Override environment variables for EPLB when specific conditions are met.
Parameters:
| Name | Type | Description | Default |
parallel_config | ParallelConfig | The parallel configuration object. | required |
Source code in vllm/distributed/eplb/eplb_utils.py
| def override_envs_for_eplb(parallel_config: ParallelConfig) -> None:
"""
Override environment variables for EPLB when specific conditions are met.
Args:
parallel_config: The parallel configuration object.
"""
is_data_parallel = parallel_config.data_parallel_size > 1
is_eplb_enabled = parallel_config.enable_eplb
async_eplb = parallel_config.eplb_config.use_async
is_deepep_ll = parallel_config.all2all_backend == "deepep_low_latency"
# Override NCCL_MAX_CTAS to avoid hangs when using async EPLB with the
# DeepEP low-latency backend.
#
# The hang happens when two ranks interleave kernel launches differently
# between NCCL collectives (used by async EPLB weight exchange) and DeepEP
# low-latency (LL) kernels. DeepEP LL uses a cooperative launch and tries
# to reserve a large fraction of the GPU's SMs; if those SMs are currently
# occupied by NCCL, the DeepEP LL launch blocks until enough SMs are
# freed.
#
# If rank A enters DeepEP LL in main thread while rank B is still executing
# NCCL in async thread, rank A can block waiting for SMs, while rank B can
# block inside NCCL waiting for rank A to participate in the collective.
# This circular wait causes a deadlock.
# Limiting NCCL occupancy via NCCL_MAX_CTAS leaves space for the DeepEP
# cooperative kernel to launch and complete, breaking the deadlock.
# See: https://github.com/deepseek-ai/DeepEP/issues/496
if is_data_parallel and is_eplb_enabled and is_deepep_ll and async_eplb:
current_value_str = os.getenv("NCCL_MAX_CTAS")
if current_value_str and current_value_str.isdigit():
return
override_value = 8
os.environ["NCCL_MAX_CTAS"] = str(override_value)
logger.info_once(
f"EPLB: Setting NCCL_MAX_CTAS={override_value} "
"for expert parallel with EPLB and deepep_low_latency backend",
scope="global",
)
|