vllm.model_executor.layers.fused_moe.oracle.unquantized ¶

select_unquantized_moe_backend ¶

select_unquantized_moe_backend(
    moe_config: FusedMoEConfig, use_ep: bool, use_dp: bool
) -> UnquantizedMoeBackend

Select the primary Unquantized MoE backend Note: Shape-specific fallbacks may still occur at runtime.

Source code in vllm/model_executor/layers/fused_moe/oracle/unquantized.py

def select_unquantized_moe_backend(
    moe_config: FusedMoEConfig,
    use_ep: bool,
    use_dp: bool,
) -> UnquantizedMoeBackend:
    """
    Select the primary Unquantized MoE backend
    Note: Shape-specific fallbacks may still occur at runtime.
    """

    def _make_log_backend(backend: UnquantizedMoeBackend):
        return f"Using {backend.value} backend for Unquantized MoE"

    rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()

    activation_format = (
        mk.FusedMoEActivationFormat.BatchedExperts
        if moe_config.moe_parallel_config.use_batched_activation_format
        else mk.FusedMoEActivationFormat.Standard
    )

    # Check if FlashInfer TRTLLM BF16 MoE is supported
    trtllm_supported, _ = is_supported_config_trtllm_bf16(
        moe_config=moe_config,
        activation_format=activation_format,
    )
    flashinfer_trtllm_moe_enabled = (
        has_flashinfer()
        and envs.VLLM_USE_FLASHINFER_MOE_FP16
        and trtllm_supported
        and envs.VLLM_FLASHINFER_MOE_BACKEND == "latency"
    )
    # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
    flashinfer_cutlass_moe_enabled = (
        has_flashinfer_cutlass_fused_moe()
        and envs.VLLM_USE_FLASHINFER_MOE_FP16
        and use_ep
        and (not use_dp)
        and current_platform.has_device_capability(90)
    )
    if current_platform.is_rocm():
        if rocm_aiter_moe_enabled:
            backend = UnquantizedMoeBackend.AITER
        else:
            backend = UnquantizedMoeBackend.TRITON
    if current_platform.is_cuda():
        if flashinfer_trtllm_moe_enabled:
            backend = UnquantizedMoeBackend.FLASHINFER_TRTLLM
        elif flashinfer_cutlass_moe_enabled:
            backend = UnquantizedMoeBackend.FLASHINFER_CUTLASS
            if trtllm_supported:
                logger.info_once(
                    "FlashInfer TRTLLM MoE is available but not enabled, "
                    "consider setting VLLM_FLASHINFER_MOE_BACKEND=latency "
                    "to enable it for better performance.",
                    scope="local",
                )
        else:
            if not envs.VLLM_USE_FLASHINFER_MOE_FP16 and trtllm_supported:
                logger.info_once(
                    "FlashInfer TRTLLM MoE is available but not enabled, "
                    "consider setting VLLM_USE_FLASHINFER_MOE_FP16=1 "
                    "and VLLM_FLASHINFER_MOE_BACKEND=latency "
                    "to enable it for better performance.",
                    scope="local",
                )
            elif use_ep and (not use_dp):
                logger.info_once(
                    "FlashInfer MoE is available for EP"
                    " but not enabled, consider setting"
                    " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.",
                    scope="local",
                )
            elif use_dp:
                logger.info_once(
                    "FlashInfer CUTLASS MoE is currently not available for DP.",
                    scope="local",
                )
            backend = UnquantizedMoeBackend.TRITON
    if current_platform.is_xpu():
        backend = UnquantizedMoeBackend.XPU
    if current_platform.is_cpu():
        backend = UnquantizedMoeBackend.CPU
    if current_platform.is_tpu():
        backend = UnquantizedMoeBackend.TPU
    if current_platform.is_out_of_tree():
        backend = UnquantizedMoeBackend.OOT

    logger.info_once(_make_log_backend(backend), scope="local")
    return backend