vllm.v1.attention.ops.triton_unified_attention ¶

_get_tile_size ¶

_get_tile_size(
    head_size: int,
    sliding_window: int,
    element_size: int,
    is_prefill: bool,
) -> int

Select tile size with Gemma3-specific optimization.

For Gemma3, use 32 for both prefill and decode to better utilize the larger head dimension (128/256). For other models, use the default vLLM behavior.

Source code in vllm/v1/attention/ops/triton_unified_attention.py

def _get_tile_size(
    head_size: int,
    sliding_window: int,
    element_size: int,
    is_prefill: bool,
) -> int:
    """Select tile size with Gemma3-specific optimization.

    For Gemma3, use 32 for both prefill and decode to better utilize
    the larger head dimension (128/256). For other models, use
    the default vLLM behavior.
    """
    if _is_gemma3_attention(head_size, sliding_window):
        # Gemma3: use 32 for decode (default is 16)
        return 32

    # Default behavior
    if is_prefill:
        return 32
    return 16 if element_size >= 2 else 32

_is_gemma3_attention ¶

_is_gemma3_attention(
    head_size: int, sliding_window: int
) -> bool

Detect Gemma3 models via unique (head_size, sliding_window) signature.

Gemma3 models are the only ones using sliding_window=1024 with head_size 128 (27B) or 256 (1B, 4B, 12B). Other SWA models use different window sizes (Mistral=4096, Phi-3=2047).

Source code in vllm/v1/attention/ops/triton_unified_attention.py

def _is_gemma3_attention(head_size: int, sliding_window: int) -> bool:
    """Detect Gemma3 models via unique (head_size, sliding_window) signature.

    Gemma3 models are the only ones using sliding_window=1024 with
    head_size 128 (27B) or 256 (1B, 4B, 12B). Other SWA models use
    different window sizes (Mistral=4096, Phi-3=2047).
    """
    return sliding_window == 1024 and head_size in (128, 256)