vllm.model_executor.model_loader.tensorizer ¶

TensorizerArgs `dataclass` ¶

Source code in vllm/model_executor/model_loader/tensorizer.py

@dataclass
class TensorizerArgs:
    tensorizer_uri: str | None = None
    tensorizer_dir: str | None = None
    encryption_keyfile: str | None = None

    def __init__(self, tensorizer_config: TensorizerConfig):
        for k, v in tensorizer_config.items():
            setattr(self, k, v)
        self.file_obj = tensorizer_config.tensorizer_uri
        self.s3_access_key_id = (
            tensorizer_config.s3_access_key_id or envs.S3_ACCESS_KEY_ID
        )
        self.s3_secret_access_key = (
            tensorizer_config.s3_secret_access_key or envs.S3_SECRET_ACCESS_KEY
        )
        self.s3_endpoint = tensorizer_config.s3_endpoint or envs.S3_ENDPOINT_URL

        self.stream_kwargs = {
            "s3_access_key_id": tensorizer_config.s3_access_key_id,
            "s3_secret_access_key": tensorizer_config.s3_secret_access_key,
            "s3_endpoint": tensorizer_config.s3_endpoint,
            **(tensorizer_config.stream_kwargs or {}),
        }

        self.deserialization_kwargs = {
            "verify_hash": tensorizer_config.verify_hash,
            "encryption": tensorizer_config.encryption_keyfile,
            "num_readers": tensorizer_config.num_readers,
            **(tensorizer_config.deserialization_kwargs or {}),
        }

        if self.encryption_keyfile:
            with open_stream(
                tensorizer_config.encryption_keyfile,
                **self.stream_kwargs,
            ) as stream:
                key = stream.read()
                decryption_params = DecryptionParams.from_key(key)
                self.deserialization_kwargs["encryption"] = decryption_params

    @staticmethod
    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        """Tensorizer CLI arguments"""

        # Tensorizer options arg group
        group = parser.add_argument_group(
            "tensorizer options",
            description=(
                "Options for configuring the behavior of the"
                " tensorizer deserializer when "
                "load_format=tensorizer is specified when "
                "initializing an LLMEngine, either via the CLI "
                "when running the vLLM OpenAI inference server "
                "with a JSON string passed to "
                "--model-loader-extra-config or as arguments given "
                "to TensorizerConfig when passed to "
                "model_loader_extra_config in the constructor "
                "for LLMEngine."
            ),
        )

        group.add_argument(
            "--tensorizer-uri",
            type=str,
            help="Path to serialized model tensors. Can be a local file path,"
            " or an HTTP(S) or S3 URI.",
        )
        group.add_argument(
            "--verify-hash",
            action="store_true",
            help="If enabled, the hashes of each tensor will be verified"
            " against the hashes stored in the file metadata. An exception"
            " will be raised if any of the hashes do not match.",
        )
        group.add_argument(
            "--encryption-keyfile",
            type=str,
            default=None,
            help="The file path to a binary file containing a binary key to "
            "use for decryption. Can be a file path or S3 network URI.",
        )
        group.add_argument(
            "--num-readers",
            default=None,
            type=int,
            help="Controls how many threads are allowed to read concurrently "
            "from the source file. Default is `None`, which will dynamically "
            "set the number of readers based on the available resources "
            "and model size. This greatly increases performance.",
        )
        group.add_argument(
            "--s3-access-key-id",
            type=str,
            default=None,
            help="The access key for the S3 bucket. Can also be set via the "
            "S3_ACCESS_KEY_ID environment variable.",
        )
        group.add_argument(
            "--s3-secret-access-key",
            type=str,
            default=None,
            help="The secret access key for the S3 bucket. Can also be set via "
            "the S3_SECRET_ACCESS_KEY environment variable.",
        )
        group.add_argument(
            "--s3-endpoint",
            type=str,
            default=None,
            help="The endpoint for the S3 bucket. Can also be set via the "
            "S3_ENDPOINT_URL environment variable.",
        )

        return parser

    @classmethod
    def from_cli_args(cls, args: argparse.Namespace) -> "TensorizerArgs":
        attrs = [attr.name for attr in dataclasses.fields(cls)]
        tensorizer_args = cls(
            **{attr: getattr(args, attr) for attr in attrs if hasattr(args, attr)}
        )
        return tensorizer_args

add_cli_args `staticmethod` ¶

add_cli_args(
    parser: FlexibleArgumentParser,
) -> FlexibleArgumentParser

Tensorizer CLI arguments

Source code in vllm/model_executor/model_loader/tensorizer.py

@staticmethod
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
    """Tensorizer CLI arguments"""

    # Tensorizer options arg group
    group = parser.add_argument_group(
        "tensorizer options",
        description=(
            "Options for configuring the behavior of the"
            " tensorizer deserializer when "
            "load_format=tensorizer is specified when "
            "initializing an LLMEngine, either via the CLI "
            "when running the vLLM OpenAI inference server "
            "with a JSON string passed to "
            "--model-loader-extra-config or as arguments given "
            "to TensorizerConfig when passed to "
            "model_loader_extra_config in the constructor "
            "for LLMEngine."
        ),
    )

    group.add_argument(
        "--tensorizer-uri",
        type=str,
        help="Path to serialized model tensors. Can be a local file path,"
        " or an HTTP(S) or S3 URI.",
    )
    group.add_argument(
        "--verify-hash",
        action="store_true",
        help="If enabled, the hashes of each tensor will be verified"
        " against the hashes stored in the file metadata. An exception"
        " will be raised if any of the hashes do not match.",
    )
    group.add_argument(
        "--encryption-keyfile",
        type=str,
        default=None,
        help="The file path to a binary file containing a binary key to "
        "use for decryption. Can be a file path or S3 network URI.",
    )
    group.add_argument(
        "--num-readers",
        default=None,
        type=int,
        help="Controls how many threads are allowed to read concurrently "
        "from the source file. Default is `None`, which will dynamically "
        "set the number of readers based on the available resources "
        "and model size. This greatly increases performance.",
    )
    group.add_argument(
        "--s3-access-key-id",
        type=str,
        default=None,
        help="The access key for the S3 bucket. Can also be set via the "
        "S3_ACCESS_KEY_ID environment variable.",
    )
    group.add_argument(
        "--s3-secret-access-key",
        type=str,
        default=None,
        help="The secret access key for the S3 bucket. Can also be set via "
        "the S3_SECRET_ACCESS_KEY environment variable.",
    )
    group.add_argument(
        "--s3-endpoint",
        type=str,
        default=None,
        help="The endpoint for the S3 bucket. Can also be set via the "
        "S3_ENDPOINT_URL environment variable.",
    )

    return parser

TensorizerConfig `dataclass` ¶

Bases: MutableMapping

Source code in vllm/model_executor/model_loader/tensorizer.py

@dataclass
class TensorizerConfig(MutableMapping):
    tensorizer_uri: str | None = None
    tensorizer_dir: str | None = None
    vllm_tensorized: bool | None = None
    verify_hash: bool | None = None
    num_readers: int | None = None
    encryption_keyfile: str | None = None
    s3_access_key_id: str | None = None
    s3_secret_access_key: str | None = None
    s3_endpoint: str | None = None
    lora_dir: str | None = None
    stream_kwargs: dict[str, Any] | None = None
    serialization_kwargs: dict[str, Any] | None = None
    deserialization_kwargs: dict[str, Any] | None = None
    _extra_serialization_attrs: dict[str, Any] | None = field(init=False, default=None)
    model_class: type[torch.nn.Module] | None = field(init=False, default=None)
    hf_config: PretrainedConfig | None = field(init=False, default=None)
    dtype: str | torch.dtype | None = field(init=False, default=None)
    _is_sharded: bool = field(init=False, default=False)
    _fields: ClassVar[tuple[str, ...]]
    _keys: ClassVar[frozenset[str]]
    """Configuration class for Tensorizer settings.

    These settings configure the behavior of model serialization and 
    deserialization using Tensorizer.

    Attributes:
        tensorizer_uri: Path to serialized model tensors. Can be a local file 
            path or a S3 URI. This is a required field unless lora_dir is 
            provided and the config is meant to be used for the
            `tensorize_lora_adapter` function. Unless a `tensorizer_dir` or 
            `lora_dir` is passed to this object's initializer, this is 
            a required argument.
        tensorizer_dir: Path to a directory containing serialized model tensors,
            and all other potential model artifacts to load the model, such as 
            configs and tokenizer files. Can be passed instead of 
            `tensorizer_uri` where the `model.tensors` file will be assumed 
            to be in this directory.
        vllm_tensorized: If True, indicates that the serialized model is a 
            vLLM model. This is used to determine the behavior of the 
            TensorDeserializer when loading tensors from a serialized model.
            It is far faster to deserialize a vLLM model as it utilizes
            tensorizer's optimized GPU loading. Note that this is now
            deprecated, as serialized vLLM models are now automatically
            inferred as vLLM models.
        verify_hash: If True, the hashes of each tensor will be verified 
            against the hashes stored in the metadata. A `HashMismatchError` 
            will be raised if any of the hashes do not match.
        num_readers: Controls how many threads are allowed to read concurrently
            from the source file. Default is `None`, which will dynamically set
            the number of readers based on the number of available 
            resources and model size. This greatly increases performance.
        encryption_keyfile: File path to a binary file containing a  
            binary key to use for decryption. `None` (the default) means 
            no decryption. See the example script in 
            examples/others/tensorize_vllm_model.py. 
        s3_access_key_id: The access key for the S3 bucket. Can also be set via
            the S3_ACCESS_KEY_ID environment variable.
        s3_secret_access_key: The secret access key for the S3 bucket. Can also
            be set via the S3_SECRET_ACCESS_KEY environment variable.
        s3_endpoint: The endpoint for the S3 bucket. Can also be set via the
            S3_ENDPOINT_URL environment variable.
        lora_dir: Path to a directory containing LoRA adapter artifacts for 
            serialization or deserialization. When serializing LoRA adapters 
            this is the only necessary parameter to pass to this object's 
            initializer.
    """

    def __post_init__(self):
        # check if the configuration is for a sharded vLLM model
        self._is_sharded = (
            isinstance(self.tensorizer_uri, str)
            and re.search(r"%0\dd", self.tensorizer_uri) is not None
        )

        if self.tensorizer_dir and self.lora_dir:
            raise ValueError(
                "Only one of tensorizer_dir or lora_dir may be specified. "
                "Use lora_dir exclusively when serializing LoRA adapters, "
                "and tensorizer_dir or tensorizer_uri otherwise."
            )
        if self.tensorizer_dir and self.tensorizer_uri:
            logger.warning_once(
                "Provided both tensorizer_dir and tensorizer_uri. "
                "Inferring tensorizer_dir from tensorizer_uri as the "
                "latter takes precedence."
            )
            self.tensorizer_dir = os.path.dirname(self.tensorizer_uri)
        if not self.tensorizer_uri:
            if self.lora_dir:
                self.tensorizer_uri = f"{self.lora_dir}/adapter_model.tensors"
            elif self.tensorizer_dir:
                self.tensorizer_uri = f"{self.tensorizer_dir}/model.tensors"
            else:
                raise ValueError(
                    "Unable to resolve tensorizer_uri. "
                    "A valid tensorizer_uri or tensorizer_dir "
                    "must be provided for deserialization, and a "
                    "valid tensorizer_uri, tensorizer_uri, or "
                    "lora_dir for serialization."
                )
        else:
            self.tensorizer_dir = os.path.dirname(self.tensorizer_uri)

        if not self.serialization_kwargs:
            self.serialization_kwargs = {}
        if not self.deserialization_kwargs:
            self.deserialization_kwargs = {}

    def to_serializable(self) -> dict[str, Any]:
        # Due to TensorizerConfig needing to be msgpack-serializable, it needs
        # support for morphing back and forth between itself and its dict
        # representation

        # TensorizerConfig's representation as a dictionary is meant to be
        # linked to TensorizerConfig in such a way that the following is
        # technically initializable:
        # TensorizerConfig(**my_tensorizer_cfg.to_serializable())

        # This means the dict must not retain non-initializable parameters
        # and post-init attribute states

        # Also don't want to retain private and unset parameters, so only retain
        # not None values and public attributes

        raw_tc_dict = asdict(self)
        blacklisted = []

        if "tensorizer_uri" in raw_tc_dict and "tensorizer_dir" in raw_tc_dict:
            blacklisted.append("tensorizer_dir")

        if "tensorizer_dir" in raw_tc_dict and "lora_dir" in raw_tc_dict:
            blacklisted.append("tensorizer_dir")

        tc_dict = {}
        for k, v in raw_tc_dict.items():
            if (
                k not in blacklisted
                and k not in tc_dict
                and not k.startswith("_")
                and v is not None
            ):
                tc_dict[k] = v

        return tc_dict

    def _construct_tensorizer_args(self) -> "TensorizerArgs":
        return TensorizerArgs(self)  # type: ignore

    def verify_with_parallel_config(
        self,
        parallel_config: "ParallelConfig",
    ) -> None:
        if parallel_config.tensor_parallel_size > 1 and not self._is_sharded:
            raise ValueError(
                "For a sharded model, tensorizer_uri should include a"
                " string format template like '%04d' to be formatted"
                " with the rank of the shard"
            )

    def verify_with_model_config(self, model_config: "ModelConfig") -> None:
        if model_config.quantization is not None and self.tensorizer_uri is not None:
            logger.warning(
                "Loading a model using Tensorizer with quantization on vLLM"
                " is unstable and may lead to errors."
            )

    def open_stream(self, tensorizer_args: "TensorizerArgs | None" = None):
        if tensorizer_args is None:
            tensorizer_args = self._construct_tensorizer_args()

        return open_stream(self.tensorizer_uri, **tensorizer_args.stream_kwargs)

    def keys(self):
        return self._keys

    def __len__(self):
        return len(fields(self))

    def __iter__(self):
        return iter(self._fields)

    def __getitem__(self, item: str) -> Any:
        if item not in self.keys():
            raise KeyError(item)
        return getattr(self, item)

    def __setitem__(self, key: str, value: Any) -> None:
        if key not in self.keys():
            # Disallow modifying invalid keys
            raise KeyError(key)
        setattr(self, key, value)

    def __delitem__(self, key, /):
        if key not in self.keys():
            raise KeyError(key)
        delattr(self, key)

_keys `class-attribute` ¶

_keys: frozenset[str]

Configuration class for Tensorizer settings.

These settings configure the behavior of model serialization and deserialization using Tensorizer.

Attributes:

Name	Type	Description
`tensorizer_uri`		Path to serialized model tensors. Can be a local file path or a S3 URI. This is a required field unless lora_dir is provided and the config is meant to be used for the `tensorize_lora_adapter` function. Unless a `tensorizer_dir` or `lora_dir` is passed to this object's initializer, this is a required argument.
`tensorizer_dir`		Path to a directory containing serialized model tensors, and all other potential model artifacts to load the model, such as configs and tokenizer files. Can be passed instead of `tensorizer_uri` where the `model.tensors` file will be assumed to be in this directory.
`vllm_tensorized`		If True, indicates that the serialized model is a vLLM model. This is used to determine the behavior of the TensorDeserializer when loading tensors from a serialized model. It is far faster to deserialize a vLLM model as it utilizes tensorizer's optimized GPU loading. Note that this is now deprecated, as serialized vLLM models are now automatically inferred as vLLM models.
`verify_hash`		If True, the hashes of each tensor will be verified against the hashes stored in the metadata. A `HashMismatchError` will be raised if any of the hashes do not match.
`num_readers`		Controls how many threads are allowed to read concurrently from the source file. Default is `None`, which will dynamically set the number of readers based on the number of available resources and model size. This greatly increases performance.
`encryption_keyfile`		File path to a binary file containing a binary key to use for decryption. `None` (the default) means no decryption. See the example script in examples/others/tensorize_vllm_model.py.
`s3_access_key_id`		The access key for the S3 bucket. Can also be set via the S3_ACCESS_KEY_ID environment variable.
`s3_secret_access_key`		The secret access key for the S3 bucket. Can also be set via the S3_SECRET_ACCESS_KEY environment variable.
`s3_endpoint`		The endpoint for the S3 bucket. Can also be set via the S3_ENDPOINT_URL environment variable.
`lora_dir`		Path to a directory containing LoRA adapter artifacts for serialization or deserialization. When serializing LoRA adapters this is the only necessary parameter to pass to this object's initializer.

_resize_lora_embeddings ¶

_resize_lora_embeddings(model: Module)

Modify LoRA embedding layers to use bigger tensors to allow for adapter added tokens.

Source code in vllm/model_executor/model_loader/tensorizer.py

def _resize_lora_embeddings(model: nn.Module):
    """Modify LoRA embedding layers to use bigger tensors
    to allow for adapter added tokens."""
    for child in model.modules():
        if (
            isinstance(child, VocabParallelEmbedding)
            and child.weight.shape[0] < child.num_embeddings_per_partition
        ):
            new_weight = torch.empty(
                child.num_embeddings_per_partition,
                child.embedding_dim,
                dtype=child.weight.dtype,
                device=child.weight.device,
            )
            new_weight[: child.weight.shape[0]].copy_(child.weight.data)
            new_weight[child.weight.shape[0] :].fill_(0)
            child.weight.data = new_weight

is_vllm_tensorized ¶

is_vllm_tensorized(
    tensorizer_config: TensorizerConfig,
) -> bool

Infer if the model is a vLLM model by checking the weights for a vLLM tensorized marker.

Parameters:

Name	Type	Description	Default
`tensorizer_config`	`TensorizerConfig`	The TensorizerConfig object containing the tensorizer_uri to the serialized model.	required

Returns:

Name	Type	Description
`bool`	`bool`	True if the model is a vLLM model, False otherwise.

Source code in vllm/model_executor/model_loader/tensorizer.py

def is_vllm_tensorized(tensorizer_config: "TensorizerConfig") -> bool:
    """
    Infer if the model is a vLLM model by checking the weights for
    a vLLM tensorized marker.

    Args:
        tensorizer_config: The TensorizerConfig object containing the
            tensorizer_uri to the serialized model.

    Returns:
        bool: True if the model is a vLLM model, False otherwise.
    """
    tensorizer_args = tensorizer_config._construct_tensorizer_args()
    deserializer = TensorDeserializer(
        open_stream(tensorizer_args.tensorizer_uri, **tensorizer_args.stream_kwargs),
        **tensorizer_args.deserialization_kwargs,
        lazy_load=True,
    )
    if tensorizer_config.vllm_tensorized:
        logger.warning(
            "Please note that newly serialized vLLM models are automatically "
            "inferred as vLLM models, so setting vllm_tensorized=True is "
            "only necessary for models serialized prior to this change."
        )
        return True
    return ".vllm_tensorized_marker" in deserializer

tensorize_lora_adapter ¶

tensorize_lora_adapter(
    lora_path: str, tensorizer_config: TensorizerConfig
)

Uses tensorizer to serialize a LoRA adapter. Assumes that the files needed to load a LoRA adapter are a safetensors-format file called adapter_model.safetensors and a json config file called adapter_config.json.

Serializes the files in the tensorizer_config.tensorizer_dir

Source code in vllm/model_executor/model_loader/tensorizer.py

def tensorize_lora_adapter(lora_path: str, tensorizer_config: TensorizerConfig):
    """
    Uses tensorizer to serialize a LoRA adapter. Assumes that the files
    needed to load a LoRA adapter are a safetensors-format file called
    adapter_model.safetensors and a json config file called adapter_config.json.

    Serializes the files in the tensorizer_config.tensorizer_dir
    """
    import safetensors

    from vllm.lora.utils import get_adapter_absolute_path

    lora_dir = get_adapter_absolute_path(lora_path)

    tensor_path = config_path = ""

    for file in os.listdir(lora_dir):
        if file.startswith("adapter_model"):
            tensor_path = lora_dir + "/" + file
        if file.startswith("adapter_config"):
            config_path = lora_dir + "/" + file
        if tensor_path and config_path:
            break

    if tensor_path.endswith(".safetensors"):
        tensors = safetensors.torch.load_file(tensor_path)
    elif tensor_path.endswith(".bin"):
        tensors = torch.load(tensor_path, weights_only=True)
    else:
        raise ValueError(
            f"Unsupported adapter model file: {tensor_path}. "
            f"Must be a .safetensors or .bin file."
        )

    with open(config_path) as f:
        config = json.load(f)

    tensorizer_args = tensorizer_config._construct_tensorizer_args()

    with open_stream(
        f"{tensorizer_config.tensorizer_dir}/adapter_config.json",
        mode="wb+",
        **tensorizer_args.stream_kwargs,
    ) as f:
        f.write(json.dumps(config).encode("utf-8"))

    lora_uri = f"{tensorizer_config.tensorizer_dir}/adapter_model.tensors"
    with open_stream(lora_uri, mode="wb+", **tensorizer_args.stream_kwargs) as f:
        serializer = TensorSerializer(f)
        serializer.write_state_dict(tensors)
        serializer.close()

    logger.info(
        "Successfully serialized LoRA files to %s",
        str(tensorizer_config.tensorizer_dir),
    )

tensorize_vllm_model ¶

tensorize_vllm_model(
    engine_args: EngineArgs,
    tensorizer_config: TensorizerConfig,
    generate_keyfile: bool = True,
)

Utility to load a model and then serialize it with Tensorizer

Intended to be used separately from running a vLLM server since it creates its own Engine instance.

Source code in vllm/model_executor/model_loader/tensorizer.py

def tensorize_vllm_model(
    engine_args: "EngineArgs",
    tensorizer_config: TensorizerConfig,
    generate_keyfile: bool = True,
):
    """Utility to load a model and then serialize it with Tensorizer

    Intended to be used separately from running a vLLM server since it
    creates its own Engine instance.
    """
    engine_config = engine_args.create_engine_config()
    tensorizer_config.verify_with_model_config(engine_config.model_config)
    tensorizer_config.verify_with_parallel_config(engine_config.parallel_config)

    # generate the encryption key before creating the engine to support sharding
    if (
        generate_keyfile
        and (keyfile := tensorizer_config.encryption_keyfile) is not None
    ):
        encryption_params = EncryptionParams.random()
        with open_stream(
            keyfile,
            mode="wb+",
            s3_access_key_id=tensorizer_config.s3_access_key_id,
            s3_secret_access_key=tensorizer_config.s3_secret_access_key,
            s3_endpoint=tensorizer_config.s3_endpoint,
        ) as stream:
            stream.write(encryption_params.key)

    from vllm.v1.engine.llm_engine import LLMEngine

    engine = LLMEngine.from_vllm_config(engine_config)
    engine.collective_rpc(
        "save_tensorized_model",
        kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
    )

vllm.model_executor.model_loader.tensorizer ¶

TensorizerArgs dataclass ¶

add_cli_args staticmethod ¶

TensorizerConfig dataclass ¶

_keys class-attribute ¶

_resize_lora_embeddings ¶

is_vllm_tensorized ¶

tensorize_lora_adapter ¶

tensorize_vllm_model ¶

TensorizerArgs `dataclass` ¶

add_cli_args `staticmethod` ¶

TensorizerConfig `dataclass` ¶

_keys `class-attribute` ¶