vllm.transformers_utils.configs.hunyuan_vl ¶

HunYuanVLConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/hunyuan_vl.py

class HunYuanVLConfig(PretrainedConfig):
    model_type = "hunyuan_vl"
    sub_configs = {
        "vision_config": HunYuanVLVisionConfig,
        "text_config": HunYuanVLTextConfig,
    }
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        text_config=None,
        vision_config=None,
        im_start_id=120118,
        im_end_id=120119,
        image_token_id=120120,
        im_newline_id=120121,
        video_start_id=120122,
        video_end_id=120123,
        **kwargs,
    ):
        # We need to init super() here so that it does not reset values
        # that are in text config to the BaseClass defaults. The Base
        # config has many text related defaults and not all defaults are
        # same as for `HunYuanVLTextConfig`.
        super().__init__(**kwargs)

        if isinstance(vision_config, dict):
            self.vision_config = self.sub_configs["vision_config"](**vision_config)
        elif vision_config is None:
            self.vision_config = self.sub_configs["vision_config"]()

        if isinstance(text_config, dict):
            self.text_config = self.sub_configs["text_config"](**text_config)
        elif text_config is None:
            # For BC use all kwargs to init `TextConfig`
            self.text_config = self.sub_configs["text_config"](**kwargs)

        self.image_token_id = image_token_id
        self.im_start_id = im_start_id
        self.im_end_id = im_end_id
        self.im_newline_id = im_newline_id
        self.video_start_id = video_start_id
        self.video_end_id = video_end_id

        self.vision_config.text_hidden_size = self.text_config.hidden_size

        # Attention implementation to use. It sets it recursively on sub-configs
        # so we call it again in the end.
        self._attn_implementation = kwargs.pop("attn_implementation", None)

    def __setattr__(self, key, value):
        if (
            (text_config := super().__getattribute__("__dict__").get("text_config"))
            is not None
            and key not in ["dtype", "_attn_implementation_internal"]
            and key in text_config.__dict__
        ):
            setattr(text_config, key, value)
        else:
            super().__setattr__(key, value)

    def __getattribute__(self, key):
        if "text_config" in super().__getattribute__("__dict__") and key not in [
            "_name_or_path",
            "model_type",
            "dtype",
            "_attn_implementation_internal",
        ]:
            text_config = super().__getattribute__("text_config")
            if key in text_config.__dict__:
                return getattr(text_config, key)

        return super().__getattribute__(key)

_attn_implementation `instance-attribute` ¶

_attn_implementation = pop('attn_implementation', None)

im_end_id `instance-attribute` ¶

im_end_id = im_end_id

im_newline_id `instance-attribute` ¶

im_newline_id = im_newline_id

im_start_id `instance-attribute` ¶

im_start_id = im_start_id

image_token_id `instance-attribute` ¶

image_token_id = image_token_id

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

keys_to_ignore_at_inference = ['past_key_values']

model_type `class-attribute` `instance-attribute` ¶

model_type = 'hunyuan_vl'

sub_configs `class-attribute` `instance-attribute` ¶

sub_configs = {
    "vision_config": HunYuanVLVisionConfig,
    "text_config": HunYuanVLTextConfig,
}

text_config `instance-attribute` ¶

text_config = sub_configs['text_config'](**text_config)

video_end_id `instance-attribute` ¶

video_end_id = video_end_id

video_start_id `instance-attribute` ¶

video_start_id = video_start_id

vision_config `instance-attribute` ¶

vision_config = sub_configs["vision_config"](
    **vision_config
)

getattribute ¶

__getattribute__(key)

Source code in vllm/transformers_utils/configs/hunyuan_vl.py

def __getattribute__(self, key):
    if "text_config" in super().__getattribute__("__dict__") and key not in [
        "_name_or_path",
        "model_type",
        "dtype",
        "_attn_implementation_internal",
    ]:
        text_config = super().__getattribute__("text_config")
        if key in text_config.__dict__:
            return getattr(text_config, key)

    return super().__getattribute__(key)

init ¶

__init__(
    text_config=None,
    vision_config=None,
    im_start_id=120118,
    im_end_id=120119,
    image_token_id=120120,
    im_newline_id=120121,
    video_start_id=120122,
    video_end_id=120123,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/hunyuan_vl.py

def __init__(
    self,
    text_config=None,
    vision_config=None,
    im_start_id=120118,
    im_end_id=120119,
    image_token_id=120120,
    im_newline_id=120121,
    video_start_id=120122,
    video_end_id=120123,
    **kwargs,
):
    # We need to init super() here so that it does not reset values
    # that are in text config to the BaseClass defaults. The Base
    # config has many text related defaults and not all defaults are
    # same as for `HunYuanVLTextConfig`.
    super().__init__(**kwargs)

    if isinstance(vision_config, dict):
        self.vision_config = self.sub_configs["vision_config"](**vision_config)
    elif vision_config is None:
        self.vision_config = self.sub_configs["vision_config"]()

    if isinstance(text_config, dict):
        self.text_config = self.sub_configs["text_config"](**text_config)
    elif text_config is None:
        # For BC use all kwargs to init `TextConfig`
        self.text_config = self.sub_configs["text_config"](**kwargs)

    self.image_token_id = image_token_id
    self.im_start_id = im_start_id
    self.im_end_id = im_end_id
    self.im_newline_id = im_newline_id
    self.video_start_id = video_start_id
    self.video_end_id = video_end_id

    self.vision_config.text_hidden_size = self.text_config.hidden_size

    # Attention implementation to use. It sets it recursively on sub-configs
    # so we call it again in the end.
    self._attn_implementation = kwargs.pop("attn_implementation", None)

setattr ¶

__setattr__(key, value)

Source code in vllm/transformers_utils/configs/hunyuan_vl.py

def __setattr__(self, key, value):
    if (
        (text_config := super().__getattribute__("__dict__").get("text_config"))
        is not None
        and key not in ["dtype", "_attn_implementation_internal"]
        and key in text_config.__dict__
    ):
        setattr(text_config, key, value)
    else:
        super().__setattr__(key, value)

HunYuanVLTextConfig ¶

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [HunYuanVLTextConfig]. It is used to instantiate an HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the HunYuan-7B. Hunyuan-7B-Instruct tencent/Hunyuan-7B-Instruct.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

Parameters:

Name	Type	Description	Default
`vocab_size`	`int`, optional, defaults to 290943	Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`HunYuanVLTextConfig`]	`290943`
`hidden_size`	`int`, optional, defaults to 4096	Dimension of the hidden representations.	`4096`
`intermediate_size`	`int`, optional, defaults to 11008	Dimension of the MLP representations or shared MLP representations.	`11008`
`num_hidden_layers`	`int`, optional, defaults to 32	Number of hidden layers in the Transformer decoder.	`32`
`num_attention_heads`	`int`, optional, defaults to 32	Number of attention heads for each attention layer in the Transformer decoder.	`32`
`num_key_value_heads`	`int`, optional	This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group. For more details checkout [this paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to`num_attention_heads`.	`None`
`hidden_act`	`str` or `function`, optional, defaults to `"silu"`	The non-linear activation function (function or string) in the decoder.	`'silu'`
`max_position_embeddings`	`int`, optional, defaults to 2048	The maximum sequence length that this model might ever be used with.	`2048`
`initializer_range`	`float`, optional, defaults to 0.02	The standard deviation of the truncated_normal_initializer for initializing all weight matrices.	`0.02`
`rms_norm_eps`	`float`, optional, defaults to 1e-05	The epsilon used by the rms normalization layers.	`1e-05`
`use_cache`	`bool`, optional, defaults to `True`	Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`.	`True`
`pad_token_id`	`int`, optional, defaults to 0	Padding token id.	`0`
`bos_token_id`	`int`, optional, defaults to 1	Beginning of stream token id.	`1`
`eos_token_id`	`int`, optional, defaults to 2	End of stream token id.	`2`
`eod_token_id`	`int, optional, defaults to 3`	Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence. Example: In multi-document processing, this token helps the model distinguish between separate documents.	`3`
`pretraining_tp`	`int`, optional, defaults to 1	Experimental feature. Tensor parallelism rank used during pretraining. Please refer to this document to understand more about it. This value is necessary to ensure exact reproducibility of the pretraining results. Please refer to this issue.	`1`
`tie_word_embeddings`	`bool`, optional, defaults to `False`	Whether to tie weight embeddings	`False`
`rope_theta`	`float`, optional, defaults to 10000.0	The base period of the RoPE embeddings.	`10000.0`
`rope_scaling`	`Dict`, optional	Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update `max_position_embeddings` to the expected new maximum. See the following thread for more information on how these scaling strategies behave: https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an experimental feature, subject to breaking API changes in future versions.	`None`
`attention_bias`	`bool`, defaults to `False`, optional, defaults to `False`	Whether to use a bias in the query, key, value and output projection layers during self-attention.	`False`
`attention_dropout`	`float`, optional, defaults to 0.0	The dropout ratio for the attention probabilities.	`0.0`
`head_dim`	`int`, optional, defaults to 128	The attention head dimension.	`None`

Source code in vllm/transformers_utils/configs/hunyuan_vl.py

class HunYuanVLTextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`HunYuanVLTextConfig`]. It is used to instantiate an
    HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the HunYuan-7B.
    Hunyuan-7B-Instruct [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 290943):
            Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`HunYuanVLTextConfig`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations or shared MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            End of stream token id.
        eod_token_id (int, *optional*, defaults to 3):
            Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence.
            Example: In multi-document processing, this token helps the model distinguish between separate documents.
        pretraining_tp (`int`, *optional*, defaults to 1):
            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
            issue](https://github.com/pytorch/pytorch/issues/76232).
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
            these scaling strategies behave:
            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
            experimental feature, subject to breaking API changes in future versions.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        head_dim (`int`, *optional*, defaults to 128):
            The attention head dimension.
    """  # noqa: E501

    model_type = "hunyuan_vl_text"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=290943,
        hidden_size=4096,
        intermediate_size: int = 11008,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=None,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        rms_norm_eps=1e-5,
        use_cache=True,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        eod_token_id=3,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        head_dim=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.head_dim = head_dim
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        # self._rope_scaling_validation()   # TODO: Need validation?
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    def _rope_scaling_validation(self):
        """
        Validate the `rope_scaling` configuration.
        """
        if self.rope_scaling is None:
            return

        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
            raise ValueError(
                "`rope_scaling` must be a dictionary with with two fields, `type` and "
                f"`factor` or `type` and `alpha`, got {self.rope_scaling}"
            )
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_factor = self.rope_scaling.get("factor", None)
        rope_scaling_alpha = self.rope_scaling.get("alpha", None)
        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
            raise ValueError(
                "`rope_scaling`'s type field must be one of ['linear', 'dynamic'], "
                f"got {rope_scaling_type}"
            )
        if rope_scaling_factor is None and rope_scaling_alpha is None:
            raise ValueError(
                "`rope_scaling`'s factor or alpha field must be have one, "
                "got both of none"
            )
        if rope_scaling_factor is not None and (
            not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0
        ):
            raise ValueError(
                "`rope_scaling`'s factor field must be a float > 1.0, "
                f"got {rope_scaling_factor}"
            )
        if rope_scaling_alpha is not None and (
            not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0
        ):
            raise ValueError(
                "`rope_scaling`'s alpha field must be a float > 1.0, "
                f"got {rope_scaling_alpha}"
            )

attention_bias `instance-attribute` ¶

attention_bias = attention_bias

attention_dropout `instance-attribute` ¶

attention_dropout = attention_dropout

head_dim `instance-attribute` ¶

head_dim = head_dim

hidden_act `instance-attribute` ¶

hidden_act = hidden_act

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

initializer_range `instance-attribute` ¶

initializer_range = initializer_range

intermediate_size `instance-attribute` ¶

intermediate_size = intermediate_size

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

keys_to_ignore_at_inference = ['past_key_values']

max_position_embeddings `instance-attribute` ¶

max_position_embeddings = max_position_embeddings

model_type `class-attribute` `instance-attribute` ¶

model_type = 'hunyuan_vl_text'

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

num_key_value_heads `instance-attribute` ¶

num_key_value_heads = num_key_value_heads

pretraining_tp `instance-attribute` ¶

pretraining_tp = pretraining_tp

rms_norm_eps `instance-attribute` ¶

rms_norm_eps = rms_norm_eps

rope_scaling `instance-attribute` ¶

rope_scaling = rope_scaling

rope_theta `instance-attribute` ¶

rope_theta = rope_theta

use_cache `instance-attribute` ¶

use_cache = use_cache

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    vocab_size=290943,
    hidden_size=4096,
    intermediate_size: int = 11008,
    num_hidden_layers=32,
    num_attention_heads=32,
    num_key_value_heads=None,
    hidden_act="silu",
    max_position_embeddings=2048,
    initializer_range=0.02,
    rms_norm_eps=1e-05,
    use_cache=True,
    pad_token_id=0,
    bos_token_id=1,
    eos_token_id=2,
    eod_token_id=3,
    pretraining_tp=1,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    attention_bias=False,
    attention_dropout=0.0,
    head_dim=None,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/hunyuan_vl.py

def __init__(
    self,
    vocab_size=290943,
    hidden_size=4096,
    intermediate_size: int = 11008,
    num_hidden_layers=32,
    num_attention_heads=32,
    num_key_value_heads=None,
    hidden_act="silu",
    max_position_embeddings=2048,
    initializer_range=0.02,
    rms_norm_eps=1e-5,
    use_cache=True,
    pad_token_id=0,
    bos_token_id=1,
    eos_token_id=2,
    eod_token_id=3,
    pretraining_tp=1,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    attention_bias=False,
    attention_dropout=0.0,
    head_dim=None,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.head_dim = head_dim
    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.rms_norm_eps = rms_norm_eps
    self.pretraining_tp = pretraining_tp
    self.use_cache = use_cache
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    # self._rope_scaling_validation()   # TODO: Need validation?
    self.attention_bias = attention_bias
    self.attention_dropout = attention_dropout

    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

_rope_scaling_validation ¶

_rope_scaling_validation()

Validate the rope_scaling configuration.

Source code in vllm/transformers_utils/configs/hunyuan_vl.py

def _rope_scaling_validation(self):
    """
    Validate the `rope_scaling` configuration.
    """
    if self.rope_scaling is None:
        return

    if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
        raise ValueError(
            "`rope_scaling` must be a dictionary with with two fields, `type` and "
            f"`factor` or `type` and `alpha`, got {self.rope_scaling}"
        )
    rope_scaling_type = self.rope_scaling.get("type", None)
    rope_scaling_factor = self.rope_scaling.get("factor", None)
    rope_scaling_alpha = self.rope_scaling.get("alpha", None)
    if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
        raise ValueError(
            "`rope_scaling`'s type field must be one of ['linear', 'dynamic'], "
            f"got {rope_scaling_type}"
        )
    if rope_scaling_factor is None and rope_scaling_alpha is None:
        raise ValueError(
            "`rope_scaling`'s factor or alpha field must be have one, "
            "got both of none"
        )
    if rope_scaling_factor is not None and (
        not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0
    ):
        raise ValueError(
            "`rope_scaling`'s factor field must be a float > 1.0, "
            f"got {rope_scaling_factor}"
        )
    if rope_scaling_alpha is not None and (
        not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0
    ):
        raise ValueError(
            "`rope_scaling`'s alpha field must be a float > 1.0, "
            f"got {rope_scaling_alpha}"
        )

HunYuanVLVisionConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/hunyuan_vl.py

class HunYuanVLVisionConfig(PretrainedConfig):
    model_type = "hunyuan_vl"
    base_config_key = "vision_config"

    def __init__(
        self,
        hidden_act="gelu",
        hidden_size=1152,
        intermediate_size=4304,
        interpolate_mode="bilinear",
        rms_norm_eps=1e-05,
        learnable_mlp_pooling_size=0,
        num_attention_heads=16,
        num_key_value_heads=None,
        num_channels=3,
        num_hidden_layers=27,
        out_hidden_size=4096,
        patch_size=16,
        remove_prenorm=True,
        spatial_merge_size=2,
        temporal_patch_size=1,
        resize_resolution=2048,
        img_max_token_num=4096,
        max_image_size=2048,
        video_max_image_size=768,
        video_min_image_size=256,
        min_image_size=512,
        anyres_vit_max_image_size=2048,
        max_vit_seq_len=16384,
        text_hidden_size=3072,
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.hidden_act = hidden_act
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.interpolate_mode = interpolate_mode
        self.learnable_mlp_pooling_size = learnable_mlp_pooling_size
        self.num_attention_heads = num_attention_heads
        if not num_key_value_heads:
            self.num_key_value_heads = num_attention_heads
        else:
            self.num_key_value_heads = num_key_value_heads
        self.num_channels = num_channels
        self.num_hidden_layers = num_hidden_layers
        self.out_hidden_size = out_hidden_size
        self.patch_size = patch_size
        self.remove_prenorm = remove_prenorm
        self.spatial_merge_size = spatial_merge_size
        self.temporal_patch_size = temporal_patch_size
        self.rms_norm_eps = rms_norm_eps

        self.resize_resolution = resize_resolution
        self.img_max_token_num = img_max_token_num
        self.max_image_size = max_image_size
        self.min_image_size = min_image_size
        self.video_max_image_size = video_max_image_size
        self.video_min_image_size = video_min_image_size
        self.anyres_vit_max_image_size = anyres_vit_max_image_size
        self.max_vit_seq_len = max_vit_seq_len
        self.text_hidden_size = text_hidden_size

anyres_vit_max_image_size `instance-attribute` ¶

anyres_vit_max_image_size = anyres_vit_max_image_size

base_config_key `class-attribute` `instance-attribute` ¶

base_config_key = 'vision_config'

hidden_act `instance-attribute` ¶

hidden_act = hidden_act

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

img_max_token_num `instance-attribute` ¶

img_max_token_num = img_max_token_num

intermediate_size `instance-attribute` ¶

intermediate_size = intermediate_size

interpolate_mode `instance-attribute` ¶

interpolate_mode = interpolate_mode

learnable_mlp_pooling_size `instance-attribute` ¶

learnable_mlp_pooling_size = learnable_mlp_pooling_size

max_image_size `instance-attribute` ¶

max_image_size = max_image_size

max_vit_seq_len `instance-attribute` ¶

max_vit_seq_len = max_vit_seq_len

min_image_size `instance-attribute` ¶

min_image_size = min_image_size

model_type `class-attribute` `instance-attribute` ¶

model_type = 'hunyuan_vl'

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_channels `instance-attribute` ¶

num_channels = num_channels

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

num_key_value_heads `instance-attribute` ¶

num_key_value_heads = num_attention_heads

out_hidden_size `instance-attribute` ¶

out_hidden_size = out_hidden_size

patch_size `instance-attribute` ¶

patch_size = patch_size

remove_prenorm `instance-attribute` ¶

remove_prenorm = remove_prenorm

resize_resolution `instance-attribute` ¶

resize_resolution = resize_resolution

rms_norm_eps `instance-attribute` ¶

rms_norm_eps = rms_norm_eps

spatial_merge_size `instance-attribute` ¶

spatial_merge_size = spatial_merge_size

temporal_patch_size `instance-attribute` ¶

temporal_patch_size = temporal_patch_size

text_hidden_size `instance-attribute` ¶

text_hidden_size = text_hidden_size

video_max_image_size `instance-attribute` ¶

video_max_image_size = video_max_image_size

video_min_image_size `instance-attribute` ¶

video_min_image_size = video_min_image_size

init ¶

__init__(
    hidden_act="gelu",
    hidden_size=1152,
    intermediate_size=4304,
    interpolate_mode="bilinear",
    rms_norm_eps=1e-05,
    learnable_mlp_pooling_size=0,
    num_attention_heads=16,
    num_key_value_heads=None,
    num_channels=3,
    num_hidden_layers=27,
    out_hidden_size=4096,
    patch_size=16,
    remove_prenorm=True,
    spatial_merge_size=2,
    temporal_patch_size=1,
    resize_resolution=2048,
    img_max_token_num=4096,
    max_image_size=2048,
    video_max_image_size=768,
    video_min_image_size=256,
    min_image_size=512,
    anyres_vit_max_image_size=2048,
    max_vit_seq_len=16384,
    text_hidden_size=3072,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/hunyuan_vl.py

def __init__(
    self,
    hidden_act="gelu",
    hidden_size=1152,
    intermediate_size=4304,
    interpolate_mode="bilinear",
    rms_norm_eps=1e-05,
    learnable_mlp_pooling_size=0,
    num_attention_heads=16,
    num_key_value_heads=None,
    num_channels=3,
    num_hidden_layers=27,
    out_hidden_size=4096,
    patch_size=16,
    remove_prenorm=True,
    spatial_merge_size=2,
    temporal_patch_size=1,
    resize_resolution=2048,
    img_max_token_num=4096,
    max_image_size=2048,
    video_max_image_size=768,
    video_min_image_size=256,
    min_image_size=512,
    anyres_vit_max_image_size=2048,
    max_vit_seq_len=16384,
    text_hidden_size=3072,
    **kwargs,
):
    super().__init__(**kwargs)

    self.hidden_act = hidden_act
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.interpolate_mode = interpolate_mode
    self.learnable_mlp_pooling_size = learnable_mlp_pooling_size
    self.num_attention_heads = num_attention_heads
    if not num_key_value_heads:
        self.num_key_value_heads = num_attention_heads
    else:
        self.num_key_value_heads = num_key_value_heads
    self.num_channels = num_channels
    self.num_hidden_layers = num_hidden_layers
    self.out_hidden_size = out_hidden_size
    self.patch_size = patch_size
    self.remove_prenorm = remove_prenorm
    self.spatial_merge_size = spatial_merge_size
    self.temporal_patch_size = temporal_patch_size
    self.rms_norm_eps = rms_norm_eps

    self.resize_resolution = resize_resolution
    self.img_max_token_num = img_max_token_num
    self.max_image_size = max_image_size
    self.min_image_size = min_image_size
    self.video_max_image_size = video_max_image_size
    self.video_min_image_size = video_min_image_size
    self.anyres_vit_max_image_size = anyres_vit_max_image_size
    self.max_vit_seq_len = max_vit_seq_len
    self.text_hidden_size = text_hidden_size

vllm.transformers_utils.configs.hunyuan_vl ¶

HunYuanVLConfig ¶

_attn_implementation instance-attribute ¶

im_end_id instance-attribute ¶

im_newline_id instance-attribute ¶

im_start_id instance-attribute ¶

image_token_id instance-attribute ¶

keys_to_ignore_at_inference class-attribute instance-attribute ¶

model_type class-attribute instance-attribute ¶

sub_configs class-attribute instance-attribute ¶

text_config instance-attribute ¶

video_end_id instance-attribute ¶

video_start_id instance-attribute ¶

vision_config instance-attribute ¶

__getattribute__ ¶

__init__ ¶

__setattr__ ¶

HunYuanVLTextConfig ¶

attention_bias instance-attribute ¶

attention_dropout instance-attribute ¶

head_dim instance-attribute ¶

hidden_act instance-attribute ¶

hidden_size instance-attribute ¶

initializer_range instance-attribute ¶

intermediate_size instance-attribute ¶

keys_to_ignore_at_inference class-attribute instance-attribute ¶

max_position_embeddings instance-attribute ¶

model_type class-attribute instance-attribute ¶

num_attention_heads instance-attribute ¶

num_hidden_layers instance-attribute ¶

num_key_value_heads instance-attribute ¶

pretraining_tp instance-attribute ¶

rms_norm_eps instance-attribute ¶

rope_scaling instance-attribute ¶

rope_theta instance-attribute ¶

use_cache instance-attribute ¶

vocab_size instance-attribute ¶

__init__ ¶

_rope_scaling_validation ¶

HunYuanVLVisionConfig ¶

anyres_vit_max_image_size instance-attribute ¶

base_config_key class-attribute instance-attribute ¶

hidden_act instance-attribute ¶

hidden_size instance-attribute ¶

img_max_token_num instance-attribute ¶

intermediate_size instance-attribute ¶

interpolate_mode instance-attribute ¶

learnable_mlp_pooling_size instance-attribute ¶

max_image_size instance-attribute ¶

max_vit_seq_len instance-attribute ¶

min_image_size instance-attribute ¶

model_type class-attribute instance-attribute ¶

num_attention_heads instance-attribute ¶

num_channels instance-attribute ¶

num_hidden_layers instance-attribute ¶

num_key_value_heads instance-attribute ¶

out_hidden_size instance-attribute ¶

patch_size instance-attribute ¶

remove_prenorm instance-attribute ¶

resize_resolution instance-attribute ¶

rms_norm_eps instance-attribute ¶

spatial_merge_size instance-attribute ¶

temporal_patch_size instance-attribute ¶

text_hidden_size instance-attribute ¶

video_max_image_size instance-attribute ¶

video_min_image_size instance-attribute ¶

__init__ ¶

_attn_implementation `instance-attribute` ¶

im_end_id `instance-attribute` ¶

im_newline_id `instance-attribute` ¶

im_start_id `instance-attribute` ¶

image_token_id `instance-attribute` ¶

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

model_type `class-attribute` `instance-attribute` ¶

sub_configs `class-attribute` `instance-attribute` ¶

text_config `instance-attribute` ¶

video_end_id `instance-attribute` ¶

video_start_id `instance-attribute` ¶

vision_config `instance-attribute` ¶

getattribute ¶

init ¶

setattr ¶

attention_bias `instance-attribute` ¶

attention_dropout `instance-attribute` ¶

head_dim `instance-attribute` ¶

hidden_act `instance-attribute` ¶

hidden_size `instance-attribute` ¶

initializer_range `instance-attribute` ¶

intermediate_size `instance-attribute` ¶

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

max_position_embeddings `instance-attribute` ¶

model_type `class-attribute` `instance-attribute` ¶

num_attention_heads `instance-attribute` ¶

num_hidden_layers `instance-attribute` ¶

num_key_value_heads `instance-attribute` ¶

pretraining_tp `instance-attribute` ¶

rms_norm_eps `instance-attribute` ¶

rope_scaling `instance-attribute` ¶

rope_theta `instance-attribute` ¶

use_cache `instance-attribute` ¶

vocab_size `instance-attribute` ¶

init ¶

anyres_vit_max_image_size `instance-attribute` ¶

base_config_key `class-attribute` `instance-attribute` ¶

hidden_act `instance-attribute` ¶

hidden_size `instance-attribute` ¶

img_max_token_num `instance-attribute` ¶

intermediate_size `instance-attribute` ¶

interpolate_mode `instance-attribute` ¶

learnable_mlp_pooling_size `instance-attribute` ¶

max_image_size `instance-attribute` ¶

max_vit_seq_len `instance-attribute` ¶

min_image_size `instance-attribute` ¶

model_type `class-attribute` `instance-attribute` ¶

num_attention_heads `instance-attribute` ¶

num_channels `instance-attribute` ¶

num_hidden_layers `instance-attribute` ¶

num_key_value_heads `instance-attribute` ¶

out_hidden_size `instance-attribute` ¶

patch_size `instance-attribute` ¶

remove_prenorm `instance-attribute` ¶

resize_resolution `instance-attribute` ¶

rms_norm_eps `instance-attribute` ¶

spatial_merge_size `instance-attribute` ¶

temporal_patch_size `instance-attribute` ¶

text_hidden_size `instance-attribute` ¶

video_max_image_size `instance-attribute` ¶

video_min_image_size `instance-attribute` ¶

init ¶