vllm/vllm/model_executor/models/__init__.py

import functools
import importlib
from typing import Dict, List, Optional, Tuple, Type

import torch.nn as nn

from vllm.logger import init_logger
from vllm.utils import is_hip

logger = init_logger(__name__)

# Architecture -> (module, class).
_GENERATION_MODELS = {
    "AquilaModel": ("llama", "LlamaForCausalLM"),
    "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
    "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),  # baichuan-7b
    "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),  # baichuan-13b
    "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
    "Blip2ForConditionalGeneration":
    ("blip2", "Blip2ForConditionalGeneration"),
    "ChameleonForConditionalGeneration":
    ("chameleon", "ChameleonForConditionalGeneration"),
    "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
    "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
    "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
    "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
    "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
    "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
    "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
    "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
    "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
    "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
    "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
    "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
    "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
    "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
    "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
    "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
    "InternVLChatModel": ("internvl", "InternVLChatModel"),
    "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
    "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
    "LlavaForConditionalGeneration":
    ("llava", "LlavaForConditionalGeneration"),
    "LlavaNextForConditionalGeneration":
    ("llava_next", "LlavaNextForConditionalGeneration"),
    # For decapoda-research/llama-*
    "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
    "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
    "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
    "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
    # transformers's mpt class has lower case
    "MptForCausalLM": ("mpt", "MPTForCausalLM"),
    "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
    "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
    "MiniCPMV": ("minicpmv", "MiniCPMV"),
    "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
    "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
    "OPTForCausalLM": ("opt", "OPTForCausalLM"),
    "OrionForCausalLM": ("orion", "OrionForCausalLM"),
    "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
    "PaliGemmaForConditionalGeneration": ("paligemma",
                                          "PaliGemmaForConditionalGeneration"),
    "PhiForCausalLM": ("phi", "PhiForCausalLM"),
    "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
    "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
    "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
    "RWForCausalLM": ("falcon", "FalconForCausalLM"),
    "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
    "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
    "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
    "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
    "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
    "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
    "MedusaModel": ("medusa", "Medusa"),
    "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
    "JambaForCausalLM": ("jamba", "JambaForCausalLM")
}

_EMBEDDING_MODELS = {
    "MistralModel": ("llama_embedding", "LlamaEmbeddingModel"),
}

_CONDITIONAL_GENERATION_MODELS = {
    "BartModel": ("bart", "BartForConditionalGeneration"),
    "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
}

_MODELS = {
    **_GENERATION_MODELS,
    **_EMBEDDING_MODELS,
    **_CONDITIONAL_GENERATION_MODELS
}

# Architecture -> type.
# out of tree models
_OOT_MODELS: Dict[str, Type[nn.Module]] = {}

# Models not supported by ROCm.
_ROCM_UNSUPPORTED_MODELS: List[str] = []

# Models partially supported by ROCm.
# Architecture -> Reason.
_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
                    "Triton flash attention. For half-precision SWA support, "
                    "please use CK flash attention by setting "
                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
    "Qwen2ForCausalLM":
    _ROCM_SWA_REASON,
    "MistralForCausalLM":
    _ROCM_SWA_REASON,
    "MixtralForCausalLM":
    _ROCM_SWA_REASON,
    "PaliGemmaForConditionalGeneration":
    ("ROCm flash attention does not yet "
     "fully support 32-bit precision on PaliGemma"),
    "Phi3VForCausalLM":
    ("ROCm Triton flash attention may run into compilation errors due to "
     "excessive use of shared memory. If this happens, disable Triton FA "
     "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
}


class ModelRegistry:

    @staticmethod
    @functools.lru_cache(maxsize=128)
    def _get_model(model_arch: str):
        module_name, model_cls_name = _MODELS[model_arch]
        module = importlib.import_module(
            f"vllm.model_executor.models.{module_name}")
        return getattr(module, model_cls_name, None)

    @staticmethod
    def _try_load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
        if model_arch in _OOT_MODELS:
            return _OOT_MODELS[model_arch]
        if model_arch not in _MODELS:
            return None
        if is_hip():
            if model_arch in _ROCM_UNSUPPORTED_MODELS:
                raise ValueError(
                    f"Model architecture {model_arch} is not supported by "
                    "ROCm for now.")
            if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
                logger.warning(
                    "Model architecture %s is partially supported by ROCm: %s",
                    model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])

        return ModelRegistry._get_model(model_arch)

    @staticmethod
    def resolve_model_cls(
            architectures: List[str]) -> Tuple[Type[nn.Module], str]:
        for arch in architectures:
            model_cls = ModelRegistry._try_load_model_cls(arch)
            if model_cls is not None:
                return (model_cls, arch)

        raise ValueError(
            f"Model architectures {architectures} are not supported for now. "
            f"Supported architectures: {ModelRegistry.get_supported_archs()}")

    @staticmethod
    def get_supported_archs() -> List[str]:
        return list(_MODELS.keys())

    @staticmethod
    def register_model(model_arch: str, model_cls: Type[nn.Module]):
        if model_arch in _MODELS:
            logger.warning(
                "Model architecture %s is already registered, and will be "
                "overwritten by the new model class %s.", model_arch,
                model_cls.__name__)
        global _OOT_MODELS
        _OOT_MODELS[model_arch] = model_cls

    @staticmethod
    def is_embedding_model(model_arch: str) -> bool:
        return model_arch in _EMBEDDING_MODELS


__all__ = [
    "ModelRegistry",
]
[Misc] Small perf improvements (#6520) 2024-07-20 03:10:56 +08:00			`import functools`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`import importlib`
[Model] Support SigLIP encoder and alternative decoders for LLaVA models (#7153) Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> 2024-08-06 16:55:31 +08:00			`from typing import Dict, List, Optional, Tuple, Type`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00
			`import torch.nn as nn`

			`from vllm.logger import init_logger`
[Hardware][Neuron] Refactor neuron support (#3471) 2024-03-22 09:22:17 +08:00			`from vllm.utils import is_hip`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00
			`logger = init_logger(__name__)`

			`# Architecture -> (module, class).`
[Model][Misc] Add e5-mistral-7b-instruct and Embedding API (#3734) 2024-05-12 02:30:37 +08:00			`_GENERATION_MODELS = {`
Migrate AquilaForCausalLM to LlamaForCausalLM (#2867) 2024-02-15 04:30:24 +08:00			`"AquilaModel": ("llama", "LlamaForCausalLM"),`
			`"AquilaForCausalLM": ("llama", "LlamaForCausalLM"), # AquilaChat2`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b`
			`"BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b`
			`"BloomForCausalLM": ("bloom", "BloomForCausalLM"),`
[Model] Initial support for BLIP-2 (#5920) Co-authored-by: ywang96 <ywang@roblox.com> 2024-07-27 19:53:07 +08:00			`"Blip2ForConditionalGeneration":`
			`("blip2", "Blip2ForConditionalGeneration"),`
[VLM][Model] Support image input for Chameleon (#6633) 2024-07-23 14:50:48 +08:00			`"ChameleonForConditionalGeneration":`
			`("chameleon", "ChameleonForConditionalGeneration"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),`
			`"ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),`
Add support for Cohere's Command-R model (#3433) Co-authored-by: José Maria Pombal <jose.pombal@unbabel.com> Co-authored-by: youkaichao <youkaichao@gmail.com> 2024-03-28 05:19:32 +08:00			`"CohereForCausalLM": ("commandr", "CohereForCausalLM"),`
[Model] Add support for DBRX (#3660) 2024-03-28 04:01:46 +08:00			`"DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),`
Added DeciLM-7b and DeciLM-7b-instruct (#2062) 2023-12-19 18:29:33 +08:00			`"DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),`
DeepseekMoE support with Fused MoE kernel (#2453) Co-authored-by: roy <jasonailu87@gmail.com> 2024-01-30 13:19:48 +08:00			`"DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),`
Support Deepseek-V2 (#4650) Co-authored-by: Philipp Moritz <pcmoritz@gmail.com> 2024-06-29 04:24:57 +08:00			`"DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"FalconForCausalLM": ("falcon", "FalconForCausalLM"),`
[Model] Initialize Fuyu-8B support (#3924) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-07-14 13:27:14 +08:00			`"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),`
Add Gemma model (#2964) 2024-02-22 01:34:30 +08:00			`"GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),`
[Model] Add Gemma 2 (#5908) 2024-06-28 04:33:56 +08:00			`"Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),`
			`"GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),`
			`"GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),`
			`"GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),`
Migrate InternLMForCausalLM to LlamaForCausalLM (#2860) Co-authored-by: Roy <jasonailu87@gmail.com> 2024-02-14 09:12:05 +08:00			`"InternLMForCausalLM": ("llama", "LlamaForCausalLM"),`
Add Internlm2 (#2666) 2024-02-02 01:27:40 +08:00			`"InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),`
[Model] Initialize support for InternVL2 series models (#6514) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-07-29 18:16:30 +08:00			`"InternVLChatModel": ("internvl", "InternVLChatModel"),`
[🚀 Ready to be merged] Added support for Jais models (#3183) 2024-03-21 17:45:24 +08:00			`"JAISLMHeadModel": ("jais", "JAISLMHeadModel"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"LlamaForCausalLM": ("llama", "LlamaForCausalLM"),`
[Feature] Add vision language model support. (#3042) 2024-03-26 05:16:30 +08:00			`"LlavaForConditionalGeneration":`
			`("llava", "LlavaForConditionalGeneration"),`
[Model] Initial support for LLaVA-NeXT (#4199) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-06-10 20:47:15 +08:00			`"LlavaNextForConditionalGeneration":`
			`("llava_next", "LlavaNextForConditionalGeneration"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`# For decapoda-research/llama-*`
			`"LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),`
Migrate MistralForCausalLM to LlamaForCausalLM (#2868) 2024-02-22 10:25:05 +08:00			`"MistralForCausalLM": ("llama", "LlamaForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),`
Add quantized mixtral support (#2673) 2024-01-31 08:34:10 +08:00			`"QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`# transformers's mpt class has lower case`
			`"MptForCausalLM": ("mpt", "MPTForCausalLM"),`
			`"MPTForCausalLM": ("mpt", "MPTForCausalLM"),`
[Model] add minicpm (#3893) 2024-04-08 18:28:36 +08:00			`"MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),`
[Model] Adding support for MiniCPM-V (#4087) 2024-07-25 11:59:30 +08:00			`"MiniCPMV": ("minicpmv", "MiniCPMV"),`
[Model] Support Nemotron models (Nemotron-3, Nemotron-4, Minitron) (#6611) 2024-07-27 02:33:42 +08:00			`"NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),`
[Bugfix][Model] Refactor OLMo model to support new HF format in transformers 4.40.0 (#4324) Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> 2024-04-26 00:35:56 +08:00			`"OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"OPTForCausalLM": ("opt", "OPTForCausalLM"),`
Support Orion model (#2539) Co-authored-by: zhangdacheng <zhangdacheng@ainirobot.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> 2024-02-27 11:17:06 +08:00			`"OrionForCausalLM": ("orion", "OrionForCausalLM"),`
[Model] Initialize Fuyu-8B support (#3924) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-07-14 13:27:14 +08:00			`"PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),`
[Model] Initial support for BLIP-2 (#5920) Co-authored-by: ywang96 <ywang@roblox.com> 2024-07-27 19:53:07 +08:00			`"PaliGemmaForConditionalGeneration": ("paligemma",`
			`"PaliGemmaForConditionalGeneration"),`
Address Phi modeling update 2 (#2428) 2024-01-13 04:16:49 +08:00			`"PhiForCausalLM": ("phi", "PhiForCausalLM"),`
[Model] Adds Phi-3 support (#4298) 2024-04-25 11:06:57 +08:00			`"Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),`
[Model] Initialize Phi-3-vision support (#4986) 2024-06-18 10:34:33 +08:00			`"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),`
Add qwen2 (#2495) 2024-01-23 06:34:21 +08:00			`"Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),`
[Model] Add support for Qwen2MoeModel (#3346) 2024-03-28 23:19:59 +08:00			`"Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"RWForCausalLM": ("falcon", "FalconForCausalLM"),`
Add StableLM3B model (#2372) 2024-01-17 12:32:40 +08:00			`"StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),`
Fix stablelm (#3038) 2024-02-27 10:31:10 +08:00			`"StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),`
Support starcoder2 architecture (#3089) 2024-02-29 16:51:48 +08:00			`"Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),`
[Model] Snowflake arctic model implementation (#4652) Co-authored-by: Dash Desai <1723932+iamontheinet@users.noreply.github.com> Co-authored-by: Aurick Qiao <qiao@aurick.net> Co-authored-by: Aurick Qiao <aurick.qiao@snowflake.com> Co-authored-by: Aurick Qiao <aurickq@users.noreply.github.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com> 2024-05-10 06:37:14 +08:00			`"ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),`
[Model] Add support for xverse (#3610) Co-authored-by: willhe <hexin@xverse.cn> Co-authored-by: root <root@localhost.localdomain> 2024-03-28 09:12:54 +08:00			`"XverseForCausalLM": ("xverse", "XverseForCausalLM"),`
[Kernel][Backend][Model] Blocksparse flash attention kernel and Phi-3-Small model (#4799) Co-authored-by: beagleski <yunanzhang@microsoft.com> Co-authored-by: bapatra <bapatra@microsoft.com> Co-authored-by: Barun Patra <codedecde@users.noreply.github.com> Co-authored-by: Michael Goin <michael@neuralmagic.com> 2024-05-25 13:00:52 +08:00			`"Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),`
[Speculative Decoding] Medusa Implementation with Top-1 proposer (#4978) 2024-07-10 09:34:02 +08:00			`"MedusaModel": ("medusa", "Medusa"),`
[Model] MLPSpeculator speculative decoding support (#4947) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> Co-authored-by: Davis Wertheimer <Davis.Wertheimer@ibm.com> 2024-06-21 08:23:12 +08:00			`"MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),`
[Model] Jamba support (#4115) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: Erez Schwartz <erezs@ai21.com> Co-authored-by: Mor Zusman <morz@ai21.com> Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com> Co-authored-by: Tomer Asida <tomera@ai21.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-03 07:11:29 +08:00			`"JambaForCausalLM": ("jamba", "JambaForCausalLM")`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`}`

[Model][Misc] Add e5-mistral-7b-instruct and Embedding API (#3734) 2024-05-12 02:30:37 +08:00			`_EMBEDDING_MODELS = {`
			`"MistralModel": ("llama_embedding", "LlamaEmbeddingModel"),`
			`}`

[Core] Subclass ModelRunner to support cross-attention & encoder sequences (towards eventual encoder/decoder model support) (#4942) Co-authored-by: Andrew Feldman <afeld2012@gmail.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2024-08-07 04:51:47 +08:00			`_CONDITIONAL_GENERATION_MODELS = {`
			`"BartModel": ("bart", "BartForConditionalGeneration"),`
			`"BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),`
			`}`

			`_MODELS = {`
			`**_GENERATION_MODELS,`
			`**_EMBEDDING_MODELS,`
			`**_CONDITIONAL_GENERATION_MODELS`
			`}`
[Model][Misc] Add e5-mistral-7b-instruct and Embedding API (#3734) 2024-05-12 02:30:37 +08:00
[Core] enable out-of-tree model register (#3871) 2024-04-07 08:11:41 +08:00			`# Architecture -> type.`
			`# out of tree models`
			`_OOT_MODELS: Dict[str, Type[nn.Module]] = {}`

Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`# Models not supported by ROCm.`
[mypy] Enable type checking for test directory (#5017) 2024-06-15 12:45:31 +08:00			`_ROCM_UNSUPPORTED_MODELS: List[str] = []`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00
			`# Models partially supported by ROCm.`
			`# Architecture -> Reason.`
[Bugfix][CI/Build][Hardware][AMD] Fix AMD tests, add HF cache, update CK FA, add partially supported model notes (#6543) 2024-07-21 00:39:07 +08:00			`_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "`
			`"Triton flash attention. For half-precision SWA support, "`
			`"please use CK flash attention by setting "`
			"`VLLM_USE_TRITON_FLASH_ATTN=0`")
[mypy] Enable type checking for test directory (#5017) 2024-06-15 12:45:31 +08:00			`_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {`
Add qwen2 (#2495) 2024-01-23 06:34:21 +08:00			`"Qwen2ForCausalLM":`
[Bugfix][CI/Build][Hardware][AMD] Fix AMD tests, add HF cache, update CK FA, add partially supported model notes (#6543) 2024-07-21 00:39:07 +08:00			`_ROCM_SWA_REASON,`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"MistralForCausalLM":`
[Bugfix][CI/Build][Hardware][AMD] Fix AMD tests, add HF cache, update CK FA, add partially supported model notes (#6543) 2024-07-21 00:39:07 +08:00			`_ROCM_SWA_REASON,`
Optimize Mixtral with expert parallelism (#2090) 2023-12-14 15:55:07 +08:00			`"MixtralForCausalLM":`
[Bugfix][CI/Build][Hardware][AMD] Fix AMD tests, add HF cache, update CK FA, add partially supported model notes (#6543) 2024-07-21 00:39:07 +08:00			`_ROCM_SWA_REASON,`
			`"PaliGemmaForConditionalGeneration":`
			`("ROCm flash attention does not yet "`
			`"fully support 32-bit precision on PaliGemma"),`
			`"Phi3VForCausalLM":`
			`("ROCm Triton flash attention may run into compilation errors due to "`
			`"excessive use of shared memory. If this happens, disable Triton FA "`
			"by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`}`


			`class ModelRegistry:`

[Misc] Small perf improvements (#6520) 2024-07-20 03:10:56 +08:00			`@staticmethod`
			`@functools.lru_cache(maxsize=128)`
			`def _get_model(model_arch: str):`
			`module_name, model_cls_name = _MODELS[model_arch]`
			`module = importlib.import_module(`
			`f"vllm.model_executor.models.{module_name}")`
			`return getattr(module, model_cls_name, None)`

Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`@staticmethod`
[Model] Support SigLIP encoder and alternative decoders for LLaVA models (#7153) Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> 2024-08-06 16:55:31 +08:00			`def _try_load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:`
[Core] enable out-of-tree model register (#3871) 2024-04-07 08:11:41 +08:00			`if model_arch in _OOT_MODELS:`
			`return _OOT_MODELS[model_arch]`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`if model_arch not in _MODELS:`
			`return None`
			`if is_hip():`
			`if model_arch in _ROCM_UNSUPPORTED_MODELS:`
			`raise ValueError(`
			`f"Model architecture {model_arch} is not supported by "`
			`"ROCm for now.")`
			`if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:`
			`logger.warning(`
[CI] Disable non-lazy string operation on logging (#4326) Co-authored-by: Danny Guinther <dguinther@neuralmagic.com> 2024-04-26 15:16:58 +08:00			`"Model architecture %s is partially supported by ROCm: %s",`
			`model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00
[Misc] Small perf improvements (#6520) 2024-07-20 03:10:56 +08:00			`return ModelRegistry._get_model(model_arch)`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00
[Model] Support SigLIP encoder and alternative decoders for LLaVA models (#7153) Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> 2024-08-06 16:55:31 +08:00			`@staticmethod`
			`def resolve_model_cls(`
			`architectures: List[str]) -> Tuple[Type[nn.Module], str]:`
			`for arch in architectures:`
			`model_cls = ModelRegistry._try_load_model_cls(arch)`
			`if model_cls is not None:`
			`return (model_cls, arch)`

			`raise ValueError(`
			`f"Model architectures {architectures} are not supported for now. "`
			`f"Supported architectures: {ModelRegistry.get_supported_archs()}")`

Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`@staticmethod`
			`def get_supported_archs() -> List[str]:`
			`return list(_MODELS.keys())`

[Core] enable out-of-tree model register (#3871) 2024-04-07 08:11:41 +08:00			`@staticmethod`
			`def register_model(model_arch: str, model_cls: Type[nn.Module]):`
			`if model_arch in _MODELS:`
			`logger.warning(`
[CI] Disable non-lazy string operation on logging (#4326) Co-authored-by: Danny Guinther <dguinther@neuralmagic.com> 2024-04-26 15:16:58 +08:00			`"Model architecture %s is already registered, and will be "`
			`"overwritten by the new model class %s.", model_arch,`
			`model_cls.__name__)`
[Core] enable out-of-tree model register (#3871) 2024-04-07 08:11:41 +08:00			`global _OOT_MODELS`
			`_OOT_MODELS[model_arch] = model_cls`

[Model][Misc] Add e5-mistral-7b-instruct and Embedding API (#3734) 2024-05-12 02:30:37 +08:00			`@staticmethod`
			`def is_embedding_model(model_arch: str) -> bool:`
			`return model_arch in _EMBEDDING_MODELS`

Change the name to vLLM (#150) 2023-06-17 18:07:40 +08:00
			`__all__ = [`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"ModelRegistry",`
Change the name to vLLM (#150) 2023-06-17 18:07:40 +08:00			`]`