vllm/vllm/model_executor/models/__init__.py

import importlib
from typing import Dict, List, Optional, Type

import torch.nn as nn

from vllm.logger import init_logger
from vllm.utils import is_hip

logger = init_logger(__name__)

# Architecture -> (module, class).
_GENERATION_MODELS = {
    "AquilaModel": ("llama", "LlamaForCausalLM"),
    "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
    "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),  # baichuan-7b
    "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),  # baichuan-13b
    "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
    "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
    "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
    "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
    "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
    "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
    "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
    "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
    "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
    "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
    "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
    "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
    "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
    "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
    "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
    "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
    "LlavaForConditionalGeneration":
    ("llava", "LlavaForConditionalGeneration"),
    "LlavaNextForConditionalGeneration":
    ("llava_next", "LlavaNextForConditionalGeneration"),
    # For decapoda-research/llama-*
    "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
    "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
    "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
    "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
    # transformers's mpt class has lower case
    "MptForCausalLM": ("mpt", "MPTForCausalLM"),
    "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
    "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
    "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
    "OPTForCausalLM": ("opt", "OPTForCausalLM"),
    "OrionForCausalLM": ("orion", "OrionForCausalLM"),
    "PhiForCausalLM": ("phi", "PhiForCausalLM"),
    "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
    "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
    "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
    "RWForCausalLM": ("falcon", "FalconForCausalLM"),
    "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
    "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
    "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
    "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
    "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
    "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
}

_EMBEDDING_MODELS = {
    "MistralModel": ("llama_embedding", "LlamaEmbeddingModel"),
}

_MODELS = {**_GENERATION_MODELS, **_EMBEDDING_MODELS}

# Architecture -> type.
# out of tree models
_OOT_MODELS: Dict[str, Type[nn.Module]] = {}

# Models not supported by ROCm.
_ROCM_UNSUPPORTED_MODELS: List[str] = []

# Models partially supported by ROCm.
# Architecture -> Reason.
_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
    "Qwen2ForCausalLM":
    "Sliding window attention is not yet supported in ROCm's flash attention",
    "MistralForCausalLM":
    "Sliding window attention is not yet supported in ROCm's flash attention",
    "MixtralForCausalLM":
    "Sliding window attention is not yet supported in ROCm's flash attention",
}


class ModelRegistry:

    @staticmethod
    def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
        if model_arch in _OOT_MODELS:
            return _OOT_MODELS[model_arch]
        if model_arch not in _MODELS:
            return None
        if is_hip():
            if model_arch in _ROCM_UNSUPPORTED_MODELS:
                raise ValueError(
                    f"Model architecture {model_arch} is not supported by "
                    "ROCm for now.")
            if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
                logger.warning(
                    "Model architecture %s is partially supported by ROCm: %s",
                    model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])

        module_name, model_cls_name = _MODELS[model_arch]
        module = importlib.import_module(
            f"vllm.model_executor.models.{module_name}")
        return getattr(module, model_cls_name, None)

    @staticmethod
    def get_supported_archs() -> List[str]:
        return list(_MODELS.keys())

    @staticmethod
    def register_model(model_arch: str, model_cls: Type[nn.Module]):
        if model_arch in _MODELS:
            logger.warning(
                "Model architecture %s is already registered, and will be "
                "overwritten by the new model class %s.", model_arch,
                model_cls.__name__)
        global _OOT_MODELS
        _OOT_MODELS[model_arch] = model_cls

    @staticmethod
    def is_embedding_model(model_arch: str) -> bool:
        return model_arch in _EMBEDDING_MODELS


__all__ = [
    "ModelRegistry",
]
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`import importlib`
[Core] enable out-of-tree model register (#3871) 2024-04-07 08:11:41 +08:00			`from typing import Dict, List, Optional, Type`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00
			`import torch.nn as nn`

			`from vllm.logger import init_logger`
[Hardware][Neuron] Refactor neuron support (#3471) 2024-03-22 09:22:17 +08:00			`from vllm.utils import is_hip`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00
			`logger = init_logger(__name__)`

			`# Architecture -> (module, class).`
[Model][Misc] Add e5-mistral-7b-instruct and Embedding API (#3734) 2024-05-12 02:30:37 +08:00			`_GENERATION_MODELS = {`
Migrate AquilaForCausalLM to LlamaForCausalLM (#2867) 2024-02-15 04:30:24 +08:00			`"AquilaModel": ("llama", "LlamaForCausalLM"),`
			`"AquilaForCausalLM": ("llama", "LlamaForCausalLM"), # AquilaChat2`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b`
			`"BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b`
			`"BloomForCausalLM": ("bloom", "BloomForCausalLM"),`
			`"ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),`
			`"ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),`
Add support for Cohere's Command-R model (#3433) Co-authored-by: José Maria Pombal <jose.pombal@unbabel.com> Co-authored-by: youkaichao <youkaichao@gmail.com> 2024-03-28 05:19:32 +08:00			`"CohereForCausalLM": ("commandr", "CohereForCausalLM"),`
[Model] Add support for DBRX (#3660) 2024-03-28 04:01:46 +08:00			`"DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),`
Added DeciLM-7b and DeciLM-7b-instruct (#2062) 2023-12-19 18:29:33 +08:00			`"DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),`
DeepseekMoE support with Fused MoE kernel (#2453) Co-authored-by: roy <jasonailu87@gmail.com> 2024-01-30 13:19:48 +08:00			`"DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"FalconForCausalLM": ("falcon", "FalconForCausalLM"),`
Add Gemma model (#2964) 2024-02-22 01:34:30 +08:00			`"GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),`
			`"GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),`
			`"GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),`
			`"GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),`
Migrate InternLMForCausalLM to LlamaForCausalLM (#2860) Co-authored-by: Roy <jasonailu87@gmail.com> 2024-02-14 09:12:05 +08:00			`"InternLMForCausalLM": ("llama", "LlamaForCausalLM"),`
Add Internlm2 (#2666) 2024-02-02 01:27:40 +08:00			`"InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),`
[🚀 Ready to be merged] Added support for Jais models (#3183) 2024-03-21 17:45:24 +08:00			`"JAISLMHeadModel": ("jais", "JAISLMHeadModel"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"LlamaForCausalLM": ("llama", "LlamaForCausalLM"),`
[Feature] Add vision language model support. (#3042) 2024-03-26 05:16:30 +08:00			`"LlavaForConditionalGeneration":`
			`("llava", "LlavaForConditionalGeneration"),`
[Model] Initial support for LLaVA-NeXT (#4199) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-06-10 20:47:15 +08:00			`"LlavaNextForConditionalGeneration":`
			`("llava_next", "LlavaNextForConditionalGeneration"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`# For decapoda-research/llama-*`
			`"LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),`
Migrate MistralForCausalLM to LlamaForCausalLM (#2868) 2024-02-22 10:25:05 +08:00			`"MistralForCausalLM": ("llama", "LlamaForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),`
Add quantized mixtral support (#2673) 2024-01-31 08:34:10 +08:00			`"QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`# transformers's mpt class has lower case`
			`"MptForCausalLM": ("mpt", "MPTForCausalLM"),`
			`"MPTForCausalLM": ("mpt", "MPTForCausalLM"),`
[Model] add minicpm (#3893) 2024-04-08 18:28:36 +08:00			`"MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),`
[Bugfix][Model] Refactor OLMo model to support new HF format in transformers 4.40.0 (#4324) Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> 2024-04-26 00:35:56 +08:00			`"OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"OPTForCausalLM": ("opt", "OPTForCausalLM"),`
Support Orion model (#2539) Co-authored-by: zhangdacheng <zhangdacheng@ainirobot.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> 2024-02-27 11:17:06 +08:00			`"OrionForCausalLM": ("orion", "OrionForCausalLM"),`
Address Phi modeling update 2 (#2428) 2024-01-13 04:16:49 +08:00			`"PhiForCausalLM": ("phi", "PhiForCausalLM"),`
[Model] Adds Phi-3 support (#4298) 2024-04-25 11:06:57 +08:00			`"Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),`
Add qwen2 (#2495) 2024-01-23 06:34:21 +08:00			`"Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),`
[Model] Add support for Qwen2MoeModel (#3346) 2024-03-28 23:19:59 +08:00			`"Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"RWForCausalLM": ("falcon", "FalconForCausalLM"),`
Add StableLM3B model (#2372) 2024-01-17 12:32:40 +08:00			`"StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),`
Fix stablelm (#3038) 2024-02-27 10:31:10 +08:00			`"StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),`
Support starcoder2 architecture (#3089) 2024-02-29 16:51:48 +08:00			`"Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),`
[Model] Snowflake arctic model implementation (#4652) Co-authored-by: Dash Desai <1723932+iamontheinet@users.noreply.github.com> Co-authored-by: Aurick Qiao <qiao@aurick.net> Co-authored-by: Aurick Qiao <aurick.qiao@snowflake.com> Co-authored-by: Aurick Qiao <aurickq@users.noreply.github.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com> 2024-05-10 06:37:14 +08:00			`"ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),`
[Model] Add support for xverse (#3610) Co-authored-by: willhe <hexin@xverse.cn> Co-authored-by: root <root@localhost.localdomain> 2024-03-28 09:12:54 +08:00			`"XverseForCausalLM": ("xverse", "XverseForCausalLM"),`
[Kernel][Backend][Model] Blocksparse flash attention kernel and Phi-3-Small model (#4799) Co-authored-by: beagleski <yunanzhang@microsoft.com> Co-authored-by: bapatra <bapatra@microsoft.com> Co-authored-by: Barun Patra <codedecde@users.noreply.github.com> Co-authored-by: Michael Goin <michael@neuralmagic.com> 2024-05-25 13:00:52 +08:00			`"Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`}`

[Model][Misc] Add e5-mistral-7b-instruct and Embedding API (#3734) 2024-05-12 02:30:37 +08:00			`_EMBEDDING_MODELS = {`
			`"MistralModel": ("llama_embedding", "LlamaEmbeddingModel"),`
			`}`

			`_MODELS = {_GENERATION_MODELS, _EMBEDDING_MODELS}`

[Core] enable out-of-tree model register (#3871) 2024-04-07 08:11:41 +08:00			`# Architecture -> type.`
			`# out of tree models`
			`_OOT_MODELS: Dict[str, Type[nn.Module]] = {}`

Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`# Models not supported by ROCm.`
[mypy] Enable type checking for test directory (#5017) 2024-06-15 12:45:31 +08:00			`_ROCM_UNSUPPORTED_MODELS: List[str] = []`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00
			`# Models partially supported by ROCm.`
			`# Architecture -> Reason.`
[mypy] Enable type checking for test directory (#5017) 2024-06-15 12:45:31 +08:00			`_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {`
Add qwen2 (#2495) 2024-01-23 06:34:21 +08:00			`"Qwen2ForCausalLM":`
			`"Sliding window attention is not yet supported in ROCm's flash attention",`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"MistralForCausalLM":`
			`"Sliding window attention is not yet supported in ROCm's flash attention",`
Optimize Mixtral with expert parallelism (#2090) 2023-12-14 15:55:07 +08:00			`"MixtralForCausalLM":`
			`"Sliding window attention is not yet supported in ROCm's flash attention",`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`}`


			`class ModelRegistry:`

			`@staticmethod`
			`def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:`
[Core] enable out-of-tree model register (#3871) 2024-04-07 08:11:41 +08:00			`if model_arch in _OOT_MODELS:`
			`return _OOT_MODELS[model_arch]`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`if model_arch not in _MODELS:`
			`return None`
			`if is_hip():`
			`if model_arch in _ROCM_UNSUPPORTED_MODELS:`
			`raise ValueError(`
			`f"Model architecture {model_arch} is not supported by "`
			`"ROCm for now.")`
			`if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:`
			`logger.warning(`
[CI] Disable non-lazy string operation on logging (#4326) Co-authored-by: Danny Guinther <dguinther@neuralmagic.com> 2024-04-26 15:16:58 +08:00			`"Model architecture %s is partially supported by ROCm: %s",`
			`model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00
			`module_name, model_cls_name = _MODELS[model_arch]`
			`module = importlib.import_module(`
			`f"vllm.model_executor.models.{module_name}")`
			`return getattr(module, model_cls_name, None)`

			`@staticmethod`
			`def get_supported_archs() -> List[str]:`
			`return list(_MODELS.keys())`

[Core] enable out-of-tree model register (#3871) 2024-04-07 08:11:41 +08:00			`@staticmethod`
			`def register_model(model_arch: str, model_cls: Type[nn.Module]):`
			`if model_arch in _MODELS:`
			`logger.warning(`
[CI] Disable non-lazy string operation on logging (#4326) Co-authored-by: Danny Guinther <dguinther@neuralmagic.com> 2024-04-26 15:16:58 +08:00			`"Model architecture %s is already registered, and will be "`
			`"overwritten by the new model class %s.", model_arch,`
			`model_cls.__name__)`
[Core] enable out-of-tree model register (#3871) 2024-04-07 08:11:41 +08:00			`global _OOT_MODELS`
			`_OOT_MODELS[model_arch] = model_cls`

[Model][Misc] Add e5-mistral-7b-instruct and Embedding API (#3734) 2024-05-12 02:30:37 +08:00			`@staticmethod`
			`def is_embedding_model(model_arch: str) -> bool:`
			`return model_arch in _EMBEDDING_MODELS`

Change the name to vLLM (#150) 2023-06-17 18:07:40 +08:00
			`__all__ = [`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"ModelRegistry",`
Change the name to vLLM (#150) 2023-06-17 18:07:40 +08:00			`]`