vllm/vllm/model_executor/models/__init__.py

import importlib
from typing import List, Optional, Type

import torch.nn as nn

from vllm.logger import init_logger
from vllm.utils import is_hip, is_neuron

logger = init_logger(__name__)

# Architecture -> (module, class).
_MODELS = {
    "AquilaModel": ("llama", "LlamaForCausalLM"),
    "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
    "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),  # baichuan-7b
    "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),  # baichuan-13b
    "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
    "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
    "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
    "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
    "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
    "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
    "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
    "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
    "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
    "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
    "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
    "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
    "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
    # For decapoda-research/llama-*
    "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
    "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
    "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
    "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
    # transformers's mpt class has lower case
    "MptForCausalLM": ("mpt", "MPTForCausalLM"),
    "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
    "OLMoForCausalLM": ("olmo", "OLMoForCausalLM"),
    "OPTForCausalLM": ("opt", "OPTForCausalLM"),
    "OrionForCausalLM": ("orion", "OrionForCausalLM"),
    "PhiForCausalLM": ("phi", "PhiForCausalLM"),
    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
    "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
    "RWForCausalLM": ("falcon", "FalconForCausalLM"),
    "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
    "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
}

# Models not supported by ROCm.
_ROCM_UNSUPPORTED_MODELS = []

# Models partially supported by ROCm.
# Architecture -> Reason.
_ROCM_PARTIALLY_SUPPORTED_MODELS = {
    "Qwen2ForCausalLM":
    "Sliding window attention is not yet supported in ROCm's flash attention",
    "MistralForCausalLM":
    "Sliding window attention is not yet supported in ROCm's flash attention",
    "MixtralForCausalLM":
    "Sliding window attention is not yet supported in ROCm's flash attention",
}

# Models not supported by Neuron.
_NEURON_SUPPORTED_MODELS = {"LlamaForCausalLM": "neuron.llama"}


class ModelRegistry:

    @staticmethod
    def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
        if model_arch not in _MODELS:
            return None
        if is_hip():
            if model_arch in _ROCM_UNSUPPORTED_MODELS:
                raise ValueError(
                    f"Model architecture {model_arch} is not supported by "
                    "ROCm for now.")
            if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
                logger.warning(
                    f"Model architecture {model_arch} is partially supported "
                    "by ROCm: " + _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])
        elif is_neuron():
            if model_arch not in _NEURON_SUPPORTED_MODELS:
                raise ValueError(
                    f"Model architecture {model_arch} is not supported by "
                    "Neuron for now.")

        module_name, model_cls_name = _MODELS[model_arch]
        if is_neuron():
            module_name = _NEURON_SUPPORTED_MODELS[model_arch]
        module = importlib.import_module(
            f"vllm.model_executor.models.{module_name}")
        return getattr(module, model_cls_name, None)

    @staticmethod
    def get_supported_archs() -> List[str]:
        return list(_MODELS.keys())


__all__ = [
    "ModelRegistry",
]
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`import importlib`
			`from typing import List, Optional, Type`

			`import torch.nn as nn`

			`from vllm.logger import init_logger`
[Neuron] Support inference with transformers-neuronx (#2569) 2024-02-29 01:34:34 +08:00			`from vllm.utils import is_hip, is_neuron`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00
			`logger = init_logger(__name__)`

			`# Architecture -> (module, class).`
			`_MODELS = {`
Migrate AquilaForCausalLM to LlamaForCausalLM (#2867) 2024-02-15 04:30:24 +08:00			`"AquilaModel": ("llama", "LlamaForCausalLM"),`
			`"AquilaForCausalLM": ("llama", "LlamaForCausalLM"), # AquilaChat2`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b`
			`"BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b`
			`"BloomForCausalLM": ("bloom", "BloomForCausalLM"),`
			`"ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),`
			`"ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),`
Added DeciLM-7b and DeciLM-7b-instruct (#2062) 2023-12-19 18:29:33 +08:00			`"DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),`
DeepseekMoE support with Fused MoE kernel (#2453) Co-authored-by: roy <jasonailu87@gmail.com> 2024-01-30 13:19:48 +08:00			`"DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"FalconForCausalLM": ("falcon", "FalconForCausalLM"),`
Add Gemma model (#2964) 2024-02-22 01:34:30 +08:00			`"GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),`
			`"GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),`
			`"GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),`
			`"GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),`
Migrate InternLMForCausalLM to LlamaForCausalLM (#2860) Co-authored-by: Roy <jasonailu87@gmail.com> 2024-02-14 09:12:05 +08:00			`"InternLMForCausalLM": ("llama", "LlamaForCausalLM"),`
Add Internlm2 (#2666) 2024-02-02 01:27:40 +08:00			`"InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"LlamaForCausalLM": ("llama", "LlamaForCausalLM"),`
			`# For decapoda-research/llama-*`
			`"LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),`
Migrate MistralForCausalLM to LlamaForCausalLM (#2868) 2024-02-22 10:25:05 +08:00			`"MistralForCausalLM": ("llama", "LlamaForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),`
Add quantized mixtral support (#2673) 2024-01-31 08:34:10 +08:00			`"QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`# transformers's mpt class has lower case`
			`"MptForCausalLM": ("mpt", "MPTForCausalLM"),`
			`"MPTForCausalLM": ("mpt", "MPTForCausalLM"),`
Support OLMo models. (#2832) 2024-02-19 13:05:15 +08:00			`"OLMoForCausalLM": ("olmo", "OLMoForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"OPTForCausalLM": ("opt", "OPTForCausalLM"),`
Support Orion model (#2539) Co-authored-by: zhangdacheng <zhangdacheng@ainirobot.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> 2024-02-27 11:17:06 +08:00			`"OrionForCausalLM": ("orion", "OrionForCausalLM"),`
Address Phi modeling update 2 (#2428) 2024-01-13 04:16:49 +08:00			`"PhiForCausalLM": ("phi", "PhiForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),`
Add qwen2 (#2495) 2024-01-23 06:34:21 +08:00			`"Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"RWForCausalLM": ("falcon", "FalconForCausalLM"),`
Add StableLM3B model (#2372) 2024-01-17 12:32:40 +08:00			`"StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),`
Fix stablelm (#3038) 2024-02-27 10:31:10 +08:00			`"StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`}`

			`# Models not supported by ROCm.`
Optimize Mixtral with expert parallelism (#2090) 2023-12-14 15:55:07 +08:00			`_ROCM_UNSUPPORTED_MODELS = []`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00
			`# Models partially supported by ROCm.`
			`# Architecture -> Reason.`
			`_ROCM_PARTIALLY_SUPPORTED_MODELS = {`
Add qwen2 (#2495) 2024-01-23 06:34:21 +08:00			`"Qwen2ForCausalLM":`
			`"Sliding window attention is not yet supported in ROCm's flash attention",`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"MistralForCausalLM":`
			`"Sliding window attention is not yet supported in ROCm's flash attention",`
Optimize Mixtral with expert parallelism (#2090) 2023-12-14 15:55:07 +08:00			`"MixtralForCausalLM":`
			`"Sliding window attention is not yet supported in ROCm's flash attention",`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`}`

[Neuron] Support inference with transformers-neuronx (#2569) 2024-02-29 01:34:34 +08:00			`# Models not supported by Neuron.`
			`_NEURON_SUPPORTED_MODELS = {"LlamaForCausalLM": "neuron.llama"}`

Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00
			`class ModelRegistry:`

			`@staticmethod`
			`def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:`
			`if model_arch not in _MODELS:`
			`return None`
			`if is_hip():`
			`if model_arch in _ROCM_UNSUPPORTED_MODELS:`
			`raise ValueError(`
			`f"Model architecture {model_arch} is not supported by "`
			`"ROCm for now.")`
			`if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:`
			`logger.warning(`
			`f"Model architecture {model_arch} is partially supported "`
			`"by ROCm: " + _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])`
[Neuron] Support inference with transformers-neuronx (#2569) 2024-02-29 01:34:34 +08:00			`elif is_neuron():`
			`if model_arch not in _NEURON_SUPPORTED_MODELS:`
			`raise ValueError(`
			`f"Model architecture {model_arch} is not supported by "`
			`"Neuron for now.")`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00
			`module_name, model_cls_name = _MODELS[model_arch]`
[Neuron] Support inference with transformers-neuronx (#2569) 2024-02-29 01:34:34 +08:00			`if is_neuron():`
			`module_name = _NEURON_SUPPORTED_MODELS[model_arch]`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`module = importlib.import_module(`
			`f"vllm.model_executor.models.{module_name}")`
			`return getattr(module, model_cls_name, None)`

			`@staticmethod`
			`def get_supported_archs() -> List[str]:`
			`return list(_MODELS.keys())`

Change the name to vLLM (#150) 2023-06-17 18:07:40 +08:00
			`__all__ = [`
Implement lazy model loader (#2044) 2023-12-13 14:21:45 +08:00			`"ModelRegistry",`
Change the name to vLLM (#150) 2023-06-17 18:07:40 +08:00			`]`