support bitsandbytes quantization with qwen model (#10549)

Signed-off-by: Ubuntu <zixuanzhang@bytedance.com>
2024-11-22 16:16:14 -08:00 · 2024-11-22 16:16:14 -08:00 · 948c859571
commit 948c859571
parent 97814fbf0f
1 changed files with 12 additions and 0 deletions
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@ -1028,6 +1028,18 @@ class QWenLLM(QWenBaseModel):
    embedding_modules = {}
    embedding_padding_modules = []

+    default_bitsandbytes_target_modules = [
+        ".c_attn.",
+        ".c_proj.",
+        ".w1.",
+        ".w2.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "w2": ("gate_up_proj", 0),
+        "w1": ("gate_up_proj", 1),
+    }
+

 class QWenVL(QWenBaseModel, SupportsMultiModal):
    packed_modules_mapping = {