[Fix] Fix quantization="gptq" when using Marlin (#3319)

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-03-13 13:51:42 +08:00 · 2024-03-13 13:51:42 +08:00 · b167109ba1
commit b167109ba1
parent 602358f8a8
1 changed files with 6 additions and 1 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@ -168,13 +168,18 @@ class ModelConfig:
        # Parse quantization method from the HF model config, if available.
        hf_quant_config = getattr(self.hf_config, "quantization_config", None)
        if hf_quant_config is not None:
            hf_quant_method = str(hf_quant_config["quant_method"]).lower()
            # If the GPTQ model is serialized in marlin format, use marlin.
            if (hf_quant_method == "gptq"
                    and "is_marlin_format" in hf_quant_config
                    and hf_quant_config["is_marlin_format"]):
                logger.info("The model is serialized in Marlin format. "
                            "Using Marlin kernel.")
                hf_quant_method = "marlin"
                if self.quantization == "gptq":
                    self.quantization = hf_quant_method
            if self.quantization is None:
                self.quantization = hf_quant_method
            elif self.quantization != hf_quant_method: