[Bugfix] Fix GPTQ and GPTQ Marlin CPU Offloading (#7225)

2024-08-06 21:34:26 -04:00 · 2024-08-06 21:34:26 -04:00 · f9a5600649
commit f9a5600649
parent fd95e026e0
4 changed files with 33 additions and 14 deletions
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@ -22,11 +22,28 @@ def test_cpu_offload_fp8():
                         ["--cpu-offload-gb", "2"])
-@pytest.mark.skipif(not is_quant_method_supported("awq"),
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="awq is not supported on this GPU type.")
+                    reason="gptq_marlin is not supported on this GPU type.")
 def test_cpu_offload_gptq():
    # Test GPTQ Marlin
    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
                         ["--cpu-offload-gb", "1"])
    # Test GPTQ
    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
                         ["--quantization", "gptq"],
                         ["--quantization", "gptq", "--cpu-offload-gb", "1"])
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
                    reason="awq_marlin is not supported on this GPU type.")
 def test_cpu_offload_awq():
-    compare_two_settings("casperhansen/llama-3-8b-instruct-awq", [],
+    # Test AWQ Marlin
-                         ["--cpu-offload-gb", "2"])
+    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
                         ["--cpu-offload-gb", "1"])
    # Test AWQ
    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ",
                         ["--quantization", "awq"],
                         ["--quantization", "awq", "--cpu-offload-gb", "1"])
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
--- a/tests/utils.py
+++ b/tests/utils.py
@ -266,8 +266,9 @@ def compare_two_settings(model: str,
    arg1_results = results[:n]
    arg2_results = results[n:]
    for arg1_result, arg2_result in zip(arg1_results, arg2_results):
-        assert arg1_result == arg2_result, \
+        assert arg1_result == arg2_result, (
-            f"Results for {model=} are not the same with {arg1=} and {arg2=}"
+            f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
            f"{arg1_result=} != {arg2_result=}")
 def init_test_distributed_environment(
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@ -204,13 +204,7 @@ class GPTQLinearMethod(LinearMethodBase):
        layer.exllama_state = exllama_state
-    def apply(self,
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
              layer: torch.nn.Module,
              x: torch.Tensor,
              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
        qweight = layer.qweight
        out_shape = x.shape[:-1] + (qweight.shape[-1], )
        reshaped_x = x.reshape(-1, x.shape[-1])
        # exllama needs to shuffle the weight after the weight is loaded
        # here we do the shuffle on first forward pass
        if layer.exllama_state == ExllamaState.UNINITIALIZED:
@ -222,6 +216,14 @@ class GPTQLinearMethod(LinearMethodBase):
            layer.exllama_state = ExllamaState.READY
            ops.gptq_shuffle(layer.qweight, layer.g_idx,
                             self.quant_config.weight_bits)
    def apply(self,
              layer: torch.nn.Module,
              x: torch.Tensor,
              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
        out_shape = x.shape[:-1] + (layer.qweight.shape[-1], )
        reshaped_x = x.reshape(-1, x.shape[-1])
        output = ops.gptq_gemm(reshaped_x, layer.qweight, layer.qzeros,
                               layer.scales, layer.g_idx,
                               layer.exllama_state == ExllamaState.READY,
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@ -251,7 +251,6 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
                scales_and_zp_size,
                output_size_per_partition // self.quant_config.pack_factor,
                dtype=torch.int32,
                device="meta",
            ),
            requires_grad=False,
        )