[Bugfix] Fix GPTQ and GPTQ Marlin CPU Offloading (#7225)
This commit is contained in:
parent
fd95e026e0
commit
f9a5600649
@ -22,11 +22,28 @@ def test_cpu_offload_fp8():
|
|||||||
["--cpu-offload-gb", "2"])
|
["--cpu-offload-gb", "2"])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not is_quant_method_supported("awq"),
|
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
||||||
reason="awq is not supported on this GPU type.")
|
reason="gptq_marlin is not supported on this GPU type.")
|
||||||
|
def test_cpu_offload_gptq():
|
||||||
|
# Test GPTQ Marlin
|
||||||
|
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
|
||||||
|
["--cpu-offload-gb", "1"])
|
||||||
|
# Test GPTQ
|
||||||
|
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
|
||||||
|
["--quantization", "gptq"],
|
||||||
|
["--quantization", "gptq", "--cpu-offload-gb", "1"])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
|
||||||
|
reason="awq_marlin is not supported on this GPU type.")
|
||||||
def test_cpu_offload_awq():
|
def test_cpu_offload_awq():
|
||||||
compare_two_settings("casperhansen/llama-3-8b-instruct-awq", [],
|
# Test AWQ Marlin
|
||||||
["--cpu-offload-gb", "2"])
|
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
|
||||||
|
["--cpu-offload-gb", "1"])
|
||||||
|
# Test AWQ
|
||||||
|
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ",
|
||||||
|
["--quantization", "awq"],
|
||||||
|
["--quantization", "awq", "--cpu-offload-gb", "1"])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
||||||
|
|||||||
@ -266,8 +266,9 @@ def compare_two_settings(model: str,
|
|||||||
arg1_results = results[:n]
|
arg1_results = results[:n]
|
||||||
arg2_results = results[n:]
|
arg2_results = results[n:]
|
||||||
for arg1_result, arg2_result in zip(arg1_results, arg2_results):
|
for arg1_result, arg2_result in zip(arg1_results, arg2_results):
|
||||||
assert arg1_result == arg2_result, \
|
assert arg1_result == arg2_result, (
|
||||||
f"Results for {model=} are not the same with {arg1=} and {arg2=}"
|
f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
|
||||||
|
f"{arg1_result=} != {arg2_result=}")
|
||||||
|
|
||||||
|
|
||||||
def init_test_distributed_environment(
|
def init_test_distributed_environment(
|
||||||
|
|||||||
@ -204,13 +204,7 @@ class GPTQLinearMethod(LinearMethodBase):
|
|||||||
|
|
||||||
layer.exllama_state = exllama_state
|
layer.exllama_state = exllama_state
|
||||||
|
|
||||||
def apply(self,
|
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
||||||
layer: torch.nn.Module,
|
|
||||||
x: torch.Tensor,
|
|
||||||
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
|
||||||
qweight = layer.qweight
|
|
||||||
out_shape = x.shape[:-1] + (qweight.shape[-1], )
|
|
||||||
reshaped_x = x.reshape(-1, x.shape[-1])
|
|
||||||
# exllama needs to shuffle the weight after the weight is loaded
|
# exllama needs to shuffle the weight after the weight is loaded
|
||||||
# here we do the shuffle on first forward pass
|
# here we do the shuffle on first forward pass
|
||||||
if layer.exllama_state == ExllamaState.UNINITIALIZED:
|
if layer.exllama_state == ExllamaState.UNINITIALIZED:
|
||||||
@ -222,6 +216,14 @@ class GPTQLinearMethod(LinearMethodBase):
|
|||||||
layer.exllama_state = ExllamaState.READY
|
layer.exllama_state = ExllamaState.READY
|
||||||
ops.gptq_shuffle(layer.qweight, layer.g_idx,
|
ops.gptq_shuffle(layer.qweight, layer.g_idx,
|
||||||
self.quant_config.weight_bits)
|
self.quant_config.weight_bits)
|
||||||
|
|
||||||
|
def apply(self,
|
||||||
|
layer: torch.nn.Module,
|
||||||
|
x: torch.Tensor,
|
||||||
|
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||||
|
out_shape = x.shape[:-1] + (layer.qweight.shape[-1], )
|
||||||
|
reshaped_x = x.reshape(-1, x.shape[-1])
|
||||||
|
|
||||||
output = ops.gptq_gemm(reshaped_x, layer.qweight, layer.qzeros,
|
output = ops.gptq_gemm(reshaped_x, layer.qweight, layer.qzeros,
|
||||||
layer.scales, layer.g_idx,
|
layer.scales, layer.g_idx,
|
||||||
layer.exllama_state == ExllamaState.READY,
|
layer.exllama_state == ExllamaState.READY,
|
||||||
|
|||||||
@ -251,7 +251,6 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
|
|||||||
scales_and_zp_size,
|
scales_and_zp_size,
|
||||||
output_size_per_partition // self.quant_config.pack_factor,
|
output_size_per_partition // self.quant_config.pack_factor,
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
device="meta",
|
|
||||||
),
|
),
|
||||||
requires_grad=False,
|
requires_grad=False,
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user