From 6f41f0e377708f223871c888ce84bb575bae732f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Dec 2023 10:24:25 -0800 Subject: [PATCH] Disable CUDA graph for SqueezeLLM (#2161) --- vllm/config.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index a2b20502..353189f6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -185,10 +185,11 @@ class ModelConfig: self.max_context_len_to_capture = self.max_model_len self.max_context_len_to_capture = min(self.max_context_len_to_capture, self.max_model_len) - if self.quantization == "gptq" and not self.enforce_eager: + if (self.quantization in ["gptq", "squeezellm"] + and not self.enforce_eager): # Related issue: https://github.com/vllm-project/vllm/issues/2147 - logger.warning("GPTQ does not support CUDA graph yet. Disabling " - "CUDA graph.") + logger.warning(f"{self.quantization} does not support CUDA graph " + "yet. Disabling CUDA graph.") self.enforce_eager = True def verify_with_parallel_config(