From f952bbc8ffccdd109b5bd8936655ce3fe3220807 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 20 Jul 2024 19:11:13 -0400 Subject: [PATCH] [Misc] Fix input_scale typing in w8a8_utils.py (#6579) --- vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 4fbf75b2..20100c76 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -104,7 +104,7 @@ def apply_fp8_linear( input: torch.Tensor, weight: torch.Tensor, weight_scale: torch.Tensor, - input_scale: torch.Tensor, + input_scale: Optional[torch.Tensor] = None, input_scale_ub: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, cutlass_fp8_supported: bool = True, @@ -192,7 +192,7 @@ def apply_int8_linear( input: torch.Tensor, weight: torch.Tensor, weight_scale: torch.Tensor, - input_scale: torch.Tensor, + input_scale: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, ): # ops.scaled_int8_quant supports both dynamic and static quant.