From 48abee9e5492924a69551d859d66d98874d72d60 Mon Sep 17 00:00:00 2001 From: Cherilyn Buren <88433283+NiuBlibing@users.noreply.github.com> Date: Thu, 8 Aug 2024 14:17:29 +0800 Subject: [PATCH] [Frontend] remove max_num_batched_tokens limit for lora (#7288) --- vllm/config.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index ec6d587e..2ac31657 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1377,11 +1377,6 @@ class LoRAConfig: model_config.quantization) def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): - if scheduler_config.max_num_batched_tokens > 65528: - raise ValueError( - "Due to limitations of the custom LoRA CUDA kernel, " - "max_num_batched_tokens must be <= 65528 when " - "LoRA is enabled.") if scheduler_config.chunked_prefill_enabled: raise ValueError("LoRA is not supported with chunked prefill yet.")