[Frontend] remove max_num_batched_tokens limit for lora (#7288)

2024-08-08 14:17:29 +08:00 · 2024-08-08 14:17:29 +08:00 · 48abee9e54
commit 48abee9e54
parent 746709642c
1 changed files with 0 additions and 5 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@ -1377,11 +1377,6 @@ class LoRAConfig:
                           model_config.quantization)
    def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
        if scheduler_config.max_num_batched_tokens > 65528:
            raise ValueError(
                "Due to limitations of the custom LoRA CUDA kernel, "
                "max_num_batched_tokens must be <= 65528 when "
                "LoRA is enabled.")
        if scheduler_config.chunked_prefill_enabled:
            raise ValueError("LoRA is not supported with chunked prefill yet.")