[Frontend] remove max_num_batched_tokens limit for lora (#7288)

This commit is contained in:
Cherilyn Buren 2024-08-08 14:17:29 +08:00 committed by GitHub
parent 746709642c
commit 48abee9e54
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1377,11 +1377,6 @@ class LoRAConfig:
model_config.quantization)
def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
if scheduler_config.max_num_batched_tokens > 65528:
raise ValueError(
"Due to limitations of the custom LoRA CUDA kernel, "
"max_num_batched_tokens must be <= 65528 when "
"LoRA is enabled.")
if scheduler_config.chunked_prefill_enabled:
raise ValueError("LoRA is not supported with chunked prefill yet.")