[Frontend] remove max_num_batched_tokens limit for lora (#7288)
This commit is contained in:
parent
746709642c
commit
48abee9e54
@ -1377,11 +1377,6 @@ class LoRAConfig:
|
||||
model_config.quantization)
|
||||
|
||||
def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
|
||||
if scheduler_config.max_num_batched_tokens > 65528:
|
||||
raise ValueError(
|
||||
"Due to limitations of the custom LoRA CUDA kernel, "
|
||||
"max_num_batched_tokens must be <= 65528 when "
|
||||
"LoRA is enabled.")
|
||||
if scheduler_config.chunked_prefill_enabled:
|
||||
raise ValueError("LoRA is not supported with chunked prefill yet.")
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user