[Frontend] remove max_num_batched_tokens limit for lora (#7288)
This commit is contained in:
parent
746709642c
commit
48abee9e54
@ -1377,11 +1377,6 @@ class LoRAConfig:
|
|||||||
model_config.quantization)
|
model_config.quantization)
|
||||||
|
|
||||||
def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
|
def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
|
||||||
if scheduler_config.max_num_batched_tokens > 65528:
|
|
||||||
raise ValueError(
|
|
||||||
"Due to limitations of the custom LoRA CUDA kernel, "
|
|
||||||
"max_num_batched_tokens must be <= 65528 when "
|
|
||||||
"LoRA is enabled.")
|
|
||||||
if scheduler_config.chunked_prefill_enabled:
|
if scheduler_config.chunked_prefill_enabled:
|
||||||
raise ValueError("LoRA is not supported with chunked prefill yet.")
|
raise ValueError("LoRA is not supported with chunked prefill yet.")
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user