Fix hanging when prompt exceeds limit (#1029)
This commit is contained in:
parent
ff36139ffc
commit
e21d7687a9
@ -175,7 +175,7 @@ class Scheduler:
|
|||||||
num_curr_seqs += num_new_seqs
|
num_curr_seqs += num_new_seqs
|
||||||
scheduled.append(seq_group)
|
scheduled.append(seq_group)
|
||||||
|
|
||||||
if scheduled:
|
if scheduled or ignored_seq_groups:
|
||||||
scheduler_outputs = SchedulerOutputs(
|
scheduler_outputs = SchedulerOutputs(
|
||||||
scheduled_seq_groups=scheduled,
|
scheduled_seq_groups=scheduled,
|
||||||
prompt_run=True,
|
prompt_run=True,
|
||||||
|
|||||||
@ -294,14 +294,12 @@ class LLMEngine:
|
|||||||
def _schedule(
|
def _schedule(
|
||||||
self
|
self
|
||||||
) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs,
|
) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs,
|
||||||
Optional[List[RequestOutput]]]:
|
List[RequestOutput]]:
|
||||||
seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
|
seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
|
||||||
if scheduler_outputs.is_empty():
|
return seq_group_metadata_list, scheduler_outputs, [
|
||||||
return seq_group_metadata_list, scheduler_outputs, [
|
RequestOutput.from_seq_group(seq_group)
|
||||||
RequestOutput.from_seq_group(seq_group)
|
for seq_group in scheduler_outputs.ignored_seq_groups
|
||||||
for seq_group in scheduler_outputs.ignored_seq_groups
|
]
|
||||||
]
|
|
||||||
return seq_group_metadata_list, scheduler_outputs, None
|
|
||||||
|
|
||||||
def _check_beam_search_early_stopping(
|
def _check_beam_search_early_stopping(
|
||||||
self,
|
self,
|
||||||
@ -545,10 +543,9 @@ class LLMEngine:
|
|||||||
and updates the scheduler with the model outputs. Finally, it decodes
|
and updates the scheduler with the model outputs. Finally, it decodes
|
||||||
the sequences and returns the newly generated results.
|
the sequences and returns the newly generated results.
|
||||||
"""
|
"""
|
||||||
(seq_group_metadata_list, scheduler_outputs,
|
seq_group_metadata_list, scheduler_outputs, ignored = self._schedule()
|
||||||
early_return) = self._schedule()
|
if scheduler_outputs.is_empty():
|
||||||
if early_return is not None:
|
return ignored
|
||||||
return early_return
|
|
||||||
|
|
||||||
# Execute the model.
|
# Execute the model.
|
||||||
output = self._run_workers(
|
output = self._run_workers(
|
||||||
@ -559,7 +556,7 @@ class LLMEngine:
|
|||||||
blocks_to_copy=scheduler_outputs.blocks_to_copy,
|
blocks_to_copy=scheduler_outputs.blocks_to_copy,
|
||||||
)
|
)
|
||||||
|
|
||||||
return self._process_model_outputs(output, scheduler_outputs)
|
return self._process_model_outputs(output, scheduler_outputs) + ignored
|
||||||
|
|
||||||
def _log_system_stats(
|
def _log_system_stats(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user