[Bugfix] Miscalculated latency lead to time_to_first_token_seconds inaccurate. (#6686)
This commit is contained in:
parent
2cf0df3381
commit
40468b13fa
@ -949,8 +949,9 @@ class LLMEngine:
|
||||
model_output: Optional[List[SamplerOutput]] = None) -> None:
|
||||
"""Forced log when no requests active."""
|
||||
if self.log_stats:
|
||||
stats = self._get_stats(scheduler_outputs, model_output)
|
||||
for logger in self.stat_loggers.values():
|
||||
logger.log(self._get_stats(scheduler_outputs, model_output))
|
||||
logger.log(stats)
|
||||
|
||||
def _get_stats(
|
||||
self,
|
||||
|
||||
@ -484,7 +484,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
|
||||
for both speculation cases (num_lookahead_slots>0) and non-speculation
|
||||
cases (e.g. prefill).
|
||||
|
||||
Returns True iff there are remaining sequences to process.
|
||||
Returns True if there are remaining sequences to process.
|
||||
"""
|
||||
assert self.rank != self._driver_rank
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user