From 40468b13faa1ebde366e7002c5752b59e1368d10 Mon Sep 17 00:00:00 2001 From: "Allen.Dou" Date: Wed, 24 Jul 2024 23:58:42 +0800 Subject: [PATCH] [Bugfix] Miscalculated latency lead to time_to_first_token_seconds inaccurate. (#6686) --- vllm/engine/llm_engine.py | 3 ++- vllm/spec_decode/spec_decode_worker.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index eabe3b23..48d53058 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -949,8 +949,9 @@ class LLMEngine: model_output: Optional[List[SamplerOutput]] = None) -> None: """Forced log when no requests active.""" if self.log_stats: + stats = self._get_stats(scheduler_outputs, model_output) for logger in self.stat_loggers.values(): - logger.log(self._get_stats(scheduler_outputs, model_output)) + logger.log(stats) def _get_stats( self, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 8cf0aa5b..98960b88 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -484,7 +484,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): for both speculation cases (num_lookahead_slots>0) and non-speculation cases (e.g. prefill). - Returns True iff there are remaining sequences to process. + Returns True if there are remaining sequences to process. """ assert self.rank != self._driver_rank