diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 7b94e1b5..b4055edd 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -332,8 +332,7 @@ async def create_chat_completion(request: ChatCompletionRequest, # Send token-by-token response for each request.n delta_text = output.text[len(previous_texts[i]):] previous_texts[i] = output.text - completion_tokens = len(output.token_ids) - previous_num_tokens[i] = completion_tokens + previous_num_tokens[i] = len(output.token_ids) choice_data = ChatCompletionResponseStreamChoice( index=i, delta=DeltaMessage(content=delta_text), @@ -351,8 +350,8 @@ async def create_chat_completion(request: ChatCompletionRequest, prompt_tokens = len(res.prompt_token_ids) final_usage = UsageInfo( prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, + completion_tokens=previous_num_tokens[i], + total_tokens=prompt_tokens + previous_num_tokens[i], ) choice_data = ChatCompletionResponseStreamChoice( index=i, delta=[], finish_reason=output.finish_reason)