[Bugfix] API stream returning two stops (#3450)

Co-authored-by: Dylan Hawk <dylanwawk@gmail.com>
2024-03-25 10:14:34 -07:00 · 2024-03-25 10:14:34 -07:00 · 0b4997e05c
commit 0b4997e05c
parent c13ad1b7bd
2 changed files with 25 additions and 27 deletions
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@ -322,9 +322,15 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
                                             temperature=0.0,
                                             stream=True)
    chunks = []
    finish_reason_count = 0
    async for chunk in stream:
        chunks.append(chunk.choices[0].text)
        if chunk.choices[0].finish_reason is not None:
            finish_reason_count += 1
    # finish reason should only return in last block
    assert finish_reason_count == 1
    assert chunk.choices[0].finish_reason == "length"
    assert chunk.choices[0].text
    assert chunk.usage == single_usage
    assert "".join(chunks) == single_output
@ -363,13 +369,19 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
        stream=True,
    )
    chunks = []
    finish_reason_count = 0
    async for chunk in stream:
        delta = chunk.choices[0].delta
        if delta.role:
            assert delta.role == "assistant"
        if delta.content:
            chunks.append(delta.content)
        if chunk.choices[0].finish_reason is not None:
            finish_reason_count += 1
    # finish reason should only return in last block
    assert finish_reason_count == 1
    assert chunk.choices[0].finish_reason == stop_reason
    assert delta.content
    assert "".join(chunks) == output
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@ -266,6 +266,16 @@ class OpenAIServingCompletion(OpenAIServing):
                    previous_texts[i] = output.text
                    previous_num_tokens[i] = len(output.token_ids)
                    finish_reason = output.finish_reason
                    if output.finish_reason is not None:  # return final usage
                        prompt_tokens = len(res.prompt_token_ids)
                        completion_tokens = len(output.token_ids)
                        final_usage = UsageInfo(
                            prompt_tokens=prompt_tokens,
                            completion_tokens=completion_tokens,
                            total_tokens=prompt_tokens + completion_tokens,
                        )
                    else:
                        final_usage = None
                    response_json = CompletionStreamResponse(
                        id=request_id,
                        created=created_time,
@ -277,34 +287,10 @@ class OpenAIServingCompletion(OpenAIServing):
                                logprobs=logprobs,
                                finish_reason=finish_reason,
                            )
-                        ]).model_dump_json()
+                        ],
                        usage=final_usage,
                    ).model_dump_json(exclude_unset=True)
                    yield f"data: {response_json}\n\n"
                    if output.finish_reason is not None:  # return final usage
                        logprobs = LogProbs(
                        ) if request.logprobs is not None else None
                        prompt_tokens = len(res.prompt_token_ids)
                        completion_tokens = len(output.token_ids)
                        final_usage = UsageInfo(
                            prompt_tokens=prompt_tokens,
                            completion_tokens=completion_tokens,
                            total_tokens=prompt_tokens + completion_tokens,
                        )
                        response_json = CompletionStreamResponse(
                            id=request_id,
                            created=created_time,
                            model=model_name,
                            choices=[
                                CompletionResponseStreamChoice(
                                    index=i,
                                    text="",
                                    logprobs=logprobs,
                                    finish_reason=output.finish_reason,
                                )
                            ],
                            usage=final_usage,
                        ).model_dump_json()
                        yield f"data: {response_json}\n\n"
        except ValueError as e:
            # TODO: Use a vllm-specific Validation Error
            data = self.create_streaming_error_response(str(e))