[Bugfix] API stream returning two stops (#3450)

Co-authored-by: Dylan Hawk <dylanwawk@gmail.com>
This commit is contained in:
Dylan Hawk 2024-03-25 10:14:34 -07:00 committed by GitHub
parent c13ad1b7bd
commit 0b4997e05c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 25 additions and 27 deletions

View File

@ -322,9 +322,15 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
temperature=0.0,
stream=True)
chunks = []
finish_reason_count = 0
async for chunk in stream:
chunks.append(chunk.choices[0].text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == "length"
assert chunk.choices[0].text
assert chunk.usage == single_usage
assert "".join(chunks) == single_output
@ -363,13 +369,19 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
stream=True,
)
chunks = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
if delta.content:
chunks.append(delta.content)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == stop_reason
assert delta.content
assert "".join(chunks) == output

View File

@ -266,6 +266,16 @@ class OpenAIServingCompletion(OpenAIServing):
previous_texts[i] = output.text
previous_num_tokens[i] = len(output.token_ids)
finish_reason = output.finish_reason
if output.finish_reason is not None: # return final usage
prompt_tokens = len(res.prompt_token_ids)
completion_tokens = len(output.token_ids)
final_usage = UsageInfo(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
)
else:
final_usage = None
response_json = CompletionStreamResponse(
id=request_id,
created=created_time,
@ -277,34 +287,10 @@ class OpenAIServingCompletion(OpenAIServing):
logprobs=logprobs,
finish_reason=finish_reason,
)
]).model_dump_json()
],
usage=final_usage,
).model_dump_json(exclude_unset=True)
yield f"data: {response_json}\n\n"
if output.finish_reason is not None: # return final usage
logprobs = LogProbs(
) if request.logprobs is not None else None
prompt_tokens = len(res.prompt_token_ids)
completion_tokens = len(output.token_ids)
final_usage = UsageInfo(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
)
response_json = CompletionStreamResponse(
id=request_id,
created=created_time,
model=model_name,
choices=[
CompletionResponseStreamChoice(
index=i,
text="",
logprobs=logprobs,
finish_reason=output.finish_reason,
)
],
usage=final_usage,
).model_dump_json()
yield f"data: {response_json}\n\n"
except ValueError as e:
# TODO: Use a vllm-specific Validation Error
data = self.create_streaming_error_response(str(e))