[Bugfix] API stream returning two stops (#3450)
Co-authored-by: Dylan Hawk <dylanwawk@gmail.com>
This commit is contained in:
parent
c13ad1b7bd
commit
0b4997e05c
@ -322,9 +322,15 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
|
|||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
stream=True)
|
stream=True)
|
||||||
chunks = []
|
chunks = []
|
||||||
|
finish_reason_count = 0
|
||||||
async for chunk in stream:
|
async for chunk in stream:
|
||||||
chunks.append(chunk.choices[0].text)
|
chunks.append(chunk.choices[0].text)
|
||||||
|
if chunk.choices[0].finish_reason is not None:
|
||||||
|
finish_reason_count += 1
|
||||||
|
# finish reason should only return in last block
|
||||||
|
assert finish_reason_count == 1
|
||||||
assert chunk.choices[0].finish_reason == "length"
|
assert chunk.choices[0].finish_reason == "length"
|
||||||
|
assert chunk.choices[0].text
|
||||||
assert chunk.usage == single_usage
|
assert chunk.usage == single_usage
|
||||||
assert "".join(chunks) == single_output
|
assert "".join(chunks) == single_output
|
||||||
|
|
||||||
@ -363,13 +369,19 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
|
|||||||
stream=True,
|
stream=True,
|
||||||
)
|
)
|
||||||
chunks = []
|
chunks = []
|
||||||
|
finish_reason_count = 0
|
||||||
async for chunk in stream:
|
async for chunk in stream:
|
||||||
delta = chunk.choices[0].delta
|
delta = chunk.choices[0].delta
|
||||||
if delta.role:
|
if delta.role:
|
||||||
assert delta.role == "assistant"
|
assert delta.role == "assistant"
|
||||||
if delta.content:
|
if delta.content:
|
||||||
chunks.append(delta.content)
|
chunks.append(delta.content)
|
||||||
|
if chunk.choices[0].finish_reason is not None:
|
||||||
|
finish_reason_count += 1
|
||||||
|
# finish reason should only return in last block
|
||||||
|
assert finish_reason_count == 1
|
||||||
assert chunk.choices[0].finish_reason == stop_reason
|
assert chunk.choices[0].finish_reason == stop_reason
|
||||||
|
assert delta.content
|
||||||
assert "".join(chunks) == output
|
assert "".join(chunks) == output
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -266,6 +266,16 @@ class OpenAIServingCompletion(OpenAIServing):
|
|||||||
previous_texts[i] = output.text
|
previous_texts[i] = output.text
|
||||||
previous_num_tokens[i] = len(output.token_ids)
|
previous_num_tokens[i] = len(output.token_ids)
|
||||||
finish_reason = output.finish_reason
|
finish_reason = output.finish_reason
|
||||||
|
if output.finish_reason is not None: # return final usage
|
||||||
|
prompt_tokens = len(res.prompt_token_ids)
|
||||||
|
completion_tokens = len(output.token_ids)
|
||||||
|
final_usage = UsageInfo(
|
||||||
|
prompt_tokens=prompt_tokens,
|
||||||
|
completion_tokens=completion_tokens,
|
||||||
|
total_tokens=prompt_tokens + completion_tokens,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
final_usage = None
|
||||||
response_json = CompletionStreamResponse(
|
response_json = CompletionStreamResponse(
|
||||||
id=request_id,
|
id=request_id,
|
||||||
created=created_time,
|
created=created_time,
|
||||||
@ -277,34 +287,10 @@ class OpenAIServingCompletion(OpenAIServing):
|
|||||||
logprobs=logprobs,
|
logprobs=logprobs,
|
||||||
finish_reason=finish_reason,
|
finish_reason=finish_reason,
|
||||||
)
|
)
|
||||||
]).model_dump_json()
|
],
|
||||||
|
usage=final_usage,
|
||||||
|
).model_dump_json(exclude_unset=True)
|
||||||
yield f"data: {response_json}\n\n"
|
yield f"data: {response_json}\n\n"
|
||||||
|
|
||||||
if output.finish_reason is not None: # return final usage
|
|
||||||
logprobs = LogProbs(
|
|
||||||
) if request.logprobs is not None else None
|
|
||||||
prompt_tokens = len(res.prompt_token_ids)
|
|
||||||
completion_tokens = len(output.token_ids)
|
|
||||||
final_usage = UsageInfo(
|
|
||||||
prompt_tokens=prompt_tokens,
|
|
||||||
completion_tokens=completion_tokens,
|
|
||||||
total_tokens=prompt_tokens + completion_tokens,
|
|
||||||
)
|
|
||||||
response_json = CompletionStreamResponse(
|
|
||||||
id=request_id,
|
|
||||||
created=created_time,
|
|
||||||
model=model_name,
|
|
||||||
choices=[
|
|
||||||
CompletionResponseStreamChoice(
|
|
||||||
index=i,
|
|
||||||
text="",
|
|
||||||
logprobs=logprobs,
|
|
||||||
finish_reason=output.finish_reason,
|
|
||||||
)
|
|
||||||
],
|
|
||||||
usage=final_usage,
|
|
||||||
).model_dump_json()
|
|
||||||
yield f"data: {response_json}\n\n"
|
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
# TODO: Use a vllm-specific Validation Error
|
# TODO: Use a vllm-specific Validation Error
|
||||||
data = self.create_streaming_error_response(str(e))
|
data = self.create_streaming_error_response(str(e))
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user