Fix the openai benchmarking requests to work with latest OpenAI apis (#2992)
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
This commit is contained in:
parent
ff578cae54
commit
9a4548bae7
@ -275,10 +275,80 @@ async def async_request_openai_completions(
|
|||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_openai_chat_completions(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
api_url = request_func_input.api_url
|
||||||
|
assert api_url.endswith(
|
||||||
|
"v1/chat/completions"
|
||||||
|
), "OpenAI Chat API URL must end with 'v1/chat/completions'."
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
assert not request_func_input.use_beam_search
|
||||||
|
payload = {
|
||||||
|
"model": request_func_input.model,
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": request_func_input.prompt,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"temperature": 0.0,
|
||||||
|
"max_tokens": request_func_input.output_len,
|
||||||
|
"stream": True,
|
||||||
|
}
|
||||||
|
headers = {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||||
|
}
|
||||||
|
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
|
generated_text = ""
|
||||||
|
ttft = 0
|
||||||
|
st = time.perf_counter()
|
||||||
|
try:
|
||||||
|
async with session.post(url=api_url, json=payload,
|
||||||
|
headers=headers) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
async for chunk in response.content:
|
||||||
|
if ttft == 0:
|
||||||
|
ttft = time.perf_counter() - st
|
||||||
|
output.ttft = ttft
|
||||||
|
|
||||||
|
chunk = chunk.strip()
|
||||||
|
if not chunk:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk = chunk.decode("utf-8").lstrip("data: ")
|
||||||
|
if chunk == "[DONE]":
|
||||||
|
latency = time.perf_counter() - st
|
||||||
|
else:
|
||||||
|
body = json.loads(chunk)
|
||||||
|
if "content" in body["choices"][0]["delta"]:
|
||||||
|
generated_text += body["choices"][0]["delta"][
|
||||||
|
"content"]
|
||||||
|
|
||||||
|
output.generated_text = generated_text
|
||||||
|
output.success = True
|
||||||
|
output.latency = latency
|
||||||
|
else:
|
||||||
|
output.success = False
|
||||||
|
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
|
||||||
|
output.success = False
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
ASYNC_REQUEST_FUNCS = {
|
ASYNC_REQUEST_FUNCS = {
|
||||||
"tgi": async_request_tgi,
|
"tgi": async_request_tgi,
|
||||||
"vllm": async_request_vllm,
|
"vllm": async_request_vllm,
|
||||||
"deepspeed-mii": async_request_deepspeed_mii,
|
"deepspeed-mii": async_request_deepspeed_mii,
|
||||||
"openai": async_request_openai_completions,
|
"openai": async_request_openai_completions,
|
||||||
|
"openai-chat": async_request_openai_chat_completions,
|
||||||
"tensorrt-llm": async_request_trt_llm,
|
"tensorrt-llm": async_request_trt_llm,
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user