[Frontend] add tok/s speed metric to llm class when using tqdm (#4400)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
This commit is contained in:
Mahmoud Ashraf 2024-05-09 08:02:31 +03:00 committed by GitHub
parent e288df0632
commit 16bc0a098f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -238,17 +238,25 @@ class LLM:
# Initialize tqdm. # Initialize tqdm.
if use_tqdm: if use_tqdm:
num_requests = self.llm_engine.get_num_unfinished_requests() num_requests = self.llm_engine.get_num_unfinished_requests()
pbar = tqdm(total=num_requests, pbar = tqdm(
desc="Processed prompts", total=num_requests,
dynamic_ncols=True) desc="Processed prompts",
dynamic_ncols=True,
postfix=f"Generation Speed: {0:.2f} toks/s",
)
# Run the engine. # Run the engine.
outputs: List[RequestOutput] = [] outputs: List[RequestOutput] = []
total_toks = 0
while self.llm_engine.has_unfinished_requests(): while self.llm_engine.has_unfinished_requests():
step_outputs = self.llm_engine.step() step_outputs = self.llm_engine.step()
for output in step_outputs: for output in step_outputs:
if output.finished: if output.finished:
outputs.append(output) outputs.append(output)
if use_tqdm: if use_tqdm:
total_toks += (sum(
len(stp.token_ids) for stp in output.outputs))
spd = total_toks / pbar.format_dict["elapsed"]
pbar.postfix = f"Generation Speed: {spd:.2f} toks/s"
pbar.update(1) pbar.update(1)
if use_tqdm: if use_tqdm:
pbar.close() pbar.close()
@ -256,4 +264,4 @@ class LLM:
# This is necessary because some requests may be finished earlier than # This is necessary because some requests may be finished earlier than
# its previous requests. # its previous requests.
outputs = sorted(outputs, key=lambda x: int(x.request_id)) outputs = sorted(outputs, key=lambda x: int(x.request_id))
return outputs return outputs