[Frontend] add tok/s speed metric to llm class when using tqdm (#4400)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
This commit is contained in:
parent
e288df0632
commit
16bc0a098f
@ -238,17 +238,25 @@ class LLM:
|
||||
# Initialize tqdm.
|
||||
if use_tqdm:
|
||||
num_requests = self.llm_engine.get_num_unfinished_requests()
|
||||
pbar = tqdm(total=num_requests,
|
||||
desc="Processed prompts",
|
||||
dynamic_ncols=True)
|
||||
pbar = tqdm(
|
||||
total=num_requests,
|
||||
desc="Processed prompts",
|
||||
dynamic_ncols=True,
|
||||
postfix=f"Generation Speed: {0:.2f} toks/s",
|
||||
)
|
||||
# Run the engine.
|
||||
outputs: List[RequestOutput] = []
|
||||
total_toks = 0
|
||||
while self.llm_engine.has_unfinished_requests():
|
||||
step_outputs = self.llm_engine.step()
|
||||
for output in step_outputs:
|
||||
if output.finished:
|
||||
outputs.append(output)
|
||||
if use_tqdm:
|
||||
total_toks += (sum(
|
||||
len(stp.token_ids) for stp in output.outputs))
|
||||
spd = total_toks / pbar.format_dict["elapsed"]
|
||||
pbar.postfix = f"Generation Speed: {spd:.2f} toks/s"
|
||||
pbar.update(1)
|
||||
if use_tqdm:
|
||||
pbar.close()
|
||||
@ -256,4 +264,4 @@ class LLM:
|
||||
# This is necessary because some requests may be finished earlier than
|
||||
# its previous requests.
|
||||
outputs = sorted(outputs, key=lambda x: int(x.request_id))
|
||||
return outputs
|
||||
return outputs
|
||||
|
||||
Loading…
Reference in New Issue
Block a user