From 0370afa2e55c4934e8d38fa1dfe22e7a8e64345a Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Mon, 19 Jun 2023 11:12:37 +0800 Subject: [PATCH] Remove benchmark_async_llm_server.py (#155) --- benchmarks/benchmark_async_llm_server.py | 60 ------------------------ 1 file changed, 60 deletions(-) delete mode 100644 benchmarks/benchmark_async_llm_server.py diff --git a/benchmarks/benchmark_async_llm_server.py b/benchmarks/benchmark_async_llm_server.py deleted file mode 100644 index 2878e166..00000000 --- a/benchmarks/benchmark_async_llm_server.py +++ /dev/null @@ -1,60 +0,0 @@ -import argparse -import json -import threading -import time - -import requests - - -def main(args: argparse.Namespace): - prompts = [f"Tell me a story with more than {''.join([str(i+1)] * 5)} words" - for i in range(args.n_threads)] - - api_url = f"http://{args.host}:{args.port}/generate" - headers = {"User-Agent": "vLLM Benchmark Client"} - ploads = [{ - "prompt": p, - "max_tokens": args.max_tokens, - "temperature": 0.0, - "ignore_eos": True, - } for p in prompts] - - def send_request(results, i): - response = requests.post(api_url, headers=headers, json=ploads[i], - stream=True) - results[i] = response - - # use args.n_threads to prompt the backend - tik = time.time() - threads = [] - results = [None] * args.n_threads - for i in range(args.n_threads): - t = threading.Thread(target=send_request, args=(results, i)) - t.start() - threads.append(t) - - for t in threads: - t.join() - - print(f"Time (POST): {time.time() - tik} s") - n_words = 0 - - for i, response in enumerate(results): - k = list(response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0")) - response_new_words = json.loads(k[-2].decode("utf-8"))["text"][0] - n_words += len(response_new_words.split(" ")) - len(prompts[i].split(" ")) - - time_seconds = time.time() - tik - print(f"Time (total): {time_seconds:.3f}s to finish, n_threads: {args.n_threads}, " - f"throughput: {n_words / time_seconds} words/s.") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--host", type=str, default="localhost") - parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--max-tokens", type=int, default=128) - parser.add_argument("--n-threads", type=int, default=128) - args = parser.parse_args() - - main(args)