From 00efdc84baf313cb775ca99a011b0e9a13539bdd Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 19 Jan 2024 20:20:19 -0800
Subject: [PATCH] Add benchmark serving to CI (#2505)

---
 .buildkite/run-benchmarks.sh | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
index c4e6b21d..dde28cb5 100644
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@@ -1,10 +1,13 @@
 # This script is run by buildkite to run the benchmarks and upload the results to buildkite
 
 set -ex
+set -o pipefail
 
 # cd into parent directory of this file
 cd "$(dirname "${BASH_SOURCE[0]}")/.."
 
+(wget && curl) || (apt-get update && apt-get install -y wget curl)
+
 # run benchmarks and upload the result to buildkite
 python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
 bench_latency_exit_code=$?
@@ -12,15 +15,36 @@ bench_latency_exit_code=$?
 python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
 bench_throughput_exit_code=$?
 
+python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
+server_pid=$!
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+# wait for server to start, timeout after 600 seconds
+timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+python3 benchmarks/benchmark_serving.py \
+    --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
+    --model meta-llama/Llama-2-7b-chat-hf \
+    --num-prompts 20 \
+    --endpoint /v1/completions \
+    --tokenizer meta-llama/Llama-2-7b-chat-hf 2>&1 | tee benchmark_serving.txt
+bench_serving_exit_code=$?
+kill $server_pid
+
 # write the results into a markdown file
 echo "### Latency Benchmarks" >> benchmark_results.md
-sed -n '1p' benchmark_latency.txt >> benchmark_results.md
+sed -n '1p' benchmark_latency.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
-sed -n '$p' benchmark_latency.txt >> benchmark_results.md
+sed -n '$p' benchmark_latency.txt >> benchmark_results.md # last line
+
 echo "### Throughput Benchmarks" >> benchmark_results.md
-sed -n '1p' benchmark_throughput.txt >> benchmark_results.md
+sed -n '1p' benchmark_throughput.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
-sed -n '$p' benchmark_throughput.txt >> benchmark_results.md
+sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
+
+echo "### Serving Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
+echo "" >> benchmark_results.md
+tail -n 5 benchmark_serving.txt >> benchmark_results.md # last 5 lines
 
 # upload the results to buildkite
 /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
@@ -33,3 +57,7 @@ fi
 if [ $bench_throughput_exit_code -ne 0 ]; then
     exit $bench_throughput_exit_code
 fi
+
+if [ $bench_serving_exit_code -ne 0 ]; then
+    exit $bench_serving_exit_code
+fi