[CI] Organizing performance benchmark files (#7616)

This commit is contained in:
Kuntai Du 2024-08-19 22:43:54 -07:00 committed by GitHub
parent f4fc7337bf
commit 3d8a5f063d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 37 additions and 25 deletions

View File

@ -34,17 +34,18 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan
Performance benchmark will be triggered when: Performance benchmark will be triggered when:
- A PR being merged into vllm. - A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label. - Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
Nightly benchmark will be triggered when: Nightly benchmark will be triggered when:
- Every commit for those PRs with `nightly-benchmarks` label. - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
## Performance benchmark details ## Performance benchmark details
See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
#### Latency test #### Latency test
@ -68,7 +69,7 @@ Here is an example of one test inside `latency-tests.json`:
In this example: In this example:
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`. - The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` - The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly. Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.

View File

@ -21,7 +21,7 @@ steps:
containers: containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command: command:
- bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
resources: resources:
limits: limits:
nvidia.com/gpu: 8 nvidia.com/gpu: 8

View File

@ -174,8 +174,8 @@ if __name__ == "__main__":
# document the result # document the result
with open(results_folder / "benchmark_results.md", "w") as f: with open(results_folder / "benchmark_results.md", "w") as f:
results = read_markdown( results = read_markdown("../.buildkite/nightly-benchmarks/" +
"../.buildkite/nightly-benchmarks/tests/descriptions.md") "performance-benchmarks-descriptions.md")
results = results.format( results = results.format(
latency_tests_markdown_table=latency_md_table, latency_tests_markdown_table=latency_md_table,
throughput_tests_markdown_table=throughput_md_table, throughput_tests_markdown_table=throughput_md_table,

View File

@ -37,9 +37,9 @@ check_hf_token() {
ensure_sharegpt_downloaded() { ensure_sharegpt_downloaded() {
local FILE=ShareGPT_V3_unfiltered_cleaned_split.json local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
if [ ! -f "$FILE" ]; then if [ ! -f "$FILE" ]; then
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
else else
echo "$FILE already exists." echo "$FILE already exists."
fi fi
} }
@ -68,11 +68,29 @@ wait_for_server() {
done' && return 0 || return 1 done' && return 0 || return 1
} }
kill_gpu_processes() { kill_processes_launched_by_current_bash() {
# kill all processes on GPU. # Kill all python processes launched from current bash script
current_shell_pid=$$
processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
if [ -n "$processes" ]; then
echo "Killing the following processes matching '$1':"
echo "$processes"
echo "$processes" | xargs kill -9
else
echo "No processes found matching '$1'."
fi
}
kill_gpu_processes() {
ps -aux
lsof -t -i:8000 | xargs -r kill -9
pkill -f pt_main_thread
# this line doesn't work now
# ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
pkill -f python3
pkill -f /usr/bin/python3
ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
# wait until GPU memory usage smaller than 1GB # wait until GPU memory usage smaller than 1GB
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
@ -82,11 +100,6 @@ kill_gpu_processes() {
# remove vllm config file # remove vllm config file
rm -rf ~/.config/vllm rm -rf ~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
} }
upload_to_buildkite() { upload_to_buildkite() {
@ -104,7 +117,7 @@ upload_to_buildkite() {
fi fi
# Use the determined command to annotate and upload artifacts # Use the determined command to annotate and upload artifacts
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
} }
@ -156,7 +169,7 @@ run_latency_tests() {
latency_command: $latency, latency_command: $latency,
gpu_type: $gpu gpu_type: $gpu
}') }')
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands" echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark # run the benchmark
eval "$latency_command" eval "$latency_command"
@ -166,7 +179,6 @@ run_latency_tests() {
done done
} }
run_throughput_tests() { run_throughput_tests() {
# run throughput tests using `benchmark_throughput.py` # run throughput tests using `benchmark_throughput.py`
# $1: a json file specifying throughput test cases # $1: a json file specifying throughput test cases
@ -214,7 +226,7 @@ run_throughput_tests() {
throughput_command: $command, throughput_command: $command,
gpu_type: $gpu gpu_type: $gpu
}') }')
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands" echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark # run the benchmark
eval "$throughput_command" eval "$throughput_command"
@ -246,7 +258,6 @@ run_serving_tests() {
continue continue
fi fi
# get client and server arguments # get client and server arguments
server_params=$(echo "$params" | jq -r '.server_parameters') server_params=$(echo "$params" | jq -r '.server_parameters')
client_params=$(echo "$params" | jq -r '.client_parameters') client_params=$(echo "$params" | jq -r '.client_parameters')
@ -324,7 +335,7 @@ run_serving_tests() {
client_command: $client, client_command: $client,
gpu_type: $gpu gpu_type: $gpu
}') }')
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands" echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done done
@ -341,6 +352,7 @@ main() {
# dependencies # dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq) (which jq) || (apt-get update && apt-get -y install jq)
(which lsof) || (apt-get update && apt-get install -y lsof)
# get the current IP address, required by benchmark_serving.py # get the current IP address, required by benchmark_serving.py
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
@ -359,7 +371,6 @@ main() {
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
# postprocess benchmarking results # postprocess benchmarking results
pip install tabulate pandas pip install tabulate pandas
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py