[CI] Organizing performance benchmark files (#7616)
This commit is contained in:
parent
f4fc7337bf
commit
3d8a5f063d
@ -34,17 +34,18 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan
|
|||||||
|
|
||||||
Performance benchmark will be triggered when:
|
Performance benchmark will be triggered when:
|
||||||
- A PR being merged into vllm.
|
- A PR being merged into vllm.
|
||||||
- Every commit for those PRs with `perf-benchmarks` label.
|
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
||||||
|
|
||||||
Nightly benchmark will be triggered when:
|
Nightly benchmark will be triggered when:
|
||||||
- Every commit for those PRs with `nightly-benchmarks` label.
|
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Performance benchmark details
|
## Performance benchmark details
|
||||||
|
|
||||||
See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
|
||||||
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
|
|
||||||
|
|
||||||
#### Latency test
|
#### Latency test
|
||||||
@ -68,7 +69,7 @@ Here is an example of one test inside `latency-tests.json`:
|
|||||||
|
|
||||||
In this example:
|
In this example:
|
||||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||||
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||||
|
|
||||||
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||||
|
|
||||||
|
|||||||
@ -21,7 +21,7 @@ steps:
|
|||||||
containers:
|
containers:
|
||||||
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
command:
|
command:
|
||||||
- bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
|
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
nvidia.com/gpu: 8
|
nvidia.com/gpu: 8
|
||||||
|
|||||||
@ -174,8 +174,8 @@ if __name__ == "__main__":
|
|||||||
# document the result
|
# document the result
|
||||||
with open(results_folder / "benchmark_results.md", "w") as f:
|
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||||
|
|
||||||
results = read_markdown(
|
results = read_markdown("../.buildkite/nightly-benchmarks/" +
|
||||||
"../.buildkite/nightly-benchmarks/tests/descriptions.md")
|
"performance-benchmarks-descriptions.md")
|
||||||
results = results.format(
|
results = results.format(
|
||||||
latency_tests_markdown_table=latency_md_table,
|
latency_tests_markdown_table=latency_md_table,
|
||||||
throughput_tests_markdown_table=throughput_md_table,
|
throughput_tests_markdown_table=throughput_md_table,
|
||||||
|
|||||||
@ -37,9 +37,9 @@ check_hf_token() {
|
|||||||
ensure_sharegpt_downloaded() {
|
ensure_sharegpt_downloaded() {
|
||||||
local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
|
local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
if [ ! -f "$FILE" ]; then
|
if [ ! -f "$FILE" ]; then
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
|
||||||
else
|
else
|
||||||
echo "$FILE already exists."
|
echo "$FILE already exists."
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -68,11 +68,29 @@ wait_for_server() {
|
|||||||
done' && return 0 || return 1
|
done' && return 0 || return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
kill_gpu_processes() {
|
kill_processes_launched_by_current_bash() {
|
||||||
# kill all processes on GPU.
|
# Kill all python processes launched from current bash script
|
||||||
|
current_shell_pid=$$
|
||||||
|
processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
|
||||||
|
if [ -n "$processes" ]; then
|
||||||
|
echo "Killing the following processes matching '$1':"
|
||||||
|
echo "$processes"
|
||||||
|
echo "$processes" | xargs kill -9
|
||||||
|
else
|
||||||
|
echo "No processes found matching '$1'."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_gpu_processes() {
|
||||||
|
|
||||||
|
ps -aux
|
||||||
|
lsof -t -i:8000 | xargs -r kill -9
|
||||||
|
pkill -f pt_main_thread
|
||||||
|
# this line doesn't work now
|
||||||
|
# ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
|
||||||
|
pkill -f python3
|
||||||
|
pkill -f /usr/bin/python3
|
||||||
|
|
||||||
ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
|
|
||||||
ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
|
|
||||||
|
|
||||||
# wait until GPU memory usage smaller than 1GB
|
# wait until GPU memory usage smaller than 1GB
|
||||||
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
|
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
|
||||||
@ -82,11 +100,6 @@ kill_gpu_processes() {
|
|||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
rm -rf ~/.config/vllm
|
rm -rf ~/.config/vllm
|
||||||
|
|
||||||
# Print the GPU memory usage
|
|
||||||
# so that we know if all GPU processes are killed.
|
|
||||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
|
||||||
# The memory usage should be 0 MB.
|
|
||||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
upload_to_buildkite() {
|
upload_to_buildkite() {
|
||||||
@ -104,7 +117,7 @@ upload_to_buildkite() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Use the determined command to annotate and upload artifacts
|
# Use the determined command to annotate and upload artifacts
|
||||||
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
|
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
|
||||||
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -156,7 +169,7 @@ run_latency_tests() {
|
|||||||
latency_command: $latency,
|
latency_command: $latency,
|
||||||
gpu_type: $gpu
|
gpu_type: $gpu
|
||||||
}')
|
}')
|
||||||
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
|
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
|
||||||
|
|
||||||
# run the benchmark
|
# run the benchmark
|
||||||
eval "$latency_command"
|
eval "$latency_command"
|
||||||
@ -166,7 +179,6 @@ run_latency_tests() {
|
|||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
run_throughput_tests() {
|
run_throughput_tests() {
|
||||||
# run throughput tests using `benchmark_throughput.py`
|
# run throughput tests using `benchmark_throughput.py`
|
||||||
# $1: a json file specifying throughput test cases
|
# $1: a json file specifying throughput test cases
|
||||||
@ -214,7 +226,7 @@ run_throughput_tests() {
|
|||||||
throughput_command: $command,
|
throughput_command: $command,
|
||||||
gpu_type: $gpu
|
gpu_type: $gpu
|
||||||
}')
|
}')
|
||||||
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
|
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
|
||||||
|
|
||||||
# run the benchmark
|
# run the benchmark
|
||||||
eval "$throughput_command"
|
eval "$throughput_command"
|
||||||
@ -246,7 +258,6 @@ run_serving_tests() {
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
# get client and server arguments
|
# get client and server arguments
|
||||||
server_params=$(echo "$params" | jq -r '.server_parameters')
|
server_params=$(echo "$params" | jq -r '.server_parameters')
|
||||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||||
@ -324,7 +335,7 @@ run_serving_tests() {
|
|||||||
client_command: $client,
|
client_command: $client,
|
||||||
gpu_type: $gpu
|
gpu_type: $gpu
|
||||||
}')
|
}')
|
||||||
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
done
|
done
|
||||||
|
|
||||||
@ -341,6 +352,7 @@ main() {
|
|||||||
# dependencies
|
# dependencies
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
|
(which lsof) || (apt-get update && apt-get install -y lsof)
|
||||||
|
|
||||||
# get the current IP address, required by benchmark_serving.py
|
# get the current IP address, required by benchmark_serving.py
|
||||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||||
@ -359,7 +371,6 @@ main() {
|
|||||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
|
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
|
||||||
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
|
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
|
||||||
|
|
||||||
|
|
||||||
# postprocess benchmarking results
|
# postprocess benchmarking results
|
||||||
pip install tabulate pandas
|
pip install tabulate pandas
|
||||||
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
|
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
|
||||||
Loading…
Reference in New Issue
Block a user