[ CI/Build ] LM Eval Harness Based CI Testing (#5838)
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
This commit is contained in:
parent
99397da534
commit
75aa1442db
@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||||
|
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.892
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.892
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||||
|
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.756
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.752
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
||||||
|
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.756
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.752
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
||||||
|
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.616
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.632
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
2
.buildkite/lm-eval-harness/configs/models-large.txt
Normal file
2
.buildkite/lm-eval-harness/configs/models-large.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
Meta-Llama-3-70B-Instruct.yaml
|
||||||
|
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||||
2
.buildkite/lm-eval-harness/configs/models-small.txt
Normal file
2
.buildkite/lm-eval-harness/configs/models-small.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
Meta-Llama-3-8B-Instruct.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-FP8.yaml
|
||||||
46
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file
46
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||||
|
#
|
||||||
|
# Make sure you have lm-eval-harness installed:
|
||||||
|
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using huggingface transformers."
|
||||||
|
echo "This pathway is intended to be used to create baselines for "
|
||||||
|
echo "our automated nm-test-accuracy workflow"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -m - huggingface stub or local directory of the model"
|
||||||
|
echo " -b - batch size to run the evaluation at"
|
||||||
|
echo " -l - limit number of samples to run"
|
||||||
|
echo " -f - number of fewshot samples to use"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "m:b:l:f:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
m )
|
||||||
|
MODEL="$OPTARG"
|
||||||
|
;;
|
||||||
|
b )
|
||||||
|
BATCH_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
l )
|
||||||
|
LIMIT="$OPTARG"
|
||||||
|
;;
|
||||||
|
f )
|
||||||
|
FEWSHOT="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
lm_eval --model hf \
|
||||||
|
--model_args pretrained=$MODEL,parallelize=True \
|
||||||
|
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
|
||||||
|
--batch_size $BATCH_SIZE
|
||||||
51
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
Normal file
51
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# We can use this script to compute baseline accuracy on GSM for vllm.
|
||||||
|
# We use this for fp8, which HF does not support.
|
||||||
|
#
|
||||||
|
# Make sure you have lm-eval-harness installed:
|
||||||
|
# pip install lm-eval==0.4.2
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using huggingface transformers."
|
||||||
|
echo "This pathway is intended to be used to create baselines for "
|
||||||
|
echo "our automated nm-test-accuracy workflow"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -m - huggingface stub or local directory of the model"
|
||||||
|
echo " -b - batch size to run the evaluation at"
|
||||||
|
echo " -l - limit number of samples to run"
|
||||||
|
echo " -f - number of fewshot samples to use"
|
||||||
|
echo " -t - tensor parallel size to run at"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "m:b:l:f:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
m )
|
||||||
|
MODEL="$OPTARG"
|
||||||
|
;;
|
||||||
|
b )
|
||||||
|
BATCH_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
l )
|
||||||
|
LIMIT="$OPTARG"
|
||||||
|
;;
|
||||||
|
f )
|
||||||
|
FEWSHOT="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
lm_eval --model vllm \
|
||||||
|
--model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE \
|
||||||
|
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
|
||||||
|
--batch_size $BATCH_SIZE
|
||||||
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using vllm and compares to "
|
||||||
|
echo "precomputed baseline (measured by HF transformers.)"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
|
||||||
|
echo " -t - tensor parallel size"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
SUCCESS=0
|
||||||
|
|
||||||
|
while getopts "c:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
c )
|
||||||
|
CONFIG="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Parse list of configs.
|
||||||
|
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
|
||||||
|
|
||||||
|
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
||||||
|
do
|
||||||
|
LOCAL_SUCCESS=0
|
||||||
|
|
||||||
|
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
|
||||||
|
|
||||||
|
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
|
||||||
|
export LM_EVAL_TP_SIZE=$TP_SIZE
|
||||||
|
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
|
||||||
|
|
||||||
|
if [[ $LOCAL_SUCCESS == 0 ]]; then
|
||||||
|
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
else
|
||||||
|
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
fi
|
||||||
|
|
||||||
|
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${SUCCESS}" -eq "0" ]; then
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
54
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
Normal file
54
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
"""
|
||||||
|
LM eval harness on model to compare vs HF baseline computed offline.
|
||||||
|
Configs are found in configs/$MODEL.yaml
|
||||||
|
|
||||||
|
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
|
||||||
|
* export LM_EVAL_TP_SIZE=4
|
||||||
|
* pytest -s test_lm_eval_correctness.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import lm_eval
|
||||||
|
import numpy
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
RTOL = 0.02
|
||||||
|
TEST_DATA_FILE = os.environ.get(
|
||||||
|
"LM_EVAL_TEST_DATA_FILE",
|
||||||
|
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
|
||||||
|
|
||||||
|
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
|
||||||
|
|
||||||
|
|
||||||
|
def launch_lm_eval(eval_config):
|
||||||
|
model_args = f"pretrained={eval_config['model_name']}," \
|
||||||
|
f"tensor_parallel_size={TP_SIZE}"
|
||||||
|
|
||||||
|
results = lm_eval.simple_evaluate(
|
||||||
|
model="vllm",
|
||||||
|
model_args=model_args,
|
||||||
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
|
limit=eval_config["limit"],
|
||||||
|
batch_size="auto")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def test_lm_eval_correctness():
|
||||||
|
eval_config = yaml.safe_load(
|
||||||
|
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
# Launch eval requests.
|
||||||
|
results = launch_lm_eval(eval_config)
|
||||||
|
|
||||||
|
# Confirm scores match ground truth.
|
||||||
|
for task in eval_config["tasks"]:
|
||||||
|
for metric in task["metrics"]:
|
||||||
|
ground_truth = metric["value"]
|
||||||
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
|
print(f'{task["name"]} | {metric["name"]}: '
|
||||||
|
f'ground_truth={ground_truth} | measured={measured_value}')
|
||||||
|
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||||
@ -197,6 +197,22 @@ steps:
|
|||||||
- pip install aiohttp
|
- pip install aiohttp
|
||||||
- bash run-benchmarks.sh
|
- bash run-benchmarks.sh
|
||||||
|
|
||||||
|
- label: LM Eval Small Models
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
commands:
|
||||||
|
- pip install lm-eval
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||||
|
|
||||||
|
- label: LM Eval Large Models
|
||||||
|
gpu: a100
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
commands:
|
||||||
|
- pip install lm-eval
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
||||||
|
|
||||||
- label: Documentation Build
|
- label: Documentation Build
|
||||||
working_dir: "/vllm-workspace/test_docs/docs"
|
working_dir: "/vllm-workspace/test_docs/docs"
|
||||||
no_gpu: True
|
no_gpu: True
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user