2024-01-15 04:37:58 +08:00
|
|
|
# In this file, you can add more tests to run either by adding a new step or
|
|
|
|
|
# adding a new command to an existing step. See different options here for examples.
|
2024-06-25 12:09:02 +08:00
|
|
|
|
|
|
|
|
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
|
|
|
|
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
|
|
|
|
# to generate the final pipeline yaml file.
|
|
|
|
|
|
2024-01-15 04:37:58 +08:00
|
|
|
|
|
|
|
|
steps:
|
2024-07-13 00:58:38 +08:00
|
|
|
- label: Async Engine, Inputs, Utils, Worker Test
|
|
|
|
|
fast_check: true
|
|
|
|
|
fast_check_only: true
|
|
|
|
|
commands:
|
|
|
|
|
- pytest -v -s async_engine # Async Engine
|
|
|
|
|
- pytest -v -s test_inputs.py
|
|
|
|
|
- pytest -v -s multimodal
|
|
|
|
|
- pytest -v -s test_utils.py # Utils
|
|
|
|
|
- pytest -v -s worker # Worker
|
|
|
|
|
|
2024-07-26 09:08:33 +08:00
|
|
|
- label: Metrics, Tracing Test
|
2024-07-13 00:58:38 +08:00
|
|
|
fast_check: true
|
|
|
|
|
fast_check_only: true
|
|
|
|
|
commands:
|
|
|
|
|
- pytest -v -s metrics # Metrics
|
|
|
|
|
- "pip install \
|
|
|
|
|
opentelemetry-sdk \
|
|
|
|
|
opentelemetry-api \
|
|
|
|
|
opentelemetry-exporter-otlp \
|
|
|
|
|
opentelemetry-semantic-conventions-ai" # Tracing
|
|
|
|
|
- pytest -v -s tracing
|
|
|
|
|
|
2024-01-15 04:37:58 +08:00
|
|
|
- label: Regression Test
|
2024-05-17 11:58:25 +08:00
|
|
|
mirror_hardwares: [amd]
|
2024-07-13 00:58:38 +08:00
|
|
|
fast_check: true
|
2024-01-15 04:37:58 +08:00
|
|
|
command: pytest -v -s test_regression.py
|
|
|
|
|
working_dir: "/vllm-workspace/tests" # optional
|
|
|
|
|
|
|
|
|
|
- label: AsyncEngine Test
|
2024-05-17 11:58:25 +08:00
|
|
|
#mirror_hardwares: [amd]
|
2024-01-15 04:37:58 +08:00
|
|
|
command: pytest -v -s async_engine
|
|
|
|
|
|
2024-02-19 08:44:50 +08:00
|
|
|
- label: Basic Correctness Test
|
2024-05-17 11:58:25 +08:00
|
|
|
mirror_hardwares: [amd]
|
2024-07-13 00:58:38 +08:00
|
|
|
fast_check: true
|
2024-04-13 00:56:57 +08:00
|
|
|
commands:
|
2024-07-21 00:39:07 +08:00
|
|
|
# This flashinfer installation will fail on AMD ROCm, so it is set as optional.
|
|
|
|
|
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl || true
|
2024-07-15 12:20:51 +08:00
|
|
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
2024-07-19 07:41:06 +08:00
|
|
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
2024-04-13 00:56:57 +08:00
|
|
|
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
|
|
|
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
2024-05-02 10:24:13 +08:00
|
|
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
2024-03-14 08:02:21 +08:00
|
|
|
|
2024-03-06 10:23:34 +08:00
|
|
|
- label: Core Test
|
2024-05-03 03:29:07 +08:00
|
|
|
mirror_hardwares: [amd]
|
2024-07-13 00:58:38 +08:00
|
|
|
fast_check: true
|
2024-07-19 10:15:52 +08:00
|
|
|
commands:
|
2024-06-27 15:15:24 +08:00
|
|
|
- pytest -v -s core
|
|
|
|
|
- pytest -v -s distributed/test_parallel_state.py
|
2024-02-19 08:44:50 +08:00
|
|
|
|
|
|
|
|
- label: Distributed Comm Ops Test
|
2024-05-17 11:58:25 +08:00
|
|
|
#mirror_hardwares: [amd]
|
2024-05-13 22:50:09 +08:00
|
|
|
working_dir: "/vllm-workspace/tests"
|
2024-05-02 12:28:21 +08:00
|
|
|
num_gpus: 2
|
2024-06-21 13:12:35 +08:00
|
|
|
commands:
|
|
|
|
|
- pytest -v -s distributed/test_comm_ops.py
|
|
|
|
|
- pytest -v -s distributed/test_shm_broadcast.py
|
2024-02-19 08:44:50 +08:00
|
|
|
|
2024-07-13 12:51:48 +08:00
|
|
|
- label: 2 Node Tests (4 GPUs in total)
|
|
|
|
|
working_dir: "/vllm-workspace/tests"
|
|
|
|
|
num_gpus: 2
|
|
|
|
|
num_nodes: 2
|
|
|
|
|
commands:
|
|
|
|
|
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
|
|
|
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
2024-07-19 10:15:52 +08:00
|
|
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
2024-07-13 12:51:48 +08:00
|
|
|
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
|
|
|
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
|
|
|
|
|
2024-06-16 15:48:02 +08:00
|
|
|
- label: Distributed Tests (2 GPUs)
|
2024-05-17 11:58:25 +08:00
|
|
|
mirror_hardwares: [amd]
|
2024-05-13 22:50:09 +08:00
|
|
|
working_dir: "/vllm-workspace/tests"
|
|
|
|
|
num_gpus: 2
|
2024-03-27 15:33:26 +08:00
|
|
|
commands:
|
2024-06-12 01:53:59 +08:00
|
|
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
2024-05-15 01:38:59 +08:00
|
|
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
|
|
|
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
2024-07-18 13:27:09 +08:00
|
|
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
|
|
|
|
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
|
2024-05-15 01:38:59 +08:00
|
|
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
|
|
|
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
2024-06-29 23:45:54 +08:00
|
|
|
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
|
|
|
|
- TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
2024-05-15 01:38:59 +08:00
|
|
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
|
|
|
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
|
|
|
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
|
|
|
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
2024-06-29 23:45:54 +08:00
|
|
|
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
|
2024-06-30 16:06:13 +08:00
|
|
|
- TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
|
2024-06-25 17:56:06 +08:00
|
|
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
2024-06-07 10:07:57 +08:00
|
|
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
2024-06-14 07:06:49 +08:00
|
|
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
2024-01-15 04:37:58 +08:00
|
|
|
|
2024-06-16 15:48:02 +08:00
|
|
|
- label: Distributed Tests (4 GPUs)
|
2024-05-17 11:58:25 +08:00
|
|
|
#mirror_hardwares: [amd]
|
2024-05-13 22:50:09 +08:00
|
|
|
working_dir: "/vllm-workspace/tests"
|
2024-05-02 12:28:21 +08:00
|
|
|
num_gpus: 4
|
2024-07-13 00:58:38 +08:00
|
|
|
fast_check: true
|
2024-05-02 12:28:21 +08:00
|
|
|
commands:
|
2024-05-13 22:50:09 +08:00
|
|
|
- pytest -v -s distributed/test_pynccl.py
|
2024-06-16 15:48:02 +08:00
|
|
|
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
|
|
|
|
|
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
|
|
|
|
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
2024-07-18 13:27:09 +08:00
|
|
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
|
2024-06-16 15:48:02 +08:00
|
|
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
2024-06-25 17:56:06 +08:00
|
|
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
2024-05-02 12:28:21 +08:00
|
|
|
|
2024-07-03 01:58:08 +08:00
|
|
|
- label: Pipeline Parallelism Test
|
|
|
|
|
working_dir: "/vllm-workspace/tests"
|
|
|
|
|
num_gpus: 4
|
|
|
|
|
commands:
|
2024-07-17 06:44:22 +08:00
|
|
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
2024-07-03 01:58:08 +08:00
|
|
|
|
2024-01-15 04:37:58 +08:00
|
|
|
- label: Engine Test
|
2024-05-17 11:58:25 +08:00
|
|
|
mirror_hardwares: [amd]
|
2024-07-19 10:15:52 +08:00
|
|
|
commands:
|
2024-07-12 06:02:29 +08:00
|
|
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
|
|
|
|
# OOM in the CI unless we run this separately
|
|
|
|
|
- pytest -v -s tokenization
|
2024-01-15 04:37:58 +08:00
|
|
|
|
2024-01-17 13:33:14 +08:00
|
|
|
- label: Entrypoints Test
|
2024-07-13 00:58:38 +08:00
|
|
|
fast_check: true
|
2024-05-21 02:29:28 +08:00
|
|
|
mirror_hardwares: [amd]
|
|
|
|
|
|
2024-04-07 08:11:41 +08:00
|
|
|
commands:
|
2024-06-30 12:58:49 +08:00
|
|
|
- pytest -v -s entrypoints/llm
|
|
|
|
|
- pytest -v -s entrypoints/openai
|
2024-01-17 13:33:14 +08:00
|
|
|
|
2024-03-29 05:36:10 +08:00
|
|
|
- label: Examples Test
|
|
|
|
|
working_dir: "/vllm-workspace/examples"
|
2024-05-03 03:29:07 +08:00
|
|
|
mirror_hardwares: [amd]
|
2024-03-29 05:36:10 +08:00
|
|
|
commands:
|
2024-05-14 05:57:07 +08:00
|
|
|
# install tensorizer for tensorize_vllm_model.py
|
|
|
|
|
- pip install awscli tensorizer
|
2024-03-29 05:36:10 +08:00
|
|
|
- python3 offline_inference.py
|
2024-07-18 11:54:35 +08:00
|
|
|
- python3 cpu_offload.py
|
2024-03-29 05:36:10 +08:00
|
|
|
- python3 offline_inference_with_prefix.py
|
|
|
|
|
- python3 llm_engine_example.py
|
2024-07-27 17:54:14 +08:00
|
|
|
- python3 offline_inference_vision_language.py
|
2024-05-14 05:57:07 +08:00
|
|
|
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
2024-03-29 05:36:10 +08:00
|
|
|
|
2024-06-04 12:01:46 +08:00
|
|
|
- label: Inputs Test
|
|
|
|
|
#mirror_hardwares: [amd]
|
|
|
|
|
commands:
|
|
|
|
|
- pytest -v -s test_inputs.py
|
|
|
|
|
- pytest -v -s multimodal
|
|
|
|
|
|
2024-03-18 05:56:30 +08:00
|
|
|
- label: Kernels Test %N
|
2024-05-17 11:58:25 +08:00
|
|
|
#mirror_hardwares: [amd]
|
2024-07-05 07:35:51 +08:00
|
|
|
commands:
|
2024-07-12 09:32:06 +08:00
|
|
|
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
2024-07-05 07:35:51 +08:00
|
|
|
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
2024-03-18 05:56:30 +08:00
|
|
|
parallelism: 4
|
2024-01-15 04:37:58 +08:00
|
|
|
|
|
|
|
|
- label: Models Test
|
2024-05-08 00:23:17 +08:00
|
|
|
#mirror_hardwares: [amd]
|
2024-01-15 04:37:58 +08:00
|
|
|
commands:
|
2024-07-12 09:32:06 +08:00
|
|
|
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
2024-06-18 21:10:04 +08:00
|
|
|
- pytest -v -s models -m \"not vlm\"
|
2024-01-15 04:37:58 +08:00
|
|
|
|
2024-06-18 21:10:04 +08:00
|
|
|
- label: Vision Language Models Test
|
2024-05-17 11:58:25 +08:00
|
|
|
mirror_hardwares: [amd]
|
2024-03-26 05:16:30 +08:00
|
|
|
commands:
|
2024-06-18 21:10:04 +08:00
|
|
|
- pytest -v -s models -m vlm
|
2024-03-26 05:16:30 +08:00
|
|
|
|
2024-01-18 08:32:10 +08:00
|
|
|
- label: Prefix Caching Test
|
2024-05-03 03:29:07 +08:00
|
|
|
mirror_hardwares: [amd]
|
2024-01-18 08:32:10 +08:00
|
|
|
commands:
|
|
|
|
|
- pytest -v -s prefix_caching
|
|
|
|
|
|
2024-01-15 04:37:58 +08:00
|
|
|
- label: Samplers Test
|
2024-05-17 11:58:25 +08:00
|
|
|
#mirror_hardwares: [amd]
|
2024-03-21 05:46:05 +08:00
|
|
|
command: pytest -v -s samplers
|
2024-01-15 04:37:58 +08:00
|
|
|
|
2024-03-21 07:25:01 +08:00
|
|
|
- label: LogitsProcessor Test
|
2024-05-03 03:29:07 +08:00
|
|
|
mirror_hardwares: [amd]
|
2024-03-21 07:25:01 +08:00
|
|
|
command: pytest -v -s test_logits_processor.py
|
|
|
|
|
|
2024-05-29 04:29:31 +08:00
|
|
|
- label: Utils Test
|
2024-07-14 22:16:21 +08:00
|
|
|
commands:
|
|
|
|
|
- pytest -v -s test_utils.py
|
|
|
|
|
- pytest -v -s test_embedded_commit.py
|
2024-05-29 04:29:31 +08:00
|
|
|
|
2024-01-15 04:37:58 +08:00
|
|
|
- label: Worker Test
|
2024-05-03 03:29:07 +08:00
|
|
|
mirror_hardwares: [amd]
|
2024-01-15 04:37:58 +08:00
|
|
|
command: pytest -v -s worker
|
|
|
|
|
|
2024-03-09 15:32:46 +08:00
|
|
|
- label: Speculative decoding tests
|
2024-05-08 00:23:17 +08:00
|
|
|
#mirror_hardwares: [amd]
|
2024-06-06 06:49:27 +08:00
|
|
|
commands:
|
|
|
|
|
# See https://github.com/vllm-project/vllm/issues/5152
|
|
|
|
|
- export VLLM_ATTENTION_BACKEND=XFORMERS
|
|
|
|
|
- pytest -v -s spec_decode
|
2024-03-09 15:32:46 +08:00
|
|
|
|
2024-03-18 05:56:30 +08:00
|
|
|
- label: LoRA Test %N
|
2024-05-17 11:58:25 +08:00
|
|
|
#mirror_hardwares: [amd]
|
2024-05-18 15:05:23 +08:00
|
|
|
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
|
2024-03-18 05:56:30 +08:00
|
|
|
parallelism: 4
|
2024-01-24 07:26:37 +08:00
|
|
|
|
2024-05-18 15:05:23 +08:00
|
|
|
- label: LoRA Long Context (Distributed)
|
|
|
|
|
#mirror_hardwares: [amd]
|
|
|
|
|
num_gpus: 4
|
|
|
|
|
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
|
|
|
|
commands:
|
2024-06-21 08:06:34 +08:00
|
|
|
# FIXIT: find out which code initialize cuda before running the test
|
|
|
|
|
# before the fix, we need to use spawn to test it
|
|
|
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
2024-06-07 10:07:57 +08:00
|
|
|
- pytest -v -s -x lora/test_long_context.py
|
2024-05-18 15:05:23 +08:00
|
|
|
|
2024-04-14 08:13:01 +08:00
|
|
|
- label: Tensorizer Test
|
2024-05-17 11:58:25 +08:00
|
|
|
#mirror_hardwares: [amd]
|
2024-07-26 09:08:33 +08:00
|
|
|
fast_check: true
|
2024-07-13 12:51:48 +08:00
|
|
|
commands:
|
2024-07-19 08:29:25 +08:00
|
|
|
- apt-get install -y curl libsodium23
|
2024-07-13 12:51:48 +08:00
|
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
|
|
|
- pytest -v -s tensorizer_loader
|
2024-04-14 08:13:01 +08:00
|
|
|
|
2024-02-23 06:00:12 +08:00
|
|
|
- label: Metrics Test
|
2024-05-17 11:58:25 +08:00
|
|
|
mirror_hardwares: [amd]
|
2024-02-23 06:00:12 +08:00
|
|
|
command: pytest -v -s metrics
|
|
|
|
|
|
2024-04-24 09:26:33 +08:00
|
|
|
- label: Quantization Test
|
2024-05-17 11:58:25 +08:00
|
|
|
#mirror_hardwares: [amd]
|
2024-04-24 09:26:33 +08:00
|
|
|
command: pytest -v -s quantization
|
|
|
|
|
|
2024-06-19 00:17:03 +08:00
|
|
|
- label: Tracing Test
|
|
|
|
|
commands:
|
|
|
|
|
- "pip install \
|
|
|
|
|
opentelemetry-sdk \
|
|
|
|
|
opentelemetry-api \
|
|
|
|
|
opentelemetry-exporter-otlp \
|
|
|
|
|
opentelemetry-semantic-conventions-ai"
|
|
|
|
|
- pytest -v -s tracing
|
|
|
|
|
|
2024-01-15 04:37:58 +08:00
|
|
|
- label: Benchmarks
|
|
|
|
|
working_dir: "/vllm-workspace/.buildkite"
|
2024-05-03 03:29:07 +08:00
|
|
|
mirror_hardwares: [amd]
|
2024-01-15 04:37:58 +08:00
|
|
|
commands:
|
|
|
|
|
- pip install aiohttp
|
|
|
|
|
- bash run-benchmarks.sh
|
2024-02-13 14:53:07 +08:00
|
|
|
|
2024-06-30 01:04:30 +08:00
|
|
|
- label: LM Eval Small Models
|
|
|
|
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
|
|
|
commands:
|
|
|
|
|
- pip install lm-eval
|
|
|
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
|
|
|
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
|
|
|
|
|
2024-07-02 00:40:45 +08:00
|
|
|
- label: LM Eval Large Models
|
|
|
|
|
gpu: a100
|
|
|
|
|
num_gpus: 4
|
|
|
|
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
|
|
|
commands:
|
|
|
|
|
- pip install lm-eval
|
|
|
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
|
|
|
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
|
|
|
|
|
2024-02-13 14:53:07 +08:00
|
|
|
- label: Documentation Build
|
2024-04-05 12:53:16 +08:00
|
|
|
working_dir: "/vllm-workspace/test_docs/docs"
|
2024-07-13 00:58:38 +08:00
|
|
|
fast_check: true
|
2024-02-13 14:53:07 +08:00
|
|
|
no_gpu: True
|
|
|
|
|
commands:
|
|
|
|
|
- pip install -r requirements-docs.txt
|
|
|
|
|
- SPHINXOPTS=\"-W\" make html
|
2024-06-19 22:42:13 +08:00
|
|
|
|
2024-06-20 04:16:04 +08:00
|
|
|
- label: Distributed Tests (A100)
|
2024-06-19 22:42:13 +08:00
|
|
|
gpu: a100
|
2024-06-20 07:30:03 +08:00
|
|
|
num_gpus: 4
|
2024-06-19 22:42:13 +08:00
|
|
|
commands:
|
2024-06-20 04:16:04 +08:00
|
|
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
|
|
|
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
|
|
|
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
|
|
|
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
|
|
|
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
2024-07-12 09:32:06 +08:00
|
|
|
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
2024-06-29 06:28:49 +08:00
|
|
|
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
2024-06-30 09:18:31 +08:00
|
|
|
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
2024-07-01 01:11:15 +08:00
|
|
|
- pytest -v -s -x lora/test_mixtral.py
|