# In this file, you can add more tests to run either by adding a new step or # adding a new command to an existing step. See different options here for examples. # This script will be feed into Jinja template in `test-template-aws.j2` at # https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 # to generate the final pipeline yaml file. steps: - label: Async Engine, Inputs, Utils, Worker Test fast_check: true fast_check_only: true commands: - pytest -v -s async_engine # Async Engine - pytest -v -s test_inputs.py - pytest -v -s multimodal - pytest -v -s test_utils.py # Utils - pytest -v -s worker # Worker - label: Metrics, Tracing Test fast_check: true fast_check_only: true commands: - pytest -v -s metrics # Metrics - "pip install \ opentelemetry-sdk \ opentelemetry-api \ opentelemetry-exporter-otlp \ opentelemetry-semantic-conventions-ai" # Tracing - pytest -v -s tracing - label: Regression Test mirror_hardwares: [amd] fast_check: true command: pytest -v -s test_regression.py working_dir: "/vllm-workspace/tests" # optional - label: AsyncEngine Test #mirror_hardwares: [amd] command: pytest -v -s async_engine - label: Basic Correctness Test mirror_hardwares: [amd] fast_check: true commands: # This flashinfer installation will fail on AMD ROCm, so it is set as optional. - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl || true - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - label: Core Test mirror_hardwares: [amd] fast_check: true commands: - pytest -v -s core - pytest -v -s distributed/test_parallel_state.py - label: Distributed Comm Ops Test #mirror_hardwares: [amd] working_dir: "/vllm-workspace/tests" num_gpus: 2 commands: - pytest -v -s distributed/test_comm_ops.py - pytest -v -s distributed/test_shm_broadcast.py - label: 2 Node Tests (4 GPUs in total) working_dir: "/vllm-workspace/tests" num_gpus: 2 num_nodes: 2 commands: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - label: Distributed Tests (2 GPUs) mirror_hardwares: [amd] working_dir: "/vllm-workspace/tests" num_gpus: 2 commands: - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py - label: Distributed Tests (4 GPUs) #mirror_hardwares: [amd] working_dir: "/vllm-workspace/tests" num_gpus: 4 fast_check: true commands: - pytest -v -s distributed/test_pynccl.py # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here. # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - label: Pipeline Parallelism Test working_dir: "/vllm-workspace/tests" num_gpus: 4 commands: - pytest -v -s distributed/test_pipeline_parallel.py - label: Engine Test mirror_hardwares: [amd] commands: - pytest -v -s engine test_sequence.py test_config.py test_logger.py # OOM in the CI unless we run this separately - pytest -v -s tokenization - label: Entrypoints Test fast_check: true mirror_hardwares: [amd] commands: - pytest -v -s entrypoints/llm - pytest -v -s entrypoints/openai - label: Examples Test working_dir: "/vllm-workspace/examples" mirror_hardwares: [amd] commands: # install tensorizer for tensorize_vllm_model.py - pip install awscli tensorizer - python3 offline_inference.py - python3 cpu_offload.py - python3 offline_inference_with_prefix.py - python3 llm_engine_example.py - python3 offline_inference_vision_language.py - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - label: Inputs Test #mirror_hardwares: [amd] commands: - pytest -v -s test_inputs.py - pytest -v -s multimodal # - label: Kernels Test %N # #mirror_hardwares: [amd] # commands: # - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl # - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT # parallelism: 4 - label: Models Test #mirror_hardwares: [amd] commands: - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl - pytest -v -s models -m \"not vlm\" - label: Vision Language Models Test mirror_hardwares: [amd] commands: - pytest -v -s models -m vlm - label: Prefix Caching Test mirror_hardwares: [amd] commands: - pytest -v -s prefix_caching - label: Samplers Test #mirror_hardwares: [amd] command: pytest -v -s samplers - label: LogitsProcessor Test mirror_hardwares: [amd] command: pytest -v -s test_logits_processor.py - label: Utils Test commands: - pytest -v -s test_utils.py - pytest -v -s test_embedded_commit.py - label: Worker Test mirror_hardwares: [amd] command: pytest -v -s worker - label: Speculative decoding tests #mirror_hardwares: [amd] commands: # See https://github.com/vllm-project/vllm/issues/5152 - export VLLM_ATTENTION_BACKEND=XFORMERS - pytest -v -s spec_decode # - label: LoRA Test %N # #mirror_hardwares: [amd] # command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py # parallelism: 4 # - label: LoRA Long Context (Distributed) # #mirror_hardwares: [amd] # num_gpus: 4 # # This test runs llama 13B, so it is required to run on 4 GPUs. # commands: # # FIXIT: find out which code initialize cuda before running the test # # before the fix, we need to use spawn to test it # - export VLLM_WORKER_MULTIPROC_METHOD=spawn # - pytest -v -s -x lora/test_long_context.py - label: Tensorizer Test #mirror_hardwares: [amd] fast_check: true commands: - apt-get install -y curl libsodium23 - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s tensorizer_loader - label: Metrics Test mirror_hardwares: [amd] command: pytest -v -s metrics - label: Quantization Test #mirror_hardwares: [amd] command: pytest -v -s quantization - label: Tracing Test commands: - "pip install \ opentelemetry-sdk \ opentelemetry-api \ opentelemetry-exporter-otlp \ opentelemetry-semantic-conventions-ai" - pytest -v -s tracing - label: Benchmarks working_dir: "/vllm-workspace/.buildkite" mirror_hardwares: [amd] commands: - pip install aiohttp - bash run-benchmarks.sh - label: LM Eval Small Models working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" commands: - pip install lm-eval - export VLLM_WORKER_MULTIPROC_METHOD=spawn - bash ./run-tests.sh -c configs/models-small.txt -t 1 - label: LM Eval Large Models gpu: a100 num_gpus: 4 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" commands: - pip install lm-eval - export VLLM_WORKER_MULTIPROC_METHOD=spawn - bash ./run-tests.sh -c configs/models-large.txt -t 4 - label: Documentation Build working_dir: "/vllm-workspace/test_docs/docs" fast_check: true no_gpu: True commands: - pip install -r requirements-docs.txt - SPHINXOPTS=\"-W\" make html - label: Distributed Tests (A100) gpu: a100 num_gpus: 4 commands: # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - pytest -v -s -x lora/test_mixtral.py