# In this file, you can add more tests to run either by adding a new step or # adding a new command to an existing step. See different options here for examples. # This script will be feed into Jinja template in `test-template-aws.j2` at # https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 # to generate the final pipeline yaml file. # Documentation # label(str): the name of the test. emoji allowed. # fast_check(bool): whether to run this on each commit on fastcheck pipeline. # fast_check_only(bool): run this test on fastcheck pipeline only # nightly(bool): run this test in nightly pipeline only # optional(bool): never run this test by default (i.e. need to unblock manually) # command(str): the single command to run for tests. incompatible with commands. # commands(list): the list of commands to run for test. incompatbile with command. # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd] # gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100 # num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4. # num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, # in this case, commands must be specified. the first command runs on first host, the second # command runs on the second host. # working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests # source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run. # When adding a test # - If the test belong to an existing group, add it there # - If the test is short, add to any existing step # - If the test takes more than 10min, then it is okay to create a new step. # Note that all steps execute in parallel. steps: ##### fast check tests ##### - label: Documentation Build # 2min working_dir: "/vllm-workspace/test_docs/docs" fast_check: true no_gpu: True commands: - pip install -r requirements-docs.txt - SPHINXOPTS=\"-W\" make html # Check API reference (if it fails, you may have missing mock imports) - grep \"sig sig-object py\" build/html/dev/sampling_params.html - label: Async Engine, Inputs, Utils, Worker Test # 24min fast_check: true source_file_dependencies: - vllm/ - tests/mq_llm_engine - tests/async_engine - tests/test_inputs - tests/multimodal - tests/test_utils - tests/worker commands: - pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s async_engine # AsyncLLMEngine - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py - pytest -v -s test_inputs.py - pytest -v -s multimodal - pytest -v -s test_utils.py # Utils - pytest -v -s worker # Worker - label: Basic Correctness Test # 30min #mirror_hardwares: [amd] fast_check: true source_file_dependencies: - vllm/ - tests/basic_correctness/test_basic_correctness - tests/basic_correctness/test_cpu_offload - tests/basic_correctness/test_preemption commands: - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - label: Chunked Prefill Test source_file_dependencies: - vllm/ - tests/basic_correctness/test_chunked_prefill commands: - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - label: Core Test # 10min mirror_hardwares: [amd] fast_check: true source_file_dependencies: - vllm/core - vllm/distributed - tests/core commands: - pytest -v -s core - label: Entrypoints Test # 40min working_dir: "/vllm-workspace/tests" fast_check: true mirror_hardwares: [amd] source_file_dependencies: - vllm/ commands: - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - label: Distributed Tests (4 GPUs) # 10min working_dir: "/vllm-workspace/tests" num_gpus: 4 fast_check: true source_file_dependencies: - vllm/distributed/ - vllm/core/ - tests/distributed - tests/spec_decode/e2e/test_integration_dist_tp4 - tests/compile commands: - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - label: Metrics, Tracing Test # 10min num_gpus: 2 fast_check: true source_file_dependencies: - vllm/ - tests/metrics - tests/tracing commands: - pytest -v -s metrics - "pip install \ 'opentelemetry-sdk>=1.26.0,<1.27.0' \ 'opentelemetry-api>=1.26.0,<1.27.0' \ 'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \ 'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'" - pytest -v -s tracing ##### fast check tests ##### ##### 1 GPU test ##### - label: Regression Test # 5min mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/test_regression commands: - pip install modelscope - pytest -v -s test_regression.py working_dir: "/vllm-workspace/tests" # optional - label: Engine Test # 10min mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/engine - tests/tokenization commands: - pytest -v -s engine test_sequence.py test_config.py test_logger.py # OOM in the CI unless we run this separately - pytest -v -s tokenization - label: Examples Test # 15min working_dir: "/vllm-workspace/examples" #mirror_hardwares: [amd] source_file_dependencies: - vllm/entrypoints - examples/ commands: - pip install awscli tensorizer # for llava example and tensorizer test - python3 offline_inference.py - python3 cpu_offload.py - python3 offline_inference_chat.py - python3 offline_inference_with_prefix.py - python3 llm_engine_example.py - python3 offline_inference_vision_language.py - python3 offline_inference_vision_language_multi_image.py - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference_encoder_decoder.py - python3 offline_profile.py --model facebook/opt-125m - label: Prefix Caching Test # 9min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/prefix_caching commands: - pytest -v -s prefix_caching - label: Samplers Test # 36min source_file_dependencies: - vllm/model_executor/layers - vllm/sampling_metadata.py - tests/samplers commands: - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - label: LogitsProcessor Test # 5min mirror_hardwares: [amd] source_file_dependencies: - vllm/model_executor/layers - tests/test_logits_processor command: pytest -v -s test_logits_processor.py - label: Speculative decoding tests # 30min source_file_dependencies: - vllm/spec_decode - tests/spec_decode commands: - pytest -v -s spec_decode/e2e/test_multistep_correctness.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py - label: LoRA Test %N # 15min each mirror_hardwares: [amd] source_file_dependencies: - vllm/lora - tests/lora command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py parallelism: 4 - label: "PyTorch Fullgraph Smoke Test" # 9min fast_check: true source_file_dependencies: - vllm/ - tests/compile commands: - pytest -v -s compile/test_basic_correctness.py # these tests need to be separated, cannot combine - pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_toy_llama.py - label: "PyTorch Fullgraph Test" # 18min source_file_dependencies: - vllm/ - tests/compile commands: - pytest -v -s compile/test_full_graph.py - label: Kernels Test %N # 1h each mirror_hardwares: [amd] source_file_dependencies: - csrc/ - vllm/attention - tests/kernels commands: - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 4 - label: Tensorizer Test # 11min mirror_hardwares: [amd] soft_fail: true source_file_dependencies: - vllm/model_executor/model_loader - tests/tensorizer_loader commands: - apt-get update && apt-get install -y curl libsodium23 - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s tensorizer_loader - label: Benchmarks # 9min working_dir: "/vllm-workspace/.buildkite" mirror_hardwares: [amd] source_file_dependencies: - benchmarks/ commands: - bash run-benchmarks.sh - label: Quantization Test # 33min source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization - tests/quantization command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization - label: LM Eval Small Models # 53min working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - bash ./run-tests.sh -c configs/models-small.txt -t 1 - label: Encoder Decoder tests # 5min source_file_dependencies: - vllm/ - tests/encoder_decoder commands: - pytest -v -s encoder_decoder - label: OpenAI-Compatible Tool Use # 20 min fast_check: false mirror_hardwares: [ amd ] source_file_dependencies: - vllm/ - tests/tool_use commands: - pytest -v -s tool_use ##### models test ##### - label: Basic Models Test # 10min source_file_dependencies: - vllm/ - tests/models commands: - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/*.py --ignore=models/test_oot_registration.py - label: Decoder-only Language Models Test (Standard) # 18min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/decoder_only/language commands: - pytest -v -s models/decoder_only/language -m core_model - pytest -v -s models/decoder_only/language -m quant_model - label: Decoder-only Language Models Test (Extended) # 46min nightly: true source_file_dependencies: - vllm/ - tests/models/decoder_only/language commands: - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' - label: Decoder-only Multi-Modal Models Test (Standard) # 22min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/decoder_only/audio_language - tests/models/decoder_only/vision_language commands: - pytest -v -s models/decoder_only/audio_language -m core_model - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model # No tests under this group for now # - pytest -v -s models/decoder_only/audio_language -m quant_model - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m quant_model - label: Decoder-only Multi-Modal Models Test (Extended) # 1h10m nightly: true source_file_dependencies: - vllm/ - tests/models/decoder_only/audio_language - tests/models/decoder_only/vision_language commands: - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model' # HACK - run phi3v tests separately to sidestep this transformers bug # https://github.com/huggingface/transformers/issues/34307 - pytest -v -s models/decoder_only/vision_language/test_phi3v.py - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model' - label: Other Models Test # 20min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/embedding/language - tests/models/embedding/vision_language - tests/models/encoder_decoder/language - tests/models/encoder_decoder/vision_language commands: - pytest -v -s models/embedding/language - pytest -v -s models/embedding/vision_language - pytest -v -s models/encoder_decoder/language - pytest -v -s models/encoder_decoder/vision_language # This test is used only in PR development phase to test individual models and should never run on main - label: Custom Models Test optional: true commands: - echo 'Testing custom models...' # PR authors can temporarily add commands below to test individual models # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* ##### 1 GPU test ##### ##### multi gpus test ##### - label: Distributed Comm Ops Test # 7min working_dir: "/vllm-workspace/tests" num_gpus: 2 source_file_dependencies: - vllm/distributed - tests/distributed commands: - pytest -v -s distributed/test_comm_ops.py - pytest -v -s distributed/test_shm_broadcast.py - label: 2 Node Tests (4 GPUs in total) # 16min working_dir: "/vllm-workspace/tests" num_gpus: 2 num_nodes: 2 source_file_dependencies: - vllm/distributed/ - vllm/engine/ - vllm/executor/ - vllm/model_executor/models/ - tests/distributed/ commands: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed' - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed' - label: Distributed Tests (2 GPUs) # 40min #mirror_hardwares: [amd] working_dir: "/vllm-workspace/tests" num_gpus: 2 source_file_dependencies: - vllm/distributed/ - vllm/engine/ - vllm/executor/ - vllm/model_executor/models/ - tests/distributed/ - vllm/compilation commands: - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed' - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus # Avoid importing model tests that cause CUDA reinitialization error - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s distributed/test_distributed_oot.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - label: Multi-step Tests (4 GPUs) # 36min working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: - vllm/model_executor/layers/sampler.py - vllm/sequence.py - vllm/worker/worker_base.py - vllm/worker/worker.py - vllm/worker/multi_step_worker.py - vllm/worker/model_runner_base.py - vllm/worker/model_runner.py - vllm/worker/multi_step_model_runner.py - vllm/engine - tests/multi_step commands: - pytest -v -s multi_step/test_correctness_async_llm.py - pytest -v -s multi_step/test_correctness_llm.py - label: Pipeline Parallelism Test # 45min working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: - vllm/distributed/ - vllm/engine/ - vllm/executor/ - vllm/model_executor/models/ - tests/distributed/ commands: - pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pipeline_parallel.py - label: LoRA Long Context (Distributed) # 11min # This test runs llama 13B, so it is required to run on 4 GPUs. num_gpus: 4 soft_fail: true source_file_dependencies: - vllm/lora - tests/lora/test_long_context commands: # FIXIT: find out which code initialize cuda before running the test # before the fix, we need to use spawn to test it - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s -x lora/test_long_context.py - label: Weight Loading Multiple GPU Test # 33min working_dir: "/vllm-workspace/tests" num_gpus: 2 source_file_dependencies: - vllm/ - tests/weight_loading commands: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - label: Weight Loading Multiple GPU Test - Large Models # optional working_dir: "/vllm-workspace/tests" num_gpus: 2 gpu: a100 optional: true source_file_dependencies: - vllm/ - tests/weight_loading commands: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt ##### multi gpus test ##### ##### A100 test ##### - label: Distributed Tests (A100) # optional gpu: a100 num_gpus: 4 source_file_dependencies: - vllm/ commands: # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus - pytest -v -s -x lora/test_mixtral.py - label: LM Eval Large Models # optional gpu: a100 num_gpus: 4 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - bash ./run-tests.sh -c configs/models-large.txt -t 4