# In this file, you can add more tests to run either by adding a new step or # adding a new command to an existing step. See different options here for examples. # This script will be feed into Jinja template in `test-template.j2` to generate # the final pipeline yaml file. steps: - label: Regression Test command: pytest -v -s test_regression.py working_dir: "/vllm-workspace/tests" # optional - label: AsyncEngine Test command: pytest -v -s async_engine - label: Basic Correctness Test commands: - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - label: Core Test mirror_hardwares: [amd] command: pytest -v -s core - label: Distributed Comm Ops Test command: pytest -v -s distributed/test_comm_ops.py working_dir: "/vllm-workspace/tests" num_gpus: 2 - label: Distributed Tests working_dir: "/vllm-workspace/tests" num_gpus: 2 mirror_hardwares: [amd] commands: - pytest -v -s distributed/test_pynccl_library.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - pytest -v -s spec_decode/e2e/test_integration_dist.py - label: Distributed Tests (Multiple Groups) working_dir: "/vllm-workspace/tests" num_gpus: 4 commands: - pytest -v -s distributed/test_pynccl.py - label: Engine Test #mirror_hardwares: [amd] command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py - label: Entrypoints Test commands: # these tests have to be separated, because each one will allocate all posible GPU memory - pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py - pytest -v -s entrypoints/test_server_oot_registration.py - label: Examples Test working_dir: "/vllm-workspace/examples" mirror_hardwares: [amd] commands: # install aws cli for llava_example.py # install tensorizer for tensorize_vllm_model.py - pip install awscli tensorizer - python3 offline_inference.py - python3 offline_inference_with_prefix.py - python3 llm_engine_example.py - python3 llava_example.py - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - label: Kernels Test %N command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 4 - label: Models Test #mirror_hardwares: [amd] commands: - bash ../.buildkite/download-images.sh - pytest -v -s models --ignore=models/test_llava.py - label: Llava Test #mirror_hardwares: [amd] commands: - bash ../.buildkite/download-images.sh - pytest -v -s models/test_llava.py - label: Prefix Caching Test mirror_hardwares: [amd] commands: - pytest -v -s prefix_caching - label: Samplers Test command: pytest -v -s samplers - label: LogitsProcessor Test mirror_hardwares: [amd] command: pytest -v -s test_logits_processor.py - label: Worker Test mirror_hardwares: [amd] command: pytest -v -s worker - label: Speculative decoding tests #mirror_hardwares: [amd] command: pytest -v -s spec_decode - label: LoRA Test %N command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 4 - label: Tensorizer Test command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader - label: Metrics Test command: pytest -v -s metrics - label: Quantization Test command: pytest -v -s quantization - label: Benchmarks working_dir: "/vllm-workspace/.buildkite" mirror_hardwares: [amd] commands: - pip install aiohttp - bash run-benchmarks.sh - label: Documentation Build working_dir: "/vllm-workspace/test_docs/docs" no_gpu: True commands: - pip install -r requirements-docs.txt - SPHINXOPTS=\"-W\" make html