140 lines
4.2 KiB
Django/Jinja
140 lines
4.2 KiB
Django/Jinja
{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
|
|
{% set default_working_dir = "/vllm-workspace/tests" %}
|
|
|
|
steps:
|
|
- label: ":docker: build image"
|
|
agents:
|
|
queue: cpu_queue
|
|
commands:
|
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
- "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ."
|
|
- "docker push {{ docker_image }}"
|
|
env:
|
|
DOCKER_BUILDKIT: "1"
|
|
retry:
|
|
automatic:
|
|
- exit_status: -1 # Agent was lost
|
|
limit: 5
|
|
- exit_status: -10 # Agent was lost
|
|
limit: 5
|
|
- wait
|
|
|
|
- group: "AMD Tests"
|
|
depends_on: ~
|
|
steps:
|
|
{% for step in steps %}
|
|
{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
|
|
- label: "AMD: {{ step.label }}"
|
|
agents:
|
|
queue: amd
|
|
command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
|
|
env:
|
|
DOCKER_BUILDKIT: "1"
|
|
priority: 100
|
|
soft_fail: true
|
|
{% endif %}
|
|
{% endfor %}
|
|
|
|
- label: "Neuron Test"
|
|
depends_on: ~
|
|
agents:
|
|
queue: neuron
|
|
command: bash .buildkite/run-neuron-test.sh
|
|
soft_fail: false
|
|
|
|
- label: "Intel Test"
|
|
depends_on: ~
|
|
agents:
|
|
queue: intel
|
|
command: bash .buildkite/run-cpu-test.sh
|
|
|
|
{% for step in steps %}
|
|
{% if step.gpu == "a100" %}
|
|
- label: "{{ step.label }}"
|
|
agents:
|
|
queue: a100-queue
|
|
soft_fail: {{ step.soft_fail or false }}
|
|
{% if step.parallelism %}
|
|
parallelism: {{ step.parallelism }}
|
|
{% endif %}
|
|
retry:
|
|
automatic:
|
|
- exit_status: -1 # Agent was lost
|
|
limit: 5
|
|
- exit_status: -10 # Agent was lost
|
|
limit: 5
|
|
plugins:
|
|
- kubernetes:
|
|
podSpec:
|
|
priorityClassName: ci
|
|
containers:
|
|
- image: {{ docker_image }}
|
|
command: ["bash"]
|
|
args:
|
|
- '-c'
|
|
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
|
|
resources:
|
|
limits:
|
|
nvidia.com/gpu: 8
|
|
volumeMounts:
|
|
- name: devshm
|
|
mountPath: /dev/shm
|
|
env:
|
|
- name: VLLM_USAGE_SOURCE
|
|
value: ci-test
|
|
- name: HF_TOKEN
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: hf-token-secret
|
|
key: token
|
|
nodeSelector:
|
|
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
|
volumes:
|
|
- name: devshm
|
|
emptyDir:
|
|
medium: Memory
|
|
{% else %}
|
|
- label: "{{ step.label }}"
|
|
agents:
|
|
{% if step.label == "Documentation Build" %}
|
|
queue: small_cpu_queue
|
|
{% elif step.no_gpu %}
|
|
queue: cpu_queue
|
|
{% elif step.num_gpus == 2 or step.num_gpus == 4 %}
|
|
queue: gpu_4_queue
|
|
{% else %}
|
|
queue: gpu_1_queue
|
|
{% endif %}
|
|
soft_fail: {{ step.soft_fail or false }}
|
|
{% if step.parallelism %}
|
|
parallelism: {{ step.parallelism }}
|
|
{% endif %}
|
|
retry:
|
|
automatic:
|
|
- exit_status: -1 # Agent was lost
|
|
limit: 5
|
|
- exit_status: -10 # Agent was lost
|
|
limit: 5
|
|
plugins:
|
|
- docker#v5.2.0:
|
|
image: {{ docker_image }}
|
|
always-pull: true
|
|
propagate-environment: true
|
|
{% if not step.no_gpu %}
|
|
gpus: all
|
|
{% endif %}
|
|
{% if step.label == "Benchmarks" %}
|
|
mount-buildkite-agent: true
|
|
{% endif %}
|
|
command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"]
|
|
environment:
|
|
- VLLM_USAGE_SOURCE=ci-test
|
|
- HF_TOKEN
|
|
{% if step.label == "Speculative decoding tests" %}
|
|
- VLLM_ATTENTION_BACKEND=XFORMERS
|
|
{% endif %}
|
|
volumes:
|
|
- /dev/shm:/dev/shm
|
|
{% endif %}
|
|
{% endfor %}
|