From c85b80c2b64d0f420aaca59679e5f38f71a8a53e Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 8 Dec 2023 09:53:47 -0800
Subject: [PATCH] [Docker] Add cuda arch list as build option (#1950)

---
 Dockerfile                                    | 6 +++++-
 docs/source/serving/deploying_with_docker.rst | 8 ++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index b1be5fb9..13c2e609 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -30,11 +30,15 @@ COPY requirements.txt requirements.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm/__init__.py vllm/__init__.py
 
+ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # max jobs used by Ninja to build extensions
-ENV MAX_JOBS=$max_jobs
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
 # number of threads used by nvcc
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
+
 RUN python3 setup.py build_ext --inplace
 
 # image to run unit testing suite
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 3afefecc..7ec76963 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -31,6 +31,14 @@ You can build and run vLLM from source via the provided dockerfile. To build vLL
 
     $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
 
+
+.. note::
+
+        By default vLLM will build for all GPU types for widest distribution. If you are just building for the
+        current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""``
+        for vLLM to find the current GPU type and build for that.
+
+
 To run vLLM:
 
 .. code-block:: console