From c85b80c2b64d0f420aaca59679e5f38f71a8a53e Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Fri, 8 Dec 2023 09:53:47 -0800 Subject: [PATCH] [Docker] Add cuda arch list as build option (#1950) --- Dockerfile | 6 +++++- docs/source/serving/deploying_with_docker.rst | 8 ++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index b1be5fb9..13c2e609 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,11 +30,15 @@ COPY requirements.txt requirements.txt COPY pyproject.toml pyproject.toml COPY vllm/__init__.py vllm/__init__.py +ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' +ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} # max jobs used by Ninja to build extensions -ENV MAX_JOBS=$max_jobs +ARG max_jobs=2 +ENV MAX_JOBS=${max_jobs} # number of threads used by nvcc ARG nvcc_threads=8 ENV NVCC_THREADS=$nvcc_threads + RUN python3 setup.py build_ext --inplace # image to run unit testing suite diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index 3afefecc..7ec76963 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -31,6 +31,14 @@ You can build and run vLLM from source via the provided dockerfile. To build vLL $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 + +.. note:: + + By default vLLM will build for all GPU types for widest distribution. If you are just building for the + current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""`` + for vLLM to find the current GPU type and build for that. + + To run vLLM: .. code-block:: console