From d03d64fd2e22f1a48e7b78c66d7644e6b6230fb7 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 4 Apr 2024 21:53:16 -0700 Subject: [PATCH] [CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) --- .buildkite/test-pipeline.yaml | 2 +- Dockerfile | 89 ++++++++++++++++++++--------------- docs/source/conf.py | 3 -- 3 files changed, 51 insertions(+), 43 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ee384c27..7ad3386f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -90,7 +90,7 @@ steps: - bash run-benchmarks.sh - label: Documentation Build - working_dir: "/vllm-workspace/docs" + working_dir: "/vllm-workspace/test_docs/docs" no_gpu: True commands: - pip install -r requirements-docs.txt diff --git a/Dockerfile b/Dockerfile index f2f5e513..71c0646b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,7 @@ # to run the OpenAI compatible server. #################### BASE BUILD IMAGE #################### +# prepare basic build environment FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev RUN apt-get update -y \ @@ -34,7 +35,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} #################### BASE BUILD IMAGE #################### -#################### EXTENSION BUILD IMAGE #################### +#################### WHEEL BUILD IMAGE #################### FROM dev AS build # install build dependencies @@ -45,14 +46,14 @@ RUN --mount=type=cache,target=/root/.cache/pip \ # install compiler cache to speed up compilation leveraging local or remote caching RUN apt-get update -y && apt-get install -y ccache -# copy input files +# files and directories related to build wheels COPY csrc csrc COPY setup.py setup.py COPY cmake cmake COPY CMakeLists.txt CMakeLists.txt COPY requirements.txt requirements.txt COPY pyproject.toml pyproject.toml -COPY vllm/__init__.py vllm/__init__.py +COPY vllm vllm # max jobs used by Ninja to build extensions ARG max_jobs=2 @@ -65,7 +66,15 @@ ENV VLLM_INSTALL_PUNICA_KERNELS=1 ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ - python3 setup.py build_ext --inplace + --mount=type=cache,target=/root/.cache/pip \ + python3 setup.py bdist_wheel --dist-dir=dist + +# the `vllm_nccl` package must be installed from source distribution +# pip is too smart to store a wheel in the cache, and other CI jobs +# will directly use the wheel from the cache, which is not what we want. +# we need to remove it manually +RUN --mount=type=cache,target=/root/.cache/pip \ + pip cache remove vllm_nccl* #################### EXTENSION Build IMAGE #################### #################### FLASH_ATTENTION Build IMAGE #################### @@ -85,57 +94,59 @@ RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \ #################### FLASH_ATTENTION Build IMAGE #################### +#################### vLLM installation IMAGE #################### +# image with vLLM installed +FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base +WORKDIR /vllm-workspace + +RUN apt-get update -y \ + && apt-get install -y python3-pip git vim + +# Workaround for https://github.com/openai/triton/issues/2507 and +# https://github.com/pytorch/pytorch/issues/107960 -- hopefully +# this won't be needed for future versions of this docker image +# or future versions of triton. +RUN ldconfig /usr/local/cuda-12.1/compat/ + +# install vllm wheel first, so that torch etc will be installed +RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ + --mount=type=cache,target=/root/.cache/pip \ + pip install dist/*.whl --verbose + +RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ + --mount=type=cache,target=/root/.cache/pip \ + pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir +#################### vLLM installation IMAGE #################### + + #################### TEST IMAGE #################### # image to run unit testing suite -FROM dev AS test +# note that this uses vllm installed by `pip` +FROM vllm-base AS test -# copy pytorch extensions separately to avoid having to rebuild -# when python code changes -WORKDIR /vllm-workspace -# ADD is used to preserve directory structure ADD . /vllm-workspace/ -COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/ -# Install flash attention (from pre-built wheel) -RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ - pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir -# ignore build dependencies installation because we are using pre-complied extensions -RUN rm pyproject.toml -RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose -#################### TEST IMAGE #################### - -#################### RUNTIME BASE IMAGE #################### -# We used base cuda image because pytorch installs its own cuda libraries. -# However pynccl depends on cuda libraries so we had to switch to the runtime image -# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda -FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base - -# libnccl required for ray -RUN apt-get update -y \ - && apt-get install -y python3-pip - -WORKDIR /workspace -COPY requirements.txt requirements.txt +# install development dependencies (for testing) RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements.txt + pip install -r requirements-dev.txt -# Install flash attention (from pre-built wheel) -RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ - pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir - -#################### RUNTIME BASE IMAGE #################### +# doc requires source code +# we hide them inside `test_docs/` , so that this source code +# will not be imported by other tests +RUN mkdir test_docs +RUN mv docs test_docs/ +RUN mv vllm test_docs/ +#################### TEST IMAGE #################### #################### OPENAI API SERVER #################### # openai api server alternative FROM vllm-base AS vllm-openai + # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ pip install accelerate hf_transfer modelscope -COPY --from=build /workspace/vllm/*.so /workspace/vllm/ -COPY vllm vllm - ENV VLLM_USAGE_SOURCE production-docker-image ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/docs/source/conf.py b/docs/source/conf.py index 5619ea21..44cda7c9 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -11,13 +11,10 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. import logging -import os import sys from sphinx.ext import autodoc -sys.path.insert(0, os.path.abspath(os.path.join('..', '..'))) - logger = logging.getLogger(__name__) # -- Project information -----------------------------------------------------