FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev RUN apt-get update -y \ && apt-get install -y python3-pip WORKDIR /workspace # install build and runtime dependencies COPY requirements.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements.txt # install development dependencies COPY requirements-dev.txt requirements-dev.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements-dev.txt # image to build pytorch extensions FROM dev AS build # install build dependencies COPY requirements-build.txt requirements-build.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements-build.txt # copy input files COPY csrc csrc COPY setup.py setup.py COPY requirements.txt requirements.txt COPY pyproject.toml pyproject.toml COPY vllm/__init__.py vllm/__init__.py ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} # max jobs used by Ninja to build extensions ARG max_jobs=2 ENV MAX_JOBS=${max_jobs} # number of threads used by nvcc ARG nvcc_threads=8 ENV NVCC_THREADS=$nvcc_threads RUN python3 setup.py build_ext --inplace # Build the megablocks library as wheel because it doesn't publish pre-built wheels. # https://github.com/stanford-futuredata/megablocks/commit/5897cd6f254b7b3edf7a708a3a3314ecb54b6f78 RUN apt-get install -y git && \ git clone https://github.com/stanford-futuredata/megablocks.git && \ cd megablocks && \ git checkout 5897cd6f254b7b3edf7a708a3a3314ecb54b6f78 && \ MAX_JOBS=8 NVCC_THREADS=8 python3 setup.py bdist_wheel # image to run unit testing suite FROM dev AS test # copy pytorch extensions separately to avoid having to rebuild # when python code changes COPY --from=build /workspace/vllm/*.so /workspace/vllm/ COPY tests tests COPY vllm vllm ENTRYPOINT ["python3", "-m", "pytest", "tests"] # use CUDA base as CUDA runtime dependencies are already installed via pip FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base # libnccl required for ray RUN apt-get update -y \ && apt-get install -y python3-pip WORKDIR /workspace COPY requirements.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements.txt FROM vllm-base AS vllm COPY --from=build /workspace/vllm/*.so /workspace/vllm/ COPY vllm vllm EXPOSE 8000 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"] # openai api server alternative FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ pip install accelerate COPY vllm vllm COPY --from=build /workspace/vllm/*.so /workspace/vllm/ COPY --from=build /workspace/megablocks/dist/*.whl /tmp/ RUN --mount=type=cache,target=/root/.cache/pip \ pip install /tmp/megablocks-0.5.0-cp310-cp310-linux_x86_64.whl && \ rm /tmp/megablocks-0.5.0-cp310-cp310-linux_x86_64.whl ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]