diff --git a/Dockerfile b/Dockerfile index 364345d6..3db86adf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,8 +4,21 @@ #################### BASE BUILD IMAGE #################### FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev +# Set the DEBIAN_FRONTEND variable to noninteractive to avoid interactive prompts +ENV DEBIAN_FRONTEND=noninteractive + +# Preconfigure tzdata for US Central Time (build running in us-central-1 but this really doesn't matter.) +RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ + && echo 'tzdata tzdata/Zones/America select Chicago' | debconf-set-selections + +# We install an older version of python here for testing to make sure vllm works with older versions of Python. +# For the actual openai compatible server, we will use the latest version of Python. RUN apt-get update -y \ - && apt-get install -y python3-pip git + && apt-get install -y software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa -y \ + && apt-get update -y \ + && apt-get install -y python3.8 python3.8-dev python3.8-venv python3-pip git \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 # Workaround for https://github.com/openai/triton/issues/2507 and # https://github.com/pytorch/pytorch/issues/107960 -- hopefully diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 8c9a7ad3..191142d2 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -1,7 +1,7 @@ import asyncio import time from fastapi import Request -from typing import AsyncGenerator, AsyncIterator, Callable, List, Optional +from typing import AsyncGenerator, AsyncIterator, Callable, List, Optional, Dict, Tuple from vllm.logger import init_logger from vllm.utils import random_uuid from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -19,8 +19,8 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing logger = init_logger(__name__) -TypeTokenIDs = list[int] -TypeTopLogProbs = List[Optional[dict[int, float]]] +TypeTokenIDs = List[int] +TypeTopLogProbs = List[Optional[Dict[int, float]]] TypeCreateLogProbsFn = Callable[ [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], LogProbs] @@ -29,7 +29,7 @@ async def completion_stream_generator( request: CompletionRequest, raw_request: Request, on_abort, - result_generator: AsyncIterator[tuple[int, RequestOutput]], + result_generator: AsyncIterator[Tuple[int, RequestOutput]], create_logprobs_fn: TypeCreateLogProbsFn, request_id: str, created_time: int, @@ -126,7 +126,7 @@ async def completion_stream_generator( yield "data: [DONE]\n\n" -def parse_prompt_format(prompt) -> tuple[bool, list]: +def parse_prompt_format(prompt) -> Tuple[bool, list]: # get the prompt, openai supports the following # "a string, array of strings, array of tokens, or array of token arrays." prompt_is_tokens = False @@ -151,7 +151,7 @@ def parse_prompt_format(prompt) -> tuple[bool, list]: def request_output_to_completion_response( - final_res_batch: list[RequestOutput], + final_res_batch: List[RequestOutput], request: CompletionRequest, create_logprobs_fn: TypeCreateLogProbsFn, request_id: str, @@ -302,7 +302,7 @@ class OpenAIServingCompletion(OpenAIServing): except ValueError as e: return self.create_error_response(str(e)) - result_generator: AsyncIterator[tuple[ + result_generator: AsyncIterator[Tuple[ int, RequestOutput]] = merge_async_iterators(*generators) # Similar to the OpenAI API, when n != best_of, we do not stream the