fix python 3.8 syntax (#2716)
This commit is contained in:
parent
923797fea4
commit
b9e96b17de
15
Dockerfile
15
Dockerfile
@ -4,8 +4,21 @@
|
|||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
|
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
|
||||||
|
|
||||||
|
# Set the DEBIAN_FRONTEND variable to noninteractive to avoid interactive prompts
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
# Preconfigure tzdata for US Central Time (build running in us-central-1 but this really doesn't matter.)
|
||||||
|
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||||
|
&& echo 'tzdata tzdata/Zones/America select Chicago' | debconf-set-selections
|
||||||
|
|
||||||
|
# We install an older version of python here for testing to make sure vllm works with older versions of Python.
|
||||||
|
# For the actual openai compatible server, we will use the latest version of Python.
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y \
|
||||||
&& apt-get install -y python3-pip git
|
&& apt-get install -y software-properties-common \
|
||||||
|
&& add-apt-repository ppa:deadsnakes/ppa -y \
|
||||||
|
&& apt-get update -y \
|
||||||
|
&& apt-get install -y python3.8 python3.8-dev python3.8-venv python3-pip git \
|
||||||
|
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
|
||||||
|
|
||||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
from typing import AsyncGenerator, AsyncIterator, Callable, List, Optional
|
from typing import AsyncGenerator, AsyncIterator, Callable, List, Optional, Dict, Tuple
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.utils import random_uuid
|
from vllm.utils import random_uuid
|
||||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||||
@ -19,8 +19,8 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
|||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
TypeTokenIDs = list[int]
|
TypeTokenIDs = List[int]
|
||||||
TypeTopLogProbs = List[Optional[dict[int, float]]]
|
TypeTopLogProbs = List[Optional[Dict[int, float]]]
|
||||||
TypeCreateLogProbsFn = Callable[
|
TypeCreateLogProbsFn = Callable[
|
||||||
[TypeTokenIDs, TypeTopLogProbs, Optional[int], int], LogProbs]
|
[TypeTokenIDs, TypeTopLogProbs, Optional[int], int], LogProbs]
|
||||||
|
|
||||||
@ -29,7 +29,7 @@ async def completion_stream_generator(
|
|||||||
request: CompletionRequest,
|
request: CompletionRequest,
|
||||||
raw_request: Request,
|
raw_request: Request,
|
||||||
on_abort,
|
on_abort,
|
||||||
result_generator: AsyncIterator[tuple[int, RequestOutput]],
|
result_generator: AsyncIterator[Tuple[int, RequestOutput]],
|
||||||
create_logprobs_fn: TypeCreateLogProbsFn,
|
create_logprobs_fn: TypeCreateLogProbsFn,
|
||||||
request_id: str,
|
request_id: str,
|
||||||
created_time: int,
|
created_time: int,
|
||||||
@ -126,7 +126,7 @@ async def completion_stream_generator(
|
|||||||
yield "data: [DONE]\n\n"
|
yield "data: [DONE]\n\n"
|
||||||
|
|
||||||
|
|
||||||
def parse_prompt_format(prompt) -> tuple[bool, list]:
|
def parse_prompt_format(prompt) -> Tuple[bool, list]:
|
||||||
# get the prompt, openai supports the following
|
# get the prompt, openai supports the following
|
||||||
# "a string, array of strings, array of tokens, or array of token arrays."
|
# "a string, array of strings, array of tokens, or array of token arrays."
|
||||||
prompt_is_tokens = False
|
prompt_is_tokens = False
|
||||||
@ -151,7 +151,7 @@ def parse_prompt_format(prompt) -> tuple[bool, list]:
|
|||||||
|
|
||||||
|
|
||||||
def request_output_to_completion_response(
|
def request_output_to_completion_response(
|
||||||
final_res_batch: list[RequestOutput],
|
final_res_batch: List[RequestOutput],
|
||||||
request: CompletionRequest,
|
request: CompletionRequest,
|
||||||
create_logprobs_fn: TypeCreateLogProbsFn,
|
create_logprobs_fn: TypeCreateLogProbsFn,
|
||||||
request_id: str,
|
request_id: str,
|
||||||
@ -302,7 +302,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
|||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return self.create_error_response(str(e))
|
return self.create_error_response(str(e))
|
||||||
|
|
||||||
result_generator: AsyncIterator[tuple[
|
result_generator: AsyncIterator[Tuple[
|
||||||
int, RequestOutput]] = merge_async_iterators(*generators)
|
int, RequestOutput]] = merge_async_iterators(*generators)
|
||||||
|
|
||||||
# Similar to the OpenAI API, when n != best_of, we do not stream the
|
# Similar to the OpenAI API, when n != best_of, we do not stream the
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user