diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index c86e41c6..7e179362 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -158,7 +158,7 @@ class OpenAIServingChat(OpenAIServing): try: # Tokenize/detokenize depending on prompt format (string/token list) prompt_ids, prompt_text = self._validate_prompt_and_tokenize( - request, prompt=prompt) + request, prompt=prompt, add_special_tokens=False) sampling_params = request.to_sampling_params() lora_request = self._maybe_get_lora(request) decoding_config = await self.engine.get_decoding_config() diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 58a1c2f7..db3fc85d 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,7 +1,7 @@ import json from dataclasses import dataclass from http import HTTPStatus -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union from pydantic import Field from typing_extensions import Annotated @@ -165,13 +165,14 @@ class OpenAIServing: raise ValueError(f"The model `{request.model}` does not exist.") def _validate_prompt_and_tokenize( - self, - request: Union[ChatCompletionRequest, CompletionRequest, - EmbeddingRequest], - prompt: Optional[str] = None, - prompt_ids: Optional[List[int]] = None, - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None - ) -> Tuple[List[int], str]: + self, + request: Union[ChatCompletionRequest, CompletionRequest, + EmbeddingRequest], + prompt: Optional[str] = None, + prompt_ids: Optional[List[int]] = None, + truncate_prompt_tokens: Optional[Annotated[int, + Field(ge=1)]] = None, + add_special_tokens: bool = True) -> Tuple[List[int], str]: if not (prompt or prompt_ids): raise ValueError("Either prompt or prompt_ids should be provided.") if (prompt and prompt_ids): @@ -179,10 +180,19 @@ class OpenAIServing: "Only one of prompt or prompt_ids should be provided.") if prompt_ids is None: - tokenizer_kwargs = {} if truncate_prompt_tokens is None else { - "truncation": True, - "max_length": truncate_prompt_tokens, + # When using OpenAIServingChat for chat completions, the + # special tokens (e.g., BOS) have already been added by the + # chat template. Therefore, we do not need to add them again. + # Set add_special_tokens to False to avoid adding the BOS tokens + # again. + tokenizer_kwargs: Dict[str, Any] = { + "add_special_tokens": add_special_tokens } + if truncate_prompt_tokens is not None: + tokenizer_kwargs.update({ + "truncation": True, + "max_length": truncate_prompt_tokens, + }) input_ids = self.tokenizer(prompt, **tokenizer_kwargs).input_ids elif truncate_prompt_tokens is not None: input_ids = prompt_ids[-truncate_prompt_tokens:]