diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 57ad7bdd..06c82d5e 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -127,6 +127,16 @@ class OpenAIServingChat(OpenAIServing): return self.tokenizer.decode(image_token_id) + # TODO: Let user specify how to insert image tokens into prompt + # (similar to chat template) + def _get_full_image_text_prompt(self, image_token_str: str, + text_prompt: str) -> str: + """Combine image and text prompts for vision language model""" + + # NOTE: For now we assume all model architectures use the same + # image + text prompt format. This may change in the future. + return f"{image_token_str}\n{text_prompt}" + def _parse_chat_message_content_parts( self, role: str, @@ -146,15 +156,6 @@ class OpenAIServingChat(OpenAIServing): "Multiple 'image_url' input is currently not supported." ) - image_token_str = self.image_token_str - if image_token_str is not None: - if any(image_token_str in text for text in texts): - logger.warning( - "Detected image token string in the text prompt. " - "Skipping prompt formatting.") - else: - texts.append(image_token_str) - image_url = cast(ChatCompletionContentPartImageParam, part)["image_url"] @@ -169,6 +170,20 @@ class OpenAIServingChat(OpenAIServing): raise NotImplementedError(f"Unknown part type: {part_type}") text_prompt = "\n".join(texts) + + if mm_futures: + image_token_str = self.image_token_str + if image_token_str is not None: + if image_token_str in text_prompt: + logger.warning( + "Detected image token string in the text prompt. " + "Skipping prompt formatting.") + else: + text_prompt = self._get_full_image_text_prompt( + image_token_str=image_token_str, + text_prompt=text_prompt, + ) + messages = [ConversationMessage(role=role, content=text_prompt)] return ChatMessageParseResult(messages=messages, mm_futures=mm_futures) @@ -238,7 +253,9 @@ class OpenAIServingChat(OpenAIServing): try: if len(mm_futures): # since we support only single mm data currently - assert len(mm_futures) == 1 + assert len( + mm_futures + ) == 1, "Multiple 'image_url' input is currently not supported." mm_data = await mm_futures[0] except Exception as e: logger.error("Error in loading multi-modal data: %s", e)