[VLM][Frontend] Proper Image Prompt Formatting from OpenAI API (#6091)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
Roger Wang 2024-07-02 23:41:23 -07:00 committed by GitHub
parent f666207161
commit 3a86b54fb0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -127,6 +127,16 @@ class OpenAIServingChat(OpenAIServing):
return self.tokenizer.decode(image_token_id) return self.tokenizer.decode(image_token_id)
# TODO: Let user specify how to insert image tokens into prompt
# (similar to chat template)
def _get_full_image_text_prompt(self, image_token_str: str,
text_prompt: str) -> str:
"""Combine image and text prompts for vision language model"""
# NOTE: For now we assume all model architectures use the same
# image + text prompt format. This may change in the future.
return f"{image_token_str}\n{text_prompt}"
def _parse_chat_message_content_parts( def _parse_chat_message_content_parts(
self, self,
role: str, role: str,
@ -146,15 +156,6 @@ class OpenAIServingChat(OpenAIServing):
"Multiple 'image_url' input is currently not supported." "Multiple 'image_url' input is currently not supported."
) )
image_token_str = self.image_token_str
if image_token_str is not None:
if any(image_token_str in text for text in texts):
logger.warning(
"Detected image token string in the text prompt. "
"Skipping prompt formatting.")
else:
texts.append(image_token_str)
image_url = cast(ChatCompletionContentPartImageParam, image_url = cast(ChatCompletionContentPartImageParam,
part)["image_url"] part)["image_url"]
@ -169,6 +170,20 @@ class OpenAIServingChat(OpenAIServing):
raise NotImplementedError(f"Unknown part type: {part_type}") raise NotImplementedError(f"Unknown part type: {part_type}")
text_prompt = "\n".join(texts) text_prompt = "\n".join(texts)
if mm_futures:
image_token_str = self.image_token_str
if image_token_str is not None:
if image_token_str in text_prompt:
logger.warning(
"Detected image token string in the text prompt. "
"Skipping prompt formatting.")
else:
text_prompt = self._get_full_image_text_prompt(
image_token_str=image_token_str,
text_prompt=text_prompt,
)
messages = [ConversationMessage(role=role, content=text_prompt)] messages = [ConversationMessage(role=role, content=text_prompt)]
return ChatMessageParseResult(messages=messages, mm_futures=mm_futures) return ChatMessageParseResult(messages=messages, mm_futures=mm_futures)
@ -238,7 +253,9 @@ class OpenAIServingChat(OpenAIServing):
try: try:
if len(mm_futures): if len(mm_futures):
# since we support only single mm data currently # since we support only single mm data currently
assert len(mm_futures) == 1 assert len(
mm_futures
) == 1, "Multiple 'image_url' input is currently not supported."
mm_data = await mm_futures[0] mm_data = await mm_futures[0]
except Exception as e: except Exception as e:
logger.error("Error in loading multi-modal data: %s", e) logger.error("Error in loading multi-modal data: %s", e)