[VLM][Frontend] Proper Image Prompt Formatting from OpenAI API (#6091)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
parent
f666207161
commit
3a86b54fb0
@ -127,6 +127,16 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
|
|
||||||
return self.tokenizer.decode(image_token_id)
|
return self.tokenizer.decode(image_token_id)
|
||||||
|
|
||||||
|
# TODO: Let user specify how to insert image tokens into prompt
|
||||||
|
# (similar to chat template)
|
||||||
|
def _get_full_image_text_prompt(self, image_token_str: str,
|
||||||
|
text_prompt: str) -> str:
|
||||||
|
"""Combine image and text prompts for vision language model"""
|
||||||
|
|
||||||
|
# NOTE: For now we assume all model architectures use the same
|
||||||
|
# image + text prompt format. This may change in the future.
|
||||||
|
return f"{image_token_str}\n{text_prompt}"
|
||||||
|
|
||||||
def _parse_chat_message_content_parts(
|
def _parse_chat_message_content_parts(
|
||||||
self,
|
self,
|
||||||
role: str,
|
role: str,
|
||||||
@ -146,15 +156,6 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
"Multiple 'image_url' input is currently not supported."
|
"Multiple 'image_url' input is currently not supported."
|
||||||
)
|
)
|
||||||
|
|
||||||
image_token_str = self.image_token_str
|
|
||||||
if image_token_str is not None:
|
|
||||||
if any(image_token_str in text for text in texts):
|
|
||||||
logger.warning(
|
|
||||||
"Detected image token string in the text prompt. "
|
|
||||||
"Skipping prompt formatting.")
|
|
||||||
else:
|
|
||||||
texts.append(image_token_str)
|
|
||||||
|
|
||||||
image_url = cast(ChatCompletionContentPartImageParam,
|
image_url = cast(ChatCompletionContentPartImageParam,
|
||||||
part)["image_url"]
|
part)["image_url"]
|
||||||
|
|
||||||
@ -169,6 +170,20 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
raise NotImplementedError(f"Unknown part type: {part_type}")
|
raise NotImplementedError(f"Unknown part type: {part_type}")
|
||||||
|
|
||||||
text_prompt = "\n".join(texts)
|
text_prompt = "\n".join(texts)
|
||||||
|
|
||||||
|
if mm_futures:
|
||||||
|
image_token_str = self.image_token_str
|
||||||
|
if image_token_str is not None:
|
||||||
|
if image_token_str in text_prompt:
|
||||||
|
logger.warning(
|
||||||
|
"Detected image token string in the text prompt. "
|
||||||
|
"Skipping prompt formatting.")
|
||||||
|
else:
|
||||||
|
text_prompt = self._get_full_image_text_prompt(
|
||||||
|
image_token_str=image_token_str,
|
||||||
|
text_prompt=text_prompt,
|
||||||
|
)
|
||||||
|
|
||||||
messages = [ConversationMessage(role=role, content=text_prompt)]
|
messages = [ConversationMessage(role=role, content=text_prompt)]
|
||||||
|
|
||||||
return ChatMessageParseResult(messages=messages, mm_futures=mm_futures)
|
return ChatMessageParseResult(messages=messages, mm_futures=mm_futures)
|
||||||
@ -238,7 +253,9 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
try:
|
try:
|
||||||
if len(mm_futures):
|
if len(mm_futures):
|
||||||
# since we support only single mm data currently
|
# since we support only single mm data currently
|
||||||
assert len(mm_futures) == 1
|
assert len(
|
||||||
|
mm_futures
|
||||||
|
) == 1, "Multiple 'image_url' input is currently not supported."
|
||||||
mm_data = await mm_futures[0]
|
mm_data = await mm_futures[0]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Error in loading multi-modal data: %s", e)
|
logger.error("Error in loading multi-modal data: %s", e)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user