From 59230ef32b0b9132ea9a6ea39d8e823574657a87 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Wed, 16 Oct 2024 04:20:51 -0700 Subject: [PATCH] [Misc] Consolidate example usage of OpenAI client for multimodal models (#9412) Co-authored-by: DarkLight1337 --- docs/source/models/vlm.rst | 2 +- examples/openai_api_client_for_multimodal.py | 236 +++++++++++++++++++ examples/openai_audio_api_client.py | 90 ------- examples/openai_vision_api_client.py | 126 ---------- 4 files changed, 237 insertions(+), 217 deletions(-) create mode 100644 examples/openai_api_client_for_multimodal.py delete mode 100644 examples/openai_audio_api_client.py delete mode 100644 examples/openai_vision_api_client.py diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index a3ee5da0..7dd42ec1 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -241,7 +241,7 @@ To consume the server, you can use the OpenAI client like in the example below: print("Chat completion output:", chat_response.choices[0].message.content) -A full code example can be found in `examples/openai_vision_api_client.py `_. +A full code example can be found in `examples/openai_api_client_for_multimodal.py `_. .. note:: diff --git a/examples/openai_api_client_for_multimodal.py b/examples/openai_api_client_for_multimodal.py new file mode 100644 index 00000000..704236be --- /dev/null +++ b/examples/openai_api_client_for_multimodal.py @@ -0,0 +1,236 @@ +"""An example showing how to use vLLM to serve multimodal models +and run online inference with OpenAI client. + +Launch the vLLM server with the following command: + +(single image inference with Llava) +vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja + +(multi-image inference with Phi-3.5-vision-instruct) +vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \ + --trust-remote-code --limit-mm-per-prompt image=2 + +(audio inference with Ultravox) +vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096 +""" +import base64 + +import requests +from openai import OpenAI + +from vllm.assets.audio import AudioAsset +from vllm.utils import FlexibleArgumentParser + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id + + +def encode_base64_content_from_url(content_url: str) -> str: + """Encode a content retrieved from a remote url to base64 format.""" + + with requests.get(content_url) as response: + response.raise_for_status() + result = base64.b64encode(response.content).decode('utf-8') + + return result + + +# Text-only inference +def run_text_only() -> None: + chat_completion = client.chat.completions.create( + messages=[{ + "role": "user", + "content": "What's the capital of France?" + }], + model=model, + max_tokens=64, + ) + + result = chat_completion.choices[0].message.content + print("Chat completion output:", result) + + +# Single-image input inference +def run_single_image() -> None: + + ## Use image url in the payload + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + }, + ], + }], + model=model, + max_tokens=64, + ) + + result = chat_completion_from_url.choices[0].message.content + print("Chat completion output from image url:", result) + + ## Use base64 encoded image in the payload + image_base64 = encode_base64_content_from_url(image_url) + chat_completion_from_base64 = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_base64}" + }, + }, + ], + }], + model=model, + max_tokens=64, + ) + + result = chat_completion_from_base64.choices[0].message.content + print("Chat completion output from base64 encoded image:", result) + + +# Multi-image input inference +def run_multi_image() -> None: + image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" + image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" + chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What are the animals in these images?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url_duck + }, + }, + { + "type": "image_url", + "image_url": { + "url": image_url_lion + }, + }, + ], + }], + model=model, + max_tokens=64, + ) + + result = chat_completion_from_url.choices[0].message.content + print("Chat completion output:", result) + + +# Audio input inference +def run_audio() -> None: + # Any format supported by librosa is supported + audio_url = AudioAsset("winning_call").url + + # Use audio url in the payload + chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?" + }, + { + "type": "audio_url", + "audio_url": { + "url": audio_url + }, + }, + ], + }], + model=model, + max_tokens=64, + ) + + result = chat_completion_from_url.choices[0].message.content + print("Chat completion output from audio url:", result) + + audio_base64 = encode_base64_content_from_url(audio_url) + chat_completion_from_base64 = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?" + }, + { + "type": "audio_url", + "audio_url": { + # Any format supported by librosa is supported + "url": f"data:audio/ogg;base64,{audio_base64}" + }, + }, + ], + }], + model=model, + max_tokens=64, + ) + + result = chat_completion_from_base64.choices[0].message.content + print("Chat completion output from base64 encoded audio:", result) + + +example_function_map = { + "text-only": run_text_only, + "single-image": run_single_image, + "multi-image": run_multi_image, + "audio": run_audio, +} + + +def main(args) -> None: + chat_type = args.chat_type + example_function_map[chat_type]() + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using OpenAI client for online inference with ' + 'multimodal language models served with vLLM.') + parser.add_argument( + '--chat-type', + '-c', + type=str, + default="single-image", + choices=["text-only", "single-image", "multi-image", "audio"], + help='Conversation type with multimodal data.') + args = parser.parse_args() + main(args) diff --git a/examples/openai_audio_api_client.py b/examples/openai_audio_api_client.py deleted file mode 100644 index 80a97268..00000000 --- a/examples/openai_audio_api_client.py +++ /dev/null @@ -1,90 +0,0 @@ -"""An example showing how to use vLLM to serve VLMs. - -Launch the vLLM server with the following command: -vllm serve fixie-ai/ultravox-v0_3 -""" -import base64 - -import requests -from openai import OpenAI - -from vllm.assets.audio import AudioAsset - -# Modify OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - -client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, -) - -models = client.models.list() -model = models.data[0].id - -# Any format supported by librosa is supported -audio_url = AudioAsset("winning_call").url - -# Use audio url in the payload -chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "audio_url", - "audio_url": { - "url": audio_url - }, - }, - ], - }], - model=model, - max_tokens=64, -) - -result = chat_completion_from_url.choices[0].message.content -print(f"Chat completion output:{result}") - - -# Use base64 encoded audio in the payload -def encode_audio_base64_from_url(audio_url: str) -> str: - """Encode an audio retrieved from a remote url to base64 format.""" - - with requests.get(audio_url) as response: - response.raise_for_status() - result = base64.b64encode(response.content).decode('utf-8') - - return result - - -audio_base64 = encode_audio_base64_from_url(audio_url=audio_url) -chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "audio_url", - "audio_url": { - # Any format supported by librosa is supported - "url": f"data:audio/ogg;base64,{audio_base64}" - }, - }, - ], - }], - model=model, - max_tokens=64, -) - -result = chat_completion_from_base64.choices[0].message.content -print(f"Chat completion output:{result}") diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py deleted file mode 100644 index 71ae03e4..00000000 --- a/examples/openai_vision_api_client.py +++ /dev/null @@ -1,126 +0,0 @@ -"""An example showing how to use vLLM to serve VLMs. - -Launch the vLLM server with the following command: - -(single image inference with Llava) -vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja - -(multi-image inference with Phi-3.5-vision-instruct) -vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \ - --trust-remote-code --limit-mm-per-prompt image=2 -""" -import base64 - -import requests -from openai import OpenAI - -# Modify OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - -client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, -) - -models = client.models.list() -model = models.data[0].id - -# Single-image input inference -image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - -## Use image url in the payload -chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": image_url - }, - }, - ], - }], - model=model, - max_tokens=64, -) - -result = chat_completion_from_url.choices[0].message.content -print("Chat completion output:", result) - - -## Use base64 encoded image in the payload -def encode_image_base64_from_url(image_url: str) -> str: - """Encode an image retrieved from a remote url to base64 format.""" - - with requests.get(image_url) as response: - response.raise_for_status() - result = base64.b64encode(response.content).decode('utf-8') - - return result - - -image_base64 = encode_image_base64_from_url(image_url=image_url) -chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{image_base64}" - }, - }, - ], - }], - model=model, - max_tokens=64, -) - -result = chat_completion_from_base64.choices[0].message.content -print(f"Chat completion output:{result}") - -# Multi-image input inference -image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" -image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" -chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What are the animals in these images?" - }, - { - "type": "image_url", - "image_url": { - "url": image_url_duck - }, - }, - { - "type": "image_url", - "image_url": { - "url": image_url_lion - }, - }, - ], - }], - model=model, - max_tokens=64, -) - -result = chat_completion_from_url.choices[0].message.content -print("Chat completion output:", result)