[Doc] Fix VLM prompt placeholder sample bug (#9170)
This commit is contained in:
parent
c8627cd41b
commit
dc4aea677a
@ -25,7 +25,7 @@ The :class:`~vllm.LLM` class can be instantiated in much the same way as languag
|
|||||||
To pass an image to the model, note the following in :class:`vllm.inputs.PromptType`:
|
To pass an image to the model, note the following in :class:`vllm.inputs.PromptType`:
|
||||||
|
|
||||||
* ``prompt``: The prompt should follow the format that is documented on HuggingFace.
|
* ``prompt``: The prompt should follow the format that is documented on HuggingFace.
|
||||||
* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`.
|
* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
@ -34,7 +34,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptT
|
|||||||
|
|
||||||
# Load the image using PIL.Image
|
# Load the image using PIL.Image
|
||||||
image = PIL.Image.open(...)
|
image = PIL.Image.open(...)
|
||||||
|
|
||||||
# Single prompt inference
|
# Single prompt inference
|
||||||
outputs = llm.generate({
|
outputs = llm.generate({
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
@ -68,7 +68,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptT
|
|||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"multi_modal_data": mm_data,
|
"multi_modal_data": mm_data,
|
||||||
})
|
})
|
||||||
|
|
||||||
for o in outputs:
|
for o in outputs:
|
||||||
generated_text = o.outputs[0].text
|
generated_text = o.outputs[0].text
|
||||||
print(generated_text)
|
print(generated_text)
|
||||||
@ -116,7 +116,7 @@ Instead of passing in a single image, you can pass in a list of images.
|
|||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
# Refer to the HuggingFace repo for the correct format to use
|
# Refer to the HuggingFace repo for the correct format to use
|
||||||
prompt = "<|user|>\n<image_1>\n<image_2>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
|
prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
|
||||||
|
|
||||||
# Load the images using PIL.Image
|
# Load the images using PIL.Image
|
||||||
image1 = PIL.Image.open(...)
|
image1 = PIL.Image.open(...)
|
||||||
@ -135,11 +135,11 @@ Instead of passing in a single image, you can pass in a list of images.
|
|||||||
|
|
||||||
A code example can be found in `examples/offline_inference_vision_language_multi_image.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py>`_.
|
A code example can be found in `examples/offline_inference_vision_language_multi_image.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py>`_.
|
||||||
|
|
||||||
Multi-image input can be extended to perform video captioning. We show this with `Qwen2-VL <https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct>`_ as it supports videos:
|
Multi-image input can be extended to perform video captioning. We show this with `Qwen2-VL <https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct>`_ as it supports videos:
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
# Specify the maximum number of frames per video to be 4. This can be changed.
|
# Specify the maximum number of frames per video to be 4. This can be changed.
|
||||||
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
|
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
|
||||||
|
|
||||||
# Create the request payload.
|
# Create the request payload.
|
||||||
@ -157,7 +157,7 @@ Multi-image input can be extended to perform video captioning. We show this with
|
|||||||
|
|
||||||
# Perform inference and log output.
|
# Perform inference and log output.
|
||||||
outputs = llm.chat([message])
|
outputs = llm.chat([message])
|
||||||
|
|
||||||
for o in outputs:
|
for o in outputs:
|
||||||
generated_text = o.outputs[0].text
|
generated_text = o.outputs[0].text
|
||||||
print(generated_text)
|
print(generated_text)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user