From 9db642138b54ef3df81873eac9fe7e15fc2da584 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 27 Aug 2024 23:28:30 +0800 Subject: [PATCH] [CI/Build][VLM] Cleanup multiple images inputs model test (#7897) --- tests/models/test_minicpmv.py | 136 ++++++++------------------------ tests/models/test_phi3v.py | 141 ++++++++++------------------------ 2 files changed, 74 insertions(+), 203 deletions(-) diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py index bf72dad0..99e49c14 100644 --- a/tests/models/test_minicpmv.py +++ b/tests/models/test_minicpmv.py @@ -1,14 +1,15 @@ -from typing import List, Optional, Tuple, Type +from typing import List, Optional, Tuple, Type, Union import pytest import torch import torch.types +from PIL import Image from transformers import BatchEncoding from vllm.multimodal.utils import rescale_image_size from vllm.sequence import SampleLogprobs -from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets +from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner from .utils import check_logprobs_close pytestmark = pytest.mark.vlm @@ -24,6 +25,11 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ "(./)\nWhat is the season?<|eot_id|>" \ "<|start_header_id|>assistant<|end_header_id|>\n\n", }) +HF_MULTIIMAGE_IMAGE_PROMPT = \ + "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \ + "(./)\n(./)\n" \ + "Describe these images.<|eot_id|>" \ + "<|start_header_id|>assistant<|end_header_id|>\n\n" models = ["openbmb/MiniCPM-Llama3-V-2_5"] @@ -46,13 +52,14 @@ target_dtype = "half" def run_test( hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, + inputs: List[Tuple[List[str], Union[List[Image.Image], + List[List[Image.Image]]]]], model: str, *, - size_factors: List[float], dtype: str, max_tokens: int, num_logprobs: int, + mm_limit: int, tensor_parallel_size: int, distributed_executor_backend: Optional[str] = None, ): @@ -65,12 +72,6 @@ def run_test( Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. """ - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] # NOTE: take care of the order. run vLLM first, and then run HF. # vLLM needs a fresh new process without cuda initialization. @@ -82,6 +83,7 @@ def run_test( max_model_len=4096, max_num_seqs=1, dtype=dtype, + limit_mm_per_prompt={"image": mm_limit}, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, enforce_eager=True) as vllm_model: @@ -93,7 +95,7 @@ def run_test( num_logprobs=num_logprobs, images=images, stop_token_ids=stop_token_ids) - for prompts, images in inputs_per_image + for prompts, images in inputs ] hf_model = hf_runner(model, dtype=dtype, postprocess_inputs=_wrap_inputs) @@ -104,7 +106,7 @@ def run_test( num_logprobs=num_logprobs, images=images, tokenizer=tokenizer) - for prompts, images in inputs_per_image + for prompts, images in inputs ] for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, @@ -138,104 +140,26 @@ def run_test( @pytest.mark.parametrize("num_logprobs", [5]) def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, dtype: str, max_tokens: int, num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + run_test( hf_runner, vllm_runner, - image_assets, + inputs_per_image, model, - size_factors=size_factors, dtype=dtype, max_tokens=max_tokens, num_logprobs=num_logprobs, + mm_limit=1, tensor_parallel_size=1, ) -HF_MULTIIMAGE_IMAGE_PROMPT = \ - "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \ - "(./)\n(./)\n" \ - "Describe these images.<|eot_id|>" \ - "<|start_header_id|>assistant<|end_header_id|>\n\n" - - -def run_multi_image_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test is under tests/images. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - images = [asset.pil_image for asset in image_assets] - - inputs_per_case = [ - ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], - [[rescale_image_size(image, factor) for image in images] - for factor in size_factors]) - ] - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - max_model_len=4096, - max_num_seqs=1, - limit_mm_per_prompt={"image": len(images)}, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - tokenizer = vllm_model.model.get_tokenizer() - stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id] - vllm_outputs_per_case = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images, - stop_token_ids=stop_token_ids) - for prompts, images in inputs_per_case - ] - - hf_model = hf_runner(model, dtype=dtype, postprocess_inputs=_wrap_inputs) - with hf_model, torch.no_grad(): - hf_outputs_per_case = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images, - tokenizer=tokenizer) - for prompts, images in inputs_per_case - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, - vllm_outputs_per_case): - check_logprobs_close( - outputs_0_lst=[ - trunc_hf_output(hf_output) for hf_output in hf_outputs - ], - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "size_factors", @@ -256,14 +180,22 @@ def run_multi_image_test( def test_multi_images_models(hf_runner, vllm_runner, image_assets, model, size_factors, dtype: str, max_tokens: int, num_logprobs: int) -> None: - run_multi_image_test( + images = [asset.pil_image for asset in image_assets] + + inputs_per_case = [ + ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], + [[rescale_image_size(image, factor) for image in images] + for factor in size_factors]) + ] + + run_test( hf_runner, vllm_runner, - image_assets, + inputs_per_case, model, - size_factors=size_factors, dtype=dtype, max_tokens=max_tokens, num_logprobs=num_logprobs, + mm_limit=2, tensor_parallel_size=1, ) diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 259cbe51..e416a85b 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -1,6 +1,6 @@ import os import re -from typing import List, Optional, Tuple, Type +from typing import List, Optional, Tuple, Type, Union import pytest from PIL import Image @@ -60,13 +60,14 @@ if is_hip(): def run_test( hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], - images: List[Image.Image], + inputs: List[Tuple[List[str], Union[List[Image.Image], + List[List[Image.Image]]]]], model: str, *, - size_factors: List[float], dtype: str, max_tokens: int, num_logprobs: int, + mm_limit: int, tensor_parallel_size: int, distributed_executor_backend: Optional[str] = None, ): @@ -79,13 +80,6 @@ def run_test( Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. """ - inputs_per_image = [( - [prompt for _ in size_factors], - [ - rescale_image_size(image, factor, transpose=idx) - for idx, factor in enumerate(size_factors) - ], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] # NOTE: take care of the order. run vLLM first, and then run HF. # vLLM needs a fresh new process without cuda initialization. @@ -97,15 +91,16 @@ def run_test( max_model_len=4096, max_num_seqs=1, dtype=dtype, + limit_mm_per_prompt={"image": mm_limit}, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, enforce_eager=True) as vllm_model: - vllm_outputs_per_image = [ + vllm_outputs_per_case = [ vllm_model.generate_greedy_logprobs(prompts, max_tokens, num_logprobs=num_logprobs, images=images) - for prompts, images in inputs_per_image + for prompts, images in inputs ] # use eager mode for hf runner, since phi3_v didn't work with flash_attn @@ -113,17 +108,17 @@ def run_test( with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model: eos_token_id = hf_model.processor.tokenizer.eos_token_id - hf_outputs_per_image = [ + hf_outputs_per_case = [ hf_model.generate_greedy_logprobs_limit(prompts, max_tokens, num_logprobs=num_logprobs, images=images, eos_token_id=eos_token_id) - for prompts, images in inputs_per_image + for prompts, images in inputs ] - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): + for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, + vllm_outputs_per_case): check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=[ @@ -156,15 +151,22 @@ def run_test( @pytest.mark.parametrize("num_logprobs", [10]) def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, dtype: str, max_tokens: int, num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + run_test( hf_runner, vllm_runner, - [asset.pil_image for asset in image_assets], + inputs_per_image, model, - size_factors=size_factors, dtype=dtype, max_tokens=max_tokens, num_logprobs=num_logprobs, + mm_limit=1, tensor_parallel_size=1, ) @@ -173,97 +175,26 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, @pytest.mark.parametrize("dtype", [target_dtype]) def test_regression_7840(hf_runner, vllm_runner, image_assets, model, dtype) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_regresion_7840 = [ + ([prompt], [image]) for image, prompt in zip(images, HF_IMAGE_PROMPTS) + ] + # Regression test for #7840. run_test( hf_runner, vllm_runner, - [image_assets[0].pil_image.resize((465, 226))], + inputs_regresion_7840, model, - size_factors=[1.0], dtype=dtype, max_tokens=128, num_logprobs=10, + mm_limit=1, tensor_parallel_size=1, ) -def run_multi_image_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - images: List[Image.Image], - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test is under tests/images. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - - inputs_per_case = [ - ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], - [[rescale_image_size(image, factor) for image in images] - for factor in size_factors]) - ] - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - max_model_len=4096, - max_num_seqs=1, - limit_mm_per_prompt={"image": len(images)}, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - vllm_outputs_per_case = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_case - ] - - hf_model_kwargs = {"_attn_implementation": "eager"} - with hf_runner(model, dtype=dtype, - model_kwargs=hf_model_kwargs) as hf_model: - eos_token_id = hf_model.processor.tokenizer.eos_token_id - hf_outputs_per_case = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images, - eos_token_id=eos_token_id) - for prompts, images in inputs_per_case - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, - vllm_outputs_per_case): - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "size_factors", @@ -280,18 +211,26 @@ def run_multi_image_test( ) @pytest.mark.parametrize("dtype", [target_dtype]) @pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("num_logprobs", [10]) def test_multi_images_models(hf_runner, vllm_runner, image_assets, model, size_factors, dtype: str, max_tokens: int, num_logprobs: int) -> None: - run_multi_image_test( + images = [asset.pil_image for asset in image_assets] + + inputs_per_case = [ + ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], + [[rescale_image_size(image, factor) for image in images] + for factor in size_factors]) + ] + + run_test( hf_runner, vllm_runner, - [asset.pil_image for asset in image_assets], + inputs_per_case, model, - size_factors=size_factors, dtype=dtype, max_tokens=max_tokens, num_logprobs=num_logprobs, + mm_limit=2, tensor_parallel_size=1, )