[BugFix] Fix test breakages from transformers 4.45 upgrade (#8829)

This commit is contained in:
Nick Hill 2024-09-27 00:46:43 +01:00 committed by GitHub
parent 71d21c73ab
commit 4b377d6feb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 62 additions and 49 deletions

View File

@ -83,7 +83,6 @@ steps:
- label: Entrypoints Test # 20min - label: Entrypoints Test # 20min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
soft_fail: true
fast_check: true fast_check: true
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
@ -96,7 +95,8 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
@ -178,7 +178,6 @@ steps:
- pytest -v -s prefix_caching - pytest -v -s prefix_caching
- label: Samplers Test # 18min - label: Samplers Test # 18min
soft_fail: true
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/layers - vllm/model_executor/layers
- vllm/sampling_metadata.py - vllm/sampling_metadata.py
@ -206,7 +205,6 @@ steps:
- label: LoRA Test %N # 30min each - label: LoRA Test %N # 30min each
mirror_hardwares: [amd] mirror_hardwares: [amd]
soft_fail: true
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
- tests/lora - tests/lora
@ -311,7 +309,6 @@ steps:
- pytest -v -s models/decoder_only/language - pytest -v -s models/decoder_only/language
- label: Decoder-only Multi-Modal Models Test # 56min - label: Decoder-only Multi-Modal Models Test # 56min
soft_fail: true
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -463,7 +460,7 @@ steps:
# NOTE: don't test llama model here, it seems hf implementation is buggy # NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details # see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py - pytest -v -s distributed/test_custom_all_reduce.py
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
- pytest -v -s -x lora/test_mixtral.py - pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional - label: LM Eval Large Models # optional

View File

@ -699,7 +699,6 @@ class VllmRunner:
if videos is not None: if videos is not None:
for i, video in enumerate(videos): for i, video in enumerate(videos):
inputs[i]["multi_modal_data"] = {"video": video} inputs[i]["multi_modal_data"] = {"video": video}
print(f"[INPUTS!!!!]: {inputs}, {sampling_params}")
req_outputs = self.model.generate(inputs, req_outputs = self.model.generate(inputs,
sampling_params=sampling_params) sampling_params=sampling_params)

View File

@ -8,8 +8,6 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
import os import os
import pytest import pytest
from packaging import version
from transformers import __version__ as transformers_version
from vllm.logger import init_logger from vllm.logger import init_logger
@ -49,11 +47,6 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
pytest.skip("Skipping multi-node pipeline parallel test for " pytest.skip("Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend") "multiprocessing distributed backend")
# Skip tests that require transformers>=4.45.0
if "Qwen2-VL" in MODEL_NAME and version.parse(
transformers_version) < version.parse("4.45.0.dev0"):
pytest.skip("This test requires transformers>=4.45.0")
pp_args = [ pp_args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",

View File

@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model):
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", ["facebook/opt-125m"])
def test_custom_executor(model, tmpdir): def test_custom_executor(model, tmp_path):
cwd = os.path.abspath(".") cwd = os.path.abspath(".")
os.chdir(tmpdir) os.chdir(tmp_path)
try: try:
assert not os.path.exists(".marker") assert not os.path.exists(".marker")
@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir):
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", ["facebook/opt-125m"])
def test_custom_executor_async(model, tmpdir): def test_custom_executor_async(model, tmp_path):
cwd = os.path.abspath(".") cwd = os.path.abspath(".")
os.chdir(tmpdir) os.chdir(tmp_path)
try: try:
assert not os.path.exists(".marker") assert not os.path.exists(".marker")

View File

@ -15,6 +15,11 @@ CHAT_TEMPLATE = "Dummy chat template for testing {}"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)] BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
@dataclass
class MockHFConfig:
model_type: str = "any"
@dataclass @dataclass
class MockModelConfig: class MockModelConfig:
tokenizer = MODEL_NAME tokenizer = MODEL_NAME
@ -24,6 +29,7 @@ class MockModelConfig:
tokenizer_revision = None tokenizer_revision = None
embedding_mode = False embedding_mode = False
multimodal_config = MultiModalConfig() multimodal_config = MultiModalConfig()
hf_config = MockHFConfig()
@dataclass @dataclass

View File

@ -41,7 +41,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
lora_request) lora_request)
def test_get_lora_tokenizer(sql_lora_files, tmpdir): def test_get_lora_tokenizer(sql_lora_files, tmp_path):
lora_request = None lora_request = None
tokenizer = get_lora_tokenizer(lora_request) tokenizer = get_lora_tokenizer(lora_request)
assert not tokenizer assert not tokenizer
@ -50,6 +50,6 @@ def test_get_lora_tokenizer(sql_lora_files, tmpdir):
tokenizer = get_lora_tokenizer(lora_request) tokenizer = get_lora_tokenizer(lora_request)
assert tokenizer.get_added_vocab() assert tokenizer.get_added_vocab()
lora_request = LoRARequest("1", 1, str(tmpdir)) lora_request = LoRARequest("1", 1, str(tmp_path))
tokenizer = get_lora_tokenizer(lora_request) tokenizer = get_lora_tokenizer(lora_request)
assert not tokenizer assert not tokenizer

View File

@ -3,7 +3,6 @@
Run `pytest tests/models/test_granite.py`. Run `pytest tests/models/test_granite.py`.
""" """
import pytest import pytest
import transformers
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
@ -12,9 +11,6 @@ MODELS = [
] ]
# GraniteForCausalLM will be in transformers >= 4.45
@pytest.mark.skipif(transformers.__version__ < "4.45",
reason="granite model test requires transformers >= 4.45")
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("max_tokens", [64])

View File

@ -1,7 +1,6 @@
from typing import List, Optional, Tuple, Type, overload from typing import List, Optional, Tuple, Type, overload
import pytest import pytest
import transformers
from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
from vllm.multimodal.utils import (rescale_video_size, resize_video, from vllm.multimodal.utils import (rescale_video_size, resize_video,
@ -158,8 +157,6 @@ def run_test(
) )
@pytest.mark.skipif(transformers.__version__ < "4.45",
reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"size_factors", "size_factors",
@ -203,8 +200,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
) )
@pytest.mark.skipif(transformers.__version__ < "4.45",
reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"sizes", "sizes",

View File

@ -1,7 +1,6 @@
from typing import List, Optional, Tuple, Type, overload from typing import List, Optional, Tuple, Type, overload
import pytest import pytest
import transformers
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
BatchEncoding) BatchEncoding)
@ -166,8 +165,6 @@ def run_video_test(
) )
@pytest.mark.skipif(transformers.__version__ < "4.45",
reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"size_factors", "size_factors",
@ -211,8 +208,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
) )
@pytest.mark.skipif(transformers.__version__ < "4.45",
reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"sizes", "sizes",
@ -259,7 +254,9 @@ def run_image_test(
# max_model_len should be greater than image_feature_size # max_model_len should be greater than image_feature_size
with vllm_runner(model, with vllm_runner(model,
dtype=dtype, dtype=dtype,
max_model_len=32768, max_num_seqs=1,
max_model_len=16384,
gpu_memory_utilization=0.98,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
enforce_eager=True, enforce_eager=True,
@ -305,8 +302,8 @@ def run_image_test(
) )
@pytest.mark.skipif(transformers.__version__ < "4.45", # FIXME: Swap to a smaller model for this architecture
reason="Waiting for next transformers release") @pytest.mark.skip(reason="Model OOMing on CI")
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])

View File

@ -1,15 +1,9 @@
import pytest import pytest
import transformers
from vllm.model_executor.models import _MODELS, ModelRegistry from vllm.model_executor.models import _MODELS, ModelRegistry
@pytest.mark.parametrize("model_cls", _MODELS) @pytest.mark.parametrize("model_cls", _MODELS)
def test_registry_imports(model_cls): def test_registry_imports(model_cls):
if (model_cls in ("LlavaOnevisionForConditionalGeneration",
"Qwen2VLForConditionalGeneration")
and transformers.__version__ < "4.45"):
pytest.skip("Waiting for next transformers release")
# Ensure all model classes can be imported successfully # Ensure all model classes can be imported successfully
ModelRegistry.resolve_model_cls([model_cls]) ModelRegistry.resolve_model_cls([model_cls])

View File

@ -1,5 +1,6 @@
import itertools import itertools
import random import random
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
from unittest.mock import Mock, patch from unittest.mock import Mock, patch
@ -596,8 +597,19 @@ def test_sampler_top_k_top_p(seed: int, device: str):
generation_config = GenerationConfig(top_k=top_k, generation_config = GenerationConfig(top_k=top_k,
top_p=top_p, top_p=top_p,
do_sample=True) do_sample=True)
warpers = generation_model._get_logits_warper(generation_config, device)
assert len(warpers) == 2 # top_p and top_k @dataclass
class MockConfig:
is_encoder_decoder: bool = False
generation_model.config = MockConfig() # needed by the following method
generation_model._prepare_special_tokens(generation_config, device=device)
processors = generation_model._get_logits_processor(generation_config,
None,
None,
None, [],
device=device)
assert len(processors) == 2 # top_p and top_k
seq_group_metadata_list: List[SequenceGroupMetadata] = [] seq_group_metadata_list: List[SequenceGroupMetadata] = []
seq_lens: List[int] = [] seq_lens: List[int] = []
@ -639,7 +651,7 @@ def test_sampler_top_k_top_p(seed: int, device: str):
assert sample_probs is not None assert sample_probs is not None
hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone()) hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone())
hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5) torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5)
assert torch.equal(hf_probs.eq(0), sample_probs.eq(0)) assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))

View File

@ -152,13 +152,13 @@ class OpenAIServingChat(OpenAIServing):
**(request.chat_template_kwargs or {}), **(request.chat_template_kwargs or {}),
) )
except Exception as e: except Exception as e:
logger.error("Error in applying chat template from request: %s", e) logger.exception("Error in applying chat template from request")
return self.create_error_response(str(e)) return self.create_error_response(str(e))
try: try:
mm_data = await mm_data_future mm_data = await mm_data_future
except Exception as e: except Exception as e:
logger.error("Error in loading multi-modal data: %s", e) logger.exception("Error in loading multi-modal data")
return self.create_error_response(str(e)) return self.create_error_response(str(e))
# validation for OpenAI tools # validation for OpenAI tools

View File

@ -1,6 +1,7 @@
import os import os
import warnings import warnings
from pathlib import Path from pathlib import Path
from types import MethodType
from typing import Optional, Union from typing import Optional, Union
import huggingface_hub import huggingface_hub
@ -152,6 +153,29 @@ def get_tokenizer(
else: else:
raise e raise e
# NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324
if type(tokenizer).__name__ in ("ChatGLMTokenizer",
"ChatGLM4Tokenizer"):
assert isinstance(tokenizer, PreTrainedTokenizer)
orig_pad = tokenizer._pad
# Patch _pad method to accept `padding_side`
def _pad(
self: PreTrainedTokenizer,
*args,
padding_side: Optional[str] = None,
**kwargs,
):
if (padding_side is not None
and padding_side != self.padding_side):
msg = ("`padding_side` argument is not supported by "
"ChatGLMTokenizer and will be ignored.")
warnings.warn(msg, stacklevel=2)
return orig_pad(*args, **kwargs)
tokenizer._pad = MethodType(_pad, tokenizer)
if not isinstance(tokenizer, PreTrainedTokenizerFast): if not isinstance(tokenizer, PreTrainedTokenizerFast):
logger.warning( logger.warning(
"Using a slow tokenizer. This might cause a significant " "Using a slow tokenizer. This might cause a significant "
@ -167,7 +191,7 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args,
return None return None
try: try:
tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs) tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs)
except OSError as e: except Exception as e:
# No tokenizer was found in the LoRA folder, # No tokenizer was found in the LoRA folder,
# use base model tokenizer # use base model tokenizer
logger.warning( logger.warning(