[Bugfix] Fix offline mode when using mistral_common (#9457)

2024-10-19 01:12:32 +00:00 · 2024-10-19 01:12:32 +00:00 · 337ed76671
commit 337ed76671
parent 0c9a5258f9
2 changed files with 62 additions and 28 deletions
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@ -1,50 +1,56 @@
 """Tests for HF_HUB_OFFLINE mode"""
 import importlib
 import sys
 import weakref
 import pytest
 from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
-MODEL_NAME = "facebook/opt-125m"
+MODEL_CONFIGS = [
    {
        "model": "facebook/opt-125m",
        "enforce_eager": True,
        "gpu_memory_utilization": 0.20,
        "max_model_len": 64,
        "max_num_batched_tokens": 64,
        "max_num_seqs": 64,
        "tensor_parallel_size": 1,
    },
    {
        "model": "mistralai/Mistral-7B-Instruct-v0.1",
        "enforce_eager": True,
        "gpu_memory_utilization": 0.95,
        "max_model_len": 64,
        "max_num_batched_tokens": 64,
        "max_num_seqs": 64,
        "tensor_parallel_size": 1,
        "tokenizer_mode": "mistral",
    },
 ]
@pytest.fixture(scope="module")
-def llm():
+def cache_models():
-    # pytest caches the fixture so we use weakref.proxy to
+    # Cache model files first
-    # enable garbage collection
+    for model_config in MODEL_CONFIGS:
-    llm = LLM(model=MODEL_NAME,
+        LLM(**model_config)
-              max_num_batched_tokens=4096,
+        cleanup_dist_env_and_memory()
              tensor_parallel_size=1,
              gpu_memory_utilization=0.10,
              enforce_eager=True)
-    with llm.deprecate_legacy_api():
+    yield
        yield weakref.proxy(llm)
        del llm
    cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
-def test_offline_mode(llm: LLM, monkeypatch):
+@pytest.mark.usefixtures("cache_models")
-    # we use the llm fixture to ensure the model files are in-cache
+def test_offline_mode(monkeypatch):
    del llm
    # Set HF to offline mode and ensure we can still construct an LLM
    try:
        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
        # Need to re-import huggingface_hub and friends to setup offline mode
        _re_import_modules()
        # Cached model files should be used in offline mode
-        LLM(model=MODEL_NAME,
+        for model_config in MODEL_CONFIGS:
-            max_num_batched_tokens=4096,
+            LLM(**model_config)
            tensor_parallel_size=1,
            gpu_memory_utilization=0.20,
            enforce_eager=True)
    finally:
        # Reset the environment after the test
        # NB: Assuming tests are run in online mode
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@ -4,6 +4,7 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast
 import huggingface_hub
 from huggingface_hub import HfApi, hf_hub_download
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 # yapf: disable
@ -24,6 +25,26 @@ class Encoding:
    input_ids: List[int]
 def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]:
    repo_cache = os.path.join(
        huggingface_hub.constants.HF_HUB_CACHE,
        huggingface_hub.constants.REPO_ID_SEPARATOR.join(
            ["models", *repo_id.split("/")]))
    if revision is None:
        revision_file = os.path.join(repo_cache, "refs", "main")
        if os.path.isfile(revision_file):
            with open(revision_file) as file:
                revision = file.read()
    if revision:
        revision_dir = os.path.join(repo_cache, "snapshots", revision)
        if os.path.isdir(revision_dir):
            return os.listdir(revision_dir)
    return []
 def find_tokenizer_file(files: List[str]):
    file_pattern = re.compile(r"^tokenizer\.model\.v.*$|^tekken\.json$")
@ -90,9 +111,16 @@ class MistralTokenizer:
    @staticmethod
    def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
                                            revision: Optional[str]) -> str:
-        api = HfApi()
+        try:
-        repo_info = api.model_info(tokenizer_name)
+            hf_api = HfApi()
-        files = [s.rfilename for s in repo_info.siblings]
+            files = hf_api.list_repo_files(repo_id=tokenizer_name,
                                           revision=revision)
        except ConnectionError as exc:
            files = list_local_repo_files(repo_id=tokenizer_name,
                                          revision=revision)
            if len(files) == 0:
                raise exc
        filename = find_tokenizer_file(files)