[Bugfix] Fix offline mode when using mistral_common (#9457)
This commit is contained in:
parent
0c9a5258f9
commit
337ed76671
@ -1,50 +1,56 @@
|
|||||||
"""Tests for HF_HUB_OFFLINE mode"""
|
"""Tests for HF_HUB_OFFLINE mode"""
|
||||||
import importlib
|
import importlib
|
||||||
import sys
|
import sys
|
||||||
import weakref
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
from vllm.distributed import cleanup_dist_env_and_memory
|
from vllm.distributed import cleanup_dist_env_and_memory
|
||||||
|
|
||||||
MODEL_NAME = "facebook/opt-125m"
|
MODEL_CONFIGS = [
|
||||||
|
{
|
||||||
|
"model": "facebook/opt-125m",
|
||||||
|
"enforce_eager": True,
|
||||||
|
"gpu_memory_utilization": 0.20,
|
||||||
|
"max_model_len": 64,
|
||||||
|
"max_num_batched_tokens": 64,
|
||||||
|
"max_num_seqs": 64,
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "mistralai/Mistral-7B-Instruct-v0.1",
|
||||||
|
"enforce_eager": True,
|
||||||
|
"gpu_memory_utilization": 0.95,
|
||||||
|
"max_model_len": 64,
|
||||||
|
"max_num_batched_tokens": 64,
|
||||||
|
"max_num_seqs": 64,
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"tokenizer_mode": "mistral",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def llm():
|
def cache_models():
|
||||||
# pytest caches the fixture so we use weakref.proxy to
|
# Cache model files first
|
||||||
# enable garbage collection
|
for model_config in MODEL_CONFIGS:
|
||||||
llm = LLM(model=MODEL_NAME,
|
LLM(**model_config)
|
||||||
max_num_batched_tokens=4096,
|
cleanup_dist_env_and_memory()
|
||||||
tensor_parallel_size=1,
|
|
||||||
gpu_memory_utilization=0.10,
|
|
||||||
enforce_eager=True)
|
|
||||||
|
|
||||||
with llm.deprecate_legacy_api():
|
yield
|
||||||
yield weakref.proxy(llm)
|
|
||||||
|
|
||||||
del llm
|
|
||||||
|
|
||||||
cleanup_dist_env_and_memory()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip_global_cleanup
|
@pytest.mark.skip_global_cleanup
|
||||||
def test_offline_mode(llm: LLM, monkeypatch):
|
@pytest.mark.usefixtures("cache_models")
|
||||||
# we use the llm fixture to ensure the model files are in-cache
|
def test_offline_mode(monkeypatch):
|
||||||
del llm
|
|
||||||
|
|
||||||
# Set HF to offline mode and ensure we can still construct an LLM
|
# Set HF to offline mode and ensure we can still construct an LLM
|
||||||
try:
|
try:
|
||||||
monkeypatch.setenv("HF_HUB_OFFLINE", "1")
|
monkeypatch.setenv("HF_HUB_OFFLINE", "1")
|
||||||
# Need to re-import huggingface_hub and friends to setup offline mode
|
# Need to re-import huggingface_hub and friends to setup offline mode
|
||||||
_re_import_modules()
|
_re_import_modules()
|
||||||
# Cached model files should be used in offline mode
|
# Cached model files should be used in offline mode
|
||||||
LLM(model=MODEL_NAME,
|
for model_config in MODEL_CONFIGS:
|
||||||
max_num_batched_tokens=4096,
|
LLM(**model_config)
|
||||||
tensor_parallel_size=1,
|
|
||||||
gpu_memory_utilization=0.20,
|
|
||||||
enforce_eager=True)
|
|
||||||
finally:
|
finally:
|
||||||
# Reset the environment after the test
|
# Reset the environment after the test
|
||||||
# NB: Assuming tests are run in online mode
|
# NB: Assuming tests are run in online mode
|
||||||
|
|||||||
@ -4,6 +4,7 @@ from dataclasses import dataclass
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast
|
||||||
|
|
||||||
|
import huggingface_hub
|
||||||
from huggingface_hub import HfApi, hf_hub_download
|
from huggingface_hub import HfApi, hf_hub_download
|
||||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
@ -24,6 +25,26 @@ class Encoding:
|
|||||||
input_ids: List[int]
|
input_ids: List[int]
|
||||||
|
|
||||||
|
|
||||||
|
def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]:
|
||||||
|
repo_cache = os.path.join(
|
||||||
|
huggingface_hub.constants.HF_HUB_CACHE,
|
||||||
|
huggingface_hub.constants.REPO_ID_SEPARATOR.join(
|
||||||
|
["models", *repo_id.split("/")]))
|
||||||
|
|
||||||
|
if revision is None:
|
||||||
|
revision_file = os.path.join(repo_cache, "refs", "main")
|
||||||
|
if os.path.isfile(revision_file):
|
||||||
|
with open(revision_file) as file:
|
||||||
|
revision = file.read()
|
||||||
|
|
||||||
|
if revision:
|
||||||
|
revision_dir = os.path.join(repo_cache, "snapshots", revision)
|
||||||
|
if os.path.isdir(revision_dir):
|
||||||
|
return os.listdir(revision_dir)
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def find_tokenizer_file(files: List[str]):
|
def find_tokenizer_file(files: List[str]):
|
||||||
file_pattern = re.compile(r"^tokenizer\.model\.v.*$|^tekken\.json$")
|
file_pattern = re.compile(r"^tokenizer\.model\.v.*$|^tekken\.json$")
|
||||||
|
|
||||||
@ -90,9 +111,16 @@ class MistralTokenizer:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
|
def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
|
||||||
revision: Optional[str]) -> str:
|
revision: Optional[str]) -> str:
|
||||||
api = HfApi()
|
try:
|
||||||
repo_info = api.model_info(tokenizer_name)
|
hf_api = HfApi()
|
||||||
files = [s.rfilename for s in repo_info.siblings]
|
files = hf_api.list_repo_files(repo_id=tokenizer_name,
|
||||||
|
revision=revision)
|
||||||
|
except ConnectionError as exc:
|
||||||
|
files = list_local_repo_files(repo_id=tokenizer_name,
|
||||||
|
revision=revision)
|
||||||
|
|
||||||
|
if len(files) == 0:
|
||||||
|
raise exc
|
||||||
|
|
||||||
filename = find_tokenizer_file(files)
|
filename = find_tokenizer_file(files)
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user