From d6e520e1700f78de2d5efdb8607a76cbab61182e Mon Sep 17 00:00:00 2001 From: Prashant Gupta Date: Sat, 27 Apr 2024 09:59:55 -0700 Subject: [PATCH] [Core] Support offline use of local cache for models (#4374) Signed-off-by: Prashant Gupta Co-authored-by: Travis Johnson --- tests/model_executor/weight_utils.py | 30 +++++++++- vllm/model_executor/model_loader/loader.py | 5 +- .../model_loader/weight_utils.py | 57 +++++++++++-------- vllm/transformers_utils/tokenizer.py | 2 + 4 files changed, 68 insertions(+), 26 deletions(-) diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/weight_utils.py index b0086dd7..c8b9bed6 100644 --- a/tests/model_executor/weight_utils.py +++ b/tests/model_executor/weight_utils.py @@ -1,9 +1,12 @@ import os +import tempfile import huggingface_hub.constants import pytest +from huggingface_hub.utils import LocalEntryNotFoundError -from vllm.model_executor.model_loader.weight_utils import enable_hf_transfer +from vllm.model_executor.model_loader.weight_utils import ( + download_weights_from_hf, enable_hf_transfer) def test_hf_transfer_auto_activation(): @@ -22,5 +25,30 @@ def test_hf_transfer_auto_activation(): HF_TRANFER_ACTIVE) +def test_download_weights_from_hf(): + with tempfile.TemporaryDirectory() as tmpdir: + # assert LocalEntryNotFoundError error is thrown + # if offline is set and model is not cached + huggingface_hub.constants.HF_HUB_OFFLINE = True + with pytest.raises(LocalEntryNotFoundError): + download_weights_from_hf("facebook/opt-125m", + allow_patterns=["*.safetensors", "*.bin"], + cache_dir=tmpdir) + + # download the model + huggingface_hub.constants.HF_HUB_OFFLINE = False + download_weights_from_hf("facebook/opt-125m", + allow_patterns=["*.safetensors", "*.bin"], + cache_dir=tmpdir) + + # now it should work offline + huggingface_hub.constants.HF_HUB_OFFLINE = True + assert download_weights_from_hf( + "facebook/opt-125m", + allow_patterns=["*.safetensors", "*.bin"], + cache_dir=tmpdir) is not None + + if __name__ == "__main__": test_hf_transfer_auto_activation() + test_download_weights_from_hf() diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index ad802430..70e64167 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -5,6 +5,7 @@ import os from abc import ABC, abstractmethod from typing import Any, Dict, Generator, List, Optional, Tuple, Type +import huggingface_hub import torch from torch import nn @@ -131,7 +132,9 @@ class DefaultModelLoader(BaseModelLoader): model_path = snapshot_download( model_id=model, cache_dir=self.load_config.download_dir, - revision=revision) + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + revision=revision, + ) else: model_path = model return model_path diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index c0905b90..c1abde9a 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -127,11 +127,14 @@ def get_quant_config(model_config: ModelConfig, if not is_local: # Download the config files. with get_lock(model_name_or_path, load_config.download_dir): - hf_folder = snapshot_download(model_name_or_path, - revision=model_config.revision, - allow_patterns="*.json", - cache_dir=load_config.download_dir, - tqdm_class=DisabledTqdm) + hf_folder = snapshot_download( + model_name_or_path, + revision=model_config.revision, + allow_patterns="*.json", + cache_dir=load_config.download_dir, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + tqdm_class=DisabledTqdm, + ) else: hf_folder = model_name_or_path @@ -161,12 +164,14 @@ def get_quant_config(model_config: ModelConfig, return quant_cls.from_config(config) -def download_weights_from_hf(model_name_or_path: str, - cache_dir: Optional[str], - allow_patterns: List[str], - revision: Optional[str] = None) -> str: +def download_weights_from_hf( + model_name_or_path: str, + cache_dir: Optional[str], + allow_patterns: List[str], + revision: Optional[str] = None, +) -> str: """Download model weights from Hugging Face Hub. - + Args: model_name_or_path (str): The model name or path. cache_dir (Optional[str]): The cache directory to store the model @@ -179,26 +184,30 @@ def download_weights_from_hf(model_name_or_path: str, Returns: str: The path to the downloaded model weights. """ - # Before we download we look at that is available: - fs = HfFileSystem() - file_list = fs.ls(model_name_or_path, detail=False, revision=revision) + if not huggingface_hub.constants.HF_HUB_OFFLINE: + # Before we download we look at that is available: + fs = HfFileSystem() + file_list = fs.ls(model_name_or_path, detail=False, revision=revision) - # depending on what is available we download different things - for pattern in allow_patterns: - matching = fnmatch.filter(file_list, pattern) - if len(matching) > 0: - allow_patterns = [pattern] - break + # depending on what is available we download different things + for pattern in allow_patterns: + matching = fnmatch.filter(file_list, pattern) + if len(matching) > 0: + allow_patterns = [pattern] + break logger.info("Using model weights format %s", allow_patterns) # Use file lock to prevent multiple processes from # downloading the same model weights at the same time. with get_lock(model_name_or_path, cache_dir): - hf_folder = snapshot_download(model_name_or_path, - allow_patterns=allow_patterns, - cache_dir=cache_dir, - tqdm_class=DisabledTqdm, - revision=revision) + hf_folder = snapshot_download( + model_name_or_path, + allow_patterns=allow_patterns, + cache_dir=cache_dir, + tqdm_class=DisabledTqdm, + revision=revision, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + ) return hf_folder diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 2fcddc3b..fa4693cb 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -1,6 +1,7 @@ import os from typing import Optional, Union +import huggingface_hub from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) @@ -76,6 +77,7 @@ def get_tokenizer( model_id=tokenizer_name, cache_dir=download_dir, revision=revision, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, # Ignore weights - we only need the tokenizer. ignore_file_pattern=["*.pt", "*.safetensors", "*.bin"]) tokenizer_name = tokenizer_path