[LoRA] Add support for pinning lora adapters in the LRU cache (#5603)

2024-06-21 15:42:46 -07:00 · 2024-06-21 15:42:46 -07:00 · f5dda63eb5
commit f5dda63eb5
parent 7187507301
13 changed files with 171 additions and 5 deletions
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@ -209,6 +209,34 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model):
    assert manager.activate_lora(3)
    assert manager.lora_index_to_id[0] == 2
    assert manager.lora_index_to_id[1] == 3
    assert manager.pin_lora(2)
    assert manager.lora_index_to_id[0] == 2
    assert manager.lora_index_to_id[1] == 3
    assert manager.activate_lora(1)
    assert manager.lora_index_to_id[0] == 2
    assert manager.lora_index_to_id[1] == 1
    assert manager.deactivate_lora(2)
    assert manager.lora_index_to_id[0] is None
    assert manager.lora_index_to_id[1] == 1
    assert manager.activate_lora(3)
    assert manager.lora_index_to_id[0] == 3
    assert manager.lora_index_to_id[1] == 1
    assert manager.pin_lora(3)
    assert manager.pin_lora(1)
    with pytest.raises(RuntimeError):
        assert manager.pin_lora(2)
    assert manager.lora_index_to_id[0] == 3
    assert manager.lora_index_to_id[1] == 1
    with pytest.raises(RuntimeError):
        assert manager.activate_lora(2)
    assert manager.deactivate_lora(3)
    assert manager.pin_lora(2)
    assert manager.lora_index_to_id[0] == 2
    assert manager.lora_index_to_id[1] == 1
    assert manager.remove_lora(3)
    with pytest.raises(ValueError):
        assert manager.pin_lora(3)
 def test_lru_lora_model_manager(dist_init, dummy_model):
@ -288,6 +316,42 @@ def test_lru_lora_model_manager(dist_init, dummy_model):
    assert set(manager.list_loras()) == set()
    assert all(x is None for x in manager.lora_index_to_id)
    # pinning
    assert manager.add_lora(model_lora3)
    assert manager.activate_lora(3)
    assert manager.add_lora(model_lora4)
    assert manager.activate_lora(4)
    assert set(manager.list_loras()) == {3, 4}
    with pytest.raises(ValueError):
        assert manager.pin_lora(1)
    assert manager.pin_lora(3)
    # Remove manually
    assert manager.remove_lora(3)
    assert not manager.remove_lora(3)
    assert set(manager.list_loras()) == {4}
    assert manager.lora_index_to_id[0] is None
    assert manager.lora_index_to_id[1] == 4
    assert manager.add_lora(model_lora1)
    assert manager.pin_lora(1)
    assert manager.add_lora(model_lora2)
    assert manager.activate_lora(2)
    assert set(manager.list_loras()) == {1, 2}
    assert manager.lora_index_to_id[0] == 1
    assert manager.lora_index_to_id[1] == 2
    assert manager.remove_oldest_lora()
    assert set(manager.list_loras()) == {1}
    assert manager.lora_index_to_id[0] == 1
    assert manager.lora_index_to_id[1] is None
    with pytest.raises(RuntimeError):
        assert manager.remove_oldest_lora()
    assert set(manager.list_loras()) == {1}
 def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
                                       sql_lora_files):
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@ -1009,6 +1009,9 @@ class LLMEngine:
    def list_loras(self) -> Set[int]:
        return self.model_executor.list_loras()
    def pin_lora(self, lora_id: int) -> bool:
        return self.model_executor.pin_lora(lora_id)
    def check_health(self) -> None:
        self.model_executor.check_health()
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@ -84,6 +84,9 @@ class CPUExecutor(ExecutorBase):
    def remove_lora(self, lora_id: int) -> bool:
        return self.driver_worker.remove_lora(lora_id)
    def pin_lora(self, lora_id: int) -> bool:
        return self.driver_worker.pin_lora(lora_id)
    def list_loras(self) -> Set[int]:
        return self.driver_worker.list_loras()
--- a/vllm/executor/distributed_gpu_executor.py
+++ b/vllm/executor/distributed_gpu_executor.py
@ -100,6 +100,13 @@ class DistributedGPUExecutor(GPUExecutor):
            lora_id=lora_id,
        )
    def pin_lora(self, lora_id: int) -> bool:
        assert lora_id > 0, "lora_id must be greater than 0."
        return self._run_workers(
            "pin_lora",
            lora_id=lora_id,
        )
    def list_loras(self) -> Set[int]:
        return self._run_workers("list_loras")
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@ -86,6 +86,10 @@ class ExecutorBase(ABC):
    def remove_lora(self, lora_id: int) -> bool:
        raise NotImplementedError
    @abstractmethod
    def pin_lora(self, lora_id: int) -> bool:
        raise NotImplementedError  # type: ignore
    @abstractmethod
    def list_loras(self) -> Set[int]:
        raise NotImplementedError
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@ -99,6 +99,10 @@ class GPUExecutor(ExecutorBase):
        assert lora_id > 0, "lora_id must be greater than 0."
        return self.driver_worker.remove_lora(lora_id)
    def pin_lora(self, lora_id: int) -> bool:
        assert lora_id > 0, "lora_id must be greater than 0."
        return self.driver_worker.pin_lora(lora_id)
    def list_loras(self) -> Set[int]:
        return self.driver_worker.list_loras()
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@ -65,6 +65,9 @@ class NeuronExecutor(ExecutorBase):
    def remove_lora(self, lora_id: int) -> bool:
        return self.driver_worker.remove_lora(lora_id)
    def pin_lora(self, lora_id: int) -> bool:
        return self.driver_worker.pin_lora(lora_id)
    def list_loras(self) -> Set[int]:
        return self.driver_worker.list_loras()
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@ -525,6 +525,12 @@ class LoRAModelManager:
            self.long_lora_context.offsets_by_lora_id.pop(lora_id, None)
        return bool(self._registered_loras.pop(lora_id, None))
    def pin_lora(self, lora_id: int) -> bool:
        """Pin a LoRAModel in the manager cache."""
        raise NotImplementedError(
            "Pinning is not supported in LoRAModelManager."
            "Use LRUCacheLoRAModelManager for pinning")  # type: ignore
    # TODO see if this can be vectorized
    def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
        (base_indices, sampler_indices, sampler_indices_padded,
@ -777,6 +783,26 @@ class LRUCacheLoRAModelManager(LoRAModelManager):
            return True
        return False
    def pin_lora(self, lora_id: int) -> bool:
        """Pin a LoRAModel in the manager cache."""
        self._pin_lora_in_cpu_cache(lora_id)
        self._pin_lora_in_gpu_cache(lora_id)
        return True
    def _pin_lora_in_cpu_cache(self, lora_id: int):
        try:
            self._registered_loras.pin(lora_id)
        except ValueError as err:
            raise ValueError("Pinning failed. "
                             f"LoRA {lora_id} is not registered.") from err
    def _pin_lora_in_gpu_cache(self, lora_id: int):
        if lora_id not in self._active_loras:
            # move lora to gpu if not already active
            self.activate_lora(lora_id)
        self._active_loras.pin(lora_id)
 def create_lora_manager(
        model: nn.Module,
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@ -221,6 +221,9 @@ class WorkerLoRAManager(AbstractWorkerLoRAManager):
    def remove_lora(self, lora_id: int) -> bool:
        return self._lora_manager.remove_lora(lora_id)
    def pin_lora(self, lora_id: int) -> bool:
        return self._lora_manager.pin_lora(lora_id)
    def remove_all_loras(self):
        self._lora_manager.remove_all_loras()
--- a/vllm/utils.py
+++ b/vllm/utils.py
@ -15,7 +15,7 @@ from collections import defaultdict
 from functools import lru_cache, partial, wraps
 from platform import uname
 from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
-                    Hashable, List, Optional, OrderedDict, Tuple, TypeVar,
+                    Hashable, List, Optional, OrderedDict, Set, Tuple, TypeVar,
                    Union)
 import numpy as np
@ -44,6 +44,13 @@ K = TypeVar("K")
 T = TypeVar("T")
 class _Sentinel:
    ...
 ALL_PINNED_SENTINEL = _Sentinel()
 class Device(enum.Enum):
    GPU = enum.auto()
    CPU = enum.auto()
@ -67,6 +74,7 @@ class LRUCache(Generic[T]):
    def __init__(self, capacity: int):
        self.cache: OrderedDict[Hashable, T] = OrderedDict()
        self.pinned_items: Set[Hashable] = set()
        self.capacity = capacity
    def __contains__(self, key: Hashable) -> bool:
@ -102,14 +110,36 @@ class LRUCache(Generic[T]):
        self.cache.move_to_end(key)
        self._remove_old_if_needed()
    def pin(self, key: Hashable) -> None:
        """
        Pins a key in the cache preventing it from being
        evicted in the LRU order.
        """
        if key not in self.cache:
            raise ValueError(f"Cannot pin key: {key} not in cache.")
        self.pinned_items.add(key)
    def _unpin(self, key: Hashable) -> None:
        self.pinned_items.remove(key)
    def _on_remove(self, key: Hashable, value: Optional[T]):
        pass
-    def remove_oldest(self):
+    def remove_oldest(self, remove_pinned=False):
        if not self.cache:
            return
-        key, value = self.cache.popitem(last=False)
+
-        self._on_remove(key, value)
+        if not remove_pinned:
            # pop the oldest item in the cache that is not pinned
            lru_key = next(
                (key for key in self.cache if key not in self.pinned_items),
                ALL_PINNED_SENTINEL)
            if lru_key is ALL_PINNED_SENTINEL:
                raise RuntimeError("All items are pinned, "
                                   "cannot remove oldest from the cache.")
        else:
            lru_key = next(iter(self.cache))
        self.pop(lru_key)
    def _remove_old_if_needed(self) -> None:
        while len(self.cache) > self.capacity:
@ -120,13 +150,16 @@ class LRUCache(Generic[T]):
            default_value: Optional[T] = None) -> Optional[T]:
        run_on_remove = key in self.cache
        value: Optional[T] = self.cache.pop(key, default_value)
        # remove from pinned items
        if key in self.pinned_items:
            self._unpin(key)
        if run_on_remove:
            self._on_remove(key, value)
        return value
    def clear(self):
        while len(self.cache) > 0:
-            self.remove_oldest()
+            self.remove_oldest(remove_pinned=True)
        self.cache.clear()
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@ -878,6 +878,11 @@ class ModelRunner:
            raise RuntimeError("LoRA is not enabled.")
        return self.lora_manager.remove_lora(lora_id)
    def pin_lora(self, lora_id: int) -> bool:
        if not self.lora_manager:
            raise RuntimeError("LoRA is not enabled.")
        return self.lora_manager.pin_lora(lora_id)
    def list_loras(self) -> Set[int]:
        if not self.lora_manager:
            raise RuntimeError("LoRA is not enabled.")
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@ -333,6 +333,9 @@ class Worker(WorkerBase):
    def remove_lora(self, lora_id: int) -> bool:
        return self.model_runner.remove_lora(lora_id)
    def pin_lora(self, lora_id: int) -> bool:
        return self.model_runner.pin_lora(lora_id)
    def list_loras(self) -> Set[int]:
        return self.model_runner.list_loras()
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@ -70,6 +70,10 @@ class WorkerBase(ABC):
    def remove_lora(self, lora_id: int) -> bool:
        raise NotImplementedError
    @abstractmethod
    def pin_lora(self, lora_id: int) -> bool:
        raise NotImplementedError
    @abstractmethod
    def list_loras(self) -> Set[int]:
        raise NotImplementedError
@ -86,6 +90,10 @@ class LoraNotSupportedWorkerBase(WorkerBase):
    def remove_lora(self, lora_id: int) -> bool:
        raise ValueError(f"{type(self)} does not support LoRA")
    def pin_lora(self, lora_id: int) -> bool:
        return ValueError(
            f"{type(self)} does not support LoRA")  # type: ignore
    def list_loras(self) -> Set[int]:
        raise ValueError(f"{type(self)} does not support LoRA")