[Bugfix] More type hint fixes for py 3.8 (#4039)
This commit is contained in:
parent
546e721168
commit
5c2e66e487
@ -39,7 +39,7 @@ class ExecutorBase(ABC):
|
||||
ExecutorBase may require modification of the result, e.g. to ensure the
|
||||
selected cache sizes are compatible with all workers.
|
||||
|
||||
Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
|
||||
Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
|
||||
are blocks that are "active" on the device and can be appended to.
|
||||
num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
|
||||
appended to.
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
"""A CPU worker class."""
|
||||
from typing import Dict, List, Optional
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.distributed
|
||||
@ -157,7 +157,7 @@ class CPUWorker(LoraNotSupportedWorkerBase):
|
||||
def load_model(self):
|
||||
self.model_runner.load_model()
|
||||
|
||||
def determine_num_available_blocks(self) -> tuple[int, int]:
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Determine the number of blocks available for the KV cache.
|
||||
|
||||
This determines how many KV blocks can fit into the configured CPU
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
"""A Neuron worker class."""
|
||||
from typing import List, Optional
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.distributed
|
||||
@ -40,7 +40,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase):
|
||||
def load_model(self):
|
||||
self.model_runner.load_model()
|
||||
|
||||
def determine_num_available_blocks(self) -> tuple[int, int]:
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Determine the number of available KV blocks.
|
||||
|
||||
Swapping is not yet supported, so always return num_cpu_blocks=0.
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
||||
@ -18,14 +18,14 @@ class WorkerBase(ABC):
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def determine_num_available_blocks(self) -> tuple[int, int]:
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Determine the number of available blocks for the GPU KV cache and
|
||||
swappable CPU KV cache.
|
||||
|
||||
The implementation may run profiling or other heuristics to determine
|
||||
the size of caches.
|
||||
|
||||
Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
|
||||
Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
|
||||
are blocks that are "active" on the device and can be appended to.
|
||||
num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
|
||||
appended to.
|
||||
|
||||
Loading…
Reference in New Issue
Block a user