[Misc] Small perf improvements (#6520)
This commit is contained in:
parent
51f8aa90ad
commit
9ed82e7074
@ -249,10 +249,13 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
|
||||
|
||||
# Expect consumed blocks to be new blocks required to support the new slots.
|
||||
expected_consumed_blocks = len(
|
||||
chunk_list(
|
||||
list(
|
||||
range(prompt_len + num_slots_to_append + num_lookahead_slots)),
|
||||
block_size)) - len(chunk_list(list(range(prompt_len)), block_size))
|
||||
list(
|
||||
chunk_list(
|
||||
list(
|
||||
range(prompt_len + num_slots_to_append +
|
||||
num_lookahead_slots)),
|
||||
block_size))) - len(
|
||||
list(chunk_list(list(range(prompt_len)), block_size)))
|
||||
assert num_consumed_blocks == expected_consumed_blocks
|
||||
|
||||
|
||||
|
||||
@ -58,10 +58,10 @@ def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
|
||||
|
||||
unique_token_ids = list(
|
||||
range((num_cpu_blocks + num_gpu_blocks) * block_size))
|
||||
gpu_token_ids = chunk_list(unique_token_ids[:num_gpu_blocks * block_size],
|
||||
block_size)
|
||||
cpu_token_ids = chunk_list(unique_token_ids[num_gpu_blocks * block_size:],
|
||||
block_size)
|
||||
gpu_token_ids = list(
|
||||
chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
|
||||
cpu_token_ids = list(
|
||||
chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size))
|
||||
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import math
|
||||
from typing import List, Optional
|
||||
|
||||
from vllm.core.block.common import BlockList
|
||||
@ -337,10 +338,17 @@ class BlockTable:
|
||||
This is required for the scheduler to determine whether a sequence can
|
||||
continue generation, or if it must be preempted.
|
||||
"""
|
||||
# Math below is equivalent to:
|
||||
# all_token_ids = token_ids + [-1] * num_lookahead_slots
|
||||
# token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
|
||||
# return len(token_blocks)
|
||||
|
||||
all_token_ids = token_ids + [-1] * num_lookahead_slots
|
||||
token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
|
||||
return len(token_blocks)
|
||||
num_token_ids = len(token_ids) + num_lookahead_slots
|
||||
first_chunk_size = self._block_size - (self._num_full_slots %
|
||||
self._block_size)
|
||||
num_token_blocks = (1 + math.ceil(
|
||||
(num_token_ids - first_chunk_size) / self._block_size))
|
||||
return num_token_blocks
|
||||
|
||||
def _chunk_token_blocks_for_append(
|
||||
self, token_ids: List[int]) -> List[List[int]]:
|
||||
@ -351,6 +359,7 @@ class BlockTable:
|
||||
"""
|
||||
first_chunk_size = self._block_size - (self._num_full_slots %
|
||||
self._block_size)
|
||||
token_blocks = [token_ids[:first_chunk_size]] + chunk_list(
|
||||
token_ids[first_chunk_size:], self._block_size)
|
||||
token_blocks = [token_ids[:first_chunk_size]]
|
||||
token_blocks.extend(
|
||||
chunk_list(token_ids[first_chunk_size:], self._block_size))
|
||||
return token_blocks
|
||||
|
||||
@ -552,9 +552,12 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
||||
# runner.
|
||||
|
||||
# It returns a list of int although type annotation says list of string.
|
||||
if len(computed_seq_block_ids) == 1:
|
||||
return computed_seq_block_ids[0]
|
||||
|
||||
return commonprefix([
|
||||
ids for ids in computed_seq_block_ids # type: ignore
|
||||
if ids != []
|
||||
if ids
|
||||
])
|
||||
|
||||
def get_num_blocks_touched(self,
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import functools
|
||||
import importlib
|
||||
from typing import Dict, List, Optional, Type
|
||||
|
||||
@ -98,6 +99,14 @@ _ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
|
||||
|
||||
class ModelRegistry:
|
||||
|
||||
@staticmethod
|
||||
@functools.lru_cache(maxsize=128)
|
||||
def _get_model(model_arch: str):
|
||||
module_name, model_cls_name = _MODELS[model_arch]
|
||||
module = importlib.import_module(
|
||||
f"vllm.model_executor.models.{module_name}")
|
||||
return getattr(module, model_cls_name, None)
|
||||
|
||||
@staticmethod
|
||||
def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
|
||||
if model_arch in _OOT_MODELS:
|
||||
@ -114,10 +123,7 @@ class ModelRegistry:
|
||||
"Model architecture %s is partially supported by ROCm: %s",
|
||||
model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])
|
||||
|
||||
module_name, model_cls_name = _MODELS[model_arch]
|
||||
module = importlib.import_module(
|
||||
f"vllm.model_executor.models.{module_name}")
|
||||
return getattr(module, model_cls_name, None)
|
||||
return ModelRegistry._get_model(model_arch)
|
||||
|
||||
@staticmethod
|
||||
def get_supported_archs() -> List[str]:
|
||||
|
||||
@ -457,24 +457,25 @@ class SequenceGroup:
|
||||
self.prompt_adapter_request = prompt_adapter_request
|
||||
self.encoder_seq = encoder_seq
|
||||
self.trace_headers = trace_headers
|
||||
self._first_seq = next(iter(self.seqs_dict.values()))
|
||||
|
||||
@property
|
||||
def prompt(self) -> Optional[str]:
|
||||
# All sequences in the group should have the same prompt.
|
||||
# We use the prompt of an arbitrary sequence.
|
||||
return next(iter(self.seqs_dict.values())).prompt
|
||||
return self._first_seq.prompt
|
||||
|
||||
@property
|
||||
def prompt_token_ids(self) -> List[int]:
|
||||
# All sequences in the group should have the same prompt.
|
||||
# We use the prompt of an arbitrary sequence.
|
||||
return next(iter(self.seqs_dict.values())).prompt_token_ids
|
||||
return self._first_seq.prompt_token_ids
|
||||
|
||||
@property
|
||||
def multi_modal_data(self) -> "MultiModalDataDict":
|
||||
# All sequences in the group should have the same multi-modal data.
|
||||
# We use the multi-modal data of an arbitrary sequence.
|
||||
return next(iter(self.seqs_dict.values())).multi_modal_data
|
||||
return self._first_seq.multi_modal_data
|
||||
|
||||
@property
|
||||
def lora_int_id(self) -> int:
|
||||
|
||||
@ -415,9 +415,10 @@ def init_kmp_env():
|
||||
os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
|
||||
|
||||
|
||||
def chunk_list(lst: List[T], chunk_size: int) -> List[List[T]]:
|
||||
def chunk_list(lst: List[T], chunk_size: int):
|
||||
"""Yield successive chunk_size chunks from lst."""
|
||||
return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
|
||||
for i in range(0, len(lst), chunk_size):
|
||||
yield lst[i:i + chunk_size]
|
||||
|
||||
|
||||
def cdiv(a: int, b: int) -> int:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user