From 64e0e383148a613c327d4bf9e866b7a185df8277 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 29 Mar 2023 16:38:48 -0700 Subject: [PATCH] Add cache watermark to avoid frequent cache eviction (#11) --- cacheflow/master/block_manager.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cacheflow/master/block_manager.py b/cacheflow/master/block_manager.py index 571ee247..a4e346ec 100644 --- a/cacheflow/master/block_manager.py +++ b/cacheflow/master/block_manager.py @@ -60,11 +60,15 @@ class BlockSpaceManager: block_size: int, num_gpu_blocks: int, num_cpu_blocks: int, + watermark: float = 0.01, ) -> None: self.block_size = block_size self.num_total_gpu_blocks = num_gpu_blocks self.num_total_cpu_blocks = num_cpu_blocks + self.watermark = watermark + assert watermark >= 0.0 + self.watermark_blocks = int(watermark * num_gpu_blocks) self.gpu_allocator = BlockAllocator(Device.GPU, block_size, num_gpu_blocks) self.cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks) @@ -76,7 +80,8 @@ class BlockSpaceManager: seq = seq_group.seqs[0] num_required_blocks = len(seq.logical_token_blocks) num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks() - return num_required_blocks <= num_free_gpu_blocks + # Use watermark to avoid frequent cache eviction. + return num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks def allocate(self, seq_group: SequenceGroup) -> None: # NOTE: Here we assume that all sequences in the group have the same prompt. @@ -154,7 +159,8 @@ class BlockSpaceManager: # NOTE: Conservatively, we assume that every sequence will allocate # at least one free block right after the swap-in. # NOTE: This should match the logic in can_append(). - return len(blocks) + num_swapped_seqs <= num_free_blocks + num_required_blocks = len(blocks) + num_swapped_seqs + return num_free_blocks - num_required_blocks >= self.watermark_blocks def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: # CPU block -> GPU block.