From 49a3c8662ba745503890ab8b3c502aad7e1a0a19 Mon Sep 17 00:00:00 2001 From: Breno Faria Date: Wed, 13 Mar 2024 01:30:08 +0100 Subject: [PATCH] Fixes #1556 double free (#3347) --- tests/core/test_block_manager.py | 87 ++++++++++++++++++++++++++++++++ vllm/core/block_manager.py | 17 ++++++- 2 files changed, 102 insertions(+), 2 deletions(-) diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index b280fd1d..44ac05a1 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -274,3 +274,90 @@ def test_reset(): # Resetting block manager frees all allocated blocks. block_manager.reset() assert block_manager.get_num_free_gpu_blocks() == original_blocks + + +def test_sliding_window_multi_seq(): + """ + Tests that memory allocation and deallocation is handled + correctly with multiple sequences that exceed the sliding + window's capacity. + """ + block_size = 1 + num_cpu_blocks = 8 + num_gpu_blocks = 8 + sliding_window = 2 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + sliding_window=sliding_window, + watermark=0) + + assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks + + parent = Sequence(1, "one two three", [0, 1, 2], block_size) + seq_group = SequenceGroup("1", [parent], SamplingParams(), time.time(), + None) + block_manager.allocate(seq_group) + + # assert the number of blocks allocated is correct + # the parent seq has len 3, but since sliding_window is 2, + # we will use at most 2 blocks + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window + + # Fork prompt and copy block tables. + child = parent.fork(2) + block_manager.fork(parent, child) + + # assert the number of blocks allocated is correct + # forking does not increase memory consumption + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window + + # assert both parent and child share all blocks + assert block_manager.get_block_table( + parent) == block_manager.get_block_table(child) + + token_id = 4 + # Append token to child. Block is shared so copy on write occurs. + child.append_token_id(token_id, {token_id: Logprob(0.0)}) + block_manager.append_slot(child) + + # assert the number of blocks allocated is correct + # we will use now one block more. Each seq will use 2 blocks, + # but only one can be shared + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window - 1 + + token_id = 5 + parent.append_token_id(token_id, {token_id: Logprob(0.0)}) + block_manager.append_slot(parent) + + # assert the number of blocks allocated is correct + # no change, because both sequences are still just sharing one block + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window - 1 + + block_table_parent = block_manager.get_block_table(parent) + block_table_child = block_manager.get_block_table(child) + + assert block_table_parent != block_table_child + + # assert both blocks are sharing the second-last block + assert block_table_parent[-2] == block_table_child[-2] + + # now let's clean up... + block_manager.free(parent) + + # assert the number of blocks allocated is correct + # We have freed one seq, reducing the ref count of two blocks by one. + # One of the two was only used by the parent seq, so this is now free. + # The child seq still consumes sliding_window blocks + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window + + # free all blocks + block_manager.free(child) + + # assert all blocks are free now + assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 8bfc1499..8b089a56 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -312,7 +312,12 @@ class BlockSpaceManager: # Thus, it is always safe from OOM. src_block_table = self.block_tables[parent_seq.seq_id] self.block_tables[child_seq.seq_id] = src_block_table.copy() - for block in src_block_table: + # When using a sliding window, blocks will be eventually reused. + # In this case the block tables will contain repeated blocks. + # When forking, we must make sure that each block's `ref_count` + # is only incremented by one, so we deduplicate them by wrapping + # them in a set. + for block in set(src_block_table): block.ref_count += 1 def _get_physical_blocks( @@ -393,7 +398,15 @@ class BlockSpaceManager: return block_number_mapping def _free_block_table(self, block_table: BlockTable) -> None: - for block in set(block_table): + # when using a sliding window, each seq will only use up + # to `self.block_sliding_window` blocks. When freeing + # the block table, we must make sure to not free blocks more + # than once. If no sliding window is used, there is no block + # reuse in the block table, so we must free all blocks. + blocks_to_free = (block_table[-self.block_sliding_window:] + if self.block_sliding_window is not None else + block_table) + for block in set(blocks_to_free): if block.device == Device.GPU: self.gpu_allocator.free(block) else: