Fixes #1556 double free (#3347)

2024-03-13 01:30:08 +01:00 · 2024-03-13 01:30:08 +01:00 · 49a3c8662b
commit 49a3c8662b
parent b0925b3878
2 changed files with 102 additions and 2 deletions
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@ -274,3 +274,90 @@ def test_reset():
    # Resetting block manager frees all allocated blocks.
    block_manager.reset()
    assert block_manager.get_num_free_gpu_blocks() == original_blocks
 def test_sliding_window_multi_seq():
    """
    Tests that memory allocation and deallocation is handled
    correctly with multiple sequences that exceed the sliding
    window's capacity.
    """
    block_size = 1
    num_cpu_blocks = 8
    num_gpu_blocks = 8
    sliding_window = 2
    block_manager = BlockSpaceManager(block_size,
                                      num_cpu_blocks,
                                      num_gpu_blocks,
                                      sliding_window=sliding_window,
                                      watermark=0)
    assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
    parent = Sequence(1, "one two three", [0, 1, 2], block_size)
    seq_group = SequenceGroup("1", [parent], SamplingParams(), time.time(),
                              None)
    block_manager.allocate(seq_group)
    # assert the number of blocks allocated is correct
    # the parent seq has len 3, but since sliding_window is 2,
    # we will use at most 2 blocks
    assert block_manager.get_num_free_gpu_blocks(
    ) == num_gpu_blocks - sliding_window
    # Fork prompt and copy block tables.
    child = parent.fork(2)
    block_manager.fork(parent, child)
    # assert the number of blocks allocated is correct
    # forking does not increase memory consumption
    assert block_manager.get_num_free_gpu_blocks(
    ) == num_gpu_blocks - sliding_window
    # assert both parent and child share all blocks
    assert block_manager.get_block_table(
        parent) == block_manager.get_block_table(child)
    token_id = 4
    # Append token to child. Block is shared so copy on write occurs.
    child.append_token_id(token_id, {token_id: Logprob(0.0)})
    block_manager.append_slot(child)
    # assert the number of blocks allocated is correct
    # we will use now one block more. Each seq will use 2 blocks,
    # but only one can be shared
    assert block_manager.get_num_free_gpu_blocks(
    ) == num_gpu_blocks - sliding_window - 1
    token_id = 5
    parent.append_token_id(token_id, {token_id: Logprob(0.0)})
    block_manager.append_slot(parent)
    # assert the number of blocks allocated is correct
    # no change, because both sequences are still just sharing one block
    assert block_manager.get_num_free_gpu_blocks(
    ) == num_gpu_blocks - sliding_window - 1
    block_table_parent = block_manager.get_block_table(parent)
    block_table_child = block_manager.get_block_table(child)
    assert block_table_parent != block_table_child
    # assert both blocks are sharing the second-last block
    assert block_table_parent[-2] == block_table_child[-2]
    # now let's clean up...
    block_manager.free(parent)
    # assert the number of blocks allocated is correct
    # We have freed one seq, reducing the ref count of two blocks by one.
    # One of the two was only used by the parent seq, so this is now free.
    # The child seq still consumes sliding_window blocks
    assert block_manager.get_num_free_gpu_blocks(
    ) == num_gpu_blocks - sliding_window
    # free all blocks
    block_manager.free(child)
    # assert all blocks are free now
    assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@ -312,7 +312,12 @@ class BlockSpaceManager:
        # Thus, it is always safe from OOM.
        src_block_table = self.block_tables[parent_seq.seq_id]
        self.block_tables[child_seq.seq_id] = src_block_table.copy()
-        for block in src_block_table:
+        # When using a sliding window, blocks will be eventually reused.
        # In this case the block tables will contain repeated blocks.
        # When forking, we must make sure that each block's `ref_count`
        # is only incremented by one, so we deduplicate them by wrapping
        # them in a set.
        for block in set(src_block_table):
            block.ref_count += 1
    def _get_physical_blocks(
@ -393,7 +398,15 @@ class BlockSpaceManager:
        return block_number_mapping
    def _free_block_table(self, block_table: BlockTable) -> None:
-        for block in set(block_table):
+        # when using a sliding window, each seq will only use up
        # to `self.block_sliding_window` blocks. When freeing
        # the block table, we must make sure to not free blocks more
        # than once. If no sliding window is used, there is no block
        # reuse in the block table, so we must free all blocks.
        blocks_to_free = (block_table[-self.block_sliding_window:]
                          if self.block_sliding_window is not None else
                          block_table)
        for block in set(blocks_to_free):
            if block.device == Device.GPU:
                self.gpu_allocator.free(block)
            else: