From 49a3c8662ba745503890ab8b3c502aad7e1a0a19 Mon Sep 17 00:00:00 2001
From: Breno Faria <breno@veltefaria.de>
Date: Wed, 13 Mar 2024 01:30:08 +0100
Subject: [PATCH] Fixes #1556 double free (#3347)

---
 tests/core/test_block_manager.py | 87 ++++++++++++++++++++++++++++++++
 vllm/core/block_manager.py       | 17 ++++++-
 2 files changed, 102 insertions(+), 2 deletions(-)

diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index b280fd1d..44ac05a1 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -274,3 +274,90 @@ def test_reset():
     # Resetting block manager frees all allocated blocks.
     block_manager.reset()
     assert block_manager.get_num_free_gpu_blocks() == original_blocks
+
+
+def test_sliding_window_multi_seq():
+    """
+    Tests that memory allocation and deallocation is handled
+    correctly with multiple sequences that exceed the sliding
+    window's capacity.
+    """
+    block_size = 1
+    num_cpu_blocks = 8
+    num_gpu_blocks = 8
+    sliding_window = 2
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      sliding_window=sliding_window,
+                                      watermark=0)
+
+    assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
+
+    parent = Sequence(1, "one two three", [0, 1, 2], block_size)
+    seq_group = SequenceGroup("1", [parent], SamplingParams(), time.time(),
+                              None)
+    block_manager.allocate(seq_group)
+
+    # assert the number of blocks allocated is correct
+    # the parent seq has len 3, but since sliding_window is 2,
+    # we will use at most 2 blocks
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window
+
+    # Fork prompt and copy block tables.
+    child = parent.fork(2)
+    block_manager.fork(parent, child)
+
+    # assert the number of blocks allocated is correct
+    # forking does not increase memory consumption
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window
+
+    # assert both parent and child share all blocks
+    assert block_manager.get_block_table(
+        parent) == block_manager.get_block_table(child)
+
+    token_id = 4
+    # Append token to child. Block is shared so copy on write occurs.
+    child.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_manager.append_slot(child)
+
+    # assert the number of blocks allocated is correct
+    # we will use now one block more. Each seq will use 2 blocks,
+    # but only one can be shared
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window - 1
+
+    token_id = 5
+    parent.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_manager.append_slot(parent)
+
+    # assert the number of blocks allocated is correct
+    # no change, because both sequences are still just sharing one block
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window - 1
+
+    block_table_parent = block_manager.get_block_table(parent)
+    block_table_child = block_manager.get_block_table(child)
+
+    assert block_table_parent != block_table_child
+
+    # assert both blocks are sharing the second-last block
+    assert block_table_parent[-2] == block_table_child[-2]
+
+    # now let's clean up...
+    block_manager.free(parent)
+
+    # assert the number of blocks allocated is correct
+    # We have freed one seq, reducing the ref count of two blocks by one.
+    # One of the two was only used by the parent seq, so this is now free.
+    # The child seq still consumes sliding_window blocks
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window
+
+    # free all blocks
+    block_manager.free(child)
+
+    # assert all blocks are free now
+    assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 8bfc1499..8b089a56 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -312,7 +312,12 @@ class BlockSpaceManager:
         # Thus, it is always safe from OOM.
         src_block_table = self.block_tables[parent_seq.seq_id]
         self.block_tables[child_seq.seq_id] = src_block_table.copy()
-        for block in src_block_table:
+        # When using a sliding window, blocks will be eventually reused.
+        # In this case the block tables will contain repeated blocks.
+        # When forking, we must make sure that each block's `ref_count`
+        # is only incremented by one, so we deduplicate them by wrapping
+        # them in a set.
+        for block in set(src_block_table):
             block.ref_count += 1
 
     def _get_physical_blocks(
@@ -393,7 +398,15 @@ class BlockSpaceManager:
         return block_number_mapping
 
     def _free_block_table(self, block_table: BlockTable) -> None:
-        for block in set(block_table):
+        # when using a sliding window, each seq will only use up
+        # to `self.block_sliding_window` blocks. When freeing
+        # the block table, we must make sure to not free blocks more
+        # than once. If no sliding window is used, there is no block
+        # reuse in the block table, so we must free all blocks.
+        blocks_to_free = (block_table[-self.block_sliding_window:]
+                          if self.block_sliding_window is not None else
+                          block_table)
+        for block in set(blocks_to_free):
             if block.device == Device.GPU:
                 self.gpu_allocator.free(block)
             else: