parent
b0925b3878
commit
49a3c8662b
@ -274,3 +274,90 @@ def test_reset():
|
|||||||
# Resetting block manager frees all allocated blocks.
|
# Resetting block manager frees all allocated blocks.
|
||||||
block_manager.reset()
|
block_manager.reset()
|
||||||
assert block_manager.get_num_free_gpu_blocks() == original_blocks
|
assert block_manager.get_num_free_gpu_blocks() == original_blocks
|
||||||
|
|
||||||
|
|
||||||
|
def test_sliding_window_multi_seq():
|
||||||
|
"""
|
||||||
|
Tests that memory allocation and deallocation is handled
|
||||||
|
correctly with multiple sequences that exceed the sliding
|
||||||
|
window's capacity.
|
||||||
|
"""
|
||||||
|
block_size = 1
|
||||||
|
num_cpu_blocks = 8
|
||||||
|
num_gpu_blocks = 8
|
||||||
|
sliding_window = 2
|
||||||
|
block_manager = BlockSpaceManager(block_size,
|
||||||
|
num_cpu_blocks,
|
||||||
|
num_gpu_blocks,
|
||||||
|
sliding_window=sliding_window,
|
||||||
|
watermark=0)
|
||||||
|
|
||||||
|
assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
|
||||||
|
|
||||||
|
parent = Sequence(1, "one two three", [0, 1, 2], block_size)
|
||||||
|
seq_group = SequenceGroup("1", [parent], SamplingParams(), time.time(),
|
||||||
|
None)
|
||||||
|
block_manager.allocate(seq_group)
|
||||||
|
|
||||||
|
# assert the number of blocks allocated is correct
|
||||||
|
# the parent seq has len 3, but since sliding_window is 2,
|
||||||
|
# we will use at most 2 blocks
|
||||||
|
assert block_manager.get_num_free_gpu_blocks(
|
||||||
|
) == num_gpu_blocks - sliding_window
|
||||||
|
|
||||||
|
# Fork prompt and copy block tables.
|
||||||
|
child = parent.fork(2)
|
||||||
|
block_manager.fork(parent, child)
|
||||||
|
|
||||||
|
# assert the number of blocks allocated is correct
|
||||||
|
# forking does not increase memory consumption
|
||||||
|
assert block_manager.get_num_free_gpu_blocks(
|
||||||
|
) == num_gpu_blocks - sliding_window
|
||||||
|
|
||||||
|
# assert both parent and child share all blocks
|
||||||
|
assert block_manager.get_block_table(
|
||||||
|
parent) == block_manager.get_block_table(child)
|
||||||
|
|
||||||
|
token_id = 4
|
||||||
|
# Append token to child. Block is shared so copy on write occurs.
|
||||||
|
child.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||||
|
block_manager.append_slot(child)
|
||||||
|
|
||||||
|
# assert the number of blocks allocated is correct
|
||||||
|
# we will use now one block more. Each seq will use 2 blocks,
|
||||||
|
# but only one can be shared
|
||||||
|
assert block_manager.get_num_free_gpu_blocks(
|
||||||
|
) == num_gpu_blocks - sliding_window - 1
|
||||||
|
|
||||||
|
token_id = 5
|
||||||
|
parent.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||||
|
block_manager.append_slot(parent)
|
||||||
|
|
||||||
|
# assert the number of blocks allocated is correct
|
||||||
|
# no change, because both sequences are still just sharing one block
|
||||||
|
assert block_manager.get_num_free_gpu_blocks(
|
||||||
|
) == num_gpu_blocks - sliding_window - 1
|
||||||
|
|
||||||
|
block_table_parent = block_manager.get_block_table(parent)
|
||||||
|
block_table_child = block_manager.get_block_table(child)
|
||||||
|
|
||||||
|
assert block_table_parent != block_table_child
|
||||||
|
|
||||||
|
# assert both blocks are sharing the second-last block
|
||||||
|
assert block_table_parent[-2] == block_table_child[-2]
|
||||||
|
|
||||||
|
# now let's clean up...
|
||||||
|
block_manager.free(parent)
|
||||||
|
|
||||||
|
# assert the number of blocks allocated is correct
|
||||||
|
# We have freed one seq, reducing the ref count of two blocks by one.
|
||||||
|
# One of the two was only used by the parent seq, so this is now free.
|
||||||
|
# The child seq still consumes sliding_window blocks
|
||||||
|
assert block_manager.get_num_free_gpu_blocks(
|
||||||
|
) == num_gpu_blocks - sliding_window
|
||||||
|
|
||||||
|
# free all blocks
|
||||||
|
block_manager.free(child)
|
||||||
|
|
||||||
|
# assert all blocks are free now
|
||||||
|
assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
|
||||||
|
|||||||
@ -312,7 +312,12 @@ class BlockSpaceManager:
|
|||||||
# Thus, it is always safe from OOM.
|
# Thus, it is always safe from OOM.
|
||||||
src_block_table = self.block_tables[parent_seq.seq_id]
|
src_block_table = self.block_tables[parent_seq.seq_id]
|
||||||
self.block_tables[child_seq.seq_id] = src_block_table.copy()
|
self.block_tables[child_seq.seq_id] = src_block_table.copy()
|
||||||
for block in src_block_table:
|
# When using a sliding window, blocks will be eventually reused.
|
||||||
|
# In this case the block tables will contain repeated blocks.
|
||||||
|
# When forking, we must make sure that each block's `ref_count`
|
||||||
|
# is only incremented by one, so we deduplicate them by wrapping
|
||||||
|
# them in a set.
|
||||||
|
for block in set(src_block_table):
|
||||||
block.ref_count += 1
|
block.ref_count += 1
|
||||||
|
|
||||||
def _get_physical_blocks(
|
def _get_physical_blocks(
|
||||||
@ -393,7 +398,15 @@ class BlockSpaceManager:
|
|||||||
return block_number_mapping
|
return block_number_mapping
|
||||||
|
|
||||||
def _free_block_table(self, block_table: BlockTable) -> None:
|
def _free_block_table(self, block_table: BlockTable) -> None:
|
||||||
for block in set(block_table):
|
# when using a sliding window, each seq will only use up
|
||||||
|
# to `self.block_sliding_window` blocks. When freeing
|
||||||
|
# the block table, we must make sure to not free blocks more
|
||||||
|
# than once. If no sliding window is used, there is no block
|
||||||
|
# reuse in the block table, so we must free all blocks.
|
||||||
|
blocks_to_free = (block_table[-self.block_sliding_window:]
|
||||||
|
if self.block_sliding_window is not None else
|
||||||
|
block_table)
|
||||||
|
for block in set(blocks_to_free):
|
||||||
if block.device == Device.GPU:
|
if block.device == Device.GPU:
|
||||||
self.gpu_allocator.free(block)
|
self.gpu_allocator.free(block)
|
||||||
else:
|
else:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user