[Core] Fix edge case in chunked prefill + block manager v2 (#7380)

This commit is contained in:
Cade Daniel 2024-08-09 16:48:49 -07:00 committed by GitHub
parent 999ef0b917
commit baa240252e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 21 additions and 3 deletions

View File

@ -261,11 +261,22 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
# skip cuda graph creation for fast test. # skip cuda graph creation for fast test.
"enforce_eager": True, "enforce_eager": True,
"enable_chunked_prefill": True, "enable_chunked_prefill": True,
"max_num_batched_tokens": 2,
"max_num_seqs": 2,
}, },
]) ])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs",
[{
"block_size": 8,
"max_num_batched_tokens": 2,
"max_num_seqs": 2,
}, {
"block_size": 8,
"max_num_batched_tokens": 3,
"max_num_seqs": 2,
}, {
"block_size": 8,
"max_num_batched_tokens": 256,
"max_num_seqs": 10,
}])
@pytest.mark.parametrize("baseline_llm_kwargs", [ @pytest.mark.parametrize("baseline_llm_kwargs", [
{ {
"use_v2_block_manager": False, "use_v2_block_manager": False,
@ -294,6 +305,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
"The president of the United States is", "The president of the United States is",
("1 + " * 50) + " 1 = ", # Longer prompt.
"The capital of France is", "The capital of France is",
"The future of AI is", "The future of AI is",
] ]

View File

@ -356,7 +356,13 @@ class BlockTable:
appended to blocks. The first such "token block" may have less token ids appended to blocks. The first such "token block" may have less token ids
than the block size, since the last allocated block may be partially than the block size, since the last allocated block may be partially
full. full.
If no token ids are provided, then no chunks are returned.
""" """
if not token_ids:
return []
first_chunk_size = self._block_size - (self._num_full_slots % first_chunk_size = self._block_size - (self._num_full_slots %
self._block_size) self._block_size)
token_blocks = [token_ids[:first_chunk_size]] token_blocks = [token_ids[:first_chunk_size]]