[Core] Fix edge case in chunked prefill + block manager v2 (#7380)
This commit is contained in:
parent
999ef0b917
commit
baa240252e
@ -261,11 +261,22 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
|
|||||||
# skip cuda graph creation for fast test.
|
# skip cuda graph creation for fast test.
|
||||||
"enforce_eager": True,
|
"enforce_eager": True,
|
||||||
"enable_chunked_prefill": True,
|
"enable_chunked_prefill": True,
|
||||||
"max_num_batched_tokens": 2,
|
|
||||||
"max_num_seqs": 2,
|
|
||||||
},
|
},
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs",
|
||||||
|
[{
|
||||||
|
"block_size": 8,
|
||||||
|
"max_num_batched_tokens": 2,
|
||||||
|
"max_num_seqs": 2,
|
||||||
|
}, {
|
||||||
|
"block_size": 8,
|
||||||
|
"max_num_batched_tokens": 3,
|
||||||
|
"max_num_seqs": 2,
|
||||||
|
}, {
|
||||||
|
"block_size": 8,
|
||||||
|
"max_num_batched_tokens": 256,
|
||||||
|
"max_num_seqs": 10,
|
||||||
|
}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [
|
@pytest.mark.parametrize("baseline_llm_kwargs", [
|
||||||
{
|
{
|
||||||
"use_v2_block_manager": False,
|
"use_v2_block_manager": False,
|
||||||
@ -294,6 +305,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
|
|||||||
prompts = [
|
prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
"The president of the United States is",
|
"The president of the United States is",
|
||||||
|
("1 + " * 50) + " 1 = ", # Longer prompt.
|
||||||
"The capital of France is",
|
"The capital of France is",
|
||||||
"The future of AI is",
|
"The future of AI is",
|
||||||
]
|
]
|
||||||
|
|||||||
@ -356,7 +356,13 @@ class BlockTable:
|
|||||||
appended to blocks. The first such "token block" may have less token ids
|
appended to blocks. The first such "token block" may have less token ids
|
||||||
than the block size, since the last allocated block may be partially
|
than the block size, since the last allocated block may be partially
|
||||||
full.
|
full.
|
||||||
|
|
||||||
|
If no token ids are provided, then no chunks are returned.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if not token_ids:
|
||||||
|
return []
|
||||||
|
|
||||||
first_chunk_size = self._block_size - (self._num_full_slots %
|
first_chunk_size = self._block_size - (self._num_full_slots %
|
||||||
self._block_size)
|
self._block_size)
|
||||||
token_blocks = [token_ids[:first_chunk_size]]
|
token_blocks = [token_ids[:first_chunk_size]]
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user