[Core] Fix edge case in chunked prefill + block manager v2 (#7380)

2024-08-09 16:48:49 -07:00 · 2024-08-09 16:48:49 -07:00 · baa240252e
commit baa240252e
parent 999ef0b917
2 changed files with 21 additions and 3 deletions
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@ -261,11 +261,22 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
            # skip cuda graph creation for fast test.
            "enforce_eager": True,
            "enable_chunked_prefill": True,
            "max_num_batched_tokens": 2,
            "max_num_seqs": 2,
        },
    ])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("per_test_common_llm_kwargs",
                         [{
                             "block_size": 8,
                             "max_num_batched_tokens": 2,
                             "max_num_seqs": 2,
                         }, {
                             "block_size": 8,
                             "max_num_batched_tokens": 3,
                             "max_num_seqs": 2,
                         }, {
                             "block_size": 8,
                             "max_num_batched_tokens": 256,
                             "max_num_seqs": 10,
                         }])
@pytest.mark.parametrize("baseline_llm_kwargs", [
    {
        "use_v2_block_manager": False,
@ -294,6 +305,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        ("1 + " * 50) + " 1 = ",  # Longer prompt.
        "The capital of France is",
        "The future of AI is",
    ]
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@ -356,7 +356,13 @@ class BlockTable:
        appended to blocks. The first such "token block" may have less token ids
        than the block size, since the last allocated block may be partially
        full.
        If no token ids are provided, then no chunks are returned.
        """
        if not token_ids:
            return []
        first_chunk_size = self._block_size - (self._num_full_slots %
                                               self._block_size)
        token_blocks = [token_ids[:first_chunk_size]]