From bda41c70ddb124134935a90a0d51304d2ac035e8 Mon Sep 17 00:00:00 2001 From: Song <44120206+Oliver-ss@users.noreply.github.com> Date: Wed, 19 Jul 2023 02:31:48 +0800 Subject: [PATCH] hotfix attn alibi wo head mapping (#496) Co-authored-by: oliveryuan --- tests/kernels/test_attention.py | 2 ++ vllm/model_executor/layers/attention.py | 1 + 2 files changed, 3 insertions(+) diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index bf3147bb..4c02f33c 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -199,6 +199,7 @@ def run_single_query_cached_kv_attention( ] block_tables.append(block_table) block_tables = torch.tensor(block_tables, dtype=torch.int, device='cuda') + head_mapping = torch.arange(num_heads, dtype=torch.int32, device="cuda") scale = float(1.0 / (head_size**0.5)) output = torch.empty(num_tokens, @@ -211,6 +212,7 @@ def run_single_query_cached_kv_attention( query, key_cache, value_cache, + head_mapping, scale, block_tables, context_lens, diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index d94649cf..375f9f59 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -408,6 +408,7 @@ class PagedAttentionWithALiBi(PagedAttention): query, key_cache, value_cache, + self.head_mapping, self.scale, input_metadata.block_tables, input_metadata.context_lens,