[Bugfix] Update flashinfer.py with PagedAttention forwards - Fixes Gemma2 OpenAI Server Crash (#6501)
This commit is contained in:
parent
e2fbaee725
commit
c8a7d51c49
@ -20,6 +20,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
|||||||
from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
|
from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
|
||||||
compute_slot_mapping_start_idx,
|
compute_slot_mapping_start_idx,
|
||||||
is_block_tables_empty)
|
is_block_tables_empty)
|
||||||
|
from vllm.attention.ops.paged_attn import PagedAttention
|
||||||
from vllm.sequence import SequenceGroupMetadata
|
from vllm.sequence import SequenceGroupMetadata
|
||||||
from vllm.utils import get_kv_cache_torch_dtype, make_tensor_with_pad
|
from vllm.utils import get_kv_cache_torch_dtype, make_tensor_with_pad
|
||||||
|
|
||||||
@ -61,14 +62,14 @@ class FlashInferBackend(AttentionBackend):
|
|||||||
dst_kv_cache: torch.Tensor,
|
dst_kv_cache: torch.Tensor,
|
||||||
src_to_dst: torch.Tensor,
|
src_to_dst: torch.Tensor,
|
||||||
) -> None:
|
) -> None:
|
||||||
raise NotImplementedError
|
PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def copy_blocks(
|
def copy_blocks(
|
||||||
kv_caches: List[torch.Tensor],
|
kv_caches: List[torch.Tensor],
|
||||||
src_to_dists: torch.Tensor,
|
src_to_dists: torch.Tensor,
|
||||||
) -> None:
|
) -> None:
|
||||||
raise NotImplementedError
|
PagedAttention.copy_blocks(kv_caches, src_to_dists)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_supported_head_sizes() -> List[int]:
|
def get_supported_head_sizes() -> List[int]:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user