From 4b5bcf89065e0d9247cb1e12635b0d57f67a3a6a Mon Sep 17 00:00:00 2001 From: Robert Irvine Date: Fri, 8 Sep 2023 06:48:54 +0100 Subject: [PATCH] faster startup of vLLM (#982) * update --------- Co-authored-by: Robert Irvine --- vllm/model_executor/layers/attention.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 29bfe328..8ccd062a 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -259,8 +259,9 @@ class PagedAttentionWithRoPE(PagedAttention): self.is_neox_style = is_neox_style # Create the cos and sin cache. - inv_freq = 1.0 / (base**(torch.arange(0, rotary_dim, 2) / rotary_dim)) - t = torch.arange(max_position).float() + inv_freq = 1.0 / (base**( + torch.arange(0, rotary_dim, 2, device="cuda") / rotary_dim)) + t = torch.arange(max_position, device="cuda").float() freqs = torch.einsum("i,j -> ij", t, inv_freq.float()) cos = freqs.cos() sin = freqs.sin()