[Bugfix] Fix logit soft cap in flash-attn backend (#7425)

2024-08-12 09:58:28 -07:00 · 2024-08-12 09:58:28 -07:00 · cfba4def5d
commit cfba4def5d
parent d2bc4510a4
1 changed files with 1 additions and 0 deletions
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@ -563,6 +563,7 @@ class FlashAttentionImpl(AttentionImpl):
                softmax_scale=self.scale,
                causal=True,
                alibi_slopes=self.alibi_slopes,
+                softcap=self.logits_soft_cap,
            ).squeeze(1)

        # Reshape the output tensor.