diff --git a/flash_attn/flash_attention.py b/flash_attn/flash_attention.py index 138a90b..db6804e 100644 --- a/flash_attn/flash_attention.py +++ b/flash_attn/flash_attention.py @@ -16,7 +16,7 @@ class FlashAttention(nn.Module): (default: 1/sqrt(d_keys) where d_keys is computed at runtime) attention_dropout: The dropout rate to apply to the attention - (default: 0.1) + (default: 0.0) """ def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None): super().__init__()