| .. |
|
__init__.py
|
FA3 initial code release
|
2024-07-11 09:53:36 -07:00 |
|
benchmark_attn.py
|
bwd benchmark + small fixes (#1129)
|
2024-08-05 21:27:52 -07:00 |
|
benchmark_flash_attention_fp8.py
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
benchmark_split_kv.py
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
combine.h
|
Make namespace comment consistent (#1305)
|
2024-10-30 22:32:49 -07:00 |
|
copy_paged_sm90_tma_cutlass35.hpp
|
Make FA3 paged attention ready for upgrade to Cutlass 3.6 (#1331)
|
2024-11-12 11:31:37 -08:00 |
|
copy_paged_sm90_tma_cutlass36.hpp
|
Make FA3 paged attention ready for upgrade to Cutlass 3.6 (#1331)
|
2024-11-12 11:31:37 -08:00 |
|
copy_paged_sm90_tma.hpp
|
Make FA3 paged attention ready for upgrade to Cutlass 3.6 (#1331)
|
2024-11-12 11:31:37 -08:00 |
|
epilogue_bwd_sm90_tma.hpp
|
Add seqused_q in fwd / bwd and seqused_k in bwd.
|
2024-09-16 14:24:11 -07:00 |
|
epilogue_fwd_sm90_tma.hpp
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_api.cpp
|
Paged Attention support for FA3 (#1268)
|
2024-11-09 17:05:01 -08:00 |
|
flash_attn_interface.py
|
Make FA3 paged attention ready for upgrade to Cutlass 3.6 (#1331)
|
2024-11-12 11:31:37 -08:00 |
|
flash_bwd_hdim64_bf16_sm90.cu
|
[FA3] Bwd
|
2024-08-01 01:57:06 -07:00 |
|
flash_bwd_hdim64_fp16_sm90.cu
|
FA3 initial code release
|
2024-07-11 09:53:36 -07:00 |
|
flash_bwd_hdim96_bf16_sm90.cu
|
[FA3] Bwd
|
2024-08-01 01:57:06 -07:00 |
|
flash_bwd_hdim96_fp16_sm90.cu
|
[FA3] Bwd
|
2024-08-01 01:57:06 -07:00 |
|
flash_bwd_hdim128_bf16_sm90.cu
|
[FA3] Bwd
|
2024-08-01 01:57:06 -07:00 |
|
flash_bwd_hdim128_fp16_sm90.cu
|
FA3 initial code release
|
2024-07-11 09:53:36 -07:00 |
|
flash_bwd_hdim256_fp16_sm90.cu
|
FA3 initial code release
|
2024-07-11 09:53:36 -07:00 |
|
flash_bwd_kernel.h
|
address comments
|
2024-09-19 22:50:59 -07:00 |
|
flash_bwd_launch_template.h
|
address comments
|
2024-09-19 22:50:59 -07:00 |
|
flash_bwd_postprocess_kernel.h
|
Add seqused_q in fwd / bwd and seqused_k in bwd.
|
2024-09-16 14:24:11 -07:00 |
|
flash_bwd_preprocess_kernel.h
|
Add seqused_q in fwd / bwd and seqused_k in bwd.
|
2024-09-16 14:24:11 -07:00 |
|
flash_fwd_hdim64_bf16_gqa2_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim64_bf16_gqa4_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim64_bf16_gqa8_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim64_bf16_gqa16_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim64_bf16_gqa32_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim64_bf16_sm90.cu
|
[FA3] BF16 forward
|
2024-07-14 23:39:46 -07:00 |
|
flash_fwd_hdim64_e4m3_gqa2_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim64_e4m3_gqa4_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim64_e4m3_gqa8_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim64_e4m3_gqa16_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim64_e4m3_gqa32_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim64_e4m3_sm90.cu
|
Fp8 kernel with "in-kernel" transpose of V in producer (#1100)
|
2024-07-30 14:14:14 -07:00 |
|
flash_fwd_hdim64_fp16_gqa2_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim64_fp16_gqa4_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim64_fp16_gqa8_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim64_fp16_gqa16_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim64_fp16_gqa32_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim64_fp16_sm90.cu
|
FA3 initial code release
|
2024-07-11 09:53:36 -07:00 |
|
flash_fwd_hdim128_bf16_gqa2_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim128_bf16_gqa4_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim128_bf16_gqa8_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim128_bf16_gqa16_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim128_bf16_gqa32_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim128_bf16_sm90.cu
|
[FA3] BF16 forward
|
2024-07-14 23:39:46 -07:00 |
|
flash_fwd_hdim128_e4m3_gqa2_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim128_e4m3_gqa4_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim128_e4m3_gqa8_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim128_e4m3_gqa16_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim128_e4m3_gqa32_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim128_e4m3_sm90.cu
|
Fp8 kernel with "in-kernel" transpose of V in producer (#1100)
|
2024-07-30 14:14:14 -07:00 |
|
flash_fwd_hdim128_fp16_gqa2_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim128_fp16_gqa4_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim128_fp16_gqa8_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim128_fp16_gqa16_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim128_fp16_gqa32_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim128_fp16_sm90.cu
|
FA3 initial code release
|
2024-07-11 09:53:36 -07:00 |
|
flash_fwd_hdim256_bf16_gqa2_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim256_bf16_gqa4_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim256_bf16_gqa8_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim256_bf16_gqa16_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim256_bf16_gqa32_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim256_bf16_sm90.cu
|
[FA3] BF16 forward
|
2024-07-14 23:39:46 -07:00 |
|
flash_fwd_hdim256_e4m3_gqa2_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim256_e4m3_gqa4_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim256_e4m3_gqa8_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim256_e4m3_gqa16_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim256_e4m3_gqa32_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim256_e4m3_sm90.cu
|
Fp8 kernel with "in-kernel" transpose of V in producer (#1100)
|
2024-07-30 14:14:14 -07:00 |
|
flash_fwd_hdim256_fp16_gqa2_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim256_fp16_gqa4_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim256_fp16_gqa8_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim256_fp16_gqa16_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim256_fp16_gqa32_sm90.cu
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_hdim256_fp16_sm90.cu
|
FA3 initial code release
|
2024-07-11 09:53:36 -07:00 |
|
flash_fwd_kernel.h
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
flash_fwd_launch_template.h
|
Paged Attention support for FA3 (#1268)
|
2024-11-09 17:05:01 -08:00 |
|
flash.h
|
Paged Attention support for FA3 (#1268)
|
2024-11-09 17:05:01 -08:00 |
|
kernel_traits.h
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
mainloop_bwd_sm90_tma_gmma_ws.hpp
|
address comments
|
2024-09-19 22:50:59 -07:00 |
|
mainloop_fwd_sm90_tma_gmma_ws.hpp
|
Paged Attention support for FA3 (#1268)
|
2024-11-09 17:05:01 -08:00 |
|
named_barrier.hpp
|
Make namespace comment consistent (#1305)
|
2024-10-30 22:32:49 -07:00 |
|
seq_len.h
|
Paged Attention support for FA3 (#1268)
|
2024-11-09 17:05:01 -08:00 |
|
setup.py
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
softmax.h
|
Make namespace comment consistent (#1305)
|
2024-10-30 22:32:49 -07:00 |
|
static_switch.h
|
Paged Attention support for FA3 (#1268)
|
2024-11-09 17:05:01 -08:00 |
|
test_attn_kvcache.py
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
test_flash_attn.py
|
Paged Attention support for FA3 (#1268)
|
2024-11-09 17:05:01 -08:00 |
|
test_kvcache.py
|
FA3 kvcache + split kv + gqa parallelization (#1236)
|
2024-10-15 00:21:22 -07:00 |
|
tile_scheduler_bwd.hpp
|
Make namespace comment consistent (#1305)
|
2024-10-30 22:32:49 -07:00 |
|
tile_scheduler.hpp
|
Make namespace comment consistent (#1305)
|
2024-10-30 22:32:49 -07:00 |
|
utils.h
|
Make namespace comment consistent (#1305)
|
2024-10-30 22:32:49 -07:00 |