move sparse_attn to new files

minminsun · minminsun · commit 8701243d774f · 2025-01-06T11:25:38.000Z
diff --git a/csrc/flash_attn/src/flash_fwd_kernel.h b/csrc/flash_attn/src/flash_fwd_kernel.h
diff --git a/csrc/flash_attn/src/flash_fwd_launch_template.h b/csrc/flash_attn/src/flash_fwd_launch_template.h
@@ -26,15 +26,6 @@
 template<typename Kernel_traits, __VA_ARGS__> \
 __global__ void kernelName(KERNEL_PARAM_MODIFIER const Flash_fwd_params params)
 
-DEFINE_FLASH_FORWARD_KERNEL(flash_fwd_sparse_kernel, bool Is_dropout, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Is_softcap, bool Return_softmax) {
-    #if defined(ARCH_SUPPORTS_FLASH)
-        static_assert(!(Is_causal && Is_local)); // Enforce constraints
-        flash::compute_sparse_attn<Kernel_traits, Is_dropout, Is_causal, Is_local, Has_alibi, Is_even_MN, Is_even_K, Is_softcap, Return_softmax>(params);
-    #else
-        FLASH_UNSUPPORTED_ARCH
-    #endif
-}
-
 DEFINE_FLASH_FORWARD_KERNEL(flash_fwd_kernel, bool Is_dropout, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Is_softcap, bool Return_softmax) {
     #if defined(ARCH_SUPPORTS_FLASH)
         static_assert(!(Is_causal && Is_local)); // Enforce constraints
@@ -57,50 +48,6 @@ DEFINE_FLASH_FORWARD_KERNEL(flash_fwd_splitkv_combine_kernel, int kBlockM, int L
     flash::combine_attn_seqk_parallel<Kernel_traits, kBlockM, Log_max_splits, Is_even_K>(params);
 }
 
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal>
-void run_flash_sparse_fwd(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr size_t smem_size = Kernel_traits::kSmemSize;
-    // printf("smem_size = %d\n", smem_size);
-
-    // Work-around for gcc 7. It doesn't like nested BOOL_SWITCH.
-    // https://github.com/kokkos/kokkos-kernels/issues/349
-    // https://github.com/HazyResearch/flash-attention/issues/21
-
-    const int num_m_block = (params.seqlen_q + Kernel_traits::kBlockM - 1) / Kernel_traits::kBlockM;
-    dim3 grid(num_m_block, params.b, params.h);
-    const bool is_even_K = params.d == Kernel_traits::kHeadDim;
-    const bool return_softmax = params.p_ptr != nullptr;
-    EVENK_SWITCH(is_even_K, IsEvenKConst, [&] {
-        BOOL_SWITCH(return_softmax, ReturnSoftmaxConst, [&] {
-            ALIBI_SWITCH(params.alibi_slopes_ptr != nullptr, Has_alibi, [&] {
-                SOFTCAP_SWITCH(params.softcap > 0.0, Is_softcap, [&] {
-                    constexpr bool IsEvenMNConst = false;
-                    constexpr bool Is_local = false;
-                    // Will only return softmax if dropout, to reduce compilation time.
-                    // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates.
-                    // If return_softmax, set IsEvenMNConst to false to reduce number of templates
-                    // If head dim > 128, set IsEvenMNConst to false to reduce number of templates
-                    // If Is_local, set Is_causal to false
-                    auto kernel = &flash_fwd_sparse_kernel<Kernel_traits, Is_dropout && !Is_softcap, Is_causal, Is_local && !Is_causal, Has_alibi, IsEvenMNConst && IsEvenKConst && !Is_local && !ReturnSoftmaxConst && Kernel_traits::kHeadDim <= 128, IsEvenKConst, Is_softcap, ReturnSoftmaxConst && Is_dropout && !Is_softcap>;
-                    // auto kernel = &flash_fwd_kernel<Kernel_traits, false, Is_causal, false, false, true, true, false>;
-                    // printf("IsEvenMNConst = %d, IsEvenKConst = %d, Is_local = %d, Is_causal = %d, ReturnSoftmaxConst = %d, Is_dropout = %d\n", int(IsEvenMNConst), int(IsEvenKConst), int(Is_local), int(Is_causal), int(ReturnSoftmaxConst), int(Is_dropout));
-                    // auto kernel = &flash_fwd_kernel<Kernel_traits, false, Is_causal, false, true, true, false>;
-                    if (smem_size >= 48 * 1024) {
-                        C10_CUDA_CHECK(cudaFuncSetAttribute(
-                            kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-                    }
-                    // int ctas_per_sm;
-                    // cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-                    //     &ctas_per_sm, kernel, Kernel_traits::kNThreads, smem_size);
-                    // printf("smem_size = %d, CTAs per SM = %d\n", int(smem_size), ctas_per_sm);
-                    kernel<<<grid, Kernel_traits::kNThreads, smem_size, stream>>>(params);
-                    C10_CUDA_KERNEL_LAUNCH_CHECK();
-                });
-            });
-        });
-    });
-}
-
 template<typename Kernel_traits, bool Is_dropout, bool Is_causal>
 void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
     constexpr size_t smem_size = Kernel_traits::kSmemSize;
@@ -407,67 +354,3 @@ void run_mha_fwd_hdim256(Flash_fwd_params &params, cudaStream_t stream) {
         // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 32, 8, false, false, T>, Is_dropout, Is_causal>(params, stream);
     });
 }
-
-template<typename T, bool Is_causal>
-void run_mha_fwd_sparse_hdim32(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 32;
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        run_flash_sparse_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
-    });
-}
-
-template<typename T, bool Is_causal>
-void run_mha_fwd_sparse_hdim64(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 64;
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        run_flash_sparse_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
-    });
-}
-
-template<typename T, bool Is_causal>
-void run_mha_fwd_sparse_hdim96(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 96;
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        run_flash_sparse_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
-    });
-}
-
-template<typename T, bool Is_causal>
-void run_mha_fwd_sparse_hdim128(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 128;
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        run_flash_sparse_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
-    });
-}
-
-template<typename T, bool Is_causal>
-void run_mha_fwd_sparse_hdim160(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 160;
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        run_flash_sparse_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
-    });
-}
-
-template<typename T, bool Is_causal>
-void run_mha_fwd_sparse_hdim192(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 192;
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        run_flash_sparse_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
-    });
-}
-
-template<typename T, bool Is_causal>
-void run_mha_fwd_sparse_hdim224(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 224;
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        run_flash_sparse_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
-    });
-}
-
-template<typename T, bool Is_causal>
-void run_mha_fwd_sparse_hdim256(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 256;
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        run_flash_sparse_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
-    });
-}
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim128_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim128_bf16_causal_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::bfloat16_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim128_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim128_bf16_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::bfloat16_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim128_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim128_fp16_causal_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::half_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim128_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim128_fp16_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::half_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim160_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim160_bf16_causal_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::bfloat16_t, 160, true>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim160_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim160_bf16_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::bfloat16_t, 160, false>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim160_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim160_fp16_causal_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::half_t, 160, true>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim160_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim160_fp16_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::half_t, 160, false>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim192_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim192_bf16_causal_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::bfloat16_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim192_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim192_bf16_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::bfloat16_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim192_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim192_fp16_causal_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::half_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim192_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim192_fp16_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::half_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim224_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim224_bf16_causal_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::bfloat16_t, 224, true>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim224_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim224_bf16_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::bfloat16_t, 224, false>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim224_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim224_fp16_causal_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::half_t, 224, true>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim224_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim224_fp16_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::half_t, 224, false>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim256_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim256_bf16_causal_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::bfloat16_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim256_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim256_bf16_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::bfloat16_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim256_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim256_fp16_causal_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::half_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim256_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim256_fp16_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::half_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim32_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim32_bf16_causal_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::bfloat16_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim32_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim32_bf16_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::bfloat16_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim32_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim32_fp16_causal_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::half_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim32_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim32_fp16_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::half_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim64_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim64_bf16_causal_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::bfloat16_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim64_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim64_bf16_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::bfloat16_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim64_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim64_fp16_causal_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::half_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim64_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim64_fp16_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::half_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim96_bf16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim96_bf16_causal_sm80.cu
@@ -2,7 +2,7 @@
 // Splitting the different head dimensions to different files to speed up compilation.
 // This file is auto-generated. See "generate_kernels.py"
 
-#include "flash_fwd_launch_template.h"
+#include "flash_fwd_sparse_launch_template.h"
 
 template<>
 void run_mha_fwd_sparse_<cutlass::bfloat16_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream) {
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim96_bf16_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim96_bf16_sm80.cu
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim96_fp16_causal_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim96_fp16_causal_sm80.cu
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_hdim96_fp16_sm80.cu b/csrc/flash_attn/src/flash_fwd_sparse_hdim96_fp16_sm80.cu
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_kernel.h b/csrc/flash_attn/src/flash_fwd_sparse_kernel.h
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_launch_template.h b/csrc/flash_attn/src/flash_fwd_sparse_launch_template.h
diff --git a/csrc/flash_attn/src/generate_kernels.py b/csrc/flash_attn/src/generate_kernels.py