remove window_size and useless code

minminsun · minminsun · commit c9d548fbd46e · 2025-01-09T07:31:48.000Z
diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp
@@ -181,8 +181,6 @@ void set_params_fprop_sparse(Flash_fwd_params &params,
                             void *softmax_lse_d,
                             float p_dropout,
                             float softmax_scale,
-                            int window_size_left,
-                            int window_size_right,
                             const float softcap,
                             bool seqlenq_ngroups_swapped=false,
                             const bool unpadded_lse=false) {
@@ -200,8 +198,8 @@ void set_params_fprop_sparse(Flash_fwd_params &params,
         softmax_lse_d,
         p_dropout,
         softmax_scale,
-        window_size_left,
-        window_size_right,
+        -1,  // window_size_left
+        -1,  // window_size_right
         softcap,
         seqlenq_ngroups_swapped,
         unpadded_lse
@@ -353,14 +351,10 @@ mha_fwd_sparse(at::Tensor &q,         // batch_size x seqlen_q x num_heads x hea
                const double p_dropout,
                const double softmax_scale,
                bool is_causal,
-               int64_t window_size_left,
-               int64_t window_size_right,
                const double softcap,
                const bool return_softmax,
                c10::optional<at::Generator> gen_) {
 
-    TORCH_CHECK(window_size_left == -1, "sliding window is not supported in sparse_attn_func.");
-    TORCH_CHECK(window_size_right == -1, "sliding window is not supported in sparse_attn_func.");
     auto dprops = at::cuda::getCurrentDeviceProperties();
     // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
     bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
@@ -398,12 +392,8 @@ mha_fwd_sparse(at::Tensor &q,         // batch_size x seqlen_q x num_heads x hea
 
     if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); }
 
-    if (window_size_left >= seqlen_k) { window_size_left = -1; }
-    if (window_size_right >= seqlen_k) { window_size_right = -1; }
-
     // causal=true is the same as causal=false in this case
     if (seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; }
-    if (is_causal) { window_size_right = 0; }
 
     // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case
     // H/t Daniel Haziza
@@ -483,8 +473,6 @@ mha_fwd_sparse(at::Tensor &q,         // batch_size x seqlen_q x num_heads x hea
                             softmax_lse.data_ptr(),
                             p_dropout,
                             softmax_scale,
-                            window_size_left,
-                            window_size_right,
                             softcap
                      );
 
@@ -1290,7 +1278,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
     ops.def("fwd_sparse(Tensor! q, Tensor k, Tensor v, "
             "Tensor block_count, Tensor block_offset, Tensor column_count, Tensor column_index, "
             "Tensor!? out, Tensor? alibi_slopes, "
-            "float p_dropout, float softmax_scale, bool is_causal, int window_size_left, int window_size_right, "
+            "float p_dropout, float softmax_scale, bool is_causal, "
             "float softcap, bool return_softmax, Generator? gen)"
             "-> Tensor[]");
     ops.impl("fwd_sparse", torch::kCUDA, &mha_fwd_sparse);
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_kernel.h b/csrc/flash_attn/src/flash_fwd_sparse_kernel.h
@@ -458,15 +458,13 @@ inline __device__ void sparse_attn_1rowblock(const Params &params, const int bid
             if (n < num_cols_block - 1) {
                 #pragma unroll
                 for (int m = 0; m < size<1>(tVgVToken); ++m) {
-                    if (true) {  // Is_even_MN
-                        tVgVToken.data() = tVgVTokenData + cols_ptr[n * kBlockN + get<0>(tKVcKV(0, m, 0))] * int64_t(params.v_row_stride);
-                        #pragma unroll
-                        for (int k = 0; k < size<2>(tVgVToken); ++k) {
-                            if (Is_even_K || tKVpKV(k)) {
-                                cute::copy(gmem_tiled_copy_QKV, tVgVToken(_, m, k), tVsV(_, m, k));
-                            } else if (true) {  // Clear_OOB_K
-                                cute::clear(tVsV(_, m, k));
-                            }
+                    tVgVToken.data() = tVgVTokenData + cols_ptr[n * kBlockN + get<0>(tKVcKV(0, m, 0))] * int64_t(params.v_row_stride);
+                    #pragma unroll
+                    for (int k = 0; k < size<2>(tVgVToken); ++k) {
+                        if (Is_even_K || tKVpKV(k)) {
+                            cute::copy(gmem_tiled_copy_QKV, tVgVToken(_, m, k), tVsV(_, m, k));
+                        } else {  // Clear_OOB_K
+                            cute::clear(tVsV(_, m, k));
                         }
                     }
                 }
@@ -534,16 +532,14 @@ inline __device__ void sparse_attn_1rowblock(const Params &params, const int bid
             if (n < num_cols_block - 2) {
                 #pragma unroll
                 for (int m = 0; m < size<1>(tKgKToken); ++m) {
-                    if (true) {  // Is_even_MN
-                        int token_idx = cols_ptr[(n + 1) * kBlockN + get<0>(tKVcKV(0, m, 0))];
-                        tKgKToken.data() = tKgKTokenData + token_idx * int64_t(params.k_row_stride);
-                        #pragma unroll
-                        for (int k = 0; k < size<2>(tKgKToken); ++k) {
-                            if (Is_even_K || tKVpKV(k)) {
-                                cute::copy(gmem_tiled_copy_QKV, tKgKToken(_, m, k), tKsK(_, m, k));
-                            } else if (true) {  // Clear_OOB_K
-                                cute::clear(tKsK(_, m, k));
-                            }
+                    int token_idx = cols_ptr[(n + 1) * kBlockN + get<0>(tKVcKV(0, m, 0))];
+                    tKgKToken.data() = tKgKTokenData + token_idx * int64_t(params.k_row_stride);
+                    #pragma unroll
+                    for (int k = 0; k < size<2>(tKgKToken); ++k) {
+                        if (Is_even_K || tKVpKV(k)) {
+                            cute::copy(gmem_tiled_copy_QKV, tKgKToken(_, m, k), tKsK(_, m, k));
+                        } else {  // Clear_OOB_K
+                            cute::clear(tKsK(_, m, k));
                         }
                     }
                 }
@@ -560,7 +556,7 @@ inline __device__ void sparse_attn_1rowblock(const Params &params, const int bid
                         for (int k = 0; k < size<2>(tKgKToken); ++k) {
                             if (Is_even_K || tKVpKV(k)) {
                                 cute::copy(gmem_tiled_copy_QKV, tKgKToken(_, m, k), tKsK(_, m, k));
-                            } else if (true) {  // Clear_OOB_K
+                            } else {  // Clear_OOB_K
                                 cute::clear(tKsK(_, m, k));
                             }
                         }
diff --git a/vllm_flash_attn/flash_attn_interface.py b/vllm_flash_attn/flash_attn_interface.py
@@ -46,7 +46,7 @@ def _get_block_size_n(device, head_dim, is_dropout, is_causal):
         return 64
 
 def _sparse_attn_forward(
-    q, k, v, block_count, block_offset, column_count, column_index,dropout_p, softmax_scale, causal, window_size, softcap, alibi_slopes, return_softmax, *, out=None
+    q, k, v, block_count, block_offset, column_count, column_index,dropout_p, softmax_scale, causal, softcap, alibi_slopes, return_softmax, *, out=None
 ):
     q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
     out, softmax_lse = torch.ops.vllm_flash_attn_c.fwd_sparse(
@@ -62,8 +62,6 @@ def _sparse_attn_forward(
         dropout_p,
         softmax_scale,
         causal,
-        window_size[0],
-        window_size[1],
         softcap,
         return_softmax,
         None,
@@ -147,7 +145,6 @@ def sparse_attn_func(
     dropout_p=0.0,
     softmax_scale=None,
     causal=False,
-    window_size=(-1, -1),  # -1 means infinite context window
     softcap=0.0, # 0.0 means deactivated
     alibi_slopes=None,
     deterministic=False,
@@ -160,8 +157,7 @@ def sparse_attn_func(
     Most Arguments are the same with the flash_attn_func interface, except for 4 extra args:
     block_count and block_offset for slash sparsity patterns, and
     column_count and column_index for vertical sparsity patterns.
-    For more details please refer to MInference
-    (Paper: https://arxiv.org/abs/2407.02490, Code: https://github.com/microsoft/MInference).
+    For more details please refer to Appendix C.4.2 of paper https://arxiv.org/abs/2407.02490.
 
     Arguments:
         q: (batch_size, seqlen, nheads, headdim)
@@ -175,8 +171,6 @@ def sparse_attn_func(
         softmax_scale: float. The scaling of QK^T before applying softmax.
             Default to 1 / sqrt(headdim).
         causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
-        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
-            Sliding window is not supported for sparse_attn_func, so only (-1, -1) is valid.
         alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
             (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
             is added to the attention score of query i and key j.
@@ -204,7 +198,6 @@ def sparse_attn_func(
         dropout_p,
         softmax_scale,
         causal=causal,
-        window_size=window_size,
         softcap=softcap,
         alibi_slopes=alibi_slopes,
         return_softmax=return_attn_probs and dropout_p > 0,