address review comments

minminsun · minminsun · commit 68c4df889849 · 2025-01-07T13:07:32.000Z
diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp
@@ -214,6 +214,10 @@ void set_params_fprop_sparse(Flash_fwd_params &params,
     TORCH_CHECK(column_index.size(2) == block_offset.size(2));
     TORCH_CHECK(column_count.size(2) == column_index.size(2));
     params.NUM_ROWS = block_count.size(2);
+    // params.NUM_ROWS should be equal to cdiv(seqlen_q, BLOCK_M), and BLOCK_M has to be 64 for now.
+    constexpr int BLOCK_M = 64;
+    int expected_num_rows = (seqlen_q + BLOCK_M - 1) / BLOCK_M;
+    TORCH_CHECK(params.NUM_ROWS == expected_num_rows);
     params.NNZ_S = block_offset.size(3);
     params.NNZ_V = column_index.size(3);
 }
@@ -355,6 +359,8 @@ mha_fwd_sparse(at::Tensor &q,         // batch_size x seqlen_q x num_heads x hea
                const bool return_softmax,
                c10::optional<at::Generator> gen_) {
 
+    TORCH_CHECK(window_size_left == -1, "sliding window is not supported in sparse_attn_func.");
+    TORCH_CHECK(window_size_right == -1, "sliding window is not supported in sparse_attn_func.");
     auto dprops = at::cuda::getCurrentDeviceProperties();
     // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
     bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
diff --git a/csrc/flash_attn/src/flash_fwd_sparse_kernel.h b/csrc/flash_attn/src/flash_fwd_sparse_kernel.h
@@ -156,10 +156,6 @@ inline __device__ void sparse_attn_1rowblock(const Params &params, const int bid
     // PREDICATES
     //
 
-    // // Allocate predicate tensors for m and n
-    // Tensor tQpQ = make_tensor<bool>(make_shape(size<1>(tQsQ), size<2>(tQsQ)), Stride<_1,_0>{});
-    // Tensor tKVpKV = make_tensor<bool>(make_shape(size<1>(tKsK), size<2>(tKsK)), Stride<_1,_0>{});
-
     // Construct identity layout for sQ and sK
     Tensor cQ = make_identity_tensor(make_shape(size<0>(sQ), size<1>(sQ)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
     Tensor cKV = make_identity_tensor(make_shape(size<0>(sK), size<1>(sK)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
@@ -434,9 +430,6 @@ inline __device__ void sparse_attn_1rowblock(const Params &params, const int bid
     if (num_cols > 0) {
         auto* cols_ptr = params.column_index + ((bidb * params.h + bidh) * params.NUM_ROWS + m_block) * params.NNZ_V;
         // We don't need to clear the sK smem tiles since we'll mask out the scores anyway.
-        // tKgKBlock.data() = tKgKBlockData + blks_ptr[0] * int64_t(params.k_row_stride);
-        // flash::copy<Is_even_MN, Is_even_K>(gmem_tiled_copy_QKV, tKgKBlock, tKsK, tKVcKV, tKVpKV,
-        //                                    binfo.actual_seqlen_k - blks_ptr[0]);
         #pragma unroll
         for (int m = 0; m < size<1>(tKgKToken); ++m) {
             if (Is_even_MN || get<0>(tKVcKV(0, m, 0)) < num_cols) {  // Is_even_MN
@@ -445,7 +438,7 @@ inline __device__ void sparse_attn_1rowblock(const Params &params, const int bid
                 for (int k = 0; k < size<2>(tKgKToken); ++k) {
                     if (Is_even_K || tKVpKV(k)) {
                         cute::copy(gmem_tiled_copy_QKV, tKgKToken(_, m, k), tKsK(_, m, k));
-                    } else if (true) {  // Clear_OOB_K
+                    } else {  // Clear_OOB_K
                         cute::clear(tKsK(_, m, k));
                     }
                 }
@@ -463,7 +456,6 @@ inline __device__ void sparse_attn_1rowblock(const Params &params, const int bid
 
             // Advance gV
             if (n < num_cols_block - 1) {
-                // flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV, tVgVBlock, tVsV, tKVcKV, tKVpKV);
                 #pragma unroll
                 for (int m = 0; m < size<1>(tVgVToken); ++m) {
                     if (true) {  // Is_even_MN
@@ -480,9 +472,6 @@ inline __device__ void sparse_attn_1rowblock(const Params &params, const int bid
                 }
             } else {
                 // Clear the smem tiles to account for predicated off loads
-                // flash::copy<Is_even_MN, Is_even_K, /*Clear_OOB_MN=*/true>(
-                //     gmem_tiled_copy_QKV, tVgVBlock, tVsV, tKVcKV, tKVpKV, binfo.actual_seqlen_k - start_n
-                // );
                 #pragma unroll
                 for (int m = 0; m < size<1>(tVgVToken); ++m) {
                     if (Is_even_MN || n * kBlockN + get<0>(tKVcKV(0, m, 0)) < num_cols) {  // Is_even_MN
@@ -491,11 +480,11 @@ inline __device__ void sparse_attn_1rowblock(const Params &params, const int bid
                         for (int k = 0; k < size<2>(tVgVToken); ++k) {
                             if (Is_even_K || tKVpKV(k)) {
                                 cute::copy(gmem_tiled_copy_QKV, tVgVToken(_, m, k), tVsV(_, m, k));
-                            } else if (true) {  // Clear_OOB_K
+                            } else {  // Clear_OOB_K
                                 cute::clear(tVsV(_, m, k));
                             }
                         }
-                    } else if (true) {  // Clear_OOB_MN
+                    } else {  // Clear_OOB_MN
                         cute::clear(tVsV(_, m, _));
                     }
                 }
@@ -511,9 +500,6 @@ inline __device__ void sparse_attn_1rowblock(const Params &params, const int bid
                 flash::apply_softcap(acc_s, params.softcap);
             }
 
-            // mask.template apply_mask<Is_causal, Is_even_MN>(
-            //     acc_s, cols_ptr[n * kBlockN], m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4, kNWarps * 16
-            // );
             if (n >= num_cols_block - n_masking_steps) {
                 Tensor tensor = make_tensor(acc_s.data(), flash::convert_layout_acc_rowcol(acc_s.layout()));
                 const int lane_id = threadIdx.x % 32;
@@ -546,8 +532,6 @@ inline __device__ void sparse_attn_1rowblock(const Params &params, const int bid
             flash::cp_async_wait<0>();
             __syncthreads();
             if (n < num_cols_block - 2) {
-                // tKgKBlock.data() = tKgKBlockData + blks_ptr[block_index + 1] * int64_t(params.k_row_stride);
-                // flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV, tKgKBlock, tKsK, tKVcKV, tKVpKV);
                 #pragma unroll
                 for (int m = 0; m < size<1>(tKgKToken); ++m) {
                     if (true) {  // Is_even_MN
@@ -567,9 +551,6 @@ inline __device__ void sparse_attn_1rowblock(const Params &params, const int bid
                 // isn't right and we get race conditions.
                 cute::cp_async_fence();
             } else if (n == num_cols_block - 2) {
-                // tKgKBlock.data() = tKgKBlockData + blks_ptr[block_index + 1] * int64_t(params.k_row_stride);
-                // flash::copy<Is_even_MN, Is_even_K>(gmem_tiled_copy_QKV, tKgKBlock, tKsK, tKVcKV, tKVpKV,
-                //                            binfo.actual_seqlen_k - blks_ptr[block_index + 1]);
                 #pragma unroll
                 for (int m = 0; m < size<1>(tKgKToken); ++m) {
                     if (Is_even_MN || (n + 1) * kBlockN + get<0>(tKVcKV(0, m, 0)) < num_cols) {  // Is_even_MN
diff --git a/tests/test_vllm_flash_attn.py b/tests/test_vllm_flash_attn.py
@@ -272,12 +272,14 @@ def test_varlen_with_paged_kv(
 @pytest.mark.parametrize("num_heads", [1, 2, 4])
 @pytest.mark.parametrize("head_size", [64, 128, 256])
 @pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("NNZ_S", [1, 2, 3, 7, 15, 32])
 @torch.inference_mode()
 def test_sparse_attention(
         seq_lens: List[Tuple[int, int]],
         num_heads: Tuple[int, int],
         head_size: int,
         dtype: torch.dtype,
+        NNZ_S: int,
 ) -> None:
     torch.set_default_device("cuda")
     torch.cuda.manual_seed_all(0)
@@ -295,12 +297,13 @@ def test_sparse_attention(
         batch_size, seqlen_k, num_heads, head_size, dtype=dtype, requires_grad=False
     )
     NUM_ROWS = (seqlen_q + block_size_M - 1) // block_size_M
-    NNZ_S = seqlen_k // block_size_M // 2
-    NNZ_V = seqlen_k - NNZ_S * block_size_M
+    if NNZ_S * block_size_N > seqlen_k:
+        return
+    NNZ_V = seqlen_k - NNZ_S * block_size_N
     block_count = torch.tensor([NNZ_S] * NUM_ROWS * num_heads, dtype=torch.int32).reshape(batch_size, num_heads, NUM_ROWS)
     column_count = torch.tensor([NNZ_V] * NUM_ROWS * num_heads, dtype=torch.int32).reshape(batch_size, num_heads, NUM_ROWS)
-    block_offset = torch.tensor([[i * block_size_M for i in range(NNZ_S)]] * NUM_ROWS * num_heads, dtype=torch.int32).reshape(batch_size, num_heads, NUM_ROWS, NNZ_S)
-    column_index = torch.tensor([[NNZ_S * block_size_M + i for i in range(NNZ_V)]] * NUM_ROWS * num_heads, dtype=torch.int32).reshape(batch_size, num_heads, NUM_ROWS, NNZ_V)
+    block_offset = torch.tensor([[i * block_size_N for i in range(NNZ_S)]] * NUM_ROWS * num_heads, dtype=torch.int32).reshape(batch_size, num_heads, NUM_ROWS, NNZ_S)
+    column_index = torch.tensor([[NNZ_S * block_size_N + i for i in range(NNZ_V)]] * NUM_ROWS * num_heads, dtype=torch.int32).reshape(batch_size, num_heads, NUM_ROWS, NNZ_V)
     from vllm_flash_attn import sparse_attn_func, flash_attn_func
     out, lse = sparse_attn_func(
         q,
diff --git a/vllm_flash_attn/flash_attn_interface.py b/vllm_flash_attn/flash_attn_interface.py
@@ -156,10 +156,12 @@ def sparse_attn_func(
     return_softmax_lse=False,
     out=None,
 ):
-    """Compute attention with virtical and slash sparsity patterns.
+    """Compute attention with vertical and slash sparsity patterns.
     Most Arguments are the same with the flash_attn_func interface, except for 4 extra args:
     block_count and block_offset for slash sparsity patterns, and
-    column_count and column_index for virtical sparsity patterns.
+    column_count and column_index for vertical sparsity patterns.
+    For more details please refer to MInference
+    (Paper: https://arxiv.org/abs/2407.02490, Code: https://github.com/microsoft/MInference).
 
     Arguments:
         q: (batch_size, seqlen, nheads, headdim)
@@ -174,6 +176,7 @@ def sparse_attn_func(
             Default to 1 / sqrt(headdim).
         causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
         window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+            Sliding window is not supported for sparse_attn_func, so only (-1, -1) is valid.
         alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
             (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
             is added to the attention score of query i and key j.