vllm-project
diff --git a/‎csrc/cutlass b/‎csrc/cutlass
diff --git a/‎hopper/benchmark_attn.py
Lines changed: 2 additions & 2 deletions b/‎hopper/benchmark_attn.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎hopper/benchmark_split_kv.py
Lines changed: 35 additions & 29 deletions b/‎hopper/benchmark_split_kv.py
Lines changed: 35 additions & 29 deletions
diff --git a/‎hopper/block.h
Lines changed: 89 additions & 0 deletions b/‎hopper/block.h
Lines changed: 89 additions & 0 deletions
diff --git a/‎hopper/epilogue_bwd.hpp
Lines changed: 1 addition & 1 deletion b/‎hopper/epilogue_bwd.hpp
Lines changed: 1 addition & 1 deletion
@@ -355,7 +355,7 @@ def run(*args, **kwargs):
                 m1 = time_fwd(flash_attn_func_v3, q, k if page_size is None else k_paged, v_fa3 if page_size is None else v_paged, causal=causal, window_size=window_size, sink_token_length=sink_token_length, softcap=softcap, num_splits=num_splits, pack_gqa=pack_gqa, repeats=repeats, verbose=verbose, desc='Fav3')
                 # pytorch_profiler(flash_attn_func_v3, q, k if page_size is None else k_paged, v_fa3 if page_size is None else v_paged, page_table=page_table, causal=causal, window_size=window_size, softcap=softcap, num_splits=num_splits, pack_gqa=pack_gqa)
             else:
-                m1 = time_fwd(flash_attn_varlen_func_v3, q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, None, None, seqlen_q, seqlen, causal=causal, window_size=window_size, softcap=softcap, num_splits=num_splits, pack_gqa=pack_gqa, repeats=repeats, verbose=verbose, desc='Fav3')
+                m1 = time_fwd(flash_attn_varlen_func_v3, q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen, causal=causal, window_size=window_size, softcap=softcap, num_splits=num_splits, pack_gqa=pack_gqa, repeats=repeats, verbose=verbose, desc='Fav3')
                 # pytorch_profiler(flash_attn_varlen_func_v3, q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen, causal=causal, window_size=window_size, softcap=softcap, num_splits=num_splits)
             time_f[(causal, headdim, batch_size, seqlen), "Flash3"] = m1.mean
             if dtype != torch.float8_e4m3fn and headdim == headdim_v:
@@ -364,7 +364,7 @@ def run(*args, **kwargs):
                     _, m1b = benchmark_backward(flash_attn_func_v3, q, k, v, causal=causal, window_size=window_size, sink_token_length=sink_token_length, softcap=softcap, deterministic=deterministic,
                                                 repeats=repeats, verbose=False, desc='Fav3')
                 else:
-                    _, m1b = benchmark_backward(flash_attn_varlen_func_v3, q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, None, None, seqlen_q, seqlen, causal=causal, window_size=window_size, softcap=softcap, deterministic=deterministic,
+                    _, m1b = benchmark_backward(flash_attn_varlen_func_v3, q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen, causal=causal, window_size=window_size, softcap=softcap, deterministic=deterministic,
                                                 repeats=repeats, verbose=False, desc='Fav3')
                 time_b[(causal, headdim, batch_size, seqlen), "Flash3"] = m1b.mean
                 # time.sleep(1)
 
@@ -18,13 +18,13 @@ def timeit(fn, *args, **kwargs):
     # Warmup
     for _ in range(5):
         fn(*args, **kwargs)
-    
+
     # Benchmark using PyTorch Timer
     t = benchmark.Timer(
         stmt='fn(*args, **kwargs)',
         globals={'fn': fn, 'args': args, 'kwargs': kwargs}
     )
-    
+
     # Measure execution time
     measurement = t.timeit(20)  # Runs the function 20 times
     # measurement = t.blocked_autorange(min_run_time=1)
@@ -38,14 +38,15 @@ def main():
     ).multi_processor_count
 
     max_splits = 129
-    check_all_splits = False
+    check_all_splits = True
 
     causal = True
     # causal = False
     # dtype=torch.float16
     dtype=torch.bfloat16
+    tp_degree = 1
 
-    torch.manual_seed(42)  
+    torch.manual_seed(42)
 
     model_configs = [
         # ("Gemma-2-2B", 8, 4, 256),
@@ -56,6 +57,7 @@ def main():
         # ("Qwen-2.5-7B", 28, 4, 128),
         # ("Llama-3.1-8B", 32, 8, 128),
         ("Llama-3.1-70B", 64, 8, 128),
+        # ("Mistral Large", 96, 8, 128),
         # ("Llama-3.1-405B", 128, 8, 128),
         # ("Llama-3.2-1B", 32, 8, 64),
         # ("Llama-3.2-3B", 24, 8, 128),
@@ -66,28 +68,32 @@ def main():
 
     all_batch_configs.extend(itertools.product(
         # [1024, 2048, 4096, 8192, 16384, 32768, 131072],  # context_seqlen
-        [4096, 16384, 65536],  # context_seqlen
-        # [131072],  # context_seqlen
+        # [4096, 16384, 65536],  # context_seqlen
+        [131072],  # context_seqlen
         # [i for i in range(1, (num_sms) + 1)], # num_requests
         [1, 4, 8, 16],  # num_requests
         # [1],  # num_requests
-        [1, 4, 8, 16],  # query_seqlen
-        # [1],  # query_seqlen
+        # [1, 4, 8, 16],  # query_seqlen
+        [1],  # query_seqlen
     ))
 
     num_caches = max(reqs for _, reqs, _ in all_batch_configs)
     cache_seqlen = max(seqlen for seqlen, _, _ in all_batch_configs)
 
     for model_name, nheads_q, nheads_kv, headdim in model_configs:
+        assert nheads_kv % tp_degree == 0
+        print(f"***{model_name}***")
+        print(f"QHEADS:{nheads_q}, KVHEADS:{nheads_kv}, HEADDIM:{headdim}, TP:{tp_degree}")
+        nheads_q //= tp_degree
+        nheads_kv //= tp_degree
+
         k_cache = torch.randn(
             (num_caches, cache_seqlen, nheads_kv, headdim), device="cuda", dtype=dtype
         )
         v_cache = torch.randn(
             (num_caches, cache_seqlen, nheads_kv, headdim), device="cuda", dtype=dtype
         )
-        print(f"***{model_name}***")
-        print(f"QHEADS:{nheads_q}, KVHEADS:{nheads_kv}, HEADDIM:{headdim}")
-        
+
         if check_all_splits is False:
             print(f"{'CONTEXT':<9}{'BSZ':<5}{'QLEN':<6}{'FA2':<10}{'FA3':<9}{'RATIO':<7}{'GB/s':<10}")
 
@@ -139,7 +145,7 @@ def main():
                 cache_seqlens=cache_seqlens,
                 cache_batch_idx=cache_idxs,
                 causal=causal,
-                gqa_parallel=False,
+                pack_gqa=False,
                 num_splits=1,
             ) * 1000. * 1000.
 
@@ -151,16 +157,16 @@ def main():
                 cache_seqlens=cache_seqlens,
                 cache_batch_idx=cache_idxs,
                 causal=causal,
-                gqa_parallel=True,
+                pack_gqa=True,
                 num_splits=0,
-                max_seqlen_k_hint=context_seqlen
+                # max_seqlen_k_hint=context_seqlen
             ) * 1000. * 1000.
 
             if check_all_splits:
-            
+
                 fa3_fastest_num_splits = 0
                 fa3_fastest_splitk_time = float("inf")
-                
+
                 for num_splits in range(1, max_splits):
                     t = timeit(
                         flash_attn_interface.flash_attn_with_kvcache,
@@ -170,7 +176,7 @@ def main():
                         cache_seqlens=cache_seqlens,
                         cache_batch_idx=cache_idxs,
                         causal=causal,
-                        gqa_parallel=False,
+                        pack_gqa=False,
                         num_splits=num_splits
                     ) * 1000. * 1000.
 
@@ -181,7 +187,7 @@ def main():
                         cache_seqlens=cache_seqlens,
                         cache_batch_idx=cache_idxs,
                         causal=causal,
-                        gqa_parallel=False,
+                        pack_gqa=False,
                         num_splits=num_splits
                     )
 
@@ -192,7 +198,7 @@ def main():
                         cache_seqlens=cache_seqlens,
                         cache_batch_idx=cache_idxs,
                         causal=causal,
-                        gqa_parallel=False,
+                        pack_gqa=False,
                         num_splits=1
                     )
 
@@ -220,7 +226,7 @@ def main():
                         cache_seqlens=cache_seqlens,
                         cache_batch_idx=cache_idxs,
                         causal=causal,
-                        gqa_parallel=True,
+                        pack_gqa=True,
                         num_splits=num_splits
                     ) * 1000. * 1000.
 
@@ -231,7 +237,7 @@ def main():
                         cache_seqlens=cache_seqlens,
                         cache_batch_idx=cache_idxs,
                         causal=causal,
-                        gqa_parallel=True,
+                        pack_gqa=True,
                         num_splits=num_splits
                     )
 
@@ -242,7 +248,7 @@ def main():
                         cache_seqlens=cache_seqlens,
                         cache_batch_idx=cache_idxs,
                         causal=causal,
-                        gqa_parallel=True,
+                        pack_gqa=True,
                         num_splits=1
                     )
 
@@ -257,7 +263,7 @@ def main():
                     if t < fa3_fastest_splitk_time_gqa:
                         fa3_fastest_splitk_time_gqa = t
                         fa3_fastest_num_splits_gqa = num_splits
-                
+
                 efficiency = (num_work_tiles * fa3_fastest_num_splits_gqa)/num_sms
                 heuristic_ratio = fa3_time_gqa_heuristic/fa3_fastest_splitk_time_gqa
                 # remeasure to smooth anomalies
@@ -271,11 +277,11 @@ def main():
                         cache_seqlens=cache_seqlens,
                         cache_batch_idx=cache_idxs,
                         causal=causal,
-                        gqa_parallel=True,
+                        pack_gqa=True,
                         # num_splits=num_splits_select,
                         # num_splits=1,
                         num_splits=0,
-                        max_seqlen_k_hint=context_seqlen
+                        # max_seqlen_k_hint=context_seqlen
                     ) * 1000. * 1000.
 
                     fa3_fastest_splitk_time_gqa = timeit(
@@ -286,9 +292,9 @@ def main():
                         cache_seqlens=cache_seqlens,
                         cache_batch_idx=cache_idxs,
                         causal=causal,
-                        gqa_parallel=True,
+                        pack_gqa=True,
                         num_splits=fa3_fastest_num_splits_gqa
-                    ) * 1000. * 1000. 
+                    ) * 1000. * 1000.
 
             if check_all_splits is True:
                 print(
@@ -308,7 +314,7 @@ def main():
                     # f"RATIO (FA2/3):{fa2_time_heuristic/fa3_time_gqa_heuristic:.2f}, "
                     f"RATIO:{fa3_time_gqa_heuristic/fa3_fastest_splitk_time_gqa:.2f}, "
                     f"EFF:{efficiency:.2f}, "
-                    f"GB/s:{bytes_kv/fa3_time_gqa_heuristic * 1e-3:.2f}" 
+                    f"GB/s:{bytes_kv/fa3_time_gqa_heuristic * 1e-3:.2f}"
                 )
 
             if check_all_splits is False:
@@ -322,4 +328,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
@@ -0,0 +1,89 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+namespace flash {
+
+template <class SeqlenInfo_t, int kBlockM, int kBlockN, bool Is_causal, bool Is_local, bool PackGQA=false, bool Split=false>
+struct BlockMN {
+
+    static
+    CUTLASS_DEVICE
+    cute::tuple<int, int> get_n_block_min_max(
+            SeqlenInfo_t const& seqlen_info,
+            int const m_block, int const bidb, int const split_idx, int const num_splits,
+            int const window_size_left, int const window_size_right,
+            cutlass::FastDivmod const& qhead_per_khead_divmod) {
+
+        int const seqlen_k = seqlen_info.seqlen_k;
+        int const seqlen_q = seqlen_info.seqlen_q;
+        int n_block_max = cute::ceil_div(seqlen_k, kBlockN);
+        if constexpr (Is_causal || Is_local) {
+            int m_idx_max = (m_block + 1) * kBlockM;
+            // TODO: check off-by-1 error
+            if (PackGQA) { m_idx_max = qhead_per_khead_divmod.divide(m_idx_max - 1) + 1 ; }
+            n_block_max = std::min(n_block_max,
+                                   cute::ceil_div(m_idx_max + seqlen_k - seqlen_q + window_size_right, kBlockN));
+        }
+        int n_block_min = 0;
+        if constexpr (Is_local) {
+            int m_idx_min = m_block * kBlockM;
+            if (PackGQA) { m_idx_min = qhead_per_khead_divmod.divide(m_idx_min); }
+            n_block_min = std::max(int(0), (m_idx_min + seqlen_k - seqlen_q - window_size_left) / kBlockN);
+        }
+        // if (threadIdx.x == 128) { printf("Inside, bid.x = %d, bid.y = %d, bid.z = %d, split_idx = %d, n_block_min: %d, n_block_max: %d\n", blockIdx.x, blockIdx.y, blockIdx.z, split_idx, n_block_min, n_block_max); }
+        if constexpr (Split) {
+            int num_n_blocks_per_split = n_block_max <= n_block_min ? 0 : cute::ceil_div(n_block_max - n_block_min, num_splits);
+            n_block_min = n_block_min + split_idx * num_n_blocks_per_split;
+            n_block_max = std::min(n_block_min + num_n_blocks_per_split, n_block_max);
+        }
+        // if (threadIdx.x == 128) { printf("After split, inside, bid.y = %d, bid.z = %d, split_idx = %d, n_block_min: %d, n_block_max: %d\n", blockIdx.y, blockIdx.z, split_idx, n_block_min, n_block_max); }
+        return {n_block_min, n_block_max};
+    }
+
+    static
+    CUTLASS_DEVICE
+    cute::tuple<int, int> get_n_block_k_new_min_max(
+            SeqlenInfo_t const& seqlen_info,
+            int const m_block, int const bidb, int const split_idx, int const num_splits,
+            int const window_size_left, int const window_size_right,
+            cutlass::FastDivmod const& qhead_per_khead_divmod) {
+
+        auto [n_block_min, n_block_max] = get_n_block_min_max(
+            seqlen_info, m_block, bidb, split_idx, num_splits,
+            window_size_left, window_size_right, qhead_per_khead_divmod);
+        int const idx_k_new_min = std::max(n_block_min * kBlockN - seqlen_info.seqlen_k_og, 0);
+        int const idx_k_new_max = std::min(n_block_max * kBlockN - seqlen_info.seqlen_k_og, seqlen_info.seqlen_k_new);
+        int const n_block_new_min = idx_k_new_min / kBlockN;
+        int const n_block_new_max = idx_k_new_max > idx_k_new_min ? cute::ceil_div(idx_k_new_max, kBlockN) : n_block_new_min;
+        // if (threadIdx.x == 128 && m_block == 0) { printf("bidb = %d, seqlen_k_new = %d, seqlen_k_og = %d, n_block_min = %d, n_block_max = %d, idx_k_new_min = %d, idx_k_new_max = %d, n_block_new_min = %d, n_block_new_max = %d\n", bidb, seqlen_k_new, seqlen_k_og, n_block_min, n_block_max, idx_k_new_min, idx_k_new_max, n_block_new_min, n_block_new_max);}
+        return {n_block_new_min, n_block_new_max};
+    }
+
+    static
+    CUTLASS_DEVICE
+    cute::tuple<int, int> get_m_block_min_max(
+            SeqlenInfo_t const& seqlen_info,
+            int const n_block, int const bidb,
+            int const window_size_left, int const window_size_right, int const sink_token_length) {
+
+        int const seqlen_q = seqlen_info.seqlen_q;
+        int const seqlen_k = seqlen_info.seqlen_k;
+        int m_block_max = cute::ceil_div(seqlen_q, kBlockM);
+        if constexpr (Is_local) {
+            if (n_block >= cute::ceil_div(sink_token_length, kBlockN)) {
+                m_block_max = std::min(m_block_max, cute::ceil_div((n_block + 1) * kBlockN + seqlen_q - seqlen_k + window_size_left, kBlockM));
+            }
+        }
+        int m_block_min = 0;
+        if constexpr (Is_causal || Is_local) {
+            m_block_min = std::max(m_block_min, (n_block * kBlockN + seqlen_q - seqlen_k - window_size_right) / kBlockM);
+        }
+        return {m_block_min, m_block_max};
+    }
+
+};
+
+} // namespace flash
@@ -238,7 +238,7 @@ struct CollectiveEpilogueBwd {
             Tensor tdKVsdK = gmem_thr_copy_dKV.partition_S(sdK); // (TMA, TMA_M, TMA_K)
             Tensor tdKVrdV = make_fragment_like(tdKVgdV);
             Tensor tdKVrdK = make_fragment_like(tdKVgdK);
-            Tensor cdKV = cute::make_identity_tensor(select<1, 2>(TileShape_MNK{}));  // (BLK_M,BLK_K) -> (blk_m,blk_k)
+            Tensor cdKV = cute::make_identity_tensor(select<1, 2>(TileShape_MNK{}));  // (BLK_N,BLK_K) -> (blk_n,blk_k)
             // Repeat the partitioning with identity layouts
             Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV);
             Tensor tdKVpdKV = make_tensor<bool>(make_shape(size<2>(tdKVgdV)));