update binding

LucasWilkinson · LucasWilkinson · commit e0faa9a7512a · 2025-03-20T20:28:54.000Z
Signed-off-by: Lucas Wilkinson &lt;lwilkinson@neuralmagic.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.26)
 
-project(vllm_flash_attn LANGUAGES CXX)
+project(vllm_flash_attn LANGUAGES CXX CUDA)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
@@ -213,7 +213,9 @@ if (FA3_ENABLED AND ${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.0)
             SRCS "${FA3_GEN_SRCS}"
             CUDA_ARCHS "${FA3_ARCHS}")
         set_gencode_flags_for_srcs(
-            SRCS "hopper/flash_fwd_combine.cu"
+            SRCS 
+                hopper/flash_fwd_combine.cu
+                hopper/flash_prepare_scheduler.cu
             CUDA_ARCHS "${FA3_ARCHS}")
     endif()
 
@@ -223,6 +225,7 @@ if (FA3_ENABLED AND ${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.0)
         LANGUAGE ${VLLM_GPU_LANG}
         SOURCES
             hopper/flash_fwd_combine.cu
+            hopper/flash_prepare_scheduler.cu
             hopper/flash_api.cpp
             hopper/flash_api_torch_lib.cpp
             ${FA3_GEN_SRCS}
diff --git a/hopper/flash_api_torch_lib.cpp b/hopper/flash_api_torch_lib.cpp
@@ -9,6 +9,14 @@
  *  Externs for the flash_attn ops to be exposed as a pytorch library
  */
 
+// b: batch_size
+// b_k: batch_size_k
+// s_q: seqlen_q
+// s_k: seqlen_k
+// s_k_new: seqlen_k_new
+// h: num_heads
+// h_k: num_heads_k
+// d: head_size
 std::vector<at::Tensor>
 mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seqlens_q
         const at::Tensor &k,  // (b_k, s_k, h_k, d) or (total_k, h_k, d) if there is cu_seqlens_k or (num_pages, page_size, h_k, d) if there is page_table.
@@ -37,12 +45,41 @@ mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seq
         bool is_causal,
         int window_size_left,
         int window_size_right,
-        int sink_token_length,
         float const softcap,
         bool const is_rotary_interleaved,   // if true, rotary combines indices 0 & 1, else indices 0 & rotary_dim / 2
+        std::optional<at::Tensor> &scheduler_metadata_,  // (b + 1)
         int num_splits,
         std::optional<bool> pack_gqa_,
-        int const sm_margin);
+        int const sm_margin
+);
+
+// Only applicable to the case where seqused_k (i.e. cache_seqlens) is available
+at::Tensor
+mha_fwd_get_scheduler_metadata(
+        int batch_size,
+        int max_seqlen_q,
+        int max_seqlen_k,
+        int num_heads,
+        int num_heads_k,
+        int headdim,
+        int headdim_v,
+        at::ScalarType qkv_dtype,
+        const at::Tensor &seqused_k, // b
+        std::optional<const at::Tensor> &cu_seqlens_q_,  // b+1
+        std::optional<const at::Tensor> &cu_seqlens_k_,  // b+1
+        std::optional<const at::Tensor> &cu_seqlens_k_new_,  // b+1
+        std::optional<const at::Tensor> &seqused_q_, // b. If given, only this many elements of each batch element's queries and outputs are used.
+        std::optional<const at::Tensor> &leftpad_k_, // b
+        std::optional<int> page_size,
+        int max_seqlen_k_new,  // 0 means we're not appending new KV
+        bool is_causal,
+        int window_size_left,
+        int window_size_right,
+        bool has_softcap,
+        int num_splits,
+        std::optional<bool> pack_gqa_,
+        int const sm_margin
+);
 
 /**
  *  Torch Library Registration
@@ -74,13 +111,40 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
             "    bool     is_causal,"
             "    int      window_size_left,"
             "    int      window_size_right,"
-            "    int      sink_token_length,"
             "    float    softcap,"
             "    bool     is_rotary_interleaved,"
+            "    Tensor?  scheduler_metadata,"
             "    int      num_splits,"
             "    bool?    pack_gqa,"
             "    int      sm_margin) -> Tensor[]");
     ops.impl("fwd", torch::kCUDA, make_pytorch_shim(&mha_fwd));
+
+    ops.def("get_scheduler_metadata("
+            "    int      batch_size,"
+            "    int      max_seqlen_q,"
+            "    int      max_seqlen_k,"
+            "    int      num_heads,"
+            "    int      num_heads_k,"
+            "    int      headdim,"
+            "    int      headdim_v,"
+            "    ScalarType qkv_dtype,"
+            "    Tensor   seqused_k,"
+            "    Tensor?  cu_seqlens_q,"
+            "    Tensor?  cu_seqlens_k,"
+            "    Tensor?  cu_seqlens_k_new,"
+            "    Tensor?  seqused_q,"
+            "    Tensor?  leftpad_k,"
+            "    int?     page_size,"
+            "    int      max_seqlen_k_new," // 0 means we're not appending new KV
+            "    bool     is_causal,"
+            "    int      window_size_left,"
+            "    int      window_size_right,"
+            "    bool     has_softcap,"
+            "    int      num_splits,"
+            "    bool?    pack_gqa,"
+            "    int      sm_margin) -> Tensor");
+   ops.impl("get_scheduler_metadata", torch::kCUDA, 
+        make_pytorch_shim(&mha_fwd_get_scheduler_metadata));
 }
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME);
diff --git a/tests/test_vllm_flash_attn.py b/tests/test_vllm_flash_attn.py
@@ -13,7 +13,8 @@
 from vllm_flash_attn.flash_attn_interface import (
     flash_attn_varlen_func,
     flash_attn_with_kvcache,
-    is_fa_version_supported
+    get_scheduler_metadata,
+    is_fa_version_supported,
 )
 
 NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
@@ -185,6 +186,7 @@ def ref_paged_attn(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("aot_schedule", [True, False])
 @pytest.mark.parametrize("fa_version", VERSIONS)
 @torch.inference_mode()
 def test_flash_attn_with_paged_kv(
@@ -195,6 +197,7 @@ def test_flash_attn_with_paged_kv(
         block_size: int,
         soft_cap: Optional[float],
         num_blocks: int,
+        aot_schedule: bool,
         fa_version: int,
 ) -> None:
     torch.set_default_device("cuda")
@@ -221,6 +224,24 @@ def test_flash_attn_with_paged_kv(
                                  (num_seqs, max_num_blocks_per_seq),
                                  dtype=torch.int32)
 
+    scheduler_metadata = None
+    if aot_schedule:
+        if fa_version == 2:
+            pytest.skip("AOT schedule is not supported in version 2")
+        scheduler_metadata = get_scheduler_metadata(
+            batch_size=num_seqs,
+            max_seqlen_q=1,
+            max_seqlen_k=max_kv_len,
+            num_heads_q=num_query_heads,
+            num_heads_kv=num_kv_heads,
+            headdim=head_size,
+            cache_seqlens=kv_lens_tensor,
+            qkv_dtype=dtype,
+            causal=True,
+            window_size=(-1, -1),
+            has_softcap=soft_cap is not None
+        )
+
     output = flash_attn_with_kvcache(
         query.unsqueeze(1),
         key_cache,
@@ -230,6 +251,7 @@ def test_flash_attn_with_paged_kv(
         block_table=block_tables,
         cache_seqlens=kv_lens_tensor,
         softcap=soft_cap if soft_cap is not None else 0,
+        scheduler_metadata=scheduler_metadata,
         fa_version=fa_version,
     ).squeeze(1)
 
@@ -255,6 +277,7 @@ def test_flash_attn_with_paged_kv(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("aot_schedule", [True, False])
 @pytest.mark.parametrize("fa_version", VERSIONS)
 @torch.inference_mode()
 def test_varlen_with_paged_kv(
@@ -266,6 +289,7 @@ def test_varlen_with_paged_kv(
         block_size: int,
         soft_cap: Optional[float],
         num_blocks: int,
+        aot_schedule: bool,
         fa_version: int,
 ) -> None:
     torch.set_default_device("cuda")
@@ -303,6 +327,25 @@ def test_varlen_with_paged_kv(
                                  num_blocks,
                                  (num_seqs, max_num_blocks_per_seq),
                                  dtype=torch.int32)
+
+    scheduler_metadata = None
+    if aot_schedule:
+        if fa_version == 2:
+            pytest.skip("AOT schedule is not supported in version 2")
+        scheduler_metadata = get_scheduler_metadata(
+            batch_size=num_seqs,
+            max_seqlen_q=1,
+            max_seqlen_k=max_kv_len,
+            num_heads_q=num_query_heads,
+            num_heads_kv=num_kv_heads,
+            headdim=head_size,
+            cache_seqlens=seqused_k,
+            qkv_dtype=dtype,
+            causal=True,
+            window_size=(-1, -1),
+            has_softcap=soft_cap is not None
+        )
+
     output = flash_attn_varlen_func(
         q=query,
         k=key_cache,
@@ -316,6 +359,7 @@ def test_varlen_with_paged_kv(
         window_size=window_size,
         block_table=block_tables,
         softcap=soft_cap if soft_cap is not None else 0,
+        scheduler_metadata=scheduler_metadata,
         fa_version=fa_version
     )
 
diff --git a/vllm_flash_attn/flash_attn_interface.py b/vllm_flash_attn/flash_attn_interface.py
@@ -73,6 +73,48 @@ def fa_version_unsupported_reason(fa_version: int, device = None) \
 def maybe_contiguous(x):
     return x.contiguous() if x is not None and x.stride(-1) != 1 else x
 
+# NOTE only used in FA3
+def get_scheduler_metadata(
+    batch_size, max_seqlen_q, max_seqlen_k, num_heads_q, num_heads_kv, headdim,
+    cache_seqlens: torch.Tensor,
+    qkv_dtype=torch.bfloat16,
+    headdim_v=None,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k_new: Optional[torch.Tensor] = None,
+    cache_leftpad: Optional[torch.Tensor] = None,
+    page_size: Optional[int] = None,
+    max_seqlen_k_new=0,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite context window
+    has_softcap=False,
+    num_splits=0,    # Can be tuned for speed
+    pack_gqa=None,   # Can be tuned for speed
+    sm_margin=0,     # Can be tuned if some SMs are used for communication
+):
+    cache_seqlens = maybe_contiguous(cache_seqlens)
+    if headdim_v is None:
+        headdim_v = headdim
+    scheduler_metadata = torch.ops._vllm_fa3_C.get_scheduler_metadata(
+        batch_size, max_seqlen_q, max_seqlen_k, num_heads_q, num_heads_kv, headdim, headdim_v,
+        qkv_dtype,
+        cache_seqlens,
+        cu_seqlens_q,
+        None,  # cu_seqlens_k
+        cu_seqlens_k_new,
+        None,  # seqused_q
+        cache_leftpad,
+        page_size,
+        max_seqlen_k_new,
+        causal,
+        window_size[0], window_size[1],
+        has_softcap,
+        num_splits,
+        pack_gqa,
+        sm_margin,
+    )
+
+    return scheduler_metadata
+
 
 def flash_attn_varlen_func(
     q,
@@ -95,10 +137,13 @@ def flash_attn_varlen_func(
     block_table=None,
     return_softmax_lse=False,
     out=None,
-    fa_version: int = DEFAULT_FA_VERSION,
+    # FA3 Only
+    scheduler_metadata=None,
     q_descale=None,
     k_descale=None,
     v_descale=None,
+    # Version selector
+    fa_version: int = DEFAULT_FA_VERSION,
 ):
     """dropout_p should be set to 0.0 during evaluation
     Supports multi-query and grouped-query attention (MQA/GQA) by passing in K, V with fewer heads
@@ -173,6 +218,12 @@ def flash_attn_varlen_func(
     dummy_cu_seqlens_k = torch.empty_like(cu_seqlens_q)
     
     if fa_version == 2:
+        if scheduler_metadata is not None and q_descale is not None \
+            and k_descale is not None and v_descale is not None:
+                raise NotImplementedError(
+                    "FA2 does not support scheduler_metadata, q_descale, "
+                    "k_descale, v_descale"
+                )
         out, softmax_lse = torch.ops._vllm_fa2_C.varlen_fwd(
             q, k, v,
             out,
@@ -216,9 +267,9 @@ def flash_attn_varlen_func(
             softmax_scale,
             causal,
             real_window_size[0], real_window_size[1],
-            0,                # sink_token_length
             softcap,
             True,             # rotary_interleaved
+            scheduler_metadata,
             0,                # num_splits
             None,             # pack_gqa
             0,                # sm_margin
@@ -250,10 +301,13 @@ def flash_attn_with_kvcache(
     return_softmax_lse=False,
     *,
     out=None,
-    fa_version: int = DEFAULT_FA_VERSION,
+    # FA3 Only
+    scheduler_metadata=None,
     q_descale=None,
     k_descale=None,
     v_descale=None,
+    # Version selector
+    fa_version: int = DEFAULT_FA_VERSION,
 ):
     """
     If k and v are not None, k_cache and v_cache will be updated *inplace* with the new values from
@@ -355,6 +409,12 @@ def flash_attn_with_kvcache(
     block_table = maybe_contiguous(block_table)
 
     if fa_version == 2:
+        if scheduler_metadata is not None and q_descale is not None \
+            and k_descale is not None and v_descale is not None:
+                raise NotImplementedError(
+                    "FA2 does not support scheduler_metadata, q_descale, "
+                    "k_descale, v_descale"
+                )
         out, softmax_lse = torch.ops._vllm_fa2_C.fwd_kvcache(
             q, k_cache, v_cache,
             k, v,             # k_new, v_new
@@ -393,9 +453,9 @@ def flash_attn_with_kvcache(
             softmax_scale,
             causal,
             window_size[0], window_size[1],
-            0,                   # sink_token_length
             softcap,
             rotary_interleaved,  # rotary_interleaved
+            scheduler_metadata,
             num_splits,          # num_splits
             None,                # pack_gqa
             0,                   # sm_margin