oobabooga
diff --git a/‎.github/workflows/publish.yml
+11-12 b/‎.github/workflows/publish.yml
+11-12
diff --git a/‎README.md
+40-42 b/‎README.md
+40-42
diff --git a/‎benchmarks/benchmark_gemm.py
+43 b/‎benchmarks/benchmark_gemm.py
+43
diff --git a/‎csrc/composable_kernel b/‎csrc/composable_kernel
@@ -44,8 +44,8 @@ jobs:
           # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
           os: [ubuntu-20.04]
           python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
-          torch-version: ['2.0.1', '2.1.2', '2.2.2', '2.3.1', '2.4.0']
-          cuda-version: ['11.8.0', '12.3.2']
+          torch-version: ['2.1.2', '2.2.2', '2.3.1', '2.4.0', '2.5.1']
+          cuda-version: ['11.8.0', '12.4.1']
           # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
           # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
           # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
@@ -54,13 +54,11 @@ jobs:
           exclude:
             # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
             # Pytorch < 2.2 does not support Python 3.12
-            - torch-version: '2.0.1'
-              python-version: '3.12'
             - torch-version: '2.1.2'
               python-version: '3.12'
-            # Pytorch <= 2.0 only supports CUDA <= 11.8
-            - torch-version: '2.0.1'
-              cuda-version: '12.3.2'
+            # Pytorch >= 2.5 does not support Python 3.8
+            - torch-version: '2.5.1'
+              python-version: '3.8'
 
     steps:
       - name: Checkout
@@ -75,6 +73,7 @@ jobs:
         run: |
           echo "MATRIX_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
           echo "MATRIX_TORCH_VERSION=$(echo ${{ matrix.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV
+          echo "WHEEL_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1'})" >> $GITHUB_ENV
 
       - name: Free up disk space
         if: ${{ runner.os == 'Linux' }}
@@ -93,7 +92,7 @@ jobs:
 
       - name: Install CUDA ${{ matrix.cuda-version }}
         if: ${{ matrix.cuda-version != 'cpu' }}
-        uses: Jimver/cuda-toolkit@v0.2.14
+        uses: Jimver/cuda-toolkit@v0.2.18
         id: cuda-toolkit
         with:
           cuda: ${{ matrix.cuda-version }}
@@ -118,9 +117,9 @@ jobs:
           # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
           # This code is ugly, maybe there's a better way to do this.
           export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
-            minv = {'2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118}[env['MATRIX_TORCH_VERSION']]; \
-            maxv = {'2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121}[env['MATRIX_TORCH_VERSION']]; \
-            print(max(min(int(env['MATRIX_CUDA_VERSION']), maxv), minv))" \
+            minv = {'2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118, '2.5': 118}[env['MATRIX_TORCH_VERSION']]; \
+            maxv = {'2.1': 121, '2.2': 121, '2.3': 121, '2.4': 124, '2.5': 124}[env['MATRIX_TORCH_VERSION']]; \
+            print(minv if int(env['MATRIX_CUDA_VERSION']) < 120 else maxv)" \
           )
           if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then
             pip install --no-cache-dir --pre torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}
@@ -147,7 +146,7 @@ jobs:
           # Limit MAX_JOBS otherwise the github runner goes OOM
           # CUDA 11.8 can compile with 2 jobs, but CUDA 12.3 goes OOM
           MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "123" ] && echo 1 || echo 2) FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
-          tmpname=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }}
+          tmpname=cu${WHEEL_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }}
           wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
           ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
           echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
 
@@ -43,15 +43,12 @@ This is a beta release for testing / benchmarking before we integrate that with
 the rest of the repo.
 
 Currently released:
-- FP16 forward and backward
-
-Coming soon in the next couple of days / next week:
-- BF16
-- Variable length (FP16, BF16)
-- FP8 forward.
+- FP16 / BF16 forward and backward, FP8 forward
 
 Requirements: H100 / H800 GPU, CUDA >= 12.3.
 
+For now, we highly recommend CUDA 12.3 for best performance.
+
 To install:
 ```sh
 cd hopper
@@ -66,26 +63,21 @@ pytest -q -s test_flash_attn.py
 
 
 ## Installation and features
-
-Requirements:
-- CUDA 11.6 and above.
+**Requirements:**
+- CUDA toolkit or ROCm toolkit
 - PyTorch 1.12 and above.
+- `packaging` Python package (`pip install packaging`)
+- `ninja` Python package (`pip install ninja`) *
 - Linux. Might work for Windows starting v2.3.2 (we've seen a few positive [reports](https://github.com/Dao-AILab/flash-attention/issues/595)) but Windows compilation still requires more testing. If you have ideas on how to set up prebuilt CUDA wheels for Windows, please reach out via Github issue.
 
-We recommend the
-[Pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch)
-container from Nvidia, which has all the required tools to install FlashAttention.
-
-To install:
-1. Make sure that PyTorch is installed.
-2. Make sure that `packaging` is installed (`pip install packaging`)
-3. Make sure that `ninja` is installed and that it works correctly (e.g. `ninja
+\* Make sure that `ninja` is installed and that it works correctly (e.g. `ninja
 --version` then `echo $?` should return exit code 0). If not (sometimes `ninja
 --version` then `echo $?` returns a nonzero exit code), uninstall then reinstall
 `ninja` (`pip uninstall -y ninja && pip install ninja`). Without `ninja`,
 compiling can take a very long time (2h) since it does not use multiple CPU
-cores. With `ninja` compiling takes 3-5 minutes on a 64-core machine.
-4. Then:
+cores. With `ninja` compiling takes 3-5 minutes on a 64-core machine using CUDA toolkit.
+
+**To install:**
 ```sh
 pip install flash-attn --no-build-isolation
 ```
@@ -102,15 +94,38 @@ variable `MAX_JOBS`:
 MAX_JOBS=4 pip install flash-attn --no-build-isolation
 ```
 
-Interface: `src/flash_attention_interface.py`
+**Interface:** `src/flash_attention_interface.py`
+
+### NVIDIA CUDA Support
+**Requirements:**
+- CUDA 11.7 and above.
 
-FlashAttention-2 currently supports:
+We recommend the
+[Pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch)
+container from Nvidia, which has all the required tools to install FlashAttention.
+
+FlashAttention-2 with CUDA currently supports:
 1. Ampere, Ada, or Hopper GPUs (e.g., A100, RTX 3090, RTX 4090, H100). Support for Turing
    GPUs (T4, RTX 2080) is coming soon, please use FlashAttention 1.x for Turing
    GPUs for now.
 2. Datatype fp16 and bf16 (bf16 requires Ampere, Ada, or Hopper GPUs).
 3. All head dimensions up to 256. ~~Head dim > 192 backward requires A100/A800 or H100/H800~~. Head dim 256 backward now works on consumer GPUs (if there's no dropout) as of flash-attn 2.5.5.
 
+### AMD ROCm Support
+ROCm version uses [composable_kernel](https://github.com/ROCm/composable_kernel) as the backend. It provides the implementation of FlashAttention-2.
+
+**Requirements:**
+- ROCm 6.0 and above.
+
+We recommend the
+[Pytorch](https://hub.docker.com/r/rocm/pytorch)
+container from ROCm, which has all the required tools to install FlashAttention.
+
+FlashAttention-2 with ROCm currently supports:
+1. MI200 or MI300 GPUs.
+2. Datatype fp16 and bf16
+3. Forward's head dimensions up to 256. Backward head dimensions up to 128.
+
 
 ## How to use FlashAttention
 
@@ -358,6 +373,10 @@ Thanks to @beginlner for this contribution.
 Support attention with softcapping, as used in Gemma-2 and Grok models.
 Thanks to @Narsil and @lucidrains for this contribution.
 
+### 2.7: Compatibility with torch compile
+
+Thanks to @ani300 for this contribution.
+
 ## Performance
 
 We present expected speedup (combined forward + backward pass) and memory savings from using FlashAttention against PyTorch standard attention, depending on sequence length, on different GPUs (speedup depends on memory bandwidth - we see more speedup on slower GPU memory).
@@ -437,27 +456,6 @@ This new release of FlashAttention-2 has been tested on several GPT-style
 models, mostly on A100 GPUs.
 
 If you encounter bugs, please open a GitHub Issue!
-## AMD GPU/ROCm Support
-ROCm version use [composable_kernel](https://github.com/ROCm/composable_kernel) as backend. It provides the implementation of FlashAttention-2.
-
-## Installation and features
-Requirements:
-- ROCm 6.0+
-- PyTorch 1.12.1+
-
-We recommend the
-[Pytorch](https://hub.docker.com/r/rocm/pytorch)
-container from ROCm, which has all the required tools to install FlashAttention.
-
-To compile from source:
-```sh
-python setup.py install
-```
-
-FlashAttention-2 on ROCm currently supports:
-1. MI200 or MI300 GPUs.
-2. Datatype fp16 and bf16
-3. Forward's head dimensions up to 256. Backward head dimensions up to 128.
 
 ## Tests
 To run the tests:
 
@@ -0,0 +1,43 @@
+import time
+import torch
+import torch.utils.benchmark as benchmark
+
+from triton.testing import do_bench
+
+
+def benchmark_forward(fn, *inputs, repeats=10, desc='', verbose=True, **kwinputs):
+    """Use Pytorch Benchmark on the forward pass of an arbitrary function."""
+    if verbose:
+        print(desc, '- Forward pass')
+    t = benchmark.Timer(
+            stmt='fn(*inputs, **kwinputs)',
+            globals={'fn': fn, 'inputs': inputs, 'kwinputs': kwinputs},
+            num_threads=torch.get_num_threads(),
+            )
+    m = t.timeit(repeats)
+    if verbose:
+        print(m)
+    return t, m
+
+
+torch.manual_seed(0)
+repeats = 30
+dtype = torch.float16
+device = 'cuda'
+verbose = False
+m, n = 8192, 8192
+
+tflops_matmul = {}
+tflops_matmul1 = {}
+for k in [512, 1024, 1536, 2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192]:
+    a = torch.randn(m, k, device=device, dtype=dtype)
+    b = torch.randn(n, k, device=device, dtype=dtype).transpose(-1, -2)
+    nFLOPS_matmul = 2 * m * n * k
+    time.sleep(2)  # to reduce power throttling
+    timing = benchmark_forward(torch.matmul, a, b, desc='cuBLAS', verbose=verbose, repeats=repeats)[1]
+    tflops_matmul[k] = nFLOPS_matmul / timing.mean * 1e-12
+    print(f'[torch.utils.benchmark] cuBLAS, {m = }, {n = }, {k = }: {timing.mean * 1e3:.3f}ms, {tflops_matmul[k]:.1f} TFLOPS')
+    time.sleep(2)  # to reduce power throttling
+    ms = do_bench(lambda: torch.matmul(a, b), warmup=10, rep=repeats)
+    tflops_matmul1[k] = nFLOPS_matmul / ms * 1e-9
+    print(f'[triton.test.do_bench]  cuBLAS, {m = }, {n = }, {k = }: {ms:.3f}ms, {tflops_matmul1[k]:.1f} TFLOPS')