oobabooga
diff --git a/‎.github/workflows/publish.yml
+32-22 b/‎.github/workflows/publish.yml
+32-22
diff --git a/‎README.md
+50-4 b/‎README.md
+50-4
diff --git a/‎csrc/cutlass b/‎csrc/cutlass
@@ -43,9 +43,9 @@ jobs:
           # Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the
           # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
           os: [ubuntu-20.04]
-          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
-          torch-version: ['2.1.2', '2.2.2', '2.3.1', '2.4.0', '2.5.1']
-          cuda-version: ['11.8.0', '12.4.1']
+          python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
+          torch-version: ['2.1.2', '2.2.2', '2.3.1', '2.4.0', '2.5.1', '2.6.0.dev20241001']
+          cuda-version: ['11.8.0', '12.3.2']
           # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
           # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
           # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
@@ -56,16 +56,22 @@ jobs:
             # Pytorch < 2.2 does not support Python 3.12
             - torch-version: '2.1.2'
               python-version: '3.12'
-            # Pytorch >= 2.5 does not support Python 3.8
-            - torch-version: '2.5.1'
-              python-version: '3.8'
+            # Pytorch < 2.5 does not support Python 3.13
+            - torch-version: '2.1.2'
+              python-version: '3.13'
+            - torch-version: '2.2.2'
+              python-version: '3.13'
+            - torch-version: '2.3.1'
+              python-version: '3.13'
+            - torch-version: '2.4.0'
+              python-version: '3.13'
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
 
@@ -74,6 +80,7 @@ jobs:
           echo "MATRIX_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
           echo "MATRIX_TORCH_VERSION=$(echo ${{ matrix.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV
           echo "WHEEL_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1'})" >> $GITHUB_ENV
+          echo "MATRIX_PYTHON_VERSION=$(echo ${{ matrix.python-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
 
       - name: Free up disk space
         if: ${{ runner.os == 'Linux' }}
@@ -92,37 +99,40 @@ jobs:
 
       - name: Install CUDA ${{ matrix.cuda-version }}
         if: ${{ matrix.cuda-version != 'cpu' }}
-        uses: Jimver/cuda-toolkit@v0.2.18
+        uses: Jimver/cuda-toolkit@v0.2.19
         id: cuda-toolkit
         with:
           cuda: ${{ matrix.cuda-version }}
           linux-local-args: '["--toolkit"]'
           # default method is "local", and we're hitting some error with caching for CUDA 11.8 and 12.1
           # method: ${{ (matrix.cuda-version == '11.8.0' || matrix.cuda-version == '12.1.0') && 'network' || 'local' }}
           method: 'network'
-          # We need the cuda libraries (e.g. cuSparse, cuSolver) for compiling PyTorch extensions,
-          # not just nvcc
-          # sub-packages: '["nvcc"]'
+          sub-packages: '["nvcc"]'
 
       - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
         run: |
           pip install --upgrade pip
-          # If we don't install before installing Pytorch, we get error for torch 2.0.1
-          # ERROR: Could not find a version that satisfies the requirement setuptools>=40.8.0 (from versions: none)
-          pip install lit
           # For some reason torch 2.2.0 on python 3.12 errors saying no setuptools
-          pip install setuptools
+          pip install setuptools==68.0.0
+          # With python 3.13 and torch 2.5.1, unless we update typing-extensions, we get error
+          # AttributeError: attribute '__default__' of 'typing.ParamSpec' objects is not writable
+          pip install typing-extensions==4.12.2
           # We want to figure out the CUDA version to download pytorch
           # e.g. we can have system CUDA version being 11.7 but if torch==1.12 then we need to download the wheel from cu116
           # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
           # This code is ugly, maybe there's a better way to do this.
           export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
-            minv = {'2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118, '2.5': 118}[env['MATRIX_TORCH_VERSION']]; \
-            maxv = {'2.1': 121, '2.2': 121, '2.3': 121, '2.4': 124, '2.5': 124}[env['MATRIX_TORCH_VERSION']]; \
+            minv = {'2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118, '2.5': 118, '2.6': 118}[env['MATRIX_TORCH_VERSION']]; \
+            maxv = {'2.1': 121, '2.2': 121, '2.3': 121, '2.4': 124, '2.5': 124, '2.6': 124}[env['MATRIX_TORCH_VERSION']]; \
             print(minv if int(env['MATRIX_CUDA_VERSION']) < 120 else maxv)" \
           )
           if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then
-            pip install --no-cache-dir --pre torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}
+            # pip install --no-cache-dir --pre torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}
+            # Can't use --no-deps because we need cudnn etc.
+            # Hard-coding this version of pytorch-triton for torch 2.6.0.dev20241001
+            pip install jinja2
+            pip install https://download.pytorch.org/whl/nightly/pytorch_triton-3.1.0%2Bcf34004b8a-cp${MATRIX_PYTHON_VERSION}-cp${MATRIX_PYTHON_VERSION}-linux_x86_64.whl
+            pip install --no-cache-dir --pre https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}/torch-${{ matrix.torch-version }}%2Bcu${TORCH_CUDA_VERSION}-cp${MATRIX_PYTHON_VERSION}-cp${MATRIX_PYTHON_VERSION}-linux_x86_64.whl
           else
             pip install --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
           fi
@@ -144,7 +154,7 @@ jobs:
           export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
           export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
           # Limit MAX_JOBS otherwise the github runner goes OOM
-          # CUDA 11.8 can compile with 2 jobs, but CUDA 12.3 goes OOM
+          # nvcc 11.8 can compile with 2 jobs, but nvcc 12.3 goes OOM
           MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "123" ] && echo 1 || echo 2) FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
           tmpname=cu${WHEEL_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }}
           wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
@@ -185,9 +195,9 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
           python-version: '3.10'
 
 
@@ -59,8 +59,11 @@ To run the test:
 export PYTHONPATH=$PWD
 pytest -q -s test_flash_attn.py
 ```
-
-
+Once the package is installed, you can import it as follows:
+```python
+import flash_attn_interface
+flash_attn_interface.flash_attn_func()
+```
 
 ## Installation and features
 **Requirements:**
@@ -112,7 +115,7 @@ FlashAttention-2 with CUDA currently supports:
 3. All head dimensions up to 256. ~~Head dim > 192 backward requires A100/A800 or H100/H800~~. Head dim 256 backward now works on consumer GPUs (if there's no dropout) as of flash-attn 2.5.5.
 
 ### AMD ROCm Support
-ROCm version uses [composable_kernel](https://github.com/ROCm/composable_kernel) as the backend. It provides the implementation of FlashAttention-2.
+ROCm version has two backends. There is [composable_kernel](https://github.com/ROCm/composable_kernel) (ck) which is the default backend and a [Triton](https://github.com/triton-lang/triton) backend. They provide an implementation of FlashAttention-2.
 
 **Requirements:**
 - ROCm 6.0 and above.
@@ -121,11 +124,54 @@ We recommend the
 [Pytorch](https://hub.docker.com/r/rocm/pytorch)
 container from ROCm, which has all the required tools to install FlashAttention.
 
-FlashAttention-2 with ROCm currently supports:
+#### Composable Kernel Backend
+FlashAttention-2 ROCm CK backend currently supports:
 1. MI200 or MI300 GPUs.
 2. Datatype fp16 and bf16
 3. Forward's head dimensions up to 256. Backward head dimensions up to 128.
 
+#### Triton Backend
+The Triton implementation of the [Flash Attention v2](https://tridao.me/publications/flash2/flash2.pdf) is currently a work in progress.
+
+It supports AMD's CDNA (MI200, MI300) and RDNA GPU's using fp16, bf16 and fp32 datatypes.
+
+These features are supported in Fwd and Bwd
+1) Fwd and Bwd with causal masking
+2) Variable sequence lengths
+3) Arbitrary Q and KV sequence lengths
+4) Arbitrary head sizes
+
+These features are supported in Fwd for now. We will add them to backward soon.
+1) Multi and grouped query attention
+2) ALiBi and matrix bias
+
+These features are in development
+1) Paged Attention 
+2) Sliding Window
+3) Rotary embeddings
+4) Dropout
+5) Performance Improvements
+
+#### Getting Started
+To get started with the triton backend for AMD, follow the steps below.
+
+First install the recommended Triton [commit](https://github.com/triton-lang/triton/commit/3ca2f498e98ed7249b82722587c511a5610e00c4).
+
+```
+git clone https://github.com/triton-lang/triton
+cd triton
+git checkout 3ca2f498e98ed7249b82722587c511a5610e00c4 
+pip install --verbose -e python
+```
+Then install and test Flash Attention with the flag `FLASH_ATTENTION_TRITON_AMD_ENABLE` set to `"TRUE"`.
+
+```
+export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
+cd flash-attention
+python setup.py install
+pytest tests/test_flash_attn.py
+```
+
 
 ## How to use FlashAttention