oobabooga
diff --git a/‎.github/workflows/publish.yml
+8-31 b/‎.github/workflows/publish.yml
+8-31
diff --git a/‎README.md
+5 b/‎README.md
+5
@@ -43,9 +43,9 @@ jobs:
           # Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the
           # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
           os: [ubuntu-20.04]
-          python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12']
-          torch-version: ['1.12.1', '1.13.1', '2.0.1', '2.1.2', '2.2.2', '2.3.0', '2.4.0.dev20240407']
-          cuda-version: ['11.8.0', '12.2.2']
+          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+          torch-version: ['2.0.1', '2.1.2', '2.2.2', '2.3.1', '2.4.0.dev20240514']
+          cuda-version: ['11.8.0', '12.3.2']
           # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
           # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
           # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
@@ -54,35 +54,13 @@ jobs:
           exclude:
             # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
             # Pytorch < 2.2 does not support Python 3.12
-            - torch-version: '1.12.1'
-              python-version: '3.12'
-            - torch-version: '1.13.1'
-              python-version: '3.12'
             - torch-version: '2.0.1'
               python-version: '3.12'
             - torch-version: '2.1.2'
               python-version: '3.12'
-            # Pytorch <= 1.12 does not support Python 3.11
-            - torch-version: '1.12.1'
-              python-version: '3.11'
-            # Pytorch >= 2.0 only supports Python >= 3.8
-            - torch-version: '2.0.1'
-              python-version: '3.7'
-            - torch-version: '2.1.2'
-              python-version: '3.7'
-            - torch-version: '2.2.2'
-              python-version: '3.7'
-            - torch-version: '2.3.0'
-              python-version: '3.7'
-            - torch-version: '2.4.0.dev20240407'
-              python-version: '3.7'
             # Pytorch <= 2.0 only supports CUDA <= 11.8
-            - torch-version: '1.12.1'
-              cuda-version: '12.2.2'
-            - torch-version: '1.13.1'
-              cuda-version: '12.2.2'
             - torch-version: '2.0.1'
-              cuda-version: '12.2.2'
+              cuda-version: '12.3.2'
 
     steps:
       - name: Checkout
@@ -97,7 +75,6 @@ jobs:
         run: |
           echo "MATRIX_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
           echo "MATRIX_TORCH_VERSION=$(echo ${{ matrix.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV
-          echo "MATRIX_PYTHON_VERSION=$(echo ${{ matrix.python-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
 
       - name: Free up disk space
         if: ${{ runner.os == 'Linux' }}
@@ -141,8 +118,8 @@ jobs:
           # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
           # This code is ugly, maybe there's a better way to do this.
           export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
-            minv = {'1.12': 113, '1.13': 116, '2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118}[env['MATRIX_TORCH_VERSION']]; \
-            maxv = {'1.12': 116, '1.13': 117, '2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121}[env['MATRIX_TORCH_VERSION']]; \
+            minv = {'2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118}[env['MATRIX_TORCH_VERSION']]; \
+            maxv = {'2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121}[env['MATRIX_TORCH_VERSION']]; \
             print(max(min(int(env['MATRIX_CUDA_VERSION']), maxv), minv))" \
           )
           if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then
@@ -168,8 +145,8 @@ jobs:
           export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
           export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
           # Limit MAX_JOBS otherwise the github runner goes OOM
-          # CUDA 11.8 can compile with 2 jobs, but CUDA 12.2 goes OOM
-          MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "122" ] && echo 1 || echo 2) FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
+          # CUDA 11.8 can compile with 2 jobs, but CUDA 12.3 goes OOM
+          MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "123" ] && echo 1 || echo 2) FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
           tmpname=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }}
           wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
           ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
 
@@ -318,6 +318,11 @@ Implement deterministic backward pass. Thanks to engineers from [Meituan](www.me
 Support paged KV cache (i.e., [PagedAttention](https://arxiv.org/abs/2309.06180)).
 Thanks to @beginlner for this contribution.
 
+### 2.6: Softcapping.
+
+Support attention with softcapping, as used in Gemma-2 and Grok models.
+Thanks to @Narsil for this contribution.
+
 ## Performance
 
 We present expected speedup (combined forward + backward pass) and memory savings from using FlashAttention against PyTorch standard attention, depending on sequence length, on different GPUs (speedup depends on memory bandwidth - we see more speedup on slower GPU memory).