Skip to content

Commit 45a341a

Browse files
committed
Merge branch 'temp-branch'
2 parents b8ff197 + 7551202 commit 45a341a

File tree

83 files changed

+910
-352
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+910
-352
lines changed

.github/workflows/publish.yml

+8-31
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,9 @@ jobs:
4343
# Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the
4444
# manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
4545
os: [ubuntu-20.04]
46-
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12']
47-
torch-version: ['1.12.1', '1.13.1', '2.0.1', '2.1.2', '2.2.2', '2.3.0', '2.4.0.dev20240407']
48-
cuda-version: ['11.8.0', '12.2.2']
46+
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
47+
torch-version: ['2.0.1', '2.1.2', '2.2.2', '2.3.1', '2.4.0.dev20240514']
48+
cuda-version: ['11.8.0', '12.3.2']
4949
# We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
5050
# Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
5151
# Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
@@ -54,35 +54,13 @@ jobs:
5454
exclude:
5555
# see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
5656
# Pytorch < 2.2 does not support Python 3.12
57-
- torch-version: '1.12.1'
58-
python-version: '3.12'
59-
- torch-version: '1.13.1'
60-
python-version: '3.12'
6157
- torch-version: '2.0.1'
6258
python-version: '3.12'
6359
- torch-version: '2.1.2'
6460
python-version: '3.12'
65-
# Pytorch <= 1.12 does not support Python 3.11
66-
- torch-version: '1.12.1'
67-
python-version: '3.11'
68-
# Pytorch >= 2.0 only supports Python >= 3.8
69-
- torch-version: '2.0.1'
70-
python-version: '3.7'
71-
- torch-version: '2.1.2'
72-
python-version: '3.7'
73-
- torch-version: '2.2.2'
74-
python-version: '3.7'
75-
- torch-version: '2.3.0'
76-
python-version: '3.7'
77-
- torch-version: '2.4.0.dev20240407'
78-
python-version: '3.7'
7961
# Pytorch <= 2.0 only supports CUDA <= 11.8
80-
- torch-version: '1.12.1'
81-
cuda-version: '12.2.2'
82-
- torch-version: '1.13.1'
83-
cuda-version: '12.2.2'
8462
- torch-version: '2.0.1'
85-
cuda-version: '12.2.2'
63+
cuda-version: '12.3.2'
8664

8765
steps:
8866
- name: Checkout
@@ -97,7 +75,6 @@ jobs:
9775
run: |
9876
echo "MATRIX_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
9977
echo "MATRIX_TORCH_VERSION=$(echo ${{ matrix.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV
100-
echo "MATRIX_PYTHON_VERSION=$(echo ${{ matrix.python-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
10178
10279
- name: Free up disk space
10380
if: ${{ runner.os == 'Linux' }}
@@ -141,8 +118,8 @@ jobs:
141118
# see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
142119
# This code is ugly, maybe there's a better way to do this.
143120
export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
144-
minv = {'1.12': 113, '1.13': 116, '2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118}[env['MATRIX_TORCH_VERSION']]; \
145-
maxv = {'1.12': 116, '1.13': 117, '2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121}[env['MATRIX_TORCH_VERSION']]; \
121+
minv = {'2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118}[env['MATRIX_TORCH_VERSION']]; \
122+
maxv = {'2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121}[env['MATRIX_TORCH_VERSION']]; \
146123
print(max(min(int(env['MATRIX_CUDA_VERSION']), maxv), minv))" \
147124
)
148125
if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then
@@ -168,8 +145,8 @@ jobs:
168145
export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
169146
export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
170147
# Limit MAX_JOBS otherwise the github runner goes OOM
171-
# CUDA 11.8 can compile with 2 jobs, but CUDA 12.2 goes OOM
172-
MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "122" ] && echo 1 || echo 2) FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
148+
# CUDA 11.8 can compile with 2 jobs, but CUDA 12.3 goes OOM
149+
MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "123" ] && echo 1 || echo 2) FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
173150
tmpname=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }}
174151
wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
175152
ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}

README.md

+5
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,11 @@ Implement deterministic backward pass. Thanks to engineers from [Meituan](www.me
318318
Support paged KV cache (i.e., [PagedAttention](https://arxiv.org/abs/2309.06180)).
319319
Thanks to @beginlner for this contribution.
320320

321+
### 2.6: Softcapping.
322+
323+
Support attention with softcapping, as used in Gemma-2 and Grok models.
324+
Thanks to @Narsil for this contribution.
325+
321326
## Performance
322327

323328
We present expected speedup (combined forward + backward pass) and memory savings from using FlashAttention against PyTorch standard attention, depending on sequence length, on different GPUs (speedup depends on memory bandwidth - we see more speedup on slower GPU memory).

0 commit comments

Comments
 (0)