move cast to before softmax in attention (tinygrad#9213)

chenyuxyz · web-flow · commit 90c3ed17c532 · 2025-02-24T17:24:59.000-05:00
* move cast to before softmax in attention

saved some memory because exp (which is used for backward) are done in half. training bert seems fine and can fit BS=78 now (from 66)

* test
diff --git a/test/unit/test_attention.py b/test/unit/test_attention.py
@@ -0,0 +1,20 @@
+import unittest
+from tinygrad import Tensor, dtypes
+
+# TODO: test_scheduler, but just in uint
+class TestAttention(unittest.TestCase):
+  def test_half_qkv_buffers(self):
+    BS, seqlen, dim = 10, 4, 100
+    q = Tensor.ones(BS, seqlen, dim, dtype=dtypes.half).contiguous().realize()
+    k = Tensor.ones(BS, seqlen, dim, dtype=dtypes.half).contiguous().realize()
+    v = Tensor.ones(BS, seqlen, dim, dtype=dtypes.half).contiguous().realize()
+    attn = q.scaled_dot_product_attention(k, v)
+    sched = attn.schedule()
+    # attention has 5 kernels now
+    self.assertEqual(len(sched), 5)
+    softmax_inputs = sched[1:4]
+    for si in softmax_inputs:
+      assert all(b.dtype == dtypes.half for b in si.bufs), f"non half {si.bufs=}"
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
@@ -3603,7 +3603,7 @@ def scaled_dot_product_attention(self, key:Tensor, value:Tensor, attn_mask:Tenso
     if attn_mask is not None:
       if attn_mask.dtype == dtypes.bool: attn_mask = attn_mask.where(0, -float("inf"))
       qk = qk + attn_mask
-    return qk.softmax(-1).cast(self.dtype).dropout(dropout_p) @ value
+    return qk.cast(self.dtype).softmax(-1).dropout(dropout_p) @ value
 
   def _do_reduction(self, reduction:ReductionStr="mean") -> Tensor:
     if reduction not in get_args(ReductionStr): raise ValueError(f"{reduction=} must be one of {get_args(ReductionStr)}")