update Gemma attention for TPU (#2130)

divyashreepathihalli · web-flow · commit 7a7a6bdb931e · 2025-03-07T20:36:59.000-08:00
* update Gemma attention for TPU

* add default fallback for GPU and CPU

* add fallback option if not running with JAX and TPU

* address review comments

* check input signature

* remove checking q length

* code reformat

* handle case when soft cap support is not needed

* fix format

* add tests for FA calls

* fix test

* update tests

* fix code format

* address review comments

* Update requirements-jax-cuda.txt

* Update gemma_causal_lm_test.py

* Update requirements-jax-cuda.txt
diff --git a/keras_hub/src/models/gemma/gemma_attention.py b/keras_hub/src/models/gemma/gemma_attention.py
@@ -1,10 +1,13 @@
+import inspect
+
 import keras
 import numpy as np
 from keras import ops
 
 from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding
 from keras_hub.src.utils.keras_utils import clone_initializer
 from keras_hub.src.utils.keras_utils import has_flash_attention_support
+from keras_hub.src.utils.keras_utils import running_on_tpu
 
 
 class CachedGemmaAttention(keras.layers.Layer):
@@ -103,6 +106,18 @@ def _apply_rope(self, x, start_index):
         )
         return x
 
+    def _can_use_flash_attention(self):
+        if not has_flash_attention_support():
+            return False
+        if self.dropout > 0.0:
+            return False
+        if self.logit_soft_cap is None:
+            return True
+        sig = inspect.signature(ops.dot_product_attention)
+        # We can currently only run soft capped attention for keras >= 3.10
+        # and only on TPU.
+        return running_on_tpu() and "attn_logits_soft_cap" in sig.parameters
+
     def _compute_attention(
         self,
         q,
@@ -118,27 +133,23 @@ def _compute_attention(
             query_normalization = 1 / np.sqrt(
                 self.hidden_dim // self.num_query_heads
             )
-        use_dot_product_attention = not (
-            self.dropout > 0.0 or (len(q.shape) != 4)
-        )
-        if has_flash_attention_support() and use_dot_product_attention:
-            if self.dropout > 0.0:
-                raise ValueError(
-                    "Flash attention does not support dropout. "
-                    "Please set `dropout` to 0.0."
-                )
+        if self._can_use_flash_attention():
             if attention_mask is not None:
                 attention_mask = ops.expand_dims(attention_mask, axis=1)
                 attention_mask = ops.cast(attention_mask, dtype="bool")
-
-            attention_output = ops.dot_product_attention(
+            # Only pass soft cap if needed as not all keras versions support.
+            if self.logit_soft_cap:
+                kwargs = {"attn_logits_soft_cap": self.logit_soft_cap}
+            else:
+                kwargs = {}
+            return ops.dot_product_attention(
                 query=q,
                 key=k,
                 value=v,
                 mask=attention_mask,
                 scale=query_normalization,
+                **kwargs,
             )
-            return attention_output
 
         q *= ops.cast(query_normalization, dtype=q.dtype)
         q_shape = ops.shape(q)
diff --git a/keras_hub/src/models/gemma/gemma_causal_lm_test.py b/keras_hub/src/models/gemma/gemma_causal_lm_test.py
@@ -12,6 +12,8 @@
 )
 from keras_hub.src.models.gemma.gemma_tokenizer import GemmaTokenizer
 from keras_hub.src.tests.test_case import TestCase
+from keras_hub.src.utils.keras_utils import has_flash_attention_support
+from keras_hub.src.utils.keras_utils import running_on_gpu
 
 
 class GemmaCausalLMTest(TestCase):
@@ -95,6 +97,18 @@ def test_generate(self):
             prompt_ids["padding_mask"][:, :4],
         )
 
+    def test_flash_attention_call(self):
+        if keras.config.backend() != "jax" or not has_flash_attention_support():
+            self.skipTest("`flash_attention` testing requires the Jax backend.")
+
+        with patch("keras.src.backend.nn.dot_product_attention") as mock_func:
+            causal_lm = GemmaCausalLM(**self.init_kwargs)
+            causal_lm.generate("the quick brown fox")
+            if running_on_gpu():
+                mock_func.assert_called()
+            else:
+                mock_func.assert_not_called()
+
     def test_generate_with_bfloat16(self):
         original_floatx = keras.config.floatx()
         keras.config.set_floatx("float16")
diff --git a/keras_hub/src/utils/keras_utils.py b/keras_hub/src/utils/keras_utils.py
@@ -72,3 +72,35 @@ def has_flash_attention_support():
         return True
     else:
         return False
+
+
+def running_on_tpu():
+    backend = keras.config.backend()
+    if backend == "jax":
+        import jax
+
+        devices = jax.devices()
+        return any(d.platform == "tpu" for d in devices)
+    elif backend == "tensorflow":
+        import tensorflow as tf
+
+        return bool(tf.config.list_logical_devices("TPU"))
+    elif backend == "torch":
+        return False
+
+
+def running_on_gpu():
+    backend = keras.config.backend()
+    if backend == "jax":
+        import jax
+
+        devices = jax.devices()
+        return any(d.platform == "gpu" for d in devices)
+    elif backend == "tensorflow":
+        import tensorflow as tf
+
+        return bool(tf.config.list_logical_devices("GPU"))
+    elif backend == "torch":
+        import torch
+
+        return torch.cuda.is_available()