_TORCH_LESS_EQUAL_2_6

Borda · Borda · commit 84cae4785128 · 2025-03-11T12:48:54.000+01:00
diff --git a/src/lightning/fabric/utilities/imports.py b/src/lightning/fabric/utilities/imports.py
@@ -34,6 +34,7 @@
 _TORCH_EQUAL_2_4_0 = compare_version("torch", operator.eq, "2.4.0")
 _TORCH_GREATER_EQUAL_2_4 = compare_version("torch", operator.ge, "2.4.0")
 _TORCH_GREATER_EQUAL_2_4_1 = compare_version("torch", operator.ge, "2.4.1")
+_TORCH_LESS_EQUAL_2_6 = compare_version("torch", operator.le, "2.6.0")
 
 _PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10)
 
diff --git a/tests/tests_fabric/strategies/test_ddp_integration.py b/tests/tests_fabric/strategies/test_ddp_integration.py
@@ -23,6 +23,7 @@
 from torch.nn.parallel.distributed import DistributedDataParallel
 
 from lightning.fabric import Fabric
+from lightning.fabric.utilities.imports import _TORCH_LESS_EQUAL_2_6
 from tests_fabric.helpers.runif import RunIf
 from tests_fabric.strategies.test_single_device import _run_test_clip_gradients
 from tests_fabric.test_fabric import BoringModel
@@ -84,16 +85,18 @@ def test_reapply_compile():
     fabric.launch()
 
     model = BoringModel()
-    # compile_kwargs = {"mode": "reduce-overhead"}
-    compiled_model = torch.compile(model)  # , **compile_kwargs
+    # currently (PyTorch 2.6) using ruduce-overhead here casues a RuntimeError:
+    # Error: accessing tensor output of CUDAGraphs that has been overwritten by a subsequent run.
+    compile_kwargs = {"mode": "reduce-overhead"} if _TORCH_LESS_EQUAL_2_6 else {}
+    compiled_model = torch.compile(model, **compile_kwargs)
     torch.compile.reset_mock()
 
     fabric_model = fabric.setup(compiled_model, _reapply_compile=True)
 
     assert isinstance(fabric_model._forward_module, OptimizedModule)
     assert isinstance(fabric_model._forward_module._orig_mod, DistributedDataParallel)
     # Assert we called compile again with the same arguments, but on the DDP-wrapped module
-    torch.compile.assert_called_with(fabric_model._forward_module._orig_mod)  # , **compile_kwargs
+    torch.compile.assert_called_with(fabric_model._forward_module._orig_mod, **compile_kwargs)
 
     assert fabric_model._original_module == model
     assert fabric_model._forward_module._orig_mod.module == model
diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py
@@ -29,6 +29,7 @@
 from lightning.fabric import Fabric
 from lightning.fabric.plugins import FSDPPrecision
 from lightning.fabric.strategies import FSDPStrategy
+from lightning.fabric.utilities.imports import _TORCH_LESS_EQUAL_2_6
 from lightning.fabric.utilities.load import _load_distributed_checkpoint
 from lightning.fabric.wrappers import _FabricOptimizer
 from tests_fabric.helpers.datasets import RandomDataset
@@ -411,8 +412,10 @@ def test_reapply_compile():
     fabric.launch()
 
     model = BoringModel()
-    # compile_kwargs = {"mode": "reduce-overhead"}
-    compiled_model = torch.compile(model)  # , **compile_kwargs
+    # currently (PyTorch 2.6) using ruduce-overhead here casues a RuntimeError:
+    # Error: accessing tensor output of CUDAGraphs that has been overwritten by a subsequent run.
+    compile_kwargs = {"mode": "reduce-overhead"} if _TORCH_LESS_EQUAL_2_6 else {}
+    compiled_model = torch.compile(model, **compile_kwargs)
     torch.compile.reset_mock()
 
     fabric_model = fabric.setup(compiled_model, _reapply_compile=True)
@@ -421,7 +424,7 @@ def test_reapply_compile():
     assert isinstance(fabric_model._forward_module._orig_mod, FullyShardedDataParallel)
 
     # Assert we called compile again with the same arguments, but on the FSDP-wrapped module
-    torch.compile.assert_called_with(fabric_model._forward_module._orig_mod)  # , **compile_kwargs
+    torch.compile.assert_called_with(fabric_model._forward_module._orig_mod, **compile_kwargs)
 
     assert fabric_model._original_module == model
     assert fabric_model._forward_module._orig_mod.module == model