Inform the user about a missing fabric.backward() call (#19447)

awaelchli · web-flow · commit 265025bd5dd6 · 2024-02-14T17:49:11.000-05:00
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -22,6 +22,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - The `Fabric.rank_zero_first` context manager now uses a barrier without timeout to avoid long-running tasks to be interrupted ([#19448](https://github.com/Lightning-AI/lightning/pull/19448))
 
+
+- Fabric now raises an error if you forget to call `fabric.backward()` when it is needed by the strategy or precision selection ([#19447](https://github.com/Lightning-AI/lightning/pull/19447))
+
+
 -
 
 ### Deprecated
diff --git a/src/lightning/fabric/cli.py b/src/lightning/fabric/cli.py
@@ -53,8 +53,10 @@ def _legacy_main() -> None:
         Raises deprecation warning and runs through fabric cli if necessary, else runs the entrypoint directly
 
         """
-        print("`lightning run model` is deprecated and will be removed in future versions."
-            " Please call `fabric run model` instead.")
+        print(
+            "`lightning run model` is deprecated and will be removed in future versions."
+            " Please call `fabric run model` instead."
+        )
         args = sys.argv[1:]
         if args and args[0] == "run" and args[1] == "model":
             _main()
diff --git a/src/lightning/fabric/fabric.py b/src/lightning/fabric/fabric.py
@@ -142,6 +142,7 @@ def __init__(
         self._loggers = loggers if isinstance(loggers, list) else [loggers]
         self._models_setup: int = 0
         self._launched: bool = False
+        self._backward_called: bool = False
 
         self._prepare_run_method()
         if _is_using_cli():
@@ -253,6 +254,7 @@ def setup(
         if compile_kwargs is not None:
             module = _to_compiled(module, compile_kwargs)
         module = _FabricModule(module, self._precision, original_module=original_module)
+        self._require_fabric_backward(module)
 
         # Update the _DeviceDtypeModuleMixin's device parameter
         # NOTE: for sharded strategies or manual device placement, there's no single root device
@@ -317,6 +319,7 @@ def setup_module(
         if compile_kwargs is not None:
             module = _to_compiled(module, compile_kwargs)
         module = _FabricModule(module, self._precision, original_module=original_module)
+        self._require_fabric_backward(module)
 
         # Update the _DeviceDtypeModuleMixin's device parameter
         # NOTE: for sharded strategies or manual device placement, there's no single root device
@@ -445,7 +448,9 @@ def backward(self, tensor: Tensor, *args: Any, model: Optional[_FabricModule] =
                 # requires to attach the current `DeepSpeedEngine` for the `_FabricOptimizer.step` call.
                 self._strategy._deepspeed_engine = module
 
+        self._backward_called = True
         self._strategy.backward(tensor, module, *args, **kwargs)
+        self._backward_called = False
 
     def clip_gradients(
         self,
@@ -1090,6 +1095,25 @@ def _validate_setup_dataloaders(self, dataloaders: Sequence[DataLoader]) -> None
         if any(not isinstance(dl, DataLoader) for dl in dataloaders):
             raise TypeError("Only PyTorch DataLoader are currently supported in `setup_dataloaders`.")
 
+    def _require_fabric_backward(self, module: _FabricModule) -> None:
+        strategy_requires = is_overridden("backward", self._strategy, parent=Strategy)
+        precision_requires = any(
+            is_overridden(method, self._precision, parent=Precision)
+            for method in ("pre_backward", "backward", "post_backward")
+        )
+
+        def _backward_hook(*_: Any, **__: Any) -> None:
+            if (strategy_requires or precision_requires) and not self._backward_called:
+                raise RuntimeError(
+                    "The current strategy and precision selection requires you to call `fabric.backward(loss)`"
+                    " instead of `loss.backward()`."
+                )
+
+        if _TORCH_GREATER_EQUAL_2_0:
+            module.register_full_backward_pre_hook(_backward_hook, prepend=True)
+        else:
+            module.register_full_backward_hook(_backward_hook)
+
     @staticmethod
     def _configure_callbacks(callbacks: Optional[Union[List[Any], Any]]) -> List[Any]:
         callbacks = callbacks if callbacks is not None else []
diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py
@@ -441,7 +441,8 @@ def test_reapply_compile():
 
     # Smoke-testing forward to ensure we don't get compilation errors
     for _ in range(3):
-        fabric_model(torch.randn(2, 32, device=fabric.device)).sum().backward()
+        loss = fabric_model(torch.randn(2, 32, device=fabric.device)).sum()
+        fabric.backward(loss)
 
 
 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
diff --git a/tests/tests_fabric/test_cli.py b/tests/tests_fabric/test_cli.py
@@ -181,6 +181,7 @@ def test_cli_through_fabric_entry_point():
     message = "Usage: fabric run model [OPTIONS] SCRIPT [SCRIPT_ARGS]"
     assert message in result.stdout or message in result.stderr
 
+
 @pytest.mark.skipif("lightning.fabric" == "lightning_fabric", reason="standalone package")
 def test_cli_through_lightning_entry_point():
     result = subprocess.run("lightning run model --help", capture_output=True, text=True, shell=True)
diff --git a/tests/tests_fabric/test_fabric.py b/tests/tests_fabric/test_fabric.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+from contextlib import nullcontext
 from re import escape
 from unittest import mock
 from unittest.mock import ANY, MagicMock, Mock, PropertyMock, call
@@ -22,7 +23,6 @@
 import torch.distributed
 import torch.nn.functional
 from lightning.fabric.fabric import Fabric
-from lightning.fabric.plugins import Precision
 from lightning.fabric.strategies import (
     DataParallelStrategy,
     DDPStrategy,
@@ -34,6 +34,7 @@
 )
 from lightning.fabric.strategies.strategy import _Sharded
 from lightning.fabric.utilities.exceptions import MisconfigurationException
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.fabric.utilities.seed import pl_worker_init_function, seed_everything
 from lightning.fabric.utilities.warnings import PossibleUserWarning
 from lightning.fabric.wrappers import _FabricDataLoader, _FabricModule, _FabricOptimizer
@@ -611,12 +612,74 @@ def test_rank_properties():
 def test_backward():
     """Test that backward() calls into the precision plugin."""
     fabric = Fabric()
-    fabric._strategy = Mock(spec=Precision)
+    fabric._strategy = Mock(spec=Strategy)
     loss = Mock()
     fabric.backward(loss, "arg", keyword="kwarg")
     fabric._strategy.backward.assert_called_with(loss, None, "arg", keyword="kwarg")
 
 
+@pytest.mark.parametrize(("strategy", "precision", "error_expected"), [
+    ("auto", "32-true", False),
+    ("auto", "bf16-true", False),
+    ("auto", "bf16-mixed", True),
+    pytest.param("fsdp", "32-true", True, marks=RunIf(min_cuda_gpus=1, min_torch="2.0.0")),
+])
+@pytest.mark.parametrize("setup_method", ["setup", "setup_module"])
+@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
+def test_backward_required(_, strategy, precision, error_expected, setup_method):
+    """Test under which strategy and precision configurations the `fabric.backward()` call is required."""
+    fabric = Fabric(
+        accelerator=("cuda" if strategy == "fsdp" else "cpu"),
+        strategy=strategy,
+        precision=precision,
+        devices=1
+    )
+    fabric._launched = True
+    fabric.strategy.setup_module = lambda module: module
+
+    error_context = (
+        pytest.raises(RuntimeError, match=escape("requires you to call `fabric.backward(loss)`")) if error_expected
+        else nullcontext()
+    )
+    batch = torch.rand(2, 2)
+
+    # One model
+    model1 = nn.Linear(2, 2)
+    assert not (model1._backward_pre_hooks if _TORCH_GREATER_EQUAL_2_0 else model1._backward_hooks)
+    model1 = getattr(fabric, setup_method)(model1)
+    assert model1._backward_pre_hooks if _TORCH_GREATER_EQUAL_2_0 else model1._backward_hooks
+    loss = model1(batch).sum()
+    with error_context:
+        loss.backward()
+    loss = model1(batch).sum()
+    fabric.backward(loss)  # no error
+    assert not fabric._backward_called
+
+    # Two models chained
+    model2 = torch.nn.Linear(2, 2)
+    model2 = getattr(fabric, setup_method)(model2)
+    loss = model2(model1(batch)).sum()
+    with error_context:
+        loss.backward()
+    loss = model2(model1(batch)).sum()
+    fabric.backward(loss)  # no error
+    assert not fabric._backward_called
+
+    # Two independent models
+    loss1 = model1(batch).sum()
+    loss2 = model2(batch).sum()
+    with error_context:
+        loss1.backward()
+    with error_context:
+        loss2.backward()
+    loss1 = model1(batch).sum()
+    loss2 = model2(batch).sum()
+    fabric.backward(loss1)  # no error
+    assert not fabric._backward_called
+    fabric.backward(loss2)  # no error
+    assert not fabric._backward_called
+
+
 @RunIf(deepspeed=True, mps=False)
 def test_backward_model_input_required():
     """Test that when using deepspeed and multiple models, backward() requires the model as input."""

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).`
`22`	`22`
`23`	`23`	- The `Fabric.rank_zero_first` context manager now uses a barrier without timeout to avoid long-running tasks to be interrupted ([#19448](https://github.com/Lightning-AI/lightning/pull/19448))
`24`	`24`
	`25`	`+`
	`26`	+- Fabric now raises an error if you forget to call `fabric.backward()` when it is needed by the strategy or precision selection ([#19447](https://github.com/Lightning-AI/lightning/pull/19447))
	`27`	`+`
	`28`	`+`
`25`	`29`	`-`
`26`	`30`
`27`	`31`	`### Deprecated`