Reapply torch.compile in Fabric.setup() (#19280)

awaelchli · web-flow · commit 7cc79fe7bad6 · 2024-01-23T21:17:41.000-05:00
diff --git a/docs/source-fabric/api/fabric_methods.rst b/docs/source-fabric/api/fabric_methods.rst
@@ -108,7 +108,6 @@ This is useful if your model experiences *exploding gradients* during training.
     fabric.clip_gradients(model, optimizer, max_norm=2.0, norm_type="inf")
 
 The :meth:`~lightning.fabric.fabric.Fabric.clip_gradients` method is agnostic to the precision and strategy being used.
-Note: Gradient clipping with FSDP is not yet fully supported.
 
 
 to_device
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -24,6 +24,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added support for clipping gradients by value with FSDP ([#19236](https://github.com/Lightning-AI/lightning/pull/19236))
 
 
+- Added a utility function and CLI to consolidate FSDP sharded checkpoints into a single file ([#19213](https://github.com/Lightning-AI/lightning/pull/19213))
+
+
+- (Experimental) Added support for re-compiling the model inside `Fabric.setup()` over the FSDP/DDP wrappers ([#19280](https://github.com/Lightning-AI/lightning/pull/19280))
+
+
 ### Changed
 
 - `seed_everything()` without passing in a seed no longer randomly selects a seed, and now defaults to `0` ([#18846](https://github.com/Lightning-AI/lightning/pull/18846))
diff --git a/src/lightning/fabric/fabric.py b/src/lightning/fabric/fabric.py
@@ -76,6 +76,7 @@
     _FabricDataLoader,
     _FabricModule,
     _FabricOptimizer,
+    _to_compiled,
     _unwrap_compiled,
     _unwrap_objects,
 )
@@ -213,6 +214,7 @@ def setup(
         module: nn.Module,
         *optimizers: Optimizer,
         move_to_device: bool = True,
+        _reapply_compile: Optional[bool] = None,
     ) -> Any:  # no specific return because the way we want our API to look does not play well with mypy
         r"""Set up a model and its optimizers for accelerated training.
 
@@ -221,12 +223,17 @@ def setup(
             *optimizers: The optimizer(s) to set up (no optimizers is also possible)
             move_to_device: If set ``True`` (default), moves the model to the correct device. Set this to ``False``
                 and alternatively use :meth:`to_device` manually.
+            _reapply_compile: (Experimental) If set to ``True``, and the model was ``torch.compile``d before, the
+                corresponding :class:`~torch._dynamo.OptimizedModule` wrapper will be removed and reapplied with the
+                same settings after the model was set up by the strategy (e.g., after the model was wrapped by DDP,
+                FSDP etc.). Only supported on PyTorch >= 2.1. Defaults to ``False``, but it may change in the future.
 
         Returns:
             The tuple containing wrapped module and the optimizers, in the same order they were passed in.
 
         """
         self._validate_setup(module, optimizers)
+        module, compile_kwargs = _unwrap_compiled(module) if _reapply_compile else (module, None)
         original_module = module
 
         module = self._precision.convert_module(module)
@@ -242,6 +249,8 @@ def setup(
         else:
             module = self._strategy.setup_module(module)
 
+        if compile_kwargs is not None:
+            module = _to_compiled(module, compile_kwargs)
         module = _FabricModule(module, self._precision, original_module=original_module)
 
         # Update the _DeviceDtypeModuleMixin's device parameter
@@ -258,8 +267,8 @@ def setup(
         self._models_setup += 1
 
         if hasattr(original_module, "_fabric"):  # this is probably a LightningModule
-            original_module._fabric = self  # type: ignore[assignment]
-            original_module._fabric_optimizers = optimizers  # type: ignore[assignment]
+            original_module._fabric = self
+            original_module._fabric_optimizers = optimizers
             if original_module not in self._callbacks:
                 self._callbacks.append(original_module)
 
@@ -270,7 +279,9 @@ def setup(
             return (module, *optimizers)
         return module
 
-    def setup_module(self, module: nn.Module, move_to_device: bool = True) -> _FabricModule:
+    def setup_module(
+        self, module: nn.Module, move_to_device: bool = True, _reapply_compile: Optional[bool] = None
+    ) -> _FabricModule:
         r"""Set up a model for accelerated training or inference.
 
         This is the same as calling ``.setup(model)`` with no optimizers. It is useful for inference or for certain
@@ -281,12 +292,17 @@ def setup_module(self, module: nn.Module, move_to_device: bool = True) -> _Fabri
             module: A :class:`torch.nn.Module` to set up
             move_to_device: If set ``True`` (default), moves the model to the correct device. Set this to ``False``
                 and alternatively use :meth:`to_device` manually.
+            _reapply_compile: (Experimental) If set to ``True``, and the model was ``torch.compile``d before, the
+                corresponding :class:`~torch._dynamo.OptimizedModule` wrapper will be removed and reapplied with the
+                same settings after the model was set up by the strategy (e.g., after the model was wrapped by DDP,
+                FSDP etc.). Only supported on PyTorch >= 2.1. Defaults to ``False``, but it may change in the future.
 
         Returns:
             The wrapped model.
 
         """
         self._validate_setup_module(module)
+        module, compile_kwargs = _unwrap_compiled(module) if _reapply_compile else (module, None)
         original_module = module
 
         module = self._precision.convert_module(module)
@@ -296,6 +312,9 @@ def setup_module(self, module: nn.Module, move_to_device: bool = True) -> _Fabri
 
         # Let strategy wrap and connect the module alone
         module = self._strategy.setup_module(module)
+
+        if compile_kwargs is not None:
+            module = _to_compiled(module, compile_kwargs)
         module = _FabricModule(module, self._precision, original_module=original_module)
 
         # Update the _DeviceDtypeModuleMixin's device parameter
@@ -305,7 +324,7 @@ def setup_module(self, module: nn.Module, move_to_device: bool = True) -> _Fabri
         )
 
         if hasattr(original_module, "_fabric"):  # this is probably a LightningModule
-            original_module._fabric = self  # type: ignore[assignment]
+            original_module._fabric = self
             if original_module not in self._callbacks:
                 self._callbacks.append(original_module)
 
@@ -410,6 +429,7 @@ def backward(self, tensor: Tensor, *args: Any, model: Optional[_FabricModule] =
 
         """
         module = model._forward_module if model is not None else model
+        module, _ = _unwrap_compiled(module)
         if isinstance(self._strategy, DeepSpeedStrategy):
             if model is None:
                 if self._models_setup == 0:
@@ -641,7 +661,7 @@ def no_backward_sync(self, module: _FabricModule, enabled: bool = True) -> Conte
                 skip.
 
         """
-        module = _unwrap_compiled(module)
+        module, _ = _unwrap_compiled(module)
         if not isinstance(module, _FabricModule):
             raise TypeError(
                 "You need to set up the model first before you can call `fabric.no_backward_sync()`:"
@@ -656,7 +676,9 @@ def no_backward_sync(self, module: _FabricModule, enabled: bool = True) -> Conte
                 category=PossibleUserWarning,
             )
             return nullcontext()
-        return self._strategy._backward_sync_control.no_backward_sync(module._forward_module)
+
+        forward_module, _ = _unwrap_compiled(module._forward_module)
+        return self._strategy._backward_sync_control.no_backward_sync(forward_module)
 
     def sharded_model(self) -> ContextManager:
         r"""Instantiate a model under this context manager to prepare it for model-parallel sharding.
@@ -772,7 +794,7 @@ def load(
             # We need to unwrap objects (see above) but this creates a new dictionary. In-place updates
             # (for user metadata) wouldn't show up in the original dict, so we need to copy the data back.
             for k in list(unwrapped_state.keys()):
-                obj = _unwrap_compiled(state[k])
+                obj, _ = _unwrap_compiled(state[k])
                 if isinstance(obj, (_FabricModule, _FabricOptimizer, _FabricDataLoader)):
                     continue
                 state[k] = unwrapped_state[k]
diff --git a/src/lightning/fabric/wrappers.py b/src/lightning/fabric/wrappers.py
@@ -12,8 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
+from copy import deepcopy
 from functools import wraps
-from typing import Any, Callable, Dict, Generator, Iterator, List, Mapping, Optional, TypeVar, Union, overload
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Iterator,
+    List,
+    Mapping,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+    overload,
+)
 
 import torch
 from lightning_utilities.core.apply_func import apply_to_collection
@@ -32,6 +47,9 @@
 from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.fabric.utilities.types import Optimizable
 
+if TYPE_CHECKING:
+    from torch._dynamo import OptimizedModule
+
 T_destination = TypeVar("T_destination", bound=Dict[str, Any])
 _LIGHTNING_MODULE_STEP_METHODS = ("training_step", "validation_step", "test_step", "predict_step")
 
@@ -285,8 +303,8 @@ def _unwrap_objects(collection: Any) -> Any:
     def _unwrap(
         obj: Union[_FabricModule, _FabricOptimizer, _FabricDataLoader]
     ) -> Union[nn.Module, Optimizer, DataLoader]:
-        if isinstance(unwrapped := _unwrap_compiled(obj), _FabricModule):
-            return unwrapped._forward_module
+        if isinstance(unwrapped := _unwrap_compiled(obj)[0], _FabricModule):
+            return _unwrap_compiled(unwrapped._forward_module)[0]
         if isinstance(obj, _FabricOptimizer):
             return obj.optimizer
         if isinstance(obj, _FabricDataLoader):
@@ -302,19 +320,33 @@ def _unwrap(
     return apply_to_collection(collection, dtype=tuple(types), function=_unwrap)
 
 
-def _unwrap_compiled(obj: Any) -> Any:
+def _unwrap_compiled(obj: Union[Any, "OptimizedModule"]) -> Tuple[Union[Any, nn.Module], Optional[Dict[str, Any]]]:
     """Removes the :class:`torch._dynamo.OptimizedModule` around the object if it is wrapped.
 
     Use this function before instance checks against e.g. :class:`_FabricModule`.
 
     """
     if not _TORCH_GREATER_EQUAL_2_0:
-        return obj
+        # obj can't be an `OptimizedModule` anyway
+        return obj, None
+
     from torch._dynamo import OptimizedModule
 
     if isinstance(obj, OptimizedModule):
-        return obj._orig_mod
-    return obj
+        if (compile_kwargs := getattr(obj, "_compile_kwargs", None)) is None:
+            raise RuntimeError(
+                "Failed to determine the arguments that were used to compile the module. Make sure to import"
+                " lightning before `torch.compile` is used."
+            )
+        return obj._orig_mod, compile_kwargs
+    return obj, None
+
+
+def _to_compiled(module: nn.Module, compile_kwargs: Dict[str, Any]) -> "OptimizedModule":
+    if not _TORCH_GREATER_EQUAL_2_0:
+        raise RuntimeError("Converting to a compiled module is only supported in PyTorch >= 2.0.0")
+
+    return torch.compile(module, **compile_kwargs)  # type: ignore[return-value]
 
 
 def is_wrapped(obj: object) -> bool:
@@ -328,5 +360,30 @@ def is_wrapped(obj: object) -> bool:
         obj: The object to test.
 
     """
-    obj = _unwrap_compiled(obj)
+    obj, _ = _unwrap_compiled(obj)
     return isinstance(obj, (_FabricModule, _FabricOptimizer, _FabricDataLoader))
+
+
+def _capture_compile_kwargs(compile_fn: Callable) -> Callable:
+    """Wraps the ``torch.compile`` function and captures the compile arguments.
+
+    We extract the compile arguments so that we can reapply ``torch.compile`` in ``Fabric.setup()`` with the
+    same arguments as the user passed to the original call. The arguments get stored in a dictionary
+    ``_compile_kwargs`` on the returned compiled module.
+
+    """
+    # Limitation: Currently, the global compile config does not get captured on a per-model basis.
+    # PyTorch will resolve this in the future: https://github.com/pytorch/pytorch/issues/116575
+
+    @wraps(compile_fn)
+    def _capture(model: Any, **kwargs: Any) -> Any:
+        compiled_model = compile_fn(model, **kwargs)
+        if isinstance(model, nn.Module):
+            compiled_model._compile_kwargs = deepcopy(kwargs)
+        return compiled_model
+
+    return _capture
+
+
+if _TORCH_GREATER_EQUAL_2_0:
+    torch.compile = _capture_compile_kwargs(torch.compile)
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -24,6 +24,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added the option `ModelCheckpoint(save_last='link')` to create a symbolic link for the 'last.ckpt' file ([#19191](https://github.com/Lightning-AI/lightning/pull/19191))
 
 
+- Added a utility function and CLI to consolidate FSDP sharded checkpoints into a single file ([#19213](https://github.com/Lightning-AI/lightning/pull/19213))
+
+
 ### Changed
 
 - `seed_everything()` without passing in a seed no longer randomly selects a seed, and now defaults to `0` ([#18846](https://github.com/Lightning-AI/lightning/pull/18846))
diff --git a/tests/tests_fabric/strategies/test_ddp_integration.py b/tests/tests_fabric/strategies/test_ddp_integration.py
@@ -11,14 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 from copy import deepcopy
+from unittest import mock
+from unittest.mock import Mock
 
 import pytest
 import torch
 from lightning.fabric import Fabric
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
+from torch.nn.parallel.distributed import DistributedDataParallel
 
 from tests_fabric.helpers.runif import RunIf
 from tests_fabric.strategies.test_single_device import _run_test_clip_gradients
+from tests_fabric.test_fabric import BoringModel
 
 
 @pytest.mark.parametrize(
@@ -64,6 +70,40 @@ def assert_params_equal(params0, params1):
     assert_params_equal(params_before, wrapped_model.parameters())
 
 
+@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.1.0", dynamo=True)
+@mock.patch(
+    "lightning.fabric.wrappers.torch.compile",
+    Mock(wraps=(torch.compile if _TORCH_GREATER_EQUAL_2_0 else None)),
+)
+@mock.patch.dict(os.environ, {})
+def test_reapply_compile():
+    """Test that Fabric can rewrap a compiled module such that compilation happens over the DDP-wrapper."""
+    from torch._dynamo import OptimizedModule
+
+    fabric = Fabric(accelerator="cuda", devices=2, strategy="ddp")
+    fabric.launch()
+
+    model = BoringModel()
+    compile_kwargs = {"mode": "reduce-overhead"}
+    compiled_model = torch.compile(model, **compile_kwargs)
+    torch.compile.reset_mock()
+
+    fabric_model = fabric.setup(compiled_model, _reapply_compile=True)
+
+    assert isinstance(fabric_model._forward_module, OptimizedModule)
+    assert isinstance(fabric_model._forward_module._orig_mod, DistributedDataParallel)
+    # Assert we called compile again with the same arguments, but on the DDP-wrapped module
+    torch.compile.assert_called_with(fabric_model._forward_module._orig_mod, **compile_kwargs)
+
+    assert fabric_model._original_module == model
+    assert fabric_model._forward_module._orig_mod.module == model
+    assert fabric_model.device == fabric.device
+
+    # Smoke-testing forward to ensure we don't get compilation errors
+    for _ in range(3):
+        fabric_model(torch.randn(2, 32, device=fabric.device)).sum().backward()
+
+
 @pytest.mark.parametrize(
     ("clip_type", "accelerator", "precision"),
     [
diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py
diff --git a/tests/tests_fabric/test_fabric.py b/tests/tests_fabric/test_fabric.py
diff --git a/tests/tests_fabric/test_wrappers.py b/tests/tests_fabric/test_wrappers.py