Lightning-AI
diff --git a/‎docs/source-fabric/guide/checkpoint/distributed_checkpoint.rst
+33-1 b/‎docs/source-fabric/guide/checkpoint/distributed_checkpoint.rst
+33-1
diff --git a/‎docs/source-pytorch/common/checkpointing_expert.rst
+34-1 b/‎docs/source-pytorch/common/checkpointing_expert.rst
+34-1
diff --git a/‎src/lightning/fabric/strategies/fsdp.py
+1-2 b/‎src/lightning/fabric/strategies/fsdp.py
+1-2
diff --git a/‎src/lightning/fabric/utilities/consolidate_checkpoint.py
+79 b/‎src/lightning/fabric/utilities/consolidate_checkpoint.py
+79
diff --git a/‎src/lightning/fabric/utilities/load.py
+83-2 b/‎src/lightning/fabric/utilities/load.py
+83-2
diff --git a/‎src/lightning/pytorch/utilities/consolidate_checkpoint.py
+30 b/‎src/lightning/pytorch/utilities/consolidate_checkpoint.py
+30
diff --git a/‎tests/tests_fabric/strategies/test_fsdp_integration.py
+49 b/‎tests/tests_fabric/strategies/test_fsdp_integration.py
+49
@@ -183,4 +183,36 @@ Note that you can load the distributed checkpoint even if the world size has cha
 Convert a distributed checkpoint
 ********************************
 
-Coming soon.
+It is possible to convert a distributed checkpoint to a regular, single-file checkpoint with this utility:
+
+.. code-block:: bash
+
+    python -m lightning.fabric.utilities.consolidate_checkpoint path/to/my/checkpoint
+
+You will need to do this for example if you want to load the checkpoint into a script that doesn't use FSDP, or need to export the checkpoint to a different format for deployment, evaluation, etc.
+
+.. note::
+
+    All tensors in the checkpoint will be converted to CPU tensors, and no GPUs are required to run the conversion command.
+    This function assumes you have enough free CPU memory to hold the entire checkpoint in memory.
+
+.. collapse:: Full example
+
+    Assuming you have saved a checkpoint ``my-checkpoint.ckpt`` using the examples above, run the following command to convert it:
+
+    .. code-block:: bash
+
+        python -m lightning.fabric.utilities.consolidate_checkpoint my-checkpoint.ckpt
+
+    This saves a new file ``my-checkpoint.ckpt.consolidated`` next to the sharded checkpoint which you can load normally in PyTorch:
+
+    .. code-block:: python
+
+        import torch
+
+        checkpoint = torch.load("my-checkpoint.ckpt.consolidated")
+        print(list(checkpoint.keys()))
+        print(checkpoint["model"]["transformer.decoder.layers.31.norm1.weight"])
+
+
+|
@@ -136,4 +136,37 @@ Note that you can load the distributed checkpoint even if the world size has cha
 Convert a distributed checkpoint
 ********************************
 
-Coming soon.
+It is possible to convert a distributed checkpoint to a regular, single-file checkpoint with this utility:
+
+.. code-block:: bash
+
+    python -m lightning.pytorch.utilities.consolidate_checkpoint path/to/my/checkpoint
+
+You will need to do this for example if you want to load the checkpoint into a script that doesn't use FSDP, or need to export the checkpoint to a different format for deployment, evaluation, etc.
+
+.. note::
+
+    All tensors in the checkpoint will be converted to CPU tensors, and no GPUs are required to run the conversion command.
+    This function assumes you have enough free CPU memory to hold the entire checkpoint in memory.
+
+.. collapse:: Full example
+
+    Assuming you have saved a checkpoint ``epoch=0-step=3.ckpt`` using the examples above, run the following command to convert it:
+
+    .. code-block:: bash
+
+        cd lightning_logs/version_0/checkpoints
+        python -m lightning.pytorch.utilities.consolidate_checkpoint epoch=0-step=3.ckpt
+
+    This saves a new file ``epoch=0-step=3.ckpt.consolidated`` next to the sharded checkpoint which you can load normally in PyTorch:
+
+    .. code-block:: python
+
+        import torch
+
+        checkpoint = torch.load("epoch=0-step=3.ckpt.consolidated")
+        print(list(checkpoint.keys()))
+        print(checkpoint["state_dict"]["model.transformer.decoder.layers.31.norm1.weight"])
+
+
+|
@@ -69,7 +69,7 @@
     _TORCH_GREATER_EQUAL_2_2,
 )
 from lightning.fabric.utilities.init import _EmptyInit
-from lightning.fabric.utilities.load import _lazy_load, _materialize_tensors, _move_state_into
+from lightning.fabric.utilities.load import _METADATA_FILENAME, _lazy_load, _materialize_tensors, _move_state_into
 from lightning.fabric.utilities.rank_zero import rank_zero_deprecation, rank_zero_only, rank_zero_warn
 from lightning.fabric.utilities.seed import reset_seed
 from lightning.fabric.utilities.types import _PATH, _Stateful
@@ -87,7 +87,6 @@
     _SHARDING_STRATEGY = Union[ShardingStrategy, Literal["FULL_SHARD", "SHARD_GRAD_OP", "NO_SHARD", "HYBRID_SHARD"]]
 
 _FSDP_ALIASES = ("fsdp", "fsdp_cpu_offload")
-_METADATA_FILENAME = "meta.pt"
 
 
 class FSDPStrategy(ParallelStrategy, _Sharded):
 
@@ -0,0 +1,79 @@
+import logging
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+
+import torch
+
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1
+from lightning.fabric.utilities.load import _METADATA_FILENAME, _load_distributed_checkpoint
+
+_log = logging.getLogger(__name__)
+
+
+def _parse_cli_args() -> Namespace:
+    parser = ArgumentParser(
+        description=(
+            "Converts a distributed/sharded checkpoint into a single file that can be loaded with `torch.load()`."
+            " Only supports FSDP sharded checkpoints at the moment."
+        ),
+    )
+    parser.add_argument(
+        "checkpoint_folder",
+        type=str,
+        help=(
+            "Path to a checkpoint folder, containing the sharded checkpoint files saved using the"
+            " `torch.distributed.checkpoint` API."
+        ),
+    )
+    parser.add_argument(
+        "--output_file",
+        type=str,
+        help=(
+            "Path to the file where the converted checkpoint should be saved. The file should not already exist."
+            " If no path is provided, the file will be saved next to the input checkpoint folder with the same name"
+            " and a '.consolidated' suffix."
+        ),
+    )
+    return parser.parse_args()
+
+
+def _process_cli_args(args: Namespace) -> Namespace:
+    if not _TORCH_GREATER_EQUAL_2_1:
+        _log.error("Processing distributed checkpoints requires PyTorch >= 2.1.")
+        exit(1)
+
+    checkpoint_folder = Path(args.checkpoint_folder)
+    if not checkpoint_folder.exists():
+        _log.error(f"The provided checkpoint folder does not exist: {checkpoint_folder}")
+        exit(1)
+    if not checkpoint_folder.is_dir():
+        _log.error(
+            f"The provided checkpoint path must be a folder, containing the checkpoint shards: {checkpoint_folder}"
+        )
+        exit(1)
+    if not (checkpoint_folder / _METADATA_FILENAME).is_file():
+        _log.error(
+            "Only FSDP-sharded checkpoints saved with Lightning are supported for consolidation. The provided folder"
+            f" is not in that format: {checkpoint_folder}"
+        )
+        exit(1)
+
+    if args.output_file is None:
+        output_file = checkpoint_folder.with_suffix(checkpoint_folder.suffix + ".consolidated")
+    else:
+        output_file = Path(args.output_file)
+    if output_file.exists():
+        _log.error(
+            "The path for the converted checkpoint already exists. Choose a different path by providing"
+            f" `--output_file` or move/delete the file first: {output_file}"
+        )
+        exit(1)
+
+    return Namespace(checkpoint_folder=checkpoint_folder, output_file=output_file)
+
+
+if __name__ == "__main__":
+    args = _parse_cli_args()
+    config = _process_cli_args(args)
+    checkpoint = _load_distributed_checkpoint(config.checkpoint_folder)
+    torch.save(checkpoint, config.output_file)
@@ -15,7 +15,8 @@
 import warnings
 from functools import partial
 from io import BytesIO
-from typing import IO, TYPE_CHECKING, Any, Callable, Dict, Optional, OrderedDict, Sequence, Set, Union
+from pathlib import Path
+from typing import IO, TYPE_CHECKING, Any, Callable, Dict, Optional, OrderedDict, Sequence, Set, Tuple, Union
 
 import torch
 from lightning_utilities.core.apply_func import apply_to_collection
@@ -24,9 +25,16 @@
 from torch.nn import Parameter
 from typing_extensions import override
 
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
+from lightning.fabric.utilities.imports import (
+    _TORCH_GREATER_EQUAL_2_0,
+    _TORCH_GREATER_EQUAL_2_1,
+    _TORCH_GREATER_EQUAL_2_2,
+)
 from lightning.fabric.utilities.types import _PATH, _Stateful
 
+_METADATA_FILENAME = "meta.pt"
+
+
 if TYPE_CHECKING:
     from torch.storage import TypedStorage
 
@@ -227,3 +235,76 @@ def _move_state_into(
             destination[key].load_state_dict(state)
         else:
             destination[key] = state
+
+
+def _load_distributed_checkpoint(checkpoint_folder: Path) -> Dict[str, Any]:
+    """Loads a sharded checkpoint saved with the `torch.distributed.checkpoint` into a full state dict.
+
+    The current implementation assumes that the entire checkpoint fits in CPU memory.
+
+    """
+    if not _TORCH_GREATER_EQUAL_2_1:
+        raise ImportError("Processing distributed checkpoints requires PyTorch >= 2.1.")
+
+    from torch.distributed.checkpoint import FileSystemReader
+    from torch.distributed.checkpoint.metadata import BytesStorageMetadata, TensorStorageMetadata
+
+    if _TORCH_GREATER_EQUAL_2_2:
+        from torch.distributed.checkpoint import load
+    else:
+        from torch.distributed.checkpoint import load_state_dict as load  # deprecated
+
+    reader = FileSystemReader(checkpoint_folder)
+    metadata = reader.read_metadata()
+
+    # TODO: Add sequential save to avoid storing the entire checkpoint in memory
+    checkpoint: Dict[str, Any] = {}
+    for tensor_name, sd_metadata in metadata.state_dict_metadata.items():
+        if isinstance(sd_metadata, BytesStorageMetadata):
+            checkpoint[tensor_name] = "<bytes_io>"
+        elif isinstance(sd_metadata, TensorStorageMetadata):
+            checkpoint[tensor_name] = torch.empty(
+                size=sd_metadata.size,
+                dtype=sd_metadata.properties.dtype,
+                device=torch.device("cpu"),
+                memory_format=sd_metadata.properties.memory_format,
+                layout=sd_metadata.properties.layout,
+                requires_grad=sd_metadata.properties.requires_grad,
+                pin_memory=sd_metadata.properties.pin_memory,
+            )
+
+    load(state_dict=checkpoint, storage_reader=reader, no_dist=True)
+    checkpoint = _unflatten_dict(checkpoint, key_map=metadata.planner_data)
+
+    # This is the extra file saved by Fabric, with user data separate from weights and optimizer states
+    extra_file = checkpoint_folder / _METADATA_FILENAME
+    extra = torch.load(extra_file, map_location="cpu") if extra_file.is_file() else {}
+    checkpoint.update(extra)
+
+    return checkpoint
+
+
+def _unflatten_dict(checkpoint: Dict[str, Any], key_map: Dict[str, Tuple[str, ...]]) -> Dict[str, Any]:
+    """Converts the flat dictionary with keys 'x.y.z...' to a nested dictionary using the provided key map.
+
+    Args:
+        checkpoint: The flat checkpoint dictionary.
+        key_map: A dictionary that maps the keys in flattened format 'x.y.z...' to a tuple representing
+            the index path into the nested dictonary that this function should construct.
+
+    """
+    assert checkpoint.keys() == key_map.keys()
+    converted: Dict[str, Any] = {}
+    for flat_key in checkpoint:
+        key_path = key_map[flat_key]
+        _set_nested_dict_value(converted, key_path, checkpoint[flat_key])
+    return converted
+
+
+def _set_nested_dict_value(nested_dict: Dict[str, Any], key_path: Tuple[str, ...], value: Any) -> None:
+    result = nested_dict
+    for key in key_path[:-1]:
+        if key not in result:
+            result[key] = {}
+        result = result[key]
+    result[key_path[-1]] = value
@@ -0,0 +1,30 @@
+import re
+from typing import Any, Dict
+
+import torch
+
+from lightning.fabric.utilities.consolidate_checkpoint import _parse_cli_args, _process_cli_args
+from lightning.fabric.utilities.load import _load_distributed_checkpoint
+
+
+def _format_checkpoint(checkpoint: Dict[str, Any]) -> Dict[str, Any]:
+    """Converts the special FSDP checkpoint format to the standard format the Lightning Trainer can load."""
+    # Rename the model key
+    checkpoint["state_dict"] = checkpoint.pop("model")
+
+    optimizer_keys = [key for key in checkpoint if re.match("optimizer_[0-9]+", key)]
+    if not optimizer_keys:
+        return checkpoint
+
+    # Optimizers are saved in special keys named `optimizer_0`, `optimizer_1`, etc.
+    # These need to be merged back into a Python list
+    checkpoint["optimizer_states"] = [checkpoint.pop(f"optimizer_{opt_idx}") for opt_idx in range(len(optimizer_keys))]
+    return checkpoint
+
+
+if __name__ == "__main__":
+    args = _parse_cli_args()
+    config = _process_cli_args(args)
+    checkpoint = _load_distributed_checkpoint(config.checkpoint_folder)
+    checkpoint = _format_checkpoint(checkpoint)
+    torch.save(checkpoint, config.output_file)
@@ -22,6 +22,7 @@
 from lightning.fabric.plugins import FSDPPrecision
 from lightning.fabric.strategies import FSDPStrategy
 from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0, _TORCH_GREATER_EQUAL_2_1
+from lightning.fabric.utilities.load import _load_distributed_checkpoint
 from lightning.fabric.wrappers import _FabricOptimizer
 from torch.distributed.fsdp import FlatParameter, FullyShardedDataParallel, OptimStateKeyType
 from torch.distributed.fsdp.wrap import always_wrap_policy, wrap
@@ -549,3 +550,51 @@ def test_clip_gradients(clip_type, precision):
 
     optimizer.step()
     optimizer.zero_grad()
+
+
+@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.1.0")
+def test_save_sharded_and_consolidate_and_load(tmp_path):
+    """Test the consolidation of a FSDP-sharded checkpoint into a single file."""
+
+    fabric = Fabric(
+        accelerator="cuda",
+        strategy=FSDPStrategy(auto_wrap_policy=always_wrap_policy, state_dict_type="sharded"),
+        devices=2,
+    )
+    fabric.launch()
+
+    model = BoringModel()
+    optimizer = torch.optim.Adam(model.parameters())
+    model, optimizer = fabric.setup(model, optimizer)
+    state = {"model": model, "optimizer": optimizer, "steps": 1}
+
+    # run one iteration to init the state of the optimizer
+    model(torch.rand(1, 32, device=fabric.device)).sum().backward()
+    optimizer.step()
+
+    checkpoint_path_sharded = fabric.broadcast(str(tmp_path / "checkpoint_sharded"))
+    fabric.save(checkpoint_path_sharded, state)
+    assert set(os.listdir(checkpoint_path_sharded)) == {"meta.pt", ".metadata", "__0_0.distcp", "__1_0.distcp"}
+
+    # consolidate the checkpoint to a single file
+    checkpoint_path_full = fabric.broadcast(str(tmp_path / "checkpoint_full.pt"))
+    if fabric.global_rank == 0:
+        checkpoint = _load_distributed_checkpoint(Path(checkpoint_path_sharded))
+        torch.save(checkpoint, checkpoint_path_full)
+    fabric.barrier()
+
+    # re-init and load from full checkpoint
+    fabric = Fabric(
+        accelerator="cuda",
+        strategy=FSDPStrategy(auto_wrap_policy=always_wrap_policy),
+        devices=2,
+    )
+
+    # Hack: we already called launch() on another Fabric instance above
+    fabric._launched = True
+
+    model = BoringModel()
+    optimizer = torch.optim.Adam(model.parameters())
+    model, optimizer = fabric.setup(model, optimizer)
+    state = {"model": model, "optimizer": optimizer, "steps": 1}
+    fabric.load(checkpoint_path_full, state)