Merge branch 'master' into delete_data_stuffs

tchaton · web-flow · commit b3ede34ea9cd · 2024-02-26T12:27:43.000Z
diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
@@ -105,16 +105,9 @@ jobs:
           done
         displayName: "Adjust dependencies"
 
-      - bash: |
-          pip install -q -r .actions/requirements.txt
-          python .actions/assistant.py requirements_prune_pkgs \
-            --packages="[lightning-colossalai]" \
-            --req_files="[requirements/_integrations/strategies.txt]"
-        displayName: "Prune packages" # these have installation issues
-
       - bash: |
           extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))")
-          pip install -e ".[${extra}dev]" -r requirements/_integrations/strategies.txt pytest-timeout -U --find-links="${TORCH_URL}"
+          pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}"
         displayName: "Install package & dependencies"
 
       - bash: pip uninstall -y lightning
diff --git a/docs/source-pytorch/common/checkpointing_intermediate.rst b/docs/source-pytorch/common/checkpointing_intermediate.rst
@@ -167,9 +167,11 @@ In distributed training cases where a model is running across many machines, Lig
     trainer = Trainer(strategy="ddp")
     model = MyLightningModule(hparams)
     trainer.fit(model)
+
     # Saves only on the main process
+    # Handles strategy-specific saving logic like XLA, FSDP, DeepSpeed etc.
     trainer.save_checkpoint("example.ckpt")
 
-Not using :meth:`~lightning.pytorch.trainer.trainer.Trainer.save_checkpoint` can lead to unexpected behavior and potential deadlock. Using other saving functions will result in all devices attempting to save the checkpoint. As a result, we highly recommend using the Trainer's save functionality.
-If using custom saving functions cannot be avoided, we recommend using the :func:`~lightning.pytorch.utilities.rank_zero.rank_zero_only` decorator to ensure saving occurs only on the main process. Note that this will only work if all ranks hold the exact same state and won't work when using
-model parallel distributed strategies such as deepspeed or sharded training.
+
+By using :meth:`~lightning.pytorch.trainer.trainer.Trainer.save_checkpoint` instead of ``torch.save``, you make your code agnostic to the distributed training strategy being used.
+It will ensure that checkpoints are saved correctly in a multi-process setting, avoiding race conditions, deadlocks and other common issues that normally require boilerplate code to handle properly.
diff --git a/docs/source-pytorch/starter/installation.rst b/docs/source-pytorch/starter/installation.rst
@@ -16,7 +16,9 @@ Install lightning inside a virtual env or conda environment with pip
 
     python -m pip install lightning
 
---------------
+
+----
+
 
 ******************
 Install with Conda
@@ -66,17 +68,17 @@ Install future patch releases from the source. Note that the patch release conta
 ^^^^^^^^^^^^^^^^^^^^^^
 Custom PyTorch Version
 ^^^^^^^^^^^^^^^^^^^^^^
-To use any PyTorch version visit the `PyTorch Installation Page <https://pytorch.org/get-started/locally/#start-locally>`_.
 
+To use any PyTorch version visit the `PyTorch Installation Page <https://pytorch.org/get-started/locally/#start-locally>`_.
 You can find the list of supported PyTorch versions in our :ref:`compatibility matrix <versioning:Compatibility matrix>`.
 
 ----
 
 
 *******************************************
-Optimized for ML workflows (lightning Apps)
+Optimized for ML workflows (Lightning Apps)
 *******************************************
-If you are deploying workflows built with Lightning in production and require fewer dependencies, try using the optimized `lightning[apps]` package:
+If you are deploying workflows built with Lightning in production and require fewer dependencies, try using the optimized ``lightning[apps]`` package:
 
 .. code-block:: bash
 
diff --git a/requirements/_integrations/strategies.txt b/requirements/_integrations/strategies.txt
diff --git a/src/lightning/pytorch/callbacks/gradient_accumulation_scheduler.py b/src/lightning/pytorch/callbacks/gradient_accumulation_scheduler.py
@@ -27,7 +27,6 @@
 import lightning.pytorch as pl
 from lightning.pytorch.callbacks.callback import Callback
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
-from lightning.pytorch.utilities.imports import _LIGHTNING_COLOSSALAI_AVAILABLE
 from lightning.pytorch.utilities.model_helpers import is_overridden
 from lightning.pytorch.utilities.rank_zero import rank_zero_warn
 
@@ -125,13 +124,7 @@ def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule")
         # local import to avoid circular import
         from lightning.pytorch.strategies import DeepSpeedStrategy
 
-        unsupported_strategies = [DeepSpeedStrategy]
-        if _LIGHTNING_COLOSSALAI_AVAILABLE:
-            from lightning_colossalai import ColossalAIStrategy
-
-            unsupported_strategies.append(ColossalAIStrategy)
-
-        if isinstance(trainer.strategy, tuple(unsupported_strategies)):
+        if isinstance(trainer.strategy, DeepSpeedStrategy):
             raise RuntimeError(
                 f"The `{type(trainer.strategy).__name__}` does not support `accumulate_grad_batches` changing"
                 " between epochs."
diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py
@@ -89,13 +89,12 @@ class ModelCheckpoint(Checkpoint):
             in a deterministic manner. Default: ``None``.
         save_top_k: if ``save_top_k == k``,
             the best k models according to the quantity monitored will be saved.
-            if ``save_top_k == 0``, no models are saved.
-            if ``save_top_k == -1``, all models are saved.
+            If ``save_top_k == 0``, no models are saved.
+            If ``save_top_k == -1``, all models are saved.
             Please note that the monitors are checked every ``every_n_epochs`` epochs.
-            if ``save_top_k >= 2`` and the callback is called multiple
-            times inside an epoch, the name of the saved file will be
-            appended with a version count starting with ``v1``
-            unless ``enable_version_counter`` is set to False.
+            If ``save_top_k >= 2`` and the callback is called multiple times inside an epoch, and the filename remains
+            unchanged, the name of the saved file will be appended with a version count starting with ``v1`` to avoid
+            collisions unless ``enable_version_counter`` is set to False.
         mode: one of {min, max}.
             If ``save_top_k != 0``, the decision to overwrite the current save file is made
             based on either the maximization or the minimization of the monitored quantity.
diff --git a/src/lightning/pytorch/core/hooks.py b/src/lightning/pytorch/core/hooks.py
@@ -85,6 +85,10 @@ def on_train_batch_end(self, outputs: STEP_OUTPUT, batch: Any, batch_idx: int) -
             batch: The batched data as it is returned by the training DataLoader.
             batch_idx: the index of the batch
 
+        Note:
+            The value ``outputs["loss"]`` here will be the normalized value w.r.t ``accumulate_grad_batches`` of the
+            loss returned from ``training_step``.
+
         """
 
     def on_validation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None:
diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py
@@ -1285,20 +1285,12 @@ def optimizer_step(
 
         Examples::
 
-            # DEFAULT
             def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure):
-                optimizer.step(closure=optimizer_closure)
+                # Add your custom logic to run directly before `optimizer.step()`
 
-            # Learning rate warm-up
-            def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure):
-                # update params
                 optimizer.step(closure=optimizer_closure)
 
-                # manually warm up lr without a scheduler
-                if self.trainer.global_step < 500:
-                    lr_scale = min(1.0, float(self.trainer.global_step + 1) / 500.0)
-                    for pg in optimizer.param_groups:
-                        pg["lr"] = lr_scale * self.learning_rate
+                # Add your custom logic to run directly after `optimizer.step()`
 
         """
         optimizer.step(closure=optimizer_closure)
diff --git a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py
@@ -62,10 +62,7 @@
 )
 from lightning.pytorch.strategies.ddp import _DDP_FORK_ALIASES
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
-from lightning.pytorch.utilities.imports import (
-    _LIGHTNING_COLOSSALAI_AVAILABLE,
-    _habana_available_and_importable,
-)
+from lightning.pytorch.utilities.imports import _habana_available_and_importable
 from lightning.pytorch.utilities.rank_zero import rank_zero_info, rank_zero_warn
 
 log = logging.getLogger(__name__)
@@ -191,9 +188,6 @@ def _check_config_and_set_final_flags(
 
         self._strategy_flag = strategy
 
-        if strategy == "colossalai" and not _LIGHTNING_COLOSSALAI_AVAILABLE:
-            raise ModuleNotFoundError(str(_LIGHTNING_COLOSSALAI_AVAILABLE))
-
         if strategy != "auto" and strategy not in self._registered_strategies and not isinstance(strategy, Strategy):
             raise ValueError(
                 f"You selected an invalid strategy name: `strategy={strategy!r}`."
@@ -490,12 +484,6 @@ def _check_and_init_precision(self) -> Precision:
             if isinstance(self.accelerator, HPUAccelerator):
                 return HPUPrecisionPlugin(self._precision_flag)
 
-        if _LIGHTNING_COLOSSALAI_AVAILABLE:
-            from lightning_colossalai import ColossalAIPrecisionPlugin, ColossalAIStrategy
-
-            if isinstance(self.strategy, ColossalAIStrategy):
-                return ColossalAIPrecisionPlugin(self._precision_flag)
-
         if isinstance(self.strategy, (SingleDeviceXLAStrategy, XLAStrategy)):
             return XLAPrecision(self._precision_flag)  # type: ignore
         if isinstance(self.strategy, DeepSpeedStrategy):
@@ -648,13 +636,6 @@ def _set_torch_flags(
 
 def _register_external_accelerators_and_strategies() -> None:
     """Registers all known strategies in other packages."""
-    if _LIGHTNING_COLOSSALAI_AVAILABLE:
-        from lightning_colossalai import ColossalAIStrategy
-
-        # TODO: Prevent registering multiple times
-        if "colossalai" not in StrategyRegistry:
-            ColossalAIStrategy.register_strategies(StrategyRegistry)
-
     if _habana_available_and_importable():
         from lightning_habana import HPUAccelerator, HPUParallelStrategy, SingleHPUStrategy
 
diff --git a/src/lightning/pytorch/utilities/imports.py b/src/lightning/pytorch/utilities/imports.py
@@ -28,7 +28,6 @@
 
 _OMEGACONF_AVAILABLE = package_available("omegaconf")
 _TORCHVISION_AVAILABLE = RequirementCache("torchvision")
-_LIGHTNING_COLOSSALAI_AVAILABLE = RequirementCache("lightning-colossalai")
 
 
 @functools.lru_cache(maxsize=128)
diff --git a/tests/tests_pytorch/callbacks/test_gradient_accumulation_scheduler.py b/tests/tests_pytorch/callbacks/test_gradient_accumulation_scheduler.py
@@ -20,12 +20,6 @@
 from lightning.pytorch.demos.boring_classes import BoringModel
 from lightning.pytorch.strategies import DeepSpeedStrategy
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
-from lightning.pytorch.utilities.imports import _LIGHTNING_COLOSSALAI_AVAILABLE
-
-if _LIGHTNING_COLOSSALAI_AVAILABLE:
-    from lightning_colossalai import ColossalAIStrategy
-else:
-    ColossalAIStrategy = None
 
 
 @pytest.mark.parametrize("accumulate_grad_batches", [1, 2, 3])
@@ -94,16 +88,7 @@ def test_invalid_values_for_grad_accum_scheduler(scheduling):
         _ = GradientAccumulationScheduler(scheduling=scheduling)
 
 
-@pytest.mark.parametrize(
-    "strategy_class",
-    [
-        pytest.param(
-            ColossalAIStrategy,
-            marks=pytest.mark.skipif(not _LIGHTNING_COLOSSALAI_AVAILABLE, reason="Requires ColossalAI strategy"),
-        ),
-        DeepSpeedStrategy,
-    ],
-)
+@pytest.mark.parametrize("strategy_class", [DeepSpeedStrategy])
 def test_unsupported_strategies(strategy_class):
     """Test that an error is raised for strategies that require the gradient accumulation factor to be fixed."""
     scheduler = GradientAccumulationScheduler({1: 2})
diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
@@ -59,7 +59,6 @@
 from lightning.pytorch.utilities.imports import (
     _LIGHTNING_HABANA_AVAILABLE,
 )
-from lightning_utilities.core.imports import package_available
 
 from tests_pytorch.conftest import mock_cuda_count, mock_mps_count, mock_tpu_available, mock_xla_available
 from tests_pytorch.helpers.runif import RunIf
@@ -845,21 +844,6 @@ def get_defaults(cls):
         assert connector_default == trainer_defaults[name]
 
 
-@RunIf(min_cuda_gpus=1)  # trigger this test on our GPU pipeline, because we don't install the package on the CPU suite
-@pytest.mark.xfail(raises=ImportError, reason="Not updated to latest API")
-@pytest.mark.skipif(not package_available("lightning_colossalai"), reason="Requires Colossal AI Strategy")
-def test_colossalai_external_strategy(monkeypatch):
-    with mock.patch(
-        "lightning.pytorch.trainer.connectors.accelerator_connector._LIGHTNING_COLOSSALAI_AVAILABLE", False
-    ), pytest.raises(ModuleNotFoundError):
-        Trainer(strategy="colossalai")
-
-    from lightning_colossalai import ColossalAIStrategy
-
-    trainer = Trainer(strategy="colossalai", precision="16-mixed")
-    assert isinstance(trainer.strategy, ColossalAIStrategy)
-
-
 class DeviceMock(Mock):
     def __instancecheck__(self, instance):
         return True