Skip to content

Commit b3ede34

Browse files
authored
Merge branch 'master' into delete_data_stuffs
2 parents c6d9726 + 2a827f3 commit b3ede34

File tree

12 files changed

+26
-96
lines changed

12 files changed

+26
-96
lines changed

.azure/gpu-tests-pytorch.yml

+1-8
Original file line numberDiff line numberDiff line change
@@ -105,16 +105,9 @@ jobs:
105105
done
106106
displayName: "Adjust dependencies"
107107
108-
- bash: |
109-
pip install -q -r .actions/requirements.txt
110-
python .actions/assistant.py requirements_prune_pkgs \
111-
--packages="[lightning-colossalai]" \
112-
--req_files="[requirements/_integrations/strategies.txt]"
113-
displayName: "Prune packages" # these have installation issues
114-
115108
- bash: |
116109
extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))")
117-
pip install -e ".[${extra}dev]" -r requirements/_integrations/strategies.txt pytest-timeout -U --find-links="${TORCH_URL}"
110+
pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}"
118111
displayName: "Install package & dependencies"
119112
120113
- bash: pip uninstall -y lightning

docs/source-pytorch/common/checkpointing_intermediate.rst

+5-3
Original file line numberDiff line numberDiff line change
@@ -167,9 +167,11 @@ In distributed training cases where a model is running across many machines, Lig
167167
trainer = Trainer(strategy="ddp")
168168
model = MyLightningModule(hparams)
169169
trainer.fit(model)
170+
170171
# Saves only on the main process
172+
# Handles strategy-specific saving logic like XLA, FSDP, DeepSpeed etc.
171173
trainer.save_checkpoint("example.ckpt")
172174
173-
Not using :meth:`~lightning.pytorch.trainer.trainer.Trainer.save_checkpoint` can lead to unexpected behavior and potential deadlock. Using other saving functions will result in all devices attempting to save the checkpoint. As a result, we highly recommend using the Trainer's save functionality.
174-
If using custom saving functions cannot be avoided, we recommend using the :func:`~lightning.pytorch.utilities.rank_zero.rank_zero_only` decorator to ensure saving occurs only on the main process. Note that this will only work if all ranks hold the exact same state and won't work when using
175-
model parallel distributed strategies such as deepspeed or sharded training.
175+
176+
By using :meth:`~lightning.pytorch.trainer.trainer.Trainer.save_checkpoint` instead of ``torch.save``, you make your code agnostic to the distributed training strategy being used.
177+
It will ensure that checkpoints are saved correctly in a multi-process setting, avoiding race conditions, deadlocks and other common issues that normally require boilerplate code to handle properly.

docs/source-pytorch/starter/installation.rst

+6-4
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ Install lightning inside a virtual env or conda environment with pip
1616
1717
python -m pip install lightning
1818
19-
--------------
19+
20+
----
21+
2022

2123
******************
2224
Install with Conda
@@ -66,17 +68,17 @@ Install future patch releases from the source. Note that the patch release conta
6668
^^^^^^^^^^^^^^^^^^^^^^
6769
Custom PyTorch Version
6870
^^^^^^^^^^^^^^^^^^^^^^
69-
To use any PyTorch version visit the `PyTorch Installation Page <https://pytorch.org/get-started/locally/#start-locally>`_.
7071

72+
To use any PyTorch version visit the `PyTorch Installation Page <https://pytorch.org/get-started/locally/#start-locally>`_.
7173
You can find the list of supported PyTorch versions in our :ref:`compatibility matrix <versioning:Compatibility matrix>`.
7274

7375
----
7476

7577

7678
*******************************************
77-
Optimized for ML workflows (lightning Apps)
79+
Optimized for ML workflows (Lightning Apps)
7880
*******************************************
79-
If you are deploying workflows built with Lightning in production and require fewer dependencies, try using the optimized `lightning[apps]` package:
81+
If you are deploying workflows built with Lightning in production and require fewer dependencies, try using the optimized ``lightning[apps]`` package:
8082

8183
.. code-block:: bash
8284

requirements/_integrations/strategies.txt

-4
This file was deleted.

src/lightning/pytorch/callbacks/gradient_accumulation_scheduler.py

+1-8
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
import lightning.pytorch as pl
2828
from lightning.pytorch.callbacks.callback import Callback
2929
from lightning.pytorch.utilities.exceptions import MisconfigurationException
30-
from lightning.pytorch.utilities.imports import _LIGHTNING_COLOSSALAI_AVAILABLE
3130
from lightning.pytorch.utilities.model_helpers import is_overridden
3231
from lightning.pytorch.utilities.rank_zero import rank_zero_warn
3332

@@ -125,13 +124,7 @@ def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule")
125124
# local import to avoid circular import
126125
from lightning.pytorch.strategies import DeepSpeedStrategy
127126

128-
unsupported_strategies = [DeepSpeedStrategy]
129-
if _LIGHTNING_COLOSSALAI_AVAILABLE:
130-
from lightning_colossalai import ColossalAIStrategy
131-
132-
unsupported_strategies.append(ColossalAIStrategy)
133-
134-
if isinstance(trainer.strategy, tuple(unsupported_strategies)):
127+
if isinstance(trainer.strategy, DeepSpeedStrategy):
135128
raise RuntimeError(
136129
f"The `{type(trainer.strategy).__name__}` does not support `accumulate_grad_batches` changing"
137130
" between epochs."

src/lightning/pytorch/callbacks/model_checkpoint.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -89,13 +89,12 @@ class ModelCheckpoint(Checkpoint):
8989
in a deterministic manner. Default: ``None``.
9090
save_top_k: if ``save_top_k == k``,
9191
the best k models according to the quantity monitored will be saved.
92-
if ``save_top_k == 0``, no models are saved.
93-
if ``save_top_k == -1``, all models are saved.
92+
If ``save_top_k == 0``, no models are saved.
93+
If ``save_top_k == -1``, all models are saved.
9494
Please note that the monitors are checked every ``every_n_epochs`` epochs.
95-
if ``save_top_k >= 2`` and the callback is called multiple
96-
times inside an epoch, the name of the saved file will be
97-
appended with a version count starting with ``v1``
98-
unless ``enable_version_counter`` is set to False.
95+
If ``save_top_k >= 2`` and the callback is called multiple times inside an epoch, and the filename remains
96+
unchanged, the name of the saved file will be appended with a version count starting with ``v1`` to avoid
97+
collisions unless ``enable_version_counter`` is set to False.
9998
mode: one of {min, max}.
10099
If ``save_top_k != 0``, the decision to overwrite the current save file is made
101100
based on either the maximization or the minimization of the monitored quantity.

src/lightning/pytorch/core/hooks.py

+4
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,10 @@ def on_train_batch_end(self, outputs: STEP_OUTPUT, batch: Any, batch_idx: int) -
8585
batch: The batched data as it is returned by the training DataLoader.
8686
batch_idx: the index of the batch
8787
88+
Note:
89+
The value ``outputs["loss"]`` here will be the normalized value w.r.t ``accumulate_grad_batches`` of the
90+
loss returned from ``training_step``.
91+
8892
"""
8993

9094
def on_validation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None:

src/lightning/pytorch/core/module.py

+2-10
Original file line numberDiff line numberDiff line change
@@ -1285,20 +1285,12 @@ def optimizer_step(
12851285
12861286
Examples::
12871287
1288-
# DEFAULT
12891288
def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure):
1290-
optimizer.step(closure=optimizer_closure)
1289+
# Add your custom logic to run directly before `optimizer.step()`
12911290
1292-
# Learning rate warm-up
1293-
def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure):
1294-
# update params
12951291
optimizer.step(closure=optimizer_closure)
12961292
1297-
# manually warm up lr without a scheduler
1298-
if self.trainer.global_step < 500:
1299-
lr_scale = min(1.0, float(self.trainer.global_step + 1) / 500.0)
1300-
for pg in optimizer.param_groups:
1301-
pg["lr"] = lr_scale * self.learning_rate
1293+
# Add your custom logic to run directly after `optimizer.step()`
13021294
13031295
"""
13041296
optimizer.step(closure=optimizer_closure)

src/lightning/pytorch/trainer/connectors/accelerator_connector.py

+1-20
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,7 @@
6262
)
6363
from lightning.pytorch.strategies.ddp import _DDP_FORK_ALIASES
6464
from lightning.pytorch.utilities.exceptions import MisconfigurationException
65-
from lightning.pytorch.utilities.imports import (
66-
_LIGHTNING_COLOSSALAI_AVAILABLE,
67-
_habana_available_and_importable,
68-
)
65+
from lightning.pytorch.utilities.imports import _habana_available_and_importable
6966
from lightning.pytorch.utilities.rank_zero import rank_zero_info, rank_zero_warn
7067

7168
log = logging.getLogger(__name__)
@@ -191,9 +188,6 @@ def _check_config_and_set_final_flags(
191188

192189
self._strategy_flag = strategy
193190

194-
if strategy == "colossalai" and not _LIGHTNING_COLOSSALAI_AVAILABLE:
195-
raise ModuleNotFoundError(str(_LIGHTNING_COLOSSALAI_AVAILABLE))
196-
197191
if strategy != "auto" and strategy not in self._registered_strategies and not isinstance(strategy, Strategy):
198192
raise ValueError(
199193
f"You selected an invalid strategy name: `strategy={strategy!r}`."
@@ -490,12 +484,6 @@ def _check_and_init_precision(self) -> Precision:
490484
if isinstance(self.accelerator, HPUAccelerator):
491485
return HPUPrecisionPlugin(self._precision_flag)
492486

493-
if _LIGHTNING_COLOSSALAI_AVAILABLE:
494-
from lightning_colossalai import ColossalAIPrecisionPlugin, ColossalAIStrategy
495-
496-
if isinstance(self.strategy, ColossalAIStrategy):
497-
return ColossalAIPrecisionPlugin(self._precision_flag)
498-
499487
if isinstance(self.strategy, (SingleDeviceXLAStrategy, XLAStrategy)):
500488
return XLAPrecision(self._precision_flag) # type: ignore
501489
if isinstance(self.strategy, DeepSpeedStrategy):
@@ -648,13 +636,6 @@ def _set_torch_flags(
648636

649637
def _register_external_accelerators_and_strategies() -> None:
650638
"""Registers all known strategies in other packages."""
651-
if _LIGHTNING_COLOSSALAI_AVAILABLE:
652-
from lightning_colossalai import ColossalAIStrategy
653-
654-
# TODO: Prevent registering multiple times
655-
if "colossalai" not in StrategyRegistry:
656-
ColossalAIStrategy.register_strategies(StrategyRegistry)
657-
658639
if _habana_available_and_importable():
659640
from lightning_habana import HPUAccelerator, HPUParallelStrategy, SingleHPUStrategy
660641

src/lightning/pytorch/utilities/imports.py

-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828

2929
_OMEGACONF_AVAILABLE = package_available("omegaconf")
3030
_TORCHVISION_AVAILABLE = RequirementCache("torchvision")
31-
_LIGHTNING_COLOSSALAI_AVAILABLE = RequirementCache("lightning-colossalai")
3231

3332

3433
@functools.lru_cache(maxsize=128)

tests/tests_pytorch/callbacks/test_gradient_accumulation_scheduler.py

+1-16
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,6 @@
2020
from lightning.pytorch.demos.boring_classes import BoringModel
2121
from lightning.pytorch.strategies import DeepSpeedStrategy
2222
from lightning.pytorch.utilities.exceptions import MisconfigurationException
23-
from lightning.pytorch.utilities.imports import _LIGHTNING_COLOSSALAI_AVAILABLE
24-
25-
if _LIGHTNING_COLOSSALAI_AVAILABLE:
26-
from lightning_colossalai import ColossalAIStrategy
27-
else:
28-
ColossalAIStrategy = None
2923

3024

3125
@pytest.mark.parametrize("accumulate_grad_batches", [1, 2, 3])
@@ -94,16 +88,7 @@ def test_invalid_values_for_grad_accum_scheduler(scheduling):
9488
_ = GradientAccumulationScheduler(scheduling=scheduling)
9589

9690

97-
@pytest.mark.parametrize(
98-
"strategy_class",
99-
[
100-
pytest.param(
101-
ColossalAIStrategy,
102-
marks=pytest.mark.skipif(not _LIGHTNING_COLOSSALAI_AVAILABLE, reason="Requires ColossalAI strategy"),
103-
),
104-
DeepSpeedStrategy,
105-
],
106-
)
91+
@pytest.mark.parametrize("strategy_class", [DeepSpeedStrategy])
10792
def test_unsupported_strategies(strategy_class):
10893
"""Test that an error is raised for strategies that require the gradient accumulation factor to be fixed."""
10994
scheduler = GradientAccumulationScheduler({1: 2})

tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py

-16
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@
5959
from lightning.pytorch.utilities.imports import (
6060
_LIGHTNING_HABANA_AVAILABLE,
6161
)
62-
from lightning_utilities.core.imports import package_available
6362

6463
from tests_pytorch.conftest import mock_cuda_count, mock_mps_count, mock_tpu_available, mock_xla_available
6564
from tests_pytorch.helpers.runif import RunIf
@@ -845,21 +844,6 @@ def get_defaults(cls):
845844
assert connector_default == trainer_defaults[name]
846845

847846

848-
@RunIf(min_cuda_gpus=1) # trigger this test on our GPU pipeline, because we don't install the package on the CPU suite
849-
@pytest.mark.xfail(raises=ImportError, reason="Not updated to latest API")
850-
@pytest.mark.skipif(not package_available("lightning_colossalai"), reason="Requires Colossal AI Strategy")
851-
def test_colossalai_external_strategy(monkeypatch):
852-
with mock.patch(
853-
"lightning.pytorch.trainer.connectors.accelerator_connector._LIGHTNING_COLOSSALAI_AVAILABLE", False
854-
), pytest.raises(ModuleNotFoundError):
855-
Trainer(strategy="colossalai")
856-
857-
from lightning_colossalai import ColossalAIStrategy
858-
859-
trainer = Trainer(strategy="colossalai", precision="16-mixed")
860-
assert isinstance(trainer.strategy, ColossalAIStrategy)
861-
862-
863847
class DeviceMock(Mock):
864848
def __instancecheck__(self, instance):
865849
return True

0 commit comments

Comments
 (0)