Lightning-AI
diff --git a/‎.github/checkgroup.yml
+12-12 b/‎.github/checkgroup.yml
+12-12
diff --git a/‎.github/workflows/ci-pkg-install.yml
+1-1 b/‎.github/workflows/ci-pkg-install.yml
+1-1
diff --git a/‎.github/workflows/ci-tests-fabric.yml
+6-3 b/‎.github/workflows/ci-tests-fabric.yml
+6-3
diff --git a/‎.github/workflows/ci-tests-pytorch.yml
+6-3 b/‎.github/workflows/ci-tests-pytorch.yml
+6-3
diff --git a/‎src/lightning/fabric/utilities/throughput.py
+8 b/‎src/lightning/fabric/utilities/throughput.py
+8
diff --git a/‎src/lightning/pytorch/demos/transformer.py
+2-2 b/‎src/lightning/pytorch/demos/transformer.py
+2-2
diff --git a/‎src/lightning/pytorch/loops/evaluation_loop.py
+37-1 b/‎src/lightning/pytorch/loops/evaluation_loop.py
+37-1
diff --git a/‎src/lightning/pytorch/loops/fit_loop.py
+99-8 b/‎src/lightning/pytorch/loops/fit_loop.py
+99-8
@@ -19,7 +19,7 @@ subprojects:
       - "!*.md"
       - "!**/*.md"
     checks:
-      - "pl-cpu (macOS-13, lightning, 3.9, 2.1, oldest)"
+      - "pl-cpu (macOS-14, lightning, 3.9, 2.1, oldest)"
       - "pl-cpu (macOS-14, lightning, 3.10, 2.1)"
       - "pl-cpu (macOS-14, lightning, 3.11, 2.2.2)"
       - "pl-cpu (macOS-14, lightning, 3.11, 2.3)"
@@ -40,7 +40,7 @@ subprojects:
       - "pl-cpu (macOS-14, pytorch, 3.9, 2.1)"
       - "pl-cpu (ubuntu-20.04, pytorch, 3.9, 2.1)"
       - "pl-cpu (windows-2022, pytorch, 3.9, 2.1)"
-      - "pl-cpu (macOS-13, pytorch, 3.10, 2.1)"
+      - "pl-cpu (macOS-14, pytorch, 3.10, 2.1)"
       - "pl-cpu (ubuntu-22.04, pytorch, 3.10, 2.1)"
       - "pl-cpu (windows-2022, pytorch, 3.10, 2.1)"
 
@@ -171,7 +171,7 @@ subprojects:
       - "!*.md"
       - "!**/*.md"
     checks:
-      - "fabric-cpu (macOS-13, lightning, 3.9, 2.1, oldest)"
+      - "fabric-cpu (macOS-14, lightning, 3.9, 2.1, oldest)"
       - "fabric-cpu (macOS-14, lightning, 3.10, 2.1)"
       - "fabric-cpu (macOS-14, lightning, 3.11, 2.2.2)"
       - "fabric-cpu (macOS-14, lightning, 3.11, 2.3)"
@@ -192,7 +192,7 @@ subprojects:
       - "fabric-cpu (macOS-14, fabric, 3.9, 2.1)"
       - "fabric-cpu (ubuntu-20.04, fabric, 3.9, 2.1)"
       - "fabric-cpu (windows-2022, fabric, 3.9, 2.1)"
-      - "fabric-cpu (macOS-13, fabric, 3.10, 2.1)"
+      - "fabric-cpu (macOS-14, fabric, 3.10, 2.1)"
       - "fabric-cpu (ubuntu-22.04, fabric, 3.10, 2.1)"
       - "fabric-cpu (windows-2022, fabric, 3.10, 2.1)"
 
@@ -266,14 +266,14 @@ subprojects:
       - "install-pkg (ubuntu-22.04, lightning, 3.11)"
       - "install-pkg (ubuntu-22.04, notset, 3.9)"
       - "install-pkg (ubuntu-22.04, notset, 3.11)"
-      - "install-pkg (macOS-13, fabric, 3.9)"
-      - "install-pkg (macOS-13, fabric, 3.11)"
-      - "install-pkg (macOS-13, pytorch, 3.9)"
-      - "install-pkg (macOS-13, pytorch, 3.11)"
-      - "install-pkg (macOS-13, lightning, 3.9)"
-      - "install-pkg (macOS-13, lightning, 3.11)"
-      - "install-pkg (macOS-13, notset, 3.9)"
-      - "install-pkg (macOS-13, notset, 3.11)"
+      - "install-pkg (macOS-14, fabric, 3.9)"
+      - "install-pkg (macOS-14, fabric, 3.11)"
+      - "install-pkg (macOS-14, pytorch, 3.9)"
+      - "install-pkg (macOS-14, pytorch, 3.11)"
+      - "install-pkg (macOS-14, lightning, 3.9)"
+      - "install-pkg (macOS-14, lightning, 3.11)"
+      - "install-pkg (macOS-14, notset, 3.9)"
+      - "install-pkg (macOS-14, notset, 3.11)"
       - "install-pkg (windows-2022, fabric, 3.9)"
       - "install-pkg (windows-2022, fabric, 3.11)"
       - "install-pkg (windows-2022, pytorch, 3.9)"
 
@@ -42,7 +42,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: ["ubuntu-22.04", "macOS-13", "windows-2022"]
+        os: ["ubuntu-22.04", "macOS-14", "windows-2022"]
         pkg-name: ["fabric", "pytorch", "lightning", "notset"]
         python-version: ["3.9", "3.11"]
     steps:
 
@@ -56,11 +56,11 @@ jobs:
           - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" }
           - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" }
           # only run PyTorch latest with Python latest, use Fabric scope to limit dependency issues
-          - { os: "macOS-13", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.1" }
+          - { os: "macOS-14", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.1" }
           - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.1" }
           - { os: "windows-2022", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.1" }
           # "oldest" versions tests, only on minimum Python
-          - { os: "macOS-13", pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" }
+          - { os: "macOS-14", pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" }
           - {
               os: "ubuntu-20.04",
               pkg-name: "lightning",
@@ -101,7 +101,10 @@ jobs:
 
       - name: Set min. dependencies
         if: ${{ matrix.requires == 'oldest' }}
-        run: python .actions/assistant.py replace_oldest_ver
+        run: |
+          python .actions/assistant.py replace_oldest_ver
+          pip install "cython<3.0" wheel
+          pip install "pyyaml==5.4" --no-build-isolation
 
       - name: Adjust PyTorch versions in requirements files
         if: ${{ matrix.requires != 'oldest' }}
 
@@ -60,11 +60,11 @@ jobs:
           - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" }
           - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" }
           # only run PyTorch latest with Python latest, use PyTorch scope to limit dependency issues
-          - { os: "macOS-13", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.1" }
+          - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.1" }
           - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.1" }
           - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.1" }
           # "oldest" versions tests, only on minimum Python
-          - { os: "macOS-13", pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" }
+          - { os: "macOS-14", pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" }
           - {
               os: "ubuntu-20.04",
               pkg-name: "lightning",
@@ -106,7 +106,10 @@ jobs:
 
       - name: Set min. dependencies
         if: ${{ matrix.requires == 'oldest' }}
-        run: python .actions/assistant.py replace_oldest_ver
+        run: |
+          python .actions/assistant.py replace_oldest_ver
+          pip install "cython<3.0" wheel
+          pip install "pyyaml==5.4" --no-build-isolation
 
       - name: Adjust PyTorch versions in requirements files
         if: ${{ matrix.requires != 'oldest' }}
 
@@ -347,6 +347,14 @@ def measure_flops(
         torch.int8: 389.9e12,
         "int4": 779.8e12,
     },
+    "rtx 4080 super": {
+        torch.float32: 52.2e12,
+        "tfloat32": 52.2e12,
+        torch.bfloat16: 52.2e12,
+        torch.float16: 52.2e12,
+        torch.int8: 417.6e12,
+        "int4": 835.2e12,
+    },
     "l4": {
         torch.float32: 30.3e12,
         "tfloat32": 60e12,
 
@@ -88,7 +88,7 @@ def forward(self, x: Tensor) -> Tensor:
             # TODO: Could make this a `nn.Parameter` with `requires_grad=False`
             self.pe = self._init_pos_encoding(device=x.device)
 
-        x = x + self.pe[: x.size(0), :]
+        x = x + self.pe[:, x.size(1)]
         return self.dropout(x)
 
     def _init_pos_encoding(self, device: torch.device) -> Tensor:
@@ -97,7 +97,7 @@ def _init_pos_encoding(self, device: torch.device) -> Tensor:
         div_term = torch.exp(torch.arange(0, self.dim, 2, device=device).float() * (-math.log(10000.0) / self.dim))
         pe[:, 0::2] = torch.sin(position * div_term)
         pe[:, 1::2] = torch.cos(position * div_term)
-        pe = pe.unsqueeze(0).transpose(0, 1)
+        pe = pe.unsqueeze(0)
         return pe
 
 
 
@@ -15,6 +15,7 @@
 import shutil
 import sys
 from collections import ChainMap, OrderedDict, defaultdict
+from dataclasses import dataclass
 from typing import Any, DefaultDict, Iterable, Iterator, List, Optional, Tuple, Union
 
 from lightning_utilities.core.apply_func import apply_to_collection
@@ -45,6 +46,12 @@
 from lightning.pytorch.utilities.signature_utils import is_param_in_hook_signature
 
 
+@dataclass
+class RestartStage:
+    NONE = "none"
+    RESTARTED_MID_EVALUATION = "restarted_mid_evaluation"
+
+
 class _EvaluationLoop(_Loop):
     """Top-level loop where validation/testing starts."""
 
@@ -73,6 +80,7 @@ def __init__(
         self._seen_batches_per_dataloader: DefaultDict[int, int] = defaultdict(int)
         self._last_val_dl_reload_epoch = float("-inf")
         self._module_mode = _ModuleMode()
+        self._restart_stage = RestartStage.NONE
 
     @property
     def num_dataloaders(self) -> int:
@@ -137,7 +145,7 @@ def run(self) -> List[_OUT_DICT]:
                 # this needs to wrap the `*_step` call too (not just `next`) for `dataloader_iter` support
                 break
             finally:
-                self._restarting = False
+                self.on_iteration_done()
         self._store_dataloader_outputs()
         return self.on_run_end()
 
@@ -197,6 +205,24 @@ def setup_data(self) -> None:
         # this depends on the data used, so reset it too
         self._seen_batches_per_dataloader = defaultdict(int)
 
+    @property
+    def restarted_mid_evaluation(self) -> bool:
+        return self._restart_stage == RestartStage.RESTARTED_MID_EVALUATION
+
+    def update_restart_stage(self) -> None:
+        if (
+            self.restarting
+            and self.batch_progress.total.started == self.batch_progress.total.ready
+            and self.batch_progress.total.processed == self.batch_progress.total.started - 1
+            and self.batch_progress.total.completed == self.batch_progress.total.processed
+        ):
+            self._restart_stage = RestartStage.RESTARTED_MID_EVALUATION
+        else:
+            self._restart_stage = RestartStage.NONE
+
+    def reset_restart_stage(self) -> None:
+        self._restart_stage = RestartStage.NONE
+
     def reset(self) -> None:
         """Resets the internal state of the loop."""
         trainer = self.trainer
@@ -236,6 +262,16 @@ def reset(self) -> None:
         data_fetcher._stop_profiler = self._on_after_fetch
         self._data_fetcher = data_fetcher
 
+    def increment_progress_to_evaluation_end(self) -> None:
+        self.setup_data()
+        if self.skip:
+            return
+        self.reset()
+        max_batch = int(max(self.max_batches))
+        if max_batch == -1:
+            return
+        self.batch_progress.increment_by(max_batch, True)
+
     def on_run_start(self) -> None:
         """Runs the ``_on_evaluation_model_eval``, ``_on_evaluation_start`` and ``_on_evaluation_epoch_start``
         hooks."""
 
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Union
 
 import torch
@@ -45,6 +46,15 @@
 log = logging.getLogger(__name__)
 
 
+@dataclass
+class RestartStage:
+    NONE = "none"
+    RESTARTED_ON_EPOCH_START = "restarted_on_epoch_start"
+    RESTARTED_MID_EPOCH = "restarted_mid_epoch"
+    RESTARTED_ON_EPOCH_END = "restarted_on_epoch_end"
+    RESUMED_ON_EPOCH_END = "resumed_on_epoch_end"
+
+
 class _FitLoop(_Loop):
     """This loop is the top-level loop where training starts.
 
@@ -97,6 +107,7 @@ def __init__(
         self._combined_loader_states_to_load: List[Dict[str, Any]] = []
         self._data_fetcher: Optional[_DataFetcher] = None
         self._last_train_dl_reload_epoch = float("-inf")
+        self._restart_stage = RestartStage.NONE
 
     @property
     def total_batch_idx(self) -> int:
@@ -204,9 +215,10 @@ def run(self) -> None:
                 self.on_advance_start()
                 self.advance()
                 self.on_advance_end()
-                self._restarting = False
             except StopIteration:
                 break
+            finally:
+                self.on_iteration_done()
         self._restarting = False
         self.on_run_end()
 
@@ -302,14 +314,92 @@ def setup_data(self) -> None:
                 category=PossibleUserWarning,
             )
 
+    @property
+    def restarted_on_epoch_start(self) -> bool:
+        return self._restart_stage == RestartStage.RESTARTED_ON_EPOCH_START
+
+    @property
+    def restarted_mid_epoch(self) -> bool:
+        return self._restart_stage == RestartStage.RESTARTED_MID_EPOCH
+
+    @property
+    def restarted_on_epoch_end(self) -> bool:
+        return self._restart_stage == RestartStage.RESTARTED_ON_EPOCH_END
+
+    @property
+    def resumed_on_epoch_end(self) -> bool:
+        # This case happens when restarting from last without validation at
+        # the end of epoch. In this case self.restarting is False.
+        return self._restart_stage == RestartStage.RESUMED_ON_EPOCH_END
+
+    def update_restart_stage(self) -> None:
+        if (
+            self.restarting
+            and self.epoch_progress.total.started == self.epoch_progress.total.ready - 1
+            and self.epoch_progress.total.processed == self.epoch_progress.total.started
+            and self.epoch_progress.total.completed == self.epoch_progress.total.processed
+        ):
+            self._restart_stage = RestartStage.RESTARTED_ON_EPOCH_START
+        elif (
+            self.restarting
+            and self.epoch_progress.total.started == self.epoch_progress.total.ready
+            and self.epoch_progress.total.processed == self.epoch_progress.total.started - 1
+            and self.epoch_progress.total.completed == self.epoch_progress.total.processed
+        ):
+            self._restart_stage = RestartStage.RESTARTED_MID_EPOCH
+        elif (
+            self.restarting
+            and self.epoch_progress.total.started == self.epoch_progress.total.ready
+            and self.epoch_progress.total.processed == self.epoch_progress.total.started
+            and self.epoch_progress.total.completed == self.epoch_progress.total.processed - 1
+        ):
+            self._restart_stage = RestartStage.RESTARTED_ON_EPOCH_END
+        elif (
+            self._loaded_from_state_dict
+            and self.epoch_progress.total.started == self.epoch_progress.total.ready
+            and self.epoch_progress.total.processed == self.epoch_progress.total.started
+            and self.epoch_progress.total.completed == self.epoch_progress.total.processed - 1
+        ):
+            self._restart_stage = RestartStage.RESUMED_ON_EPOCH_END
+        else:
+            self._restart_stage = RestartStage.NONE
+
+        self.epoch_loop.update_restart_stage()
+
+    def reset_restart_stage(self) -> None:
+        self._restart_stage = RestartStage.NONE
+
     def reset(self) -> None:
         """Resets the internal state of this loop."""
         assert self.trainer.model is not None
         torch.set_grad_enabled(True)
 
-        if self.restarting:
+        self.update_restart_stage()
+
+        if self.restarted_on_epoch_start:
             self.epoch_progress.reset_on_restart()
 
+        if self.resumed_on_epoch_end:
+            # when restarting from last without validation at end of epoch,
+            # self.restarting is False but it's still resuming
+            self.epoch_progress.increment_completed()
+
+        if (
+            self.epoch_loop.restarted_on_train_batch_end
+            and self.restarted_mid_epoch
+            and self.epoch_loop.batch_progress.is_last_batch
+        ):
+            self.epoch_progress.increment_processed()
+            self.epoch_progress.increment_completed()
+
+        if (
+            self.epoch_loop.restarted_on_train_batch_end
+            and self.epoch_loop.batch_progress.is_last_batch
+            and not self.restarted_mid_epoch
+            and not self.epoch_loop.val_loop.batch_progress.is_last_batch
+        ):
+            self.epoch_progress.increment_completed()
+
     def on_run_start(self) -> None:
         """Calls the ``on_train_start`` hook."""
         # update the current_epoch in-case of checkpoint reload
@@ -340,12 +430,14 @@ def on_advance_start(self) -> None:
         for i, dl in enumerate(self._combined_loader.flattened):
             _set_sampler_epoch(dl, self.epoch_progress.current.processed)
 
-        self.epoch_progress.increment_ready()
+        if not self.restarted_mid_epoch and not self.restarted_on_epoch_end:
+            if not self.restarted_on_epoch_start:
+                self.epoch_progress.increment_ready()
 
-        call._call_callback_hooks(trainer, "on_train_epoch_start")
-        call._call_lightning_module_hook(trainer, "on_train_epoch_start")
+            call._call_callback_hooks(trainer, "on_train_epoch_start")
+            call._call_lightning_module_hook(trainer, "on_train_epoch_start")
 
-        self.epoch_progress.increment_started()
+            self.epoch_progress.increment_started()
 
     def advance(self) -> None:
         """Runs one whole epoch."""
@@ -379,8 +471,7 @@ def on_advance_end(self) -> None:
 
         trainer._logger_connector.on_epoch_end()
 
-        if self.epoch_loop._num_ready_batches_reached():
-            # if we are restarting and the above condition holds, it's because we are reloading an epoch-end checkpoint.
+        if not self.restarting and self.epoch_loop._num_ready_batches_reached():
             # since metric-based schedulers require access to metrics and those are not currently saved in the
             # checkpoint, the plateau schedulers shouldn't be updated
             self.epoch_loop.update_lr_schedulers("epoch", update_plateau_schedulers=not self.restarting)