torchmetric optimizations (#2943)

czmrand · crand-mbe · Borda · web-flow · commit 656191aa9cc0 · 2025-02-24T22:51:05.000+01:00
* add nan_strategy "disable" to disable nan checks set default in update to None to prevent sync event see https://medium.com/@chaimrand/efficient-metric-collection-in-pytorch-avoiding-the-performance-pitfalls-of-torchmetrics-0dea81413681 for motivation --------- Co-authored-by: Chaim Rand <chaim.rand@mobileye.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> Co-authored-by: Nicki Skafte <skaftenicki@gmail.com> Co-authored-by: Jirka B <j.borovec+github@gmail.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
--
+- Added `disable` option to `nan_strategy` in basic aggregation metrics ([#2943](https://github.com/PyTorchLightning/metrics/pull/2943))
 
 
 ### Changed
diff --git a/src/torchmetrics/aggregation.py b/src/torchmetrics/aggregation.py
@@ -16,6 +16,7 @@
 
 import torch
 from torch import Tensor
+from typing_extensions import Literal
 
 from torchmetrics.metric import Metric
 from torchmetrics.utilities import rank_zero_warn
@@ -38,14 +39,15 @@ class BaseAggregator(Metric):
             - ``'error'``: if any `nan` values are encountered will give a RuntimeError
             - ``'warn'``: if any `nan` values are encountered will give a warning and continue
             - ``'ignore'``: all `nan` values are silently removed
+            - ``'disable'``: disable all `nan` checks
             - a float: if a float is provided will impute any `nan` values with this value
 
         state_name: name of the metric state
         kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.
 
     Raises:
         ValueError:
-            If ``nan_strategy`` is not one of ``error``, ``warn``, ``ignore`` or a float
+            If ``nan_strategy`` is not one of ``error``, ``warn``, ``ignore``, ``disable`` or a float
 
     """
 
@@ -57,12 +59,12 @@ def __init__(
         self,
         fn: Union[Callable, str],
         default_value: Union[Tensor, list],
-        nan_strategy: Union[str, float] = "error",
+        nan_strategy: Union[Literal["error", "warn", "ignore", "disable"], float] = "error",
         state_name: str = "value",
         **kwargs: Any,
     ) -> None:
         super().__init__(**kwargs)
-        allowed_nan_strategy = ("error", "warn", "ignore")
+        allowed_nan_strategy = ("error", "warn", "ignore", "disable")
         if nan_strategy not in allowed_nan_strategy and not isinstance(nan_strategy, float):
             raise ValueError(
                 f"Arg `nan_strategy` should either be a float or one of {allowed_nan_strategy} but got {nan_strategy}."
@@ -81,26 +83,28 @@ def _cast_and_nan_check_input(
         if weight is not None and not isinstance(weight, Tensor):
             weight = torch.as_tensor(weight, dtype=self.dtype, device=self.device)
 
-        nans = torch.isnan(x)
-        if weight is not None:
-            nans_weight = torch.isnan(weight)
+        if self.nan_strategy != "disable":
+            nans = torch.isnan(x)
+            if weight is not None:
+                nans_weight = torch.isnan(weight)
+            else:
+                nans_weight = torch.zeros_like(nans).bool()
+                weight = torch.ones_like(x)
+            if nans.any() or nans_weight.any():
+                if self.nan_strategy == "error":
+                    raise RuntimeError("Encountered `nan` values in tensor")
+                if self.nan_strategy in ("ignore", "warn"):
+                    if self.nan_strategy == "warn":
+                        rank_zero_warn("Encountered `nan` values in tensor. Will be removed.", UserWarning)
+                    x = x[~(nans | nans_weight)]
+                    weight = weight[~(nans | nans_weight)]
+                else:
+                    if not isinstance(self.nan_strategy, float):
+                        raise ValueError(f"`nan_strategy` shall be float but you pass {self.nan_strategy}")
+                    x[nans | nans_weight] = self.nan_strategy
+                    weight[nans | nans_weight] = 1
         else:
-            nans_weight = torch.zeros_like(nans).bool()
             weight = torch.ones_like(x)
-        if nans.any() or nans_weight.any():
-            if self.nan_strategy == "error":
-                raise RuntimeError("Encountered `nan` values in tensor")
-            if self.nan_strategy in ("ignore", "warn"):
-                if self.nan_strategy == "warn":
-                    rank_zero_warn("Encountered `nan` values in tensor. Will be removed.", UserWarning)
-                x = x[~(nans | nans_weight)]
-                weight = weight[~(nans | nans_weight)]
-            else:
-                if not isinstance(self.nan_strategy, float):
-                    raise ValueError(f"`nan_strategy` shall be float but you pass {self.nan_strategy}")
-                x[nans | nans_weight] = self.nan_strategy
-                weight[nans | nans_weight] = self.nan_strategy
-
         return x.to(self.dtype), weight.to(self.dtype)
 
     def update(self, value: Union[float, Tensor]) -> None:
@@ -128,13 +132,14 @@ class MaxMetric(BaseAggregator):
             - ``'error'``: if any `nan` values are encountered will give a RuntimeError
             - ``'warn'``: if any `nan` values are encountered will give a warning and continue
             - ``'ignore'``: all `nan` values are silently removed
+            - ``'disable'``: disable all `nan` checks
             - a float: if a float is provided will impute any `nan` values with this value
 
         kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.
 
     Raises:
         ValueError:
-            If ``nan_strategy`` is not one of ``error``, ``warn``, ``ignore`` or a float
+            If ``nan_strategy`` is not one of ``error``, ``warn``, ``ignore``, ``disable`` or a float
 
     Example:
         >>> from torch import tensor
@@ -152,7 +157,7 @@ class MaxMetric(BaseAggregator):
 
     def __init__(
         self,
-        nan_strategy: Union[str, float] = "warn",
+        nan_strategy: Union[Literal["error", "warn", "ignore", "disable"], float] = "warn",
         **kwargs: Any,
     ) -> None:
         super().__init__(
@@ -233,13 +238,14 @@ class MinMetric(BaseAggregator):
             - ``'error'``: if any `nan` values are encountered will give a RuntimeError
             - ``'warn'``: if any `nan` values are encountered will give a warning and continue
             - ``'ignore'``: all `nan` values are silently removed
+            - ``'disable'``: disable all `nan` checks
             - a float: if a float is provided will impute any `nan` values with this value
 
         kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.
 
     Raises:
         ValueError:
-            If ``nan_strategy`` is not one of ``error``, ``warn``, ``ignore`` or a float
+            If ``nan_strategy`` is not one of ``error``, ``warn``, ``ignore``, ``disable`` or a float
 
     Example:
         >>> from torch import tensor
@@ -257,7 +263,7 @@ class MinMetric(BaseAggregator):
 
     def __init__(
         self,
-        nan_strategy: Union[str, float] = "warn",
+        nan_strategy: Union[Literal["error", "warn", "ignore", "disable"], float] = "warn",
         **kwargs: Any,
     ) -> None:
         super().__init__(
@@ -338,13 +344,14 @@ class SumMetric(BaseAggregator):
             - ``'error'``: if any `nan` values are encountered will give a RuntimeError
             - ``'warn'``: if any `nan` values are encountered will give a warning and continue
             - ``'ignore'``: all `nan` values are silently removed
+            - ``'disable'``: disable all `nan` checks
             - a float: if a float is provided will impute any `nan` values with this value
 
         kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.
 
     Raises:
         ValueError:
-            If ``nan_strategy`` is not one of ``error``, ``warn``, ``ignore`` or a float
+            If ``nan_strategy`` is not one of ``error``, ``warn``, ``ignore``, ``disable`` or a float
 
     Example:
         >>> from torch import tensor
@@ -361,7 +368,7 @@ class SumMetric(BaseAggregator):
 
     def __init__(
         self,
-        nan_strategy: Union[str, float] = "warn",
+        nan_strategy: Union[Literal["error", "warn", "ignore", "disable"], float] = "warn",
         **kwargs: Any,
     ) -> None:
         super().__init__(
@@ -443,13 +450,14 @@ class CatMetric(BaseAggregator):
             - ``'error'``: if any `nan` values are encountered will give a RuntimeError
             - ``'warn'``: if any `nan` values are encountered will give a warning and continue
             - ``'ignore'``: all `nan` values are silently removed
+            - ``'disable'``: disable all `nan` checks
             - a float: if a float is provided will impute any `nan` values with this value
 
         kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.
 
     Raises:
         ValueError:
-            If ``nan_strategy`` is not one of ``error``, ``warn``, ``ignore`` or a float
+            If ``nan_strategy`` is not one of ``error``, ``warn``, ``ignore``, ``disable`` or a float
 
     Example:
         >>> from torch import tensor
@@ -466,7 +474,7 @@ class CatMetric(BaseAggregator):
 
     def __init__(
         self,
-        nan_strategy: Union[str, float] = "warn",
+        nan_strategy: Union[Literal["error", "warn", "ignore", "disable"], float] = "warn",
         **kwargs: Any,
     ) -> None:
         super().__init__("cat", [], nan_strategy, **kwargs)
@@ -505,17 +513,18 @@ class MeanMetric(BaseAggregator):
     - ``agg`` (:class:`~torch.Tensor`): scalar float tensor with aggregated (weighted) mean over all inputs received
 
     Args:
-       nan_strategy: options:
+        nan_strategy: options:
             - ``'error'``: if any `nan` values are encountered will give a RuntimeError
             - ``'warn'``: if any `nan` values are encountered will give a warning and continue
             - ``'ignore'``: all `nan` values are silently removed
+            - ``'disable'``: disable all `nan` checks
             - a float: if a float is provided will impute any `nan` values with this value
 
         kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.
 
     Raises:
         ValueError:
-            If ``nan_strategy`` is not one of ``error``, ``warn``, ``ignore`` or a float
+            If ``nan_strategy`` is not one of ``error``, ``warn``, ``ignore``, ``disable`` or a float
 
     Example:
         >>> from torchmetrics.aggregation import MeanMetric
@@ -532,7 +541,7 @@ class MeanMetric(BaseAggregator):
 
     def __init__(
         self,
-        nan_strategy: Union[str, float] = "warn",
+        nan_strategy: Union[Literal["error", "warn", "ignore", "disable"], float] = "warn",
         **kwargs: Any,
     ) -> None:
         super().__init__(
@@ -544,22 +553,24 @@ def __init__(
         )
         self.add_state("weight", default=torch.tensor(0.0, dtype=torch.get_default_dtype()), dist_reduce_fx="sum")
 
-    def update(self, value: Union[float, Tensor], weight: Union[float, Tensor] = 1.0) -> None:
+    def update(self, value: Union[float, Tensor], weight: Union[float, Tensor, None] = None) -> None:
         """Update state with data.
 
         Args:
             value: Either a float or tensor containing data. Additional tensor
                 dimensions will be flattened
             weight: Either a float or tensor containing weights for calculating
                 the average. Shape of weight should be able to broadcast with
-                the shape of `value`. Default to `1.0` corresponding to simple
+                the shape of `value`. Default to None corresponding to simple
                 harmonic average.
 
         """
         # broadcast weight to value shape
         if not isinstance(value, Tensor):
             value = torch.as_tensor(value, dtype=self.dtype, device=self.device)
-        if weight is not None and not isinstance(weight, Tensor):
+        if weight is None:
+            weight = torch.ones_like(value)
+        elif not isinstance(weight, Tensor):
             weight = torch.as_tensor(weight, dtype=self.dtype, device=self.device)
         weight = torch.broadcast_to(weight, value.shape)
         value, weight = self._cast_and_nan_check_input(value, weight)
@@ -631,18 +642,18 @@ class RunningMean(Running):
     - ``agg`` (:class:`~torch.Tensor`): scalar float tensor with aggregated sum over all inputs received
 
     Args:
-        window: The size of the running window.
         nan_strategy: options:
             - ``'error'``: if any `nan` values are encountered will give a RuntimeError
             - ``'warn'``: if any `nan` values are encountered will give a warning and continue
             - ``'ignore'``: all `nan` values are silently removed
+            - ``'disable'``: disable all `nan` checks
             - a float: if a float is provided will impute any `nan` values with this value
 
         kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.
 
     Raises:
         ValueError:
-            If ``nan_strategy`` is not one of ``error``, ``warn``, ``ignore`` or a float
+            If ``nan_strategy`` is not one of ``error``, ``warn``, ``ignore``, ``disable`` or a float
 
     Example:
         >>> from torch import tensor
@@ -665,7 +676,7 @@ class RunningMean(Running):
     def __init__(
         self,
         window: int = 5,
-        nan_strategy: Union[str, float] = "warn",
+        nan_strategy: Union[Literal["error", "warn", "ignore", "disable"], float] = "warn",
         **kwargs: Any,
     ) -> None:
         super().__init__(base_metric=MeanMetric(nan_strategy=nan_strategy, **kwargs), window=window)
@@ -693,13 +704,14 @@ class RunningSum(Running):
             - ``'error'``: if any `nan` values are encountered will give a RuntimeError
             - ``'warn'``: if any `nan` values are encountered will give a warning and continue
             - ``'ignore'``: all `nan` values are silently removed
+            - ``'disable'``: disable all `nan` checks
             - a float: if a float is provided will impute any `nan` values with this value
 
         kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.
 
     Raises:
         ValueError:
-            If ``nan_strategy`` is not one of ``error``, ``warn``, ``ignore`` or a float
+            If ``nan_strategy`` is not one of ``error``, ``warn``, ``ignore``, ``disable`` or a float
 
     Example:
         >>> from torch import tensor
@@ -722,7 +734,7 @@ class RunningSum(Running):
     def __init__(
         self,
         window: int = 5,
-        nan_strategy: Union[str, float] = "warn",
+        nan_strategy: Union[Literal["error", "warn", "ignore", "disable"], float] = "warn",
         **kwargs: Any,
     ) -> None:
         super().__init__(base_metric=SumMetric(nan_strategy=nan_strategy, **kwargs), window=window)
diff --git a/tests/unittests/bases/test_aggregation.py b/tests/unittests/bases/test_aggregation.py
@@ -2,6 +2,7 @@
 import pytest
 import torch
 
+from torchmetrics import Metric
 from torchmetrics.aggregation import CatMetric, MaxMetric, MeanMetric, MinMetric, SumMetric
 from torchmetrics.collections import MetricCollection
 from unittests import BATCH_SIZE, NUM_BATCHES
@@ -121,28 +122,35 @@ def test_nan_error(value, nan_strategy, metric_class):
         (MinMetric, 2.0, _CASE_1, 2.0),
         (MinMetric, "ignore", _CASE_2, 1.0),
         (MinMetric, 2.0, _CASE_2, 1.0),
+        (MinMetric, "disable", _CASE_1, torch.tensor(float("nan"))),
         (MaxMetric, "ignore", _CASE_1, -torch.tensor(float("inf"))),
         (MaxMetric, 2.0, _CASE_1, 2.0),
         (MaxMetric, "ignore", _CASE_2, 5.0),
         (MaxMetric, 2.0, _CASE_2, 5.0),
+        (MaxMetric, "disable", _CASE_1, torch.tensor(float("nan"))),
         (SumMetric, "ignore", _CASE_1, 0.0),
         (SumMetric, 2.0, _CASE_1, 10.0),
         (SumMetric, "ignore", _CASE_2, 12.0),
         (SumMetric, 2.0, _CASE_2, 14.0),
+        (SumMetric, "disable", _CASE_1, torch.tensor(float("nan"))),
+        (SumMetric, "disable", _CASE_2, torch.tensor(float("nan"))),
         (MeanMetric, "ignore", _CASE_1, torch.tensor([float("nan")])),
         (MeanMetric, 2.0, _CASE_1, 2.0),
         (MeanMetric, "ignore", _CASE_2, 3.0),
         (MeanMetric, 2.0, _CASE_2, 2.8),
+        (MeanMetric, "disable", _CASE_1, torch.tensor(float("nan"))),
+        (MeanMetric, "disable", _CASE_2, torch.tensor(float("nan"))),
         (CatMetric, "ignore", _CASE_1, []),
         (CatMetric, 2.0, _CASE_1, torch.tensor([2.0, 2.0, 2.0, 2.0, 2.0])),
         (CatMetric, "ignore", _CASE_2, torch.tensor([1.0, 2.0, 4.0, 5.0])),
         (CatMetric, 2.0, _CASE_2, torch.tensor([1.0, 2.0, 2.0, 4.0, 5.0])),
         (CatMetric, "ignore", torch.zeros(5), torch.zeros(5)),
+        (CatMetric, "disable", _CASE_1, _CASE_1),
     ],
 )
 def test_nan_expected(metric_class, nan_strategy, value, expected):
     """Test that nan values are handled correctly."""
-    metric = metric_class(nan_strategy=nan_strategy)
+    metric: Metric = metric_class(nan_strategy=nan_strategy)
     metric.update(value.clone())
     out = metric.compute()
     assert np.allclose(out, expected, equal_nan=True)