Miscellaneous bug fixes (#540)

sdatkinson · web-flow · commit 18f4a054f985 · 2025-03-19T23:13:41.000-07:00
* Implement ConcatDataset.sample_rate

* Update environment_gpu.yml

* Implement nx, ny, sample_rate safely in ConcatDataset
diff --git a/environments/environment_gpu.yml b/environments/environment_gpu.yml
@@ -17,14 +17,14 @@ dependencies:
   - numpy<2
   - pip
   - pre-commit
-  - pydantic>=2.0.0
+  - pydantic>=2
   - pytest
   - pytest-mock
-  - pytorch
+  - pytorch::pytorch
   # If your GPU isn't being detected, you may need a different version.
   # You're going to need to look at Table 3 here: 
   # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions__table-cuda-toolkit-driver-versions
-  - pytorch-cuda=12.1  # GPU
+  - pytorch::pytorch-cuda=12.1  # GPU
   - scipy
   - semver
   - tensorboard
diff --git a/nam/data.py b/nam/data.py
@@ -375,6 +375,10 @@ def __len__(self) -> int:
         single_pairs = n - self._nx + 1
         return single_pairs // self._ny
 
+    @property
+    def nx(self) -> int:
+        return self._nx
+
     @property
     def ny(self) -> int:
         return self._ny
@@ -695,6 +699,14 @@ def _validate_preceding_silence(
             )
 
 
+class ConcatDatasetValidationError(ValueError):
+    """
+    Error raised when a ConcatDataset fails validation
+    """
+
+    pass
+
+
 class ConcatDataset(AbstractDataset, _InitializableFromConfig):
     def __init__(self, datasets: _Sequence[Dataset], flatten=True):
         if flatten:
@@ -717,6 +729,21 @@ def __len__(self) -> int:
     def datasets(self):
         return self._datasets
 
+    @property
+    def nx(self) -> int:
+        # Validated at initialization
+        return self.datasets[0].nx
+
+    @property
+    def ny(self) -> int:
+        # Validated at initialization
+        return self.datasets[0].ny
+
+    @property
+    def sample_rate(self) -> _Optional[float]:
+        # This is validated to be consistent across datasets during initialization
+        return self.datasets[0].sample_rate
+
     @classmethod
     def parse_config(cls, config):
         init = _dataset_init_registry[config.get("type", "dataset")]
@@ -767,14 +794,20 @@ def _make_lookup(self):
 
     @classmethod
     def _validate_datasets(cls, datasets: _Sequence[Dataset]):
+        # Ensure that a couple attrs are consistent across the sub-datasets.
         Reference = _namedtuple("Reference", ("index", "val"))
-        ref_keys, ref_ny = None, None
+        references = {name: None for name in ("nx", "ny", "sample_rate")}
         for i, d in enumerate(datasets):
-            ref_ny = Reference(i, d.ny) if ref_ny is None else ref_ny
-            if d.ny != ref_ny.val:
-                raise ValueError(
-                    f"Mismatch between ny of datasets {ref_ny.index} ({ref_ny.val}) and {i} ({d.ny})"
-                )
+            for name in references.keys():
+                this_val = getattr(d, name)
+                if references[name] is None:
+                    references[name] = Reference(i, this_val)
+
+                if this_val != references[name].val:
+                    raise ConcatDatasetValidationError(
+                        f"Mismatch between {name} of datasets {references[name].index} "
+                        f"({references[name].val}) and {i} ({this_val})"
+                    )
 
 
 _dataset_init_registry = {"dataset": Dataset.init_from_config}
diff --git a/tests/test_nam/test_data.py b/tests/test_nam/test_data.py
@@ -385,6 +385,38 @@ def test_sample_widths(self, sample_width: int):
         assert info.sampwidth == sample_width
 
 
+class TestConcatDataset(object):
+    @pytest.mark.parametrize("attrname", ("nx", "ny", "sample_rate"))
+    def test_valiation_sample_rate_fail(self, attrname: str):
+        """
+        Assert failed validation for datasets with different nx, ny, sample rates
+        """
+        nx, ny, sample_rate = 1, 2, 48_000.0
+
+        n1 = 16
+        ds1_kwargs = dict(
+            x=torch.zeros((n1,)),
+            y=torch.zeros((n1,)),
+            nx=nx,
+            ny=ny,
+            sample_rate=sample_rate,
+        )
+        ds1 = data.Dataset(**ds1_kwargs)
+        n2 = 7
+        ds2_kwargs = dict(
+            x=torch.zeros((n2,)),
+            y=torch.zeros((n2,)),
+            nx=nx,
+            ny=ny,
+            sample_rate=sample_rate,
+        )
+        # Cause the error by moving the named attr:
+        ds2_kwargs[attrname] += 1
+        ds2 = data.Dataset(**ds2_kwargs)
+        with pytest.raises(data.ConcatDatasetValidationError):
+            data.ConcatDataset([ds1, ds2])
+
+
 def test_audio_mismatch_shapes_in_order():
     """
     https://github.com/sdatkinson/neural-amp-modeler/issues/257