From b2551d2654f9789acd3e7ca1366b8b973dd10b9a Mon Sep 17 00:00:00 2001
From: Derrick Chambers <d-chambers@users.noreply.github.com>
Date: Tue, 29 Oct 2024 12:34:36 -0700
Subject: [PATCH] fix febus overlap issue (#449)

* fix febus overlap issue

* fix start/end date

* use uv for pre-commit

* add absolute time test

---------

Co-authored-by: Derrick Chambers <derrickchambers@lightwave.mines.edu>
---
 dascore/io/febus/utils.py              | 55 +++++++++++++++++---------
 tests/test_io/test_febus/test_febus.py | 43 ++++++++++++++++++++
 2 files changed, 80 insertions(+), 18 deletions(-)
 create mode 100644 tests/test_io/test_febus/test_febus.py

diff --git a/dascore/io/febus/utils.py b/dascore/io/febus/utils.py
index 40b00414..4d608800 100644
--- a/dascore/io/febus/utils.py
+++ b/dascore/io/febus/utils.py
@@ -97,37 +97,57 @@ def _get_febus_attrs(feb: _FebusSlice) -> dict:
     return out
 
 
+def _get_time_overlap_samples(feb, data_shape):
+    """Determine the number of redundant samples in the time dimension."""
+    time_step = feb.zone.attrs["Spacing"][1] / 1_000  # value in ms, convert to s.
+    block_time = _maybe_unpack(1 / (feb.zone.attrs["BlockRate"] / 1_000))
+    # Since the data have overlaps in each block's time dimension, we need to
+    # trim the overlap off the time dimension to avoid having to merge blocks later.
+    # However, sometimes the "BlockOverlap" is wrong, so we calculate it
+    # manually here.
+    expected_samples = int(np.round(block_time / time_step))
+    excess_rows = data_shape[1] - expected_samples
+    assert (
+        excess_rows % 2 == 0
+    ), "excess rows must be symmetric to distribute on both ends"
+    return excess_rows
+
+
 def _get_time_coord(feb):
     """Get the time coordinate contained in the febus slice."""
     time = feb.source["time"]
-    # In older version time shape is different, always grap first eleemnt.
+    # In older version time shape is different, always grab first element.
     first_slice = tuple(0 for _ in time.shape)
     t_0 = time[first_slice]
     # Data dimensions are block_index, time, distance
     data_shape = feb.zone[feb.data_name].shape
     n_blocks = data_shape[0]
-    # Since the data have overlaps in each block's time dimension, we need to
-    # trim the overlap off the time dimension to avoid having to merge blocks later.
-    overlap_percentage = _maybe_unpack(feb.zone.attrs.get("BlockOverlap", 0))
-    rows_to_remove = int(np.round(data_shape[1] * overlap_percentage / 100))
-    total_time_rows = (data_shape[1] - 2 * rows_to_remove) * n_blocks
-    # Get spacing between time samples (in s)
+    # Get spacing between time samples (in s) and the total time of each block.
     time_step = feb.zone.attrs["Spacing"][1] / 1_000  # value in ms, convert to s.
-    # Get origin info, these are offsets from time for start of block,
-    time_origin = feb.zone.attrs["Origin"][1] / 1_000  # also convert to s
-    # Get the start/stop indicies for the zone
+    excess_rows = _get_time_overlap_samples(feb, data_shape)
+    total_time_rows = (data_shape[1] - excess_rows) * n_blocks
+    # Get origin info, these are offsets from time to get to the first simple
+    # of the block. These should always be non-positive.
+    time_offset = feb.zone.attrs["Origin"][1] / 1_000  # also convert to s
+    assert time_offset <= 0, "time offset must be non positive"
+    # Get the start/stop indices for the zone. We assume zones never sub-slice
+    # time (only distance) but assert that here.
     extent = feb.zone.attrs["Extent"]
-    time_ids = (extent[2], extent[3])
+    assert (extent[3] - extent[2] + 1) == data_shape[1], "Cant handle sub time zones"
     # Create time coord
     # Need to account for removing overlap times.
-    total_start = time_origin - rows_to_remove * time_step + time_ids[0] * time_step
+    total_start = t_0 + time_offset + (excess_rows // 2) * time_step
     total_end = total_start + total_time_rows * time_step
     time_coord = get_coord(
-        start=dc.to_datetime64(t_0 + total_start),
-        stop=dc.to_datetime64(t_0 + total_end),
+        start=dc.to_datetime64(total_start),
+        stop=dc.to_datetime64(total_end),
         step=dc.to_timedelta64(time_step),
     )
-    return time_coord.change_length(total_time_rows)
+    # Note: we have found some files in which the sampling rate is 1/3e-4
+    # because we use datetime64 we lose some precision which has caused
+    # slight differences in shape of the patch.
+    out = time_coord.change_length(total_time_rows)
+    return out
 
 
 def _get_distance_coord(feb):
@@ -221,9 +241,8 @@ def _get_time_filtered_data(data, t_start_end, time, total_slice, time_coord):
     dist_coord, time_coord = cm.coord_map["distance"], cm.coord_map["time"]
     data = febus.zone[febus.data_name]
     data_shape = data.shape
-    overlap_percentage = _maybe_unpack(febus.zone.attrs.get("BlockOverlap", 0))
-    skip_rows = int(np.round(overlap_percentage / 100 * data_shape[1]))
-    # Need to handle case where skip_rows == 0
+    skip_rows = _get_time_overlap_samples(febus, data_shape) // 2
+    # Need to handle case where excess_rows == 0
     data_slice = slice(skip_rows, -skip_rows if skip_rows else None)
     total_slice = list(broadcast_for_index(3, 1, data_slice))
     total_time_rows = data_shape[1] - 2 * skip_rows
diff --git a/tests/test_io/test_febus/test_febus.py b/tests/test_io/test_febus/test_febus.py
new file mode 100644
index 00000000..40a24511
--- /dev/null
+++ b/tests/test_io/test_febus/test_febus.py
@@ -0,0 +1,43 @@
+"""
+Febus specific tests.
+"""
+
+import h5py
+import numpy as np
+import pytest
+
+from dascore.io.febus import Febus2
+from dascore.io.febus.utils import _flatten_febus_info
+from dascore.utils.downloader import fetch
+from dascore.utils.time import to_float
+
+
+class TestFebus:
+    """Special test cases for febus."""
+
+    @pytest.fixture(scope="class")
+    def febus_path(self):
+        """Return the path to a test febus file."""
+        return fetch("febus_1.h5")
+
+    def test_time_coords_consistent_with_metadata(self, febus_path):
+        """
+        Ensure the time coords returned have the same length as
+        metadata indicates.
+        """
+        patch = Febus2().read(febus_path)[0]
+        coords = patch.coords
+        time = coords.get_coord("time")
+        time_span = to_float((time.max() - time.min()) + time.step)
+
+        with h5py.File(febus_path, "r") as f:
+            feb = _flatten_febus_info(f)[0]
+            # First check total time extent
+            n_blocks = feb.zone[feb.data_name].shape[0]
+            block_time = 1 / (feb.zone.attrs["BlockRate"] / 1_000)
+            expected_time_span = block_time * n_blocks
+            assert np.isclose(expected_time_span, time_span)
+            # Then check absolute time
+            time_offset = feb.zone.attrs["Origin"][1] / 1_000
+            time_start = feb.source["time"][0] + time_offset
+            assert np.isclose(to_float(time.min()), time_start)