From b2551d2654f9789acd3e7ca1366b8b973dd10b9a Mon Sep 17 00:00:00 2001 From: Derrick Chambers Date: Tue, 29 Oct 2024 12:34:36 -0700 Subject: [PATCH] fix febus overlap issue (#449) * fix febus overlap issue * fix start/end date * use uv for pre-commit * add absolute time test --------- Co-authored-by: Derrick Chambers --- dascore/io/febus/utils.py | 55 +++++++++++++++++--------- tests/test_io/test_febus/test_febus.py | 43 ++++++++++++++++++++ 2 files changed, 80 insertions(+), 18 deletions(-) create mode 100644 tests/test_io/test_febus/test_febus.py diff --git a/dascore/io/febus/utils.py b/dascore/io/febus/utils.py index 40b00414..4d608800 100644 --- a/dascore/io/febus/utils.py +++ b/dascore/io/febus/utils.py @@ -97,37 +97,57 @@ def _get_febus_attrs(feb: _FebusSlice) -> dict: return out +def _get_time_overlap_samples(feb, data_shape): + """Determine the number of redundant samples in the time dimension.""" + time_step = feb.zone.attrs["Spacing"][1] / 1_000 # value in ms, convert to s. + block_time = _maybe_unpack(1 / (feb.zone.attrs["BlockRate"] / 1_000)) + # Since the data have overlaps in each block's time dimension, we need to + # trim the overlap off the time dimension to avoid having to merge blocks later. + # However, sometimes the "BlockOverlap" is wrong, so we calculate it + # manually here. + expected_samples = int(np.round(block_time / time_step)) + excess_rows = data_shape[1] - expected_samples + assert ( + excess_rows % 2 == 0 + ), "excess rows must be symmetric to distribute on both ends" + return excess_rows + + def _get_time_coord(feb): """Get the time coordinate contained in the febus slice.""" time = feb.source["time"] - # In older version time shape is different, always grap first eleemnt. + # In older version time shape is different, always grab first element. first_slice = tuple(0 for _ in time.shape) t_0 = time[first_slice] # Data dimensions are block_index, time, distance data_shape = feb.zone[feb.data_name].shape n_blocks = data_shape[0] - # Since the data have overlaps in each block's time dimension, we need to - # trim the overlap off the time dimension to avoid having to merge blocks later. - overlap_percentage = _maybe_unpack(feb.zone.attrs.get("BlockOverlap", 0)) - rows_to_remove = int(np.round(data_shape[1] * overlap_percentage / 100)) - total_time_rows = (data_shape[1] - 2 * rows_to_remove) * n_blocks - # Get spacing between time samples (in s) + # Get spacing between time samples (in s) and the total time of each block. time_step = feb.zone.attrs["Spacing"][1] / 1_000 # value in ms, convert to s. - # Get origin info, these are offsets from time for start of block, - time_origin = feb.zone.attrs["Origin"][1] / 1_000 # also convert to s - # Get the start/stop indicies for the zone + excess_rows = _get_time_overlap_samples(feb, data_shape) + total_time_rows = (data_shape[1] - excess_rows) * n_blocks + # Get origin info, these are offsets from time to get to the first simple + # of the block. These should always be non-positive. + time_offset = feb.zone.attrs["Origin"][1] / 1_000 # also convert to s + assert time_offset <= 0, "time offset must be non positive" + # Get the start/stop indices for the zone. We assume zones never sub-slice + # time (only distance) but assert that here. extent = feb.zone.attrs["Extent"] - time_ids = (extent[2], extent[3]) + assert (extent[3] - extent[2] + 1) == data_shape[1], "Cant handle sub time zones" # Create time coord # Need to account for removing overlap times. - total_start = time_origin - rows_to_remove * time_step + time_ids[0] * time_step + total_start = t_0 + time_offset + (excess_rows // 2) * time_step total_end = total_start + total_time_rows * time_step time_coord = get_coord( - start=dc.to_datetime64(t_0 + total_start), - stop=dc.to_datetime64(t_0 + total_end), + start=dc.to_datetime64(total_start), + stop=dc.to_datetime64(total_end), step=dc.to_timedelta64(time_step), ) - return time_coord.change_length(total_time_rows) + # Note: we have found some files in which the sampling rate is 1/3e-4 + # because we use datetime64 we lose some precision which has caused + # slight differences in shape of the patch. + out = time_coord.change_length(total_time_rows) + return out def _get_distance_coord(feb): @@ -221,9 +241,8 @@ def _get_time_filtered_data(data, t_start_end, time, total_slice, time_coord): dist_coord, time_coord = cm.coord_map["distance"], cm.coord_map["time"] data = febus.zone[febus.data_name] data_shape = data.shape - overlap_percentage = _maybe_unpack(febus.zone.attrs.get("BlockOverlap", 0)) - skip_rows = int(np.round(overlap_percentage / 100 * data_shape[1])) - # Need to handle case where skip_rows == 0 + skip_rows = _get_time_overlap_samples(febus, data_shape) // 2 + # Need to handle case where excess_rows == 0 data_slice = slice(skip_rows, -skip_rows if skip_rows else None) total_slice = list(broadcast_for_index(3, 1, data_slice)) total_time_rows = data_shape[1] - 2 * skip_rows diff --git a/tests/test_io/test_febus/test_febus.py b/tests/test_io/test_febus/test_febus.py new file mode 100644 index 00000000..40a24511 --- /dev/null +++ b/tests/test_io/test_febus/test_febus.py @@ -0,0 +1,43 @@ +""" +Febus specific tests. +""" + +import h5py +import numpy as np +import pytest + +from dascore.io.febus import Febus2 +from dascore.io.febus.utils import _flatten_febus_info +from dascore.utils.downloader import fetch +from dascore.utils.time import to_float + + +class TestFebus: + """Special test cases for febus.""" + + @pytest.fixture(scope="class") + def febus_path(self): + """Return the path to a test febus file.""" + return fetch("febus_1.h5") + + def test_time_coords_consistent_with_metadata(self, febus_path): + """ + Ensure the time coords returned have the same length as + metadata indicates. + """ + patch = Febus2().read(febus_path)[0] + coords = patch.coords + time = coords.get_coord("time") + time_span = to_float((time.max() - time.min()) + time.step) + + with h5py.File(febus_path, "r") as f: + feb = _flatten_febus_info(f)[0] + # First check total time extent + n_blocks = feb.zone[feb.data_name].shape[0] + block_time = 1 / (feb.zone.attrs["BlockRate"] / 1_000) + expected_time_span = block_time * n_blocks + assert np.isclose(expected_time_span, time_span) + # Then check absolute time + time_offset = feb.zone.attrs["Origin"][1] / 1_000 + time_start = feb.source["time"][0] + time_offset + assert np.isclose(to_float(time.min()), time_start)