Skip to content

Commit

Permalink
fix febus overlap issue (#449)
Browse files Browse the repository at this point in the history
* fix febus overlap issue

* fix start/end date

* use uv for pre-commit

* add absolute time test

---------

Co-authored-by: Derrick Chambers <derrickchambers@lightwave.mines.edu>
  • Loading branch information
d-chambers and Derrick Chambers authored Oct 29, 2024
1 parent 7649ac4 commit b2551d2
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 18 deletions.
55 changes: 37 additions & 18 deletions dascore/io/febus/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,37 +97,57 @@ def _get_febus_attrs(feb: _FebusSlice) -> dict:
return out


def _get_time_overlap_samples(feb, data_shape):
"""Determine the number of redundant samples in the time dimension."""
time_step = feb.zone.attrs["Spacing"][1] / 1_000 # value in ms, convert to s.
block_time = _maybe_unpack(1 / (feb.zone.attrs["BlockRate"] / 1_000))
# Since the data have overlaps in each block's time dimension, we need to
# trim the overlap off the time dimension to avoid having to merge blocks later.
# However, sometimes the "BlockOverlap" is wrong, so we calculate it
# manually here.
expected_samples = int(np.round(block_time / time_step))
excess_rows = data_shape[1] - expected_samples
assert (
excess_rows % 2 == 0
), "excess rows must be symmetric to distribute on both ends"
return excess_rows


def _get_time_coord(feb):
"""Get the time coordinate contained in the febus slice."""
time = feb.source["time"]
# In older version time shape is different, always grap first eleemnt.
# In older version time shape is different, always grab first element.
first_slice = tuple(0 for _ in time.shape)
t_0 = time[first_slice]
# Data dimensions are block_index, time, distance
data_shape = feb.zone[feb.data_name].shape
n_blocks = data_shape[0]
# Since the data have overlaps in each block's time dimension, we need to
# trim the overlap off the time dimension to avoid having to merge blocks later.
overlap_percentage = _maybe_unpack(feb.zone.attrs.get("BlockOverlap", 0))
rows_to_remove = int(np.round(data_shape[1] * overlap_percentage / 100))
total_time_rows = (data_shape[1] - 2 * rows_to_remove) * n_blocks
# Get spacing between time samples (in s)
# Get spacing between time samples (in s) and the total time of each block.
time_step = feb.zone.attrs["Spacing"][1] / 1_000 # value in ms, convert to s.
# Get origin info, these are offsets from time for start of block,
time_origin = feb.zone.attrs["Origin"][1] / 1_000 # also convert to s
# Get the start/stop indicies for the zone
excess_rows = _get_time_overlap_samples(feb, data_shape)
total_time_rows = (data_shape[1] - excess_rows) * n_blocks
# Get origin info, these are offsets from time to get to the first simple
# of the block. These should always be non-positive.
time_offset = feb.zone.attrs["Origin"][1] / 1_000 # also convert to s
assert time_offset <= 0, "time offset must be non positive"
# Get the start/stop indices for the zone. We assume zones never sub-slice
# time (only distance) but assert that here.
extent = feb.zone.attrs["Extent"]
time_ids = (extent[2], extent[3])
assert (extent[3] - extent[2] + 1) == data_shape[1], "Cant handle sub time zones"
# Create time coord
# Need to account for removing overlap times.
total_start = time_origin - rows_to_remove * time_step + time_ids[0] * time_step
total_start = t_0 + time_offset + (excess_rows // 2) * time_step
total_end = total_start + total_time_rows * time_step
time_coord = get_coord(
start=dc.to_datetime64(t_0 + total_start),
stop=dc.to_datetime64(t_0 + total_end),
start=dc.to_datetime64(total_start),
stop=dc.to_datetime64(total_end),
step=dc.to_timedelta64(time_step),
)
return time_coord.change_length(total_time_rows)
# Note: we have found some files in which the sampling rate is 1/3e-4
# because we use datetime64 we lose some precision which has caused
# slight differences in shape of the patch.
out = time_coord.change_length(total_time_rows)
return out


def _get_distance_coord(feb):
Expand Down Expand Up @@ -221,9 +241,8 @@ def _get_time_filtered_data(data, t_start_end, time, total_slice, time_coord):
dist_coord, time_coord = cm.coord_map["distance"], cm.coord_map["time"]
data = febus.zone[febus.data_name]
data_shape = data.shape
overlap_percentage = _maybe_unpack(febus.zone.attrs.get("BlockOverlap", 0))
skip_rows = int(np.round(overlap_percentage / 100 * data_shape[1]))
# Need to handle case where skip_rows == 0
skip_rows = _get_time_overlap_samples(febus, data_shape) // 2
# Need to handle case where excess_rows == 0
data_slice = slice(skip_rows, -skip_rows if skip_rows else None)
total_slice = list(broadcast_for_index(3, 1, data_slice))
total_time_rows = data_shape[1] - 2 * skip_rows
Expand Down
43 changes: 43 additions & 0 deletions tests/test_io/test_febus/test_febus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""
Febus specific tests.
"""

import h5py
import numpy as np
import pytest

from dascore.io.febus import Febus2
from dascore.io.febus.utils import _flatten_febus_info
from dascore.utils.downloader import fetch
from dascore.utils.time import to_float


class TestFebus:
"""Special test cases for febus."""

@pytest.fixture(scope="class")
def febus_path(self):
"""Return the path to a test febus file."""
return fetch("febus_1.h5")

def test_time_coords_consistent_with_metadata(self, febus_path):
"""
Ensure the time coords returned have the same length as
metadata indicates.
"""
patch = Febus2().read(febus_path)[0]
coords = patch.coords
time = coords.get_coord("time")
time_span = to_float((time.max() - time.min()) + time.step)

with h5py.File(febus_path, "r") as f:
feb = _flatten_febus_info(f)[0]
# First check total time extent
n_blocks = feb.zone[feb.data_name].shape[0]
block_time = 1 / (feb.zone.attrs["BlockRate"] / 1_000)
expected_time_span = block_time * n_blocks
assert np.isclose(expected_time_span, time_span)
# Then check absolute time
time_offset = feb.zone.attrs["Origin"][1] / 1_000
time_start = feb.source["time"][0] + time_offset
assert np.isclose(to_float(time.min()), time_start)

0 comments on commit b2551d2

Please sign in to comment.