Merge pull request #19 from xdas-dev/fix/febus-overlap

Fix: Dealing with the various version of the Febus Format
xdas-dev · Sep 18, 2024 · 88fe71c · 88fe71c
2 parents 0249f19 + c6b0f35
commit 88fe71c
Show file tree

Hide file tree

Showing 3 changed files with 118 additions and 17 deletions.
diff --git a/docs/user-guide/data-formats.md b/docs/user-guide/data-formats.md
@@ -30,6 +30,11 @@ The formats that are currently implemented are: ASN, FEBUS, OPTASENSE, SINTELA a
 | SINTELA           | `"sintela"`       |
 | TERRA15           | `"terra15"`       |
 
+```{warning}
+Due to poor documentation of the various version of the Febus format, it is recommended to manually provide the required trimming and the position of the timestamps within each block. For example to trim 100 samples on both side of each block and to set the timestamp location at the center of the block for a block of 2000 samples:
+`xdas.open_dataarray("path.h5", engine="febus", overlaps=(100, 100), offset=1000)`
+```
+
 ## Extending *xdas* with your file format
 
 *xdas* insists on its extensibility, the power is in the hands of the users. Extending *xdas* usually consists of writing few-line-of-code-long functions. The process consists in dealing with the two main aspects of a {py:class}`xarray.DataArray`: unpacking the data and coordinates objects, eventually processing them and packing them back into a Database object. 

diff --git a/xdas/core/routines.py b/xdas/core/routines.py
@@ -83,7 +83,13 @@ def open_mfdatacollection(
 
 
 def open_mfdatatree(
-    paths, dim="first", tolerance=None, squeeze=False, engine=None, verbose=False
+    paths,
+    dim="first",
+    tolerance=None,
+    squeeze=False,
+    engine=None,
+    verbose=False,
+    **kwargs,
 ):
     """
     Open a directory tree structure as a data collection.
@@ -118,6 +124,8 @@ def open_mfdatatree(
         The type of file to open or a read function. Default to xdas netcdf format.
     verbose: bool
         Whether to display a progress bar. Default to False.
+    **kwargs
+        Additional keyword arguments to be passed to the read function.
 
     Returns
     -------
@@ -180,11 +188,18 @@ def open_mfdatatree(
             bag = bag[match.group(field)]
         bag.append(fname)
 
-    return collect(tree, fields, dim, tolerance, squeeze, engine, verbose)
+    return collect(tree, fields, dim, tolerance, squeeze, engine, verbose, **kwargs)
 
 
 def collect(
-    tree, fields, dim="first", tolerance=None, squeeze=False, engine=None, verbose=False
+    tree,
+    fields,
+    dim="first",
+    tolerance=None,
+    squeeze=False,
+    engine=None,
+    verbose=False,
+    **kwargs,
 ):
     """
     Collects the data from a tree of paths using `fields` as level names.
@@ -207,6 +222,8 @@ def collect(
         The type of file to open or a read function. Default to xdas netcdf format.
     verbose: bool
         Whether to display a progress bar. Default to False.
+    **kwargs
+        Additional keyword arguments to be passed to the read function.
 
 
     Returns
@@ -219,7 +236,9 @@ def collect(
     collection = DataCollection({}, name=name)
     for key, value in tree.items():
         if isinstance(value, list):
-            dc = open_mfdataarray(value, dim, tolerance, squeeze, engine, verbose)
+            dc = open_mfdataarray(
+                value, dim, tolerance, squeeze, engine, verbose, **kwargs
+            )
             dc.name = fields[0]
             collection[key] = dc
         else:
@@ -238,14 +257,20 @@ def defaulttree(depth):
 
 
 def open_mfdataarray(
-    paths, dim="first", tolerance=None, squeeze=True, engine=None, verbose=False
+    paths,
+    dim="first",
+    tolerance=None,
+    squeeze=True,
+    engine=None,
+    verbose=False,
+    **kwargs,
 ):
     """
     Open a multiple file dataset.
 
     Each file described by `path` will be opened as a data array. The data arrays are
     then combined along the `dim` dimension using `combine_by_coords`. If the
-    cooridnates of the data arrays are not compatible, the resulting object will be
+    coordinates of the data arrays are not compatible, the resulting object will be
     split into a sequence of data arrays.
 
     Parameters
@@ -264,11 +289,13 @@ def open_mfdataarray(
         The type of file to open or a read function. Default to xdas netcdf format.
     verbose: bool
         Whether to display a progress bar. Default to False.
+    **kwargs
+        Additional keyword arguments to be passed to the read function.
 
     Returns
     -------
     DataArray or DataSequence
-        The dataarray containing all files data. If different acquisitions are found,
+        The data array containing all files data. If different acquisitions are found,
         a DataSequence is returned.
 
     Raises
@@ -295,7 +322,8 @@ def open_mfdataarray(
         )
     with ProcessPoolExecutor() as executor:
         futures = [
-            executor.submit(open_dataarray, path, engine=engine) for path in paths
+            executor.submit(open_dataarray, path, engine=engine, **kwargs)
+            for path in paths
         ]
         if verbose:
             iterator = tqdm(
@@ -322,6 +350,8 @@ def open_dataarray(fname, group=None, engine=None, **kwargs):
         to the root of the file.
     engine: str of callable, optional
         The type of file to open or a read function. Default to xdas netcdf format.
+    **kwargs
+        Additional keyword arguments to be passed to the read function.
 
     Returns
     -------
@@ -341,14 +371,14 @@ def open_dataarray(fname, group=None, engine=None, **kwargs):
     if not os.path.exists(fname):
         raise FileNotFoundError("no file to open")
     if engine is None:
-        return DataArray.from_netcdf(fname, group=group, **kwargs)
+        return DataArray.from_netcdf(fname, group=group)
     elif callable(engine):
-        return engine(fname)
+        return engine(fname, **kwargs)
     elif isinstance(engine, str):
         from .. import io
 
         module = getattr(io, engine)
-        return module.read(fname)
+        return module.read(fname, **kwargs)
     else:
         raise ValueError("engine not recognized")
 

diff --git a/xdas/io/febus.py b/xdas/io/febus.py
@@ -1,3 +1,5 @@
+import warnings
+
 import h5py
 import numpy as np
 
@@ -6,22 +8,85 @@
 from ..virtual import VirtualSource
 
 
-def read(fname):
-    """Open a febus file into a xdas DataArray"""
+def read(fname, overlaps=None, offset=None):
+    """
+    Open a Febus file into a xdas DataArray object.
+
+    The Febus file format contains a 3D array which is a stack of 2D (time, distance)
+    chunks of data that overlaps with each other. The overlaps must be trimmed and the
+    chunks concatenated to form a seamless dataset. Each chunk is associated with a
+    timestamp that is located at a fixed offset from the beginning of the chunk.
+
+    Because of poor documentation of the evolution of the Febus file format, it is
+    recommended to manually specify the overlap and offset parameters. If not provided,
+    the function will attempt to determine the correct values at your own risk.
+
+    Parameters:
+    -----------
+    fname : str
+        The filename of the Febus file to read.
+    overlaps : tuple of int, optional
+        A tuple specifying the overlap in number of sample to trim on both side of each
+        chunk of the data. If not provided, the function will attempt to determine the
+        correct overlap at your own risk.
+    offset : int, optional
+        The location of the timestamp within each block given as the number of samples
+        from the beginning. If not provided, the function will attempt to determine the
+        correct offset at you own risk.
+
+    Returns:
+    --------
+    DataArray
+        A data array containing the data from the Febus file.
+
+    """
     with h5py.File(fname, "r") as file:
         (device_name,) = list(file.keys())
         source = file[device_name]["Source1"]
         times = np.asarray(source["time"])
         zone = source["Zone1"]
+        if "BlockRate" in zone.attrs:
+            blockrate = zone.attrs["BlockRate"][0] / 1000.0
+        elif "FreqRes" in zone.attrs:
+            blockrate = zone.attrs["FreqRes"][0] / 1000.0
+        else:
+            raise KeyError("Could not find the block size, please check file header")
         (name,) = list(zone.keys())
         chunks = VirtualSource(zone[name])
-        delta = [zone.attrs["Spacing"][1] / 1000.0, zone.attrs["Spacing"][0]]
+        delta = (zone.attrs["Spacing"][1] / 1000.0, zone.attrs["Spacing"][0])
     name = "".join(["_" + c.lower() if c.isupper() else c for c in name]).lstrip("_")
-    noverlap = chunks.shape[1] // 4
-    chunks = chunks[:, noverlap:-noverlap, :]
-    times = times + noverlap * delta[0]
+
+    match overlaps:
+        case None:
+            warnings.warn(
+                "No overlap specified, Xdas will try its best to find the correct trimming"
+            )
+            noverlap = chunks.shape[1] - round((1 / blockrate) / delta[0])
+            before = noverlap // 2
+            after = noverlap - before
+            overlaps = (before, after)
+        case (int(), int()):
+            pass
+        case _:
+            raise ValueError("overlaps must be a integer or a tuple of two integers")
+
+    match offset:
+        case None:
+            warnings.warn(
+                "No offset specified, Xdas will try its best to place the timestamps"
+            )
+            offset = chunks.shape[1] // 2
+        case int():
+            pass
+        case _:
+            raise ValueError("offset must be an integer")
+
+    chunks = chunks[:, overlaps[0] : -overlaps[-1], :]
+    times = times + (overlaps[0] - offset) * delta[0]
+
     dt, dx = delta
     _, nt, nx = chunks.shape
+
     dc = []
     for t0, chunk in zip(times, chunks):
         time = {
@@ -33,4 +98,5 @@ def read(fname):
         distance = {"tie_indices": [0, nx - 1], "tie_values": [0.0, (nx - 1) * dx]}
         da = DataArray(chunk, {"time": time, "distance": distance}, name=name)
         dc.append(da)
+
     return concatenate(dc, "time")