Skip to content

Commit

Permalink
Merge pull request #19 from xdas-dev/fix/febus-overlap
Browse files Browse the repository at this point in the history
Fix: Dealing with the various version of the Febus Format
  • Loading branch information
atrabattoni authored Sep 18, 2024
2 parents 0249f19 + c6b0f35 commit 88fe71c
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 17 deletions.
5 changes: 5 additions & 0 deletions docs/user-guide/data-formats.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ The formats that are currently implemented are: ASN, FEBUS, OPTASENSE, SINTELA a
| SINTELA | `"sintela"` |
| TERRA15 | `"terra15"` |

```{warning}
Due to poor documentation of the various version of the Febus format, it is recommended to manually provide the required trimming and the position of the timestamps within each block. For example to trim 100 samples on both side of each block and to set the timestamp location at the center of the block for a block of 2000 samples:
`xdas.open_dataarray("path.h5", engine="febus", overlaps=(100, 100), offset=1000)`
```

## Extending *xdas* with your file format

*xdas* insists on its extensibility, the power is in the hands of the users. Extending *xdas* usually consists of writing few-line-of-code-long functions. The process consists in dealing with the two main aspects of a {py:class}`xarray.DataArray`: unpacking the data and coordinates objects, eventually processing them and packing them back into a Database object.
Expand Down
52 changes: 41 additions & 11 deletions xdas/core/routines.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,13 @@ def open_mfdatacollection(


def open_mfdatatree(
paths, dim="first", tolerance=None, squeeze=False, engine=None, verbose=False
paths,
dim="first",
tolerance=None,
squeeze=False,
engine=None,
verbose=False,
**kwargs,
):
"""
Open a directory tree structure as a data collection.
Expand Down Expand Up @@ -118,6 +124,8 @@ def open_mfdatatree(
The type of file to open or a read function. Default to xdas netcdf format.
verbose: bool
Whether to display a progress bar. Default to False.
**kwargs
Additional keyword arguments to be passed to the read function.
Returns
-------
Expand Down Expand Up @@ -180,11 +188,18 @@ def open_mfdatatree(
bag = bag[match.group(field)]
bag.append(fname)

return collect(tree, fields, dim, tolerance, squeeze, engine, verbose)
return collect(tree, fields, dim, tolerance, squeeze, engine, verbose, **kwargs)


def collect(
tree, fields, dim="first", tolerance=None, squeeze=False, engine=None, verbose=False
tree,
fields,
dim="first",
tolerance=None,
squeeze=False,
engine=None,
verbose=False,
**kwargs,
):
"""
Collects the data from a tree of paths using `fields` as level names.
Expand All @@ -207,6 +222,8 @@ def collect(
The type of file to open or a read function. Default to xdas netcdf format.
verbose: bool
Whether to display a progress bar. Default to False.
**kwargs
Additional keyword arguments to be passed to the read function.
Returns
Expand All @@ -219,7 +236,9 @@ def collect(
collection = DataCollection({}, name=name)
for key, value in tree.items():
if isinstance(value, list):
dc = open_mfdataarray(value, dim, tolerance, squeeze, engine, verbose)
dc = open_mfdataarray(
value, dim, tolerance, squeeze, engine, verbose, **kwargs
)
dc.name = fields[0]
collection[key] = dc
else:
Expand All @@ -238,14 +257,20 @@ def defaulttree(depth):


def open_mfdataarray(
paths, dim="first", tolerance=None, squeeze=True, engine=None, verbose=False
paths,
dim="first",
tolerance=None,
squeeze=True,
engine=None,
verbose=False,
**kwargs,
):
"""
Open a multiple file dataset.
Each file described by `path` will be opened as a data array. The data arrays are
then combined along the `dim` dimension using `combine_by_coords`. If the
cooridnates of the data arrays are not compatible, the resulting object will be
coordinates of the data arrays are not compatible, the resulting object will be
split into a sequence of data arrays.
Parameters
Expand All @@ -264,11 +289,13 @@ def open_mfdataarray(
The type of file to open or a read function. Default to xdas netcdf format.
verbose: bool
Whether to display a progress bar. Default to False.
**kwargs
Additional keyword arguments to be passed to the read function.
Returns
-------
DataArray or DataSequence
The dataarray containing all files data. If different acquisitions are found,
The data array containing all files data. If different acquisitions are found,
a DataSequence is returned.
Raises
Expand All @@ -295,7 +322,8 @@ def open_mfdataarray(
)
with ProcessPoolExecutor() as executor:
futures = [
executor.submit(open_dataarray, path, engine=engine) for path in paths
executor.submit(open_dataarray, path, engine=engine, **kwargs)
for path in paths
]
if verbose:
iterator = tqdm(
Expand All @@ -322,6 +350,8 @@ def open_dataarray(fname, group=None, engine=None, **kwargs):
to the root of the file.
engine: str of callable, optional
The type of file to open or a read function. Default to xdas netcdf format.
**kwargs
Additional keyword arguments to be passed to the read function.
Returns
-------
Expand All @@ -341,14 +371,14 @@ def open_dataarray(fname, group=None, engine=None, **kwargs):
if not os.path.exists(fname):
raise FileNotFoundError("no file to open")
if engine is None:
return DataArray.from_netcdf(fname, group=group, **kwargs)
return DataArray.from_netcdf(fname, group=group)
elif callable(engine):
return engine(fname)
return engine(fname, **kwargs)
elif isinstance(engine, str):
from .. import io

module = getattr(io, engine)
return module.read(fname)
return module.read(fname, **kwargs)
else:
raise ValueError("engine not recognized")

Expand Down
78 changes: 72 additions & 6 deletions xdas/io/febus.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import warnings

import h5py
import numpy as np

Expand All @@ -6,22 +8,85 @@
from ..virtual import VirtualSource


def read(fname):
"""Open a febus file into a xdas DataArray"""
def read(fname, overlaps=None, offset=None):
"""
Open a Febus file into a xdas DataArray object.
The Febus file format contains a 3D array which is a stack of 2D (time, distance)
chunks of data that overlaps with each other. The overlaps must be trimmed and the
chunks concatenated to form a seamless dataset. Each chunk is associated with a
timestamp that is located at a fixed offset from the beginning of the chunk.
Because of poor documentation of the evolution of the Febus file format, it is
recommended to manually specify the overlap and offset parameters. If not provided,
the function will attempt to determine the correct values at your own risk.
Parameters:
-----------
fname : str
The filename of the Febus file to read.
overlaps : tuple of int, optional
A tuple specifying the overlap in number of sample to trim on both side of each
chunk of the data. If not provided, the function will attempt to determine the
correct overlap at your own risk.
offset : int, optional
The location of the timestamp within each block given as the number of samples
from the beginning. If not provided, the function will attempt to determine the
correct offset at you own risk.
Returns:
--------
DataArray
A data array containing the data from the Febus file.
"""
with h5py.File(fname, "r") as file:
(device_name,) = list(file.keys())
source = file[device_name]["Source1"]
times = np.asarray(source["time"])
zone = source["Zone1"]
if "BlockRate" in zone.attrs:
blockrate = zone.attrs["BlockRate"][0] / 1000.0
elif "FreqRes" in zone.attrs:
blockrate = zone.attrs["FreqRes"][0] / 1000.0
else:
raise KeyError("Could not find the block size, please check file header")
(name,) = list(zone.keys())
chunks = VirtualSource(zone[name])
delta = [zone.attrs["Spacing"][1] / 1000.0, zone.attrs["Spacing"][0]]
delta = (zone.attrs["Spacing"][1] / 1000.0, zone.attrs["Spacing"][0])
name = "".join(["_" + c.lower() if c.isupper() else c for c in name]).lstrip("_")
noverlap = chunks.shape[1] // 4
chunks = chunks[:, noverlap:-noverlap, :]
times = times + noverlap * delta[0]

match overlaps:
case None:
warnings.warn(
"No overlap specified, Xdas will try its best to find the correct trimming"
)
noverlap = chunks.shape[1] - round((1 / blockrate) / delta[0])
before = noverlap // 2
after = noverlap - before
overlaps = (before, after)
case (int(), int()):
pass
case _:
raise ValueError("overlaps must be a integer or a tuple of two integers")

match offset:
case None:
warnings.warn(
"No offset specified, Xdas will try its best to place the timestamps"
)
offset = chunks.shape[1] // 2
case int():
pass
case _:
raise ValueError("offset must be an integer")

chunks = chunks[:, overlaps[0] : -overlaps[-1], :]
times = times + (overlaps[0] - offset) * delta[0]

dt, dx = delta
_, nt, nx = chunks.shape

dc = []
for t0, chunk in zip(times, chunks):
time = {
Expand All @@ -33,4 +98,5 @@ def read(fname):
distance = {"tie_indices": [0, nx - 1], "tie_values": [0.0, (nx - 1) * dx]}
da = DataArray(chunk, {"time": time, "distance": distance}, name=name)
dc.append(da)

return concatenate(dc, "time")

0 comments on commit 88fe71c

Please sign in to comment.