Skip to content

Commit

Permalink
support for geothermal data repo h5 format
Browse files Browse the repository at this point in the history
  • Loading branch information
d-chambers committed Dec 24, 2024
1 parent 884c41f commit 6cf985e
Show file tree
Hide file tree
Showing 10 changed files with 453 additions and 0 deletions.
1 change: 1 addition & 0 deletions dascore/data_registry.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@ neubrex_dts_forge.h5 940f7bea6dd4c8a1340b4936b8eb7f9edc577cbcaf77c1f5ac295890f88
decimated_optodas.hdf5 48ce9c2ab4916d5536faeef0bd789f326ec4afc232729d32014d4d835a9fb74e https://github.com/dasdae/test_data/raw/master/das/decimated_optodas.hdf5
neubrex_das_1.h5 48a97e27c56e66cc2954ba4eaaadd2169919bb8f897d78e95ef6ab50abb5027b https://github.com/dasdae/test_data/raw/master/das/neubrex_das_1.h5
UoU_lf_urban.hdf5 d3f8fa6ff3d8ae993484b3fbf9b39505e2cf15cb3b39925a3519b27d5fbe7b5b https://github.com/dasdae/test_data/raw/master/das/UoU_lf_urban.hdf5
gdr_1.h5 aaf11a7333b720436d194e3c7f4fa66f38907bb0c9abfa1804c150e634642aa2 https://github.com/dasdae/test_data/raw/master/das/gdr_1.h5
11 changes: 11 additions & 0 deletions dascore/io/gdr/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""
Support for the Geothermal Data Repository (gdr) h5 format.
The gdr format is a combination of prodml and the Earthscope DMC's meta
data spec. It houses many data sets, not just DFOS.
Find more information here: https://gdr.openei.org/. Information regarding
the DAS format can be found here: https://gdr.openei.org/das_data_standard
"""

from .core import GDR_V1
70 changes: 70 additions & 0 deletions dascore/io/gdr/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""
Core modules for reading GDR data.
GDR files do not specify the GDR version directly. Instead, they use versions
from other standards for the metadata and raw data. These can be found in the
overview attributes MetadataStandard and RawDataStandard.
"""

from __future__ import annotations

import dascore as dc
from dascore.constants import SpoolType
from dascore.io import FiberIO
from dascore.io.gdr.utils_das import (
_get_attrs_coords_and_data,
_get_version,
_maybe_trim_data,
)
from dascore.utils.hdf5 import H5Reader


class GDRPatchAttrs(dc.PatchAttrs):
"""Patch attrs for GDR files."""

gauge_length: float
gauge_length_units: str
project_number: str = ""


class GDR_V1(FiberIO): # noqa
"""
Support for GDR version 1.
"""

name = "GDR_DAS"
preferred_extensions = ("hdf5", "h5")
version = "1"

def get_format(self, resource: H5Reader, **kwargs) -> tuple[str, str] | bool:
"""Determine if the resource belongs to this format."""
return _get_version(resource)

def read(self, resource: H5Reader, snap=True, **kwargs) -> SpoolType:
"""
Read a resource belonging to this format.
Parameters
----------
resource
The open h5 object.
snap
If True, snap each coordinate to be evenly sampled.
**kwargs
Passed to filtering coordinates.
"""
attr_dict, cm, data = _get_attrs_coords_and_data(resource, snap=snap)
if kwargs:
cm, data = _maybe_trim_data(cm, data, **kwargs)
attrs = GDRPatchAttrs(**attr_dict)
patch = dc.Patch(coords=cm, data=data[:], attrs=attrs)
return dc.spool([patch])

def scan(self, resource: H5Reader, snap=True, **kwargs) -> list[dc.PatchAttrs]:
"""Get the attributes of a resource belong to this type."""
attrs, cm, data = _get_attrs_coords_and_data(resource, snap)
attrs["coords"] = cm.to_summary_dict()
attrs["path"] = resource.filename
attrs["file_format"] = self.name
attrs["file_version"] = self.version
return [dc.PatchAttrs(**attrs)]
119 changes: 119 additions & 0 deletions dascore/io/gdr/utils_das.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""
Utilities functions for GDR DAS format.
See: https://gdr.openei.org/das_data_standard for more info.
"""

import numpy as np

import dascore as dc
from dascore.core import get_coord
from dascore.utils.hdf5 import extract_h5_attrs, h5_matches_structure
from dascore.utils.misc import unbyte

# This defines the metadata/rawdata version combinations that define GDR versions.
_COMPOSITE_VERSIONS = {
("DAS-RCN v1.10", "PRODML v2.2"): "1",
}

_BASE_STRUCTURE = (
"DasMetadata/Interrogator/Acquisition",
"DasRawData/DasTimeArray",
"DasRawData/RawData",
"DasMetadata.MetadataStandard",
"DasMetadata.RawDataStandard",
)

# Attribute map for version 1. {current_name: new_name}
ACQ = "DasMetadata/Interrogator/Acquisition"
_V1_ATTR_MAP = {
f"{ACQ}.GaugeLength": "gauge_length",
f"{ACQ}.GaugeLengthUnit": "gauge_length_units",
f"{ACQ}.UnitOfMeasure": "data_units",
"DasMetadata/Interrogator.SerialNumber": "instrument_id",
}


def _get_version(h5fi):
"""Get the version code of the GDR file."""
if not h5_matches_structure(h5fi, _BASE_STRUCTURE):
return False
meta = h5fi["DasMetadata"].attrs
data_fmt = meta["RawDataStandard"]
meta_fmt = meta["MetadataStandard"]
return "GDR_DAS", _COMPOSITE_VERSIONS[(meta_fmt, data_fmt)]


def _get_attrs_coords_and_data(resource, snap):
"""
Get attributes, coordinates, and data from the file.
"""
fill = {"NaN": "", "nan": ""}
attrs = extract_h5_attrs(resource, _V1_ATTR_MAP, fill_values=fill)
coords = _get_coord_manager(resource, snap)
data = resource["DasRawData/RawData"]
return attrs, coords, data


def _get_coord_manager(resource, snap=True):
"""Get a coordinate manager from the file."""

def get_time_coord(resource, snap):
"""Get the time coordinate."""
# TODO: I am not sure if time will always be in ns, check on it.
time = resource["DasRawData/DasTimeArray"]
if not snap:
return get_coord(data=np.array(time).astype("datetime64[ns]"))
t1 = np.int64(time[0]).astype("datetime64[ns]")
t2 = np.int64(time[-1]).astype("datetime64[ns]")
step = (t2 - t1) / (len(time) - 1)
return get_coord(start=t1, stop=t2, step=step).change_length(len(time))

def get_dist_coord(resource, length):
"""Get distance coordinates."""
# Note: There is not enough info to correctly infer the start of
# distance coordinate since Channels are often not included. In this
# case we just assume the distance starts at 0 since the location of
# each channel must be attached alter anyway. This at least includes
# correct dx information.
group = resource["DasMetadata/Interrogator/Acquisition"]
dx = float(unbyte(group.attrs["SpatialSamplingInterval"]))
units = unbyte(group.attrs["SpatialSamplingIntervalUnit"])
start = 0
stop = length * dx
coord = get_coord(start=start, stop=stop, step=dx, units=units)
return coord.change_length(length)

def get_dims(dataset):
"""Get the dimension names."""
das_dims = dataset.attrs["DasDimensions"]
out = [""] * 2
for num, dim in enumerate(das_dims):
if dim.startswith("time"):
out[num] = "time"
elif dim == "locus":
out[num] = "distance"
assert all(out)
return tuple(out)

time_coord = get_time_coord(resource, snap)
dataset = resource["DasRawData/RawData"]
dims = get_dims(dataset)
# Get distance coord.
dist_axis = dims.index("distance")
dist_length = dataset.shape[dist_axis]
dist_coord = get_dist_coord(resource, dist_length)

coords = {
"time": time_coord,
"distance": dist_coord,
}

return dc.get_coord_manager(coords=coords, dims=dims)


def _maybe_trim_data(cm, data, time=None, distance=None, **kwargs):
"""Maybe trim the data."""
if time is not None or distance is not None:
cm, data = cm.select(time=time, distance=distance, array=data)
return cm, data
66 changes: 66 additions & 0 deletions dascore/utils/hdf5.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import time
import warnings
from collections.abc import Sequence
from contextlib import contextmanager, suppress
from functools import partial
from pathlib import Path
Expand All @@ -30,8 +31,10 @@
from dascore.utils.mapping import FrozenDict
from dascore.utils.misc import (
_maybe_make_parent_directory,
_maybe_unpack,
cached_method,
suppress_warnings,
unbyte,
)
from dascore.utils.pd import (
_remove_base_path,
Expand Down Expand Up @@ -469,3 +472,66 @@ def unpack_scalar_h5_dataset(dataset):
if isinstance(value, np.ndarray):
value = value[0]
return value


def h5_matches_structure(h5file: H5pyFile, structure: Sequence[str]):
"""
Check if an H5 file matches a spec given by a structure.
Parameters
----------
h5file
A an open h5file as returned by h5py.File.
structure
A sequence of strings which indicates required groups/datasets/attrs.
For example ("data", "data/raw", "data/raw.sampling") would require
the 'data' group to exist, the data/raw group/dataset to exist and
that raw has an attributed called 'sampling'.
"""
for address in structure:
split = address.split(".")
assert len(split) in {1, 2}, "address can have at most one '.'"
if len(split) == 2:
base, attr = split
else:
base, attr = split[0], None
try:
obj = h5file[base]
except KeyError:
return False
if attr is not None and attr not in set(obj.attrs):
return False
return True


def extract_h5_attrs(
h5file: H5pyFile,
name_map: dict[str, str],
fill_values=None,
):
"""
Extract attributes from h5 file based on structure.
Parameters
----------
h5file
A an open h5file as returned by h5py.File.
name_map
A mapping from {old_name: new_name}. Old name must include one
dot which separates the path from the attribute name.
eg {"DasData.SamplingRate": "sampling_rate"}.
Raises
------
KeyError if any datasets/attributes are missing.
"""
fill_values = fill_values or {}
out = {}
for address, out_name in name_map.items():
split = address.split(".")
assert len(split) == 2, "Struct must have exactly one '.'"
base, attr = split
obj = h5file[base]
value = _maybe_unpack(unbyte(obj.attrs[attr]))
out[out_name] = fill_values.get(value, value)
return out
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ SEGY__V2 = "dascore.io.segy.core:SegyV2"
RSF__V1 = "dascore.io.rsf.core:RSFV1"
WAV = "dascore.io.wav.core:WavIO"
XMLBINARY__V1 = "dascore.io.xml_binary.core:XMLBinaryV1"
GDR_DAS__V1 = "dascore.io.gdr.core:GDR_V1"


# --- External tool configuration
Expand Down
Loading

0 comments on commit 6cf985e

Please sign in to comment.