diff --git a/dascore/core/attrs.py b/dascore/core/attrs.py index bf5337c1..7511f0aa 100644 --- a/dascore/core/attrs.py +++ b/dascore/core/attrs.py @@ -85,6 +85,13 @@ class PatchAttrs(DascoreBaseModel): network: str = Field( default="", max_length=max_lens["network"], description="A network code." ) + path: str = Field(default="", description="The origin of the patch data.") + format_name: str = Field( + default="", description="The original format of the patch data." + ) + format_version: str = Field( + default="", description="The version of the patch data." + ) history: StrTupleStrSerialized = Field( default_factory=tuple, description="A list of processing performed on the patch.", diff --git a/dascore/core/coordmanager.py b/dascore/core/coordmanager.py index eb1613b8..af163d0e 100644 --- a/dascore/core/coordmanager.py +++ b/dascore/core/coordmanager.py @@ -995,9 +995,8 @@ def to_summary(self) -> Self: new_map[name] = coord.to_summary(dims=dims, name=name) return CoordManagerSummary( dim_map=self.dim_map, - coords=new_map, + coord_map=FrozenDict(new_map), dims=self.dims, - summary=True, ) def get_coord(self, coord_name: str) -> BaseCoord: diff --git a/dascore/core/patch.py b/dascore/core/patch.py index 8fafdc17..721be2c2 100644 --- a/dascore/core/patch.py +++ b/dascore/core/patch.py @@ -24,7 +24,11 @@ ) from dascore.core.coords import BaseCoord from dascore.utils.display import array_to_text, attrs_to_text, get_dascore_text -from dascore.utils.models import ArrayLike, ArraySummary, DascoreBaseModel +from dascore.utils.models import ( + ArrayLike, + ArraySummary, + DascoreBaseModel, +) from dascore.utils.patch import check_patch_attrs, check_patch_coords, get_patch_names from dascore.utils.time import to_float from dascore.viz import VizPatchNameSpace @@ -380,21 +384,14 @@ def io(self) -> dc.io.PatchIO: def to_summary( self, - path=None, - resource_format=None, - resource_version=None, ) -> PatchSummary: """ Summarize the contents of the Patch. """ - path = path if path is not None else self.get_patch_name() psum = PatchSummary( - uri=path, coords=self.coords.to_summary(), attrs=self.attrs, data=ArraySummary.from_array(self.data), - resource_format=resource_format, - resource_version=resource_version, ) return psum @@ -404,12 +401,7 @@ class PatchSummary(DascoreBaseModel): A class for summarizing the metadata of the Patch. """ - path: str - format: str = "" - version: str = "" - data: Annotated[ArraySummary, PlainValidator(ArraySummary.from_array)] - attrs: PatchAttrs coords: CoordManagerSummary diff --git a/dascore/io/ap_sensing/core.py b/dascore/io/ap_sensing/core.py index 2c132b57..391b2d0e 100644 --- a/dascore/io/ap_sensing/core.py +++ b/dascore/io/ap_sensing/core.py @@ -43,12 +43,9 @@ def get_format(self, resource: H5Reader, **kwargs) -> tuple[str, str] | bool: def scan(self, resource: H5Reader, **kwargs) -> list[dc.PatchSummary]: """Scan an AP sensing file, return summary info about the contents.""" - attrs = _get_attrs_dict(resource) + attrs = _get_attrs_dict(resource, self.name) coords = _get_coords(resource) info = { - "path": resource.filename, - "format": self.name, - "version": _get_version_string(resource), "attrs": attrs, "coords": coords, "data": resource["DAS"], @@ -64,6 +61,10 @@ def read( ) -> dc.BaseSpool: """Read a single file with APSensing data inside.""" patches = _get_patch( - resource, time=time, distance=distance, attr_cls=APSensingPatchAttrs + resource, + self.name, + time=time, + distance=distance, + attr_cls=APSensingPatchAttrs, ) return dc.spool(patches) diff --git a/dascore/io/ap_sensing/utils.py b/dascore/io/ap_sensing/utils.py index 7cdbf40c..07d6ecd3 100644 --- a/dascore/io/ap_sensing/utils.py +++ b/dascore/io/ap_sensing/utils.py @@ -4,7 +4,7 @@ import dascore as dc from dascore.core import get_coord, get_coord_manager -from dascore.utils.misc import _maybe_unpack, unbyte +from dascore.utils.misc import _maybe_unpack, get_path, unbyte def _get_version_string(resource): @@ -73,8 +73,9 @@ def _get_coords(resource): return cm -def _get_attrs_dict(resource): +def _get_attrs_dict(resource, format_name): """Get attributes.""" + version = _get_version_string(resource) daq = resource["DAQ"] pserver = resource["ProcessingServer"] out = dict( @@ -82,19 +83,23 @@ def _get_attrs_dict(resource): instrumet_id=unbyte(_maybe_unpack(daq["SerialNumber"])), gauge_length=_maybe_unpack(pserver["GaugeLength"]), radians_to_nano_strain=_maybe_unpack(pserver["RadiansToNanoStrain"]), + path=get_path(resource), + format_name=format_name, + format_version=version, ) return out def _get_patch( resource, + format_name, time=None, distance=None, attr_cls=dc.PatchAttrs, **kwargs, ): """Get a patch from ap_sensing file.""" - attrs = _get_attrs_dict(resource) + attrs = _get_attrs_dict(resource, format_name) coords = _get_coords(resource) data = resource["DAS"] if time is not None or distance is not None: diff --git a/dascore/io/asn/__init__.py b/dascore/io/asn/__init__.py new file mode 100644 index 00000000..fe6a889f --- /dev/null +++ b/dascore/io/asn/__init__.py @@ -0,0 +1,9 @@ +""" +Support for Alcatel Submarine Networks (ASN) formats. + +ASN makes the OptoDAS interrogator. + +More info can be found here: https://web.asn.com/ +""" +from __future__ import annotations +from .core import OptoDASV8 diff --git a/dascore/io/optodas/core.py b/dascore/io/asn/core.py similarity index 74% rename from dascore/io/optodas/core.py rename to dascore/io/asn/core.py index faa021e5..1d3a1e1d 100644 --- a/dascore/io/optodas/core.py +++ b/dascore/io/asn/core.py @@ -10,7 +10,7 @@ from dascore.utils.hdf5 import H5Reader from dascore.utils.models import UnitQuantity, UTF8Str -from .utils import _get_opto_das_attrs, _get_opto_das_version_str, _read_opto_das +from .utils import _get_opto_das_coords_attrs, _get_opto_das_version_str, _read_opto_das class OptoDASPatchAttrs(dc.PatchAttrs): @@ -41,17 +41,16 @@ def get_format(self, resource: H5Reader, **kwargs) -> tuple[str, str] | bool: if version_str: return self.name, version_str - def scan(self, resource: H5Reader, **kwargs) -> list[dc.PatchAttrs]: + def scan(self, resource: H5Reader, **kwargs) -> list[dc.PatchSummary]: """Scan a OptoDAS file, return summary information about the file's contents.""" - file_version = _get_opto_das_version_str(resource) - extras = { - "path": resource.filename, - "file_format": self.name, - "file_version": str(file_version), - } - attrs = _get_opto_das_attrs(resource) - attrs.update(extras) - return [OptoDASPatchAttrs(**attrs)] + attrs, coords = _get_opto_das_coords_attrs(resource, self.name) + data_node = resource["data"] + summary = dc.PatchSummary( + data=data_node, + coords=coords, + attrs=OptoDASPatchAttrs(**attrs), + ) + return [summary] def read( self, @@ -62,6 +61,10 @@ def read( ) -> dc.BaseSpool: """Read a OptoDAS spool of patches.""" patches = _read_opto_das( - resource, time=time, distance=distance, attr_cls=OptoDASPatchAttrs + resource, + time=time, + distance=distance, + attr_cls=OptoDASPatchAttrs, + format_name=self.name, ) return dc.spool(patches) diff --git a/dascore/io/optodas/utils.py b/dascore/io/asn/utils.py similarity index 79% rename from dascore/io/optodas/utils.py rename to dascore/io/asn/utils.py index 770ac661..5c54168f 100644 --- a/dascore/io/optodas/utils.py +++ b/dascore/io/asn/utils.py @@ -6,7 +6,7 @@ import dascore.core from dascore.core.coords import get_coord from dascore.utils.hdf5 import unpack_scalar_h5_dataset -from dascore.utils.misc import unbyte +from dascore.utils.misc import get_path, unbyte # --- Getting format/version @@ -53,7 +53,7 @@ def _get_coord_manager(fi): return out -def _get_attr_dict(header): +def _get_attr_dict(header, path, format_name, format_version): """Map header info to DAS attrs.""" attr_map = { "gaugeLength": "gauge_length", @@ -61,7 +61,12 @@ def _get_attr_dict(header): "instrument": "instrument_id", "experiment": "acquisition_id", } - out = {"data_category": "DAS"} + out = { + "data_category": "DAS", + "path": path, + "format_name": format_name, + "format_version": format_version, + } for head_name, attr_name in attr_map.items(): value = header[head_name] if hasattr(value, "shape"): @@ -70,20 +75,20 @@ def _get_attr_dict(header): return out -def _get_opto_das_attrs(fi) -> dict: +def _get_opto_das_coords_attrs(fi, format_name) -> tuple[dc.CoordManager, dict]: """Scan a OptoDAS file, return metadata.""" cm = _get_coord_manager(fi) - attrs = _get_attr_dict(fi["header"]) - attrs["coords"] = cm - return attrs + path = get_path(fi) + version = _get_opto_das_version_str(fi) + attrs = _get_attr_dict(fi["header"], path, format_name, version) + return cm, attrs -def _read_opto_das(fi, distance=None, time=None, attr_cls=dc.PatchAttrs): +def _read_opto_das( + fi, distance=None, time=None, attr_cls=dc.PatchAttrs, format_name="" +): """Read the OptoDAS values into a patch.""" - attrs = _get_opto_das_attrs(fi) + coords, attrs = _get_opto_das_coords_attrs(fi, format_name) data_node = fi["data"] - coords = attrs.pop("coords") cm, data = coords.select(array=data_node, distance=distance, time=time) - attrs["coords"] = cm.to_summary_dict() - attrs["dims"] = cm.dims return [dc.Patch(data=data, coords=cm, attrs=attr_cls(**attrs))] diff --git a/dascore/io/dasdae/core.py b/dascore/io/dasdae/core.py index 9d9fd88b..c638e8ca 100644 --- a/dascore/io/dasdae/core.py +++ b/dascore/io/dasdae/core.py @@ -2,24 +2,18 @@ from __future__ import annotations -import contextlib - -import pandas as pd - import dascore as dc from dascore.constants import SpoolType from dascore.io import FiberIO from dascore.utils.hdf5 import ( H5Reader, H5Writer, - HDFPatchIndexManager, - NodeError, ) -from dascore.utils.misc import unbyte +from dascore.utils.misc import get_path, unbyte from dascore.utils.patch import get_patch_names from .utils import ( - _get_contents_from_patch_groups, + _get_summary_from_patch_groups, _read_patch, _save_patch, _write_meta, @@ -67,37 +61,19 @@ def write(self, spool: SpoolType, resource: H5Writer, index=False, **kwargs): This is recommended for files with many patches and not recommended for files with few patches. """ - breakpoint() # write out patches _write_meta(resource, self.version) # get an iterable of patches and save them patches = [spool] if isinstance(spool, dc.Patch) else spool # create new node called waveforms, else suppress error if it # already exists. - with contextlib.suppress(NodeError): - resource.create_group(resource.root, "waveforms") - waveforms = resource.get_node("/waveforms") + if "waveforms" not in resource: + resource.create_group(resource, "waveforms") + waveforms = resource["/waveforms"] # write new patches to file patch_names = get_patch_names(patches).values for patch, name in zip(patches, patch_names): _save_patch(patch, waveforms, resource, name) - indexer = HDFPatchIndexManager(resource) - if index or indexer.has_index: - df = self._get_patch_summary(patches) - indexer.write_update(df) - - def _get_patch_summary(self, patches) -> pd.DataFrame: - """Get a patch summary to put into index.""" - df = ( - dc.scan_to_df(patches) - .assign( - path=lambda x: get_patch_names(x), - file_format=self.name, - file_version=self.version, - ) - .dropna(subset=["time_min", "time_max", "distance_min", "distance_max"]) - ) - return df def get_format(self, resource: H5Reader, **kwargs) -> tuple[str, str] | bool: """Return the format from a dasdae file.""" @@ -112,12 +88,22 @@ def get_format(self, resource: H5Reader, **kwargs) -> tuple[str, str] | bool: def read(self, resource: H5Reader, **kwargs) -> SpoolType: """Read a DASDAE file.""" patches = [] + path = get_path(resource) + format_version = unbyte(resource.attrs["__DASDAE_version__"]) + format_name = self.name try: - waveform_group = resource.root["/waveforms"] + waveform_group = resource["/waveforms"] except (KeyError, IndexError): return dc.spool([]) for patch_group in waveform_group: - patches.append(_read_patch(patch_group, **kwargs)) + pa = _read_patch( + patch_group, + path=path, + format_name=format_name, + format_version=format_version, + **kwargs, + ) + patches.append(pa) return dc.spool(patches) def scan(self, resource: H5Reader, **kwargs) -> list[dc.PatchSummary]: @@ -134,5 +120,4 @@ def scan(self, resource: H5Reader, **kwargs) -> list[dc.PatchSummary]: A path to the file. """ file_format = self.name - version = resource.attrs["__DASDAE_version__"] - return _get_contents_from_patch_groups(resource, version, file_format) + return _get_summary_from_patch_groups(resource, file_format) diff --git a/dascore/io/dasdae/utils.py b/dascore/io/dasdae/utils.py index 9aa91f0a..7e1bf914 100644 --- a/dascore/io/dasdae/utils.py +++ b/dascore/io/dasdae/utils.py @@ -6,11 +6,10 @@ from tables import NodeError import dascore as dc -from dascore.core.attrs import PatchAttrs from dascore.core.coordmanager import get_coord_manager from dascore.core.coords import get_coord from dascore.utils.hdf5 import Empty -from dascore.utils.misc import suppress_warnings, unbyte +from dascore.utils.misc import get_path, suppress_warnings, unbyte from dascore.utils.time import to_int # --- Functions for writing DASDAE format @@ -114,19 +113,21 @@ def _save_patch(patch, wave_group, h5, name): # --- Functions for reading -def _get_attrs(patch_group): +def _get_attrs(patch_group, path, format_name, format_version): """Get the saved attributes form the group attrs.""" out = {} attrs = [x for x in patch_group.attrs if x.startswith("_attrs_")] - for attr_name in attrs: - key = attr_name.replace("_attrs_", "") - val = patch_group._v_attrs[attr_name] - # need to unpack one value arrays - if isinstance(val, np.ndarray) and not val.shape: - val = np.asarray([val])[0] - out[key] = val - with suppress_warnings(DeprecationWarning): - return PatchAttrs(**out) + tables_attrs = _santize_pytables(dict(patch_group.attrs)) + for key, value in tables_attrs.items(): + new_key = key.replace("_attrs_", "") + # need to unpack 0 dim arrays. + if isinstance(value, np.ndarray) and not value.shape: + value = np.atleast_1d(value)[0] + attrs[new_key] = value + out["path"] = path + out["format_name"] = format_name + out["format_version"] = format_version + return out def _read_array(table_array): @@ -174,34 +175,33 @@ def _get_dims(patch_group): return out -def _read_patch(patch_group, load_data=True, **kwargs): +def _read_patch(patch_group, path, format_name, format_version, **kwargs): """Read a patch group, return Patch.""" - attrs = _get_attrs(patch_group) + attrs = _get_attrs(patch_group, path, format_name, format_version) dims = _get_dims(patch_group) coords = _get_coords(patch_group, dims, attrs) - # Note, previously this was wrapped with try, except (Index, KeyError) - # and the data = np.array(None) in except block. Not sure, why, removed - # try except. if kwargs: coords, data = coords.select(array=patch_group["data"], **kwargs) else: data = patch_group["data"] - if load_data: - data = data[:] - return dc.Patch(data=data, coords=coords, dims=dims, attrs=attrs) + return dc.Patch(data=data[:], coords=coords, dims=dims, attrs=attrs) -def _get_contents_from_patch_groups(h5, file_version, file_format="DASDAE"): +def _get_summary_from_patch_groups(h5, format_name="DASDAE"): """Get the contents from each patch group.""" + path = get_path(h5) + format_version = h5.attrs["__DASDAE_version__"] out = [] for name, group in h5[("/waveforms")].items(): - contents = _get_patch_content_from_group(group) - # populate file info - contents["version"] = file_version - contents["format"] = file_format - contents["path"] = h5.filename + contents = _get_patch_content_from_group( + group, + path=path, + format_name=format_name, + format_version=format_version, + ) # suppressing warnings because old dasdae files will issue warning - # due to d_dim rather than dim_step. TODO fix test files in the future + # due to d_dim rather than dim_step. + # TODO fix in parser. with suppress_warnings(DeprecationWarning): out.append(dc.PatchSummary(**contents)) @@ -240,20 +240,17 @@ def _get_coord_info(info, group): return coords -def _get_patch_content_from_group(group): +def _get_patch_content_from_group(group, path, format_name, format_version): """Get patch content from a single node.""" - attrs = {} # The attributes in the table. - tables_attrs = _santize_pytables(dict(group.attrs)) - for key, value in tables_attrs.items(): - new_key = key.replace("_attrs_", "") - # need to unpack 0 dim arrays. - if isinstance(value, np.ndarray) and not value.shape: - value = np.atleast_1d(value)[0] - attrs[new_key] = value - # Add coord info. + attrs = _get_attrs(group, path, format_name, format_version) + # Get coord info coords = _get_coord_info(attrs, group) + # Overwrite (or add) file-specific info. + attrs["path"] = path + attrs["file_format"] = format_name + attrs["file_version"] = format_version # Add data info. data = group["data"] - dims = attrs.pop("_dims") + dims = attrs.pop("_dims", None) return dict(data=data, attrs=attrs, dims=dims, coords=coords) diff --git a/dascore/io/dashdf5/core.py b/dascore/io/dashdf5/core.py index 91b11bce..f2e361d7 100644 --- a/dascore/io/dashdf5/core.py +++ b/dascore/io/dashdf5/core.py @@ -6,6 +6,8 @@ from dascore.constants import opt_timeable_types from dascore.io import FiberIO from dascore.utils.hdf5 import H5Reader +from dascore.utils.misc import get_path + from .utils import _get_cf_attrs, _get_cf_coords, _get_cf_version_str @@ -16,6 +18,14 @@ class DASHDF5(FiberIO): preferred_extensions = ("hdf5", "h5") version = "1.0" + def _get_attr(self, resource: H5Reader): + """Get the attrs dict with path and such populated.""" + attrs = _get_cf_attrs(resource) + attrs["path"] = get_path(resource) + attrs["format_name"] = self.name + attrs["format_version"] = self.version + return dc.PatchAttrs.model_validate(attrs) + def get_format(self, resource: H5Reader, **kwargs) -> tuple[str, str] | bool: """ Return True if file contains terra15 version 2 data else False. @@ -32,13 +42,11 @@ def get_format(self, resource: H5Reader, **kwargs) -> tuple[str, str] | bool: def scan(self, resource: H5Reader, **kwargs) -> list[dc.PatchSummary]: """Get metadata from file.""" coords = _get_cf_coords(resource) + attrs = self._get_attr(resource) info = { - "path": resource.filename, - "format": self.name, - "version": str(self.version), "coords": coords, - "dims": coords.dims, - "attrs": _get_cf_attrs(resource, coords), + "attrs": attrs, + "data": resource["das"], } return [dc.PatchSummary(**info)] @@ -56,8 +64,8 @@ def read( time=time, channel=channel, ) - attrs = _get_cf_attrs(resource, coords_new) + attrs = self._get_attr(resource) patch = dc.Patch( data=data, attrs=attrs, coords=coords_new, dims=coords_new.dims ) - return dc.spool(patch) + return dc.spool([patch]) diff --git a/dascore/io/dashdf5/utils.py b/dascore/io/dashdf5/utils.py index b9203284..b6e02860 100644 --- a/dascore/io/dashdf5/utils.py +++ b/dascore/io/dashdf5/utils.py @@ -75,14 +75,13 @@ def _get_spatial_coord(hdf_fi, code): return cm -def _get_cf_attrs(hdf_fi, coords=None, extras=None): +def _get_cf_attrs(hdf_fi): """Get attributes for CF file.""" - out = {"coords": coords or _get_cf_coords(hdf_fi)} - out.update(extras or {}) + out = {} for n1, n2 in _ROOT_ATTR_MAPPING.items(): out[n1] = hdf_fi.attrs.get(n2) for n1, n2 in _DAS_ATTR_MAPPING.items(): out[n1] = getattr(hdf_fi.get("das", {}), "attrs", {}).get(n2) for n1, n2 in _CRS_MAPPING.items(): out[n1] = getattr(hdf_fi.get("crs", {}), "attrs", {}).get(n2) - return dc.PatchAttrs(**out) + return out diff --git a/dascore/io/febus/core.py b/dascore/io/febus/core.py index c42f26fb..8975c15f 100644 --- a/dascore/io/febus/core.py +++ b/dascore/io/febus/core.py @@ -7,9 +7,10 @@ import numpy as np import dascore as dc -from dascore.constants import opt_timeable_types, attr_conflict_description +from dascore.constants import opt_timeable_types from dascore.io import FiberIO from dascore.utils.hdf5 import H5Reader +from dascore.utils.misc import get_path from dascore.utils.models import UTF8Str from .utils import ( @@ -68,7 +69,13 @@ def get_format(self, resource: H5Reader, **kwargs) -> tuple[str, str] | bool: def scan(self, resource: H5Reader, **kwargs) -> list[dc.PatchSummary]: """Scan a febus file, return summary information about the file's contents.""" - return _scan_febus(resource, resource.path, attr_cls=FebusPatchAttrs) + return _scan_febus( + resource, + path=get_path(resource), + format_name=self.name, + format_version=self.version, + attr_cls=FebusPatchAttrs, + ) def read( self, @@ -79,7 +86,13 @@ def read( ) -> dc.BaseSpool: """Read a febus spool of patches.""" patches = _read_febus( - resource, time=time, distance=distance, attr_cls=FebusPatchAttrs + resource, + path=get_path(resource), + format_name=self.name, + format_version=self.version, + time=time, + distance=distance, + attr_cls=FebusPatchAttrs, ) return dc.spool(patches) diff --git a/dascore/io/febus/utils.py b/dascore/io/febus/utils.py index d698df73..699dd5e8 100644 --- a/dascore/io/febus/utils.py +++ b/dascore/io/febus/utils.py @@ -9,13 +9,13 @@ import dascore as dc from dascore.core import get_coord, get_coord_manager from dascore.core.coordmanager import CoordManager -from dascore.utils.models import ArraySummary from dascore.utils.misc import ( _maybe_unpack, broadcast_for_index, maybe_get_items, unbyte, ) +from dascore.utils.models import ArraySummary # --- Getting format/version @@ -94,7 +94,6 @@ def _get_febus_attrs(feb: _FebusSlice) -> dict: out["source"] = feb.source_name out["zone"] = feb.zone_name out["schema_version"] = out.get("folog_a1_software_version", "").split(".")[0] - out["dims"] = ("time", "distance") return out @@ -185,11 +184,16 @@ def _get_febus_coord_manager(feb: _FebusSlice) -> CoordManager: return cm -def _yield_attrs_coords(fi) -> tuple[dict, CoordManager]: +def _yield_attrs_coords( + fi, path, format_name, format_version +) -> tuple[dict, CoordManager]: """Scan a febus file, return metadata.""" febuses = _flatten_febus_info(fi) for febus in febuses: attr = _get_febus_attrs(febus) + attr["path"] = path + attr["format_name"] = format_name + attr["format_version"] = format_version cm = _get_febus_coord_manager(febus) yield attr, cm, febus @@ -265,6 +269,7 @@ def _get_time_filtered_data(data, t_start_end, time, total_slice, time_coord): def _get_data_summary(febus): """Get the summary of the data array.""" + breakpoint() data = febus.zone[febus.data_name] data_shape = data.shape skip_rows = _get_time_overlap_samples(febus, data_shape) // 2 @@ -278,32 +283,35 @@ def _get_data_summary(febus): return data, cm - -def _read_febus(h5, distance=None, time=None, attr_cls=dc.PatchAttrs): +def _read_febus( + h5, + path, + format_name, + format_version, + distance=None, + time=None, + attr_cls=dc.PatchAttrs, +): """Read the febus values into a patch.""" out = [] - for attr, cm, febus in _yield_attrs_coords(h5): + febiter = _yield_attrs_coords(h5, path, format_name, format_version) + for attr, cm, febus in febiter: data, new_cm = _get_data_new_cm(cm, febus, distance=distance, time=time) patch = dc.Patch(data=data, coords=new_cm, attrs=attr_cls(**attr)) out.append(patch) return out -def _scan_febus(h5, path, attr_cls=dc.PatchAttrs): +def _scan_febus(h5, path, format_name, format_version, attr_cls=dc.PatchAttrs): """Read the febus values into a patch.""" out = [] - format = "febus" - version = _get_febus_version_str(h5) - for attr, cm, febus in _yield_attrs_coords(h5): + febiter = _yield_attrs_coords(h5, path, format_name, format_version) + for attr, cm, febus in febiter: data_summary = _get_data_summary(febus) patch = dc.PatchSummary( data=data_summary, coords=cm, attrs=attr_cls(**attr), - dims=cm.dims, - format=format, - version=version, - path=path, ) out.append(patch) - return out \ No newline at end of file + return out diff --git a/dascore/io/gdr/core.py b/dascore/io/gdr/core.py index 8fe7c90d..d436ec7a 100644 --- a/dascore/io/gdr/core.py +++ b/dascore/io/gdr/core.py @@ -17,6 +17,7 @@ _maybe_trim_data, ) from dascore.utils.hdf5 import H5Reader +from dascore.utils.misc import get_path class GDRPatchAttrs(dc.PatchAttrs): @@ -36,6 +37,15 @@ class GDR_V1(FiberIO): # noqa preferred_extensions = ("hdf5", "h5") version = "1" + def _get_attr_coord_data(self, resource, snap=True): + """Get the attributes, coordinates, and h5 dataset.""" + attr_dict, cm, data = _get_attrs_coords_and_data(resource, snap=snap) + attr_dict["path"] = get_path(resource) + attr_dict["format_name"] = self.name + attr_dict["version"] = self.version + attr = GDRPatchAttrs(**attr_dict) + return attr, cm, data + def get_format(self, resource: H5Reader, **kwargs) -> tuple[str, str] | bool: """Determine if the resource belongs to this format.""" return _get_version(resource) @@ -53,18 +63,18 @@ def read(self, resource: H5Reader, snap=True, **kwargs) -> SpoolType: **kwargs Passed to filtering coordinates. """ - attr_dict, cm, data = _get_attrs_coords_and_data(resource, snap=snap) + attr, cm, data = self._get_attr_coord_data(resource, snap=snap) if kwargs: cm, data = _maybe_trim_data(cm, data, **kwargs) - attrs = GDRPatchAttrs(**attr_dict) - patch = dc.Patch(coords=cm, data=data[:], attrs=attrs) + patch = dc.Patch(coords=cm, data=data[:], attrs=attr) return dc.spool([patch]) - def scan(self, resource: H5Reader, snap=True, **kwargs) -> list[dc.PatchAttrs]: + def scan(self, resource: H5Reader, snap=True, **kwargs) -> list[dc.PatchSummary]: """Get the attributes of a resource belong to this type.""" - attrs, cm, data = _get_attrs_coords_and_data(resource, snap) - attrs["coords"] = cm.to_summary_dict() - attrs["path"] = resource.filename - attrs["file_format"] = self.name - attrs["file_version"] = self.version - return [dc.PatchAttrs(**attrs)] + attr, cm, data = self._get_attr_coord_data(resource, snap=snap) + summary = dc.PatchSummary( + coords=cm, + data=data[:], + attrs=attr, + ) + return [summary] diff --git a/dascore/io/h5simple/core.py b/dascore/io/h5simple/core.py index 5b0c0098..5f59ae9c 100644 --- a/dascore/io/h5simple/core.py +++ b/dascore/io/h5simple/core.py @@ -41,9 +41,12 @@ def read(self, resource: H5Reader, snap=True, **kwargs) -> SpoolType: patch = dc.Patch(coords=new_cm, data=new_data[:], attrs=attrs) return dc.spool([patch]) - def scan(self, resource: H5Reader, snap=True, **kwargs) -> list[dc.PatchAttrs]: + def scan(self, resource: H5Reader, snap=True, **kwargs) -> list[dc.PatchSummary]: """Get the attributes of a h5simple file.""" attrs, cm, data = _get_attrs_coords_and_data(resource, snap, self) - attrs["coords"] = cm.to_summary_dict() - attrs["path"] = resource.filename - return [dc.PatchAttrs(**attrs)] + summary = dc.PatchSummary( + attrs=attrs, + coords=cm, + data=data, + ) + return [summary] diff --git a/dascore/io/neubrex/core.py b/dascore/io/neubrex/core.py index 211699fa..21f46a67 100644 --- a/dascore/io/neubrex/core.py +++ b/dascore/io/neubrex/core.py @@ -12,6 +12,7 @@ from dascore.constants import SpoolType from dascore.io import FiberIO from dascore.utils.hdf5 import H5Reader +from dascore.utils.misc import get_path class NeubrexRFSPatchAttrs(dc.PatchAttrs): @@ -49,6 +50,14 @@ class NeubrexRFSV1(FiberIO): preferred_extensions = ("hdf5", "h5") version = "1" + def _get_attrs(self, resource) -> NeubrexRFSPatchAttrs: + """Get the patch attributes.""" + attr = rfs_utils._get_attr_dict(resource) + attr["path"] = get_path(resource) + attr["format_name"] = self.name + attr["format_version"] = self.version + return NeubrexRFSPatchAttrs(**attr) + def get_format(self, resource: H5Reader, **kwargs) -> tuple[str, str] | bool: """Determine if the resource belongs to this format.""" if rfs_utils._is_neubrex(resource): @@ -68,21 +77,21 @@ def read(self, resource: H5Reader, snap=True, **kwargs) -> SpoolType: **kwargs Passed to filtering coordinates. """ - attr_dict, cm, data = rfs_utils._get_attrs_coords_and_data(resource, snap) + attrs = self._get_attrs(resource) + cm = rfs_utils._get_coord_manager(resource, snap=snap) + data = resource["data"] if kwargs: cm, data = rfs_utils._maybe_trim_data(cm, data, **kwargs) - attrs = NeubrexRFSPatchAttrs(**attr_dict) patch = dc.Patch(coords=cm, data=data[:], attrs=attrs) return dc.spool([patch]) - def scan(self, resource: H5Reader, snap=True, **kwargs) -> list[dc.PatchAttrs]: + def scan(self, resource: H5Reader, snap=True, **kwargs) -> list[dc.PatchSummary]: """Get the attributes of a resource belong to this type.""" - attrs, cm, data = rfs_utils._get_attrs_coords_and_data(resource, snap) - attrs["coords"] = cm.to_summary_dict() - attrs["path"] = resource.filename - attrs["file_format"] = self.name - attrs["file_version"] = self.version - return [dc.PatchAttrs(**attrs)] + attrs = self._get_attrs(resource) + cm = rfs_utils._get_coord_manager(resource, snap=snap) + data = resource["data"] + summary = dc.PatchSummary(coords=cm, data=data, attrs=attrs) + return [summary] class NeubrexDASV1(FiberIO): @@ -94,6 +103,14 @@ class NeubrexDASV1(FiberIO): preferred_extensions = ("hdf5", "h5") version = "1" + def _get_attr(self, resource) -> NeubrexDASPatchAttrs: + """Get the attrs for from the file.""" + attr = das_utils._get_attr_dict(resource["Acoustic"]) + attr["path"] = get_path(resource) + attr["format_name"] = self.name + attr["format_version"] = self.version + return NeubrexDASPatchAttrs(**attr) + def get_format(self, resource: H5Reader, **kwargs) -> tuple[str, str] | bool: """Determine if resource belongs to this format.""" if das_utils._is_neubrex(resource): @@ -108,23 +125,21 @@ def read(self, resource: H5Reader, **kwargs) -> SpoolType: ---------- resource The open h5 object. - snap - If True, snap each coordinate to be evenly sampled. **kwargs Passed to filtering coordinates. """ - attr_dict, cm, data = das_utils._get_attrs_coords_and_data(resource) + attrs = self._get_attr(resource) + cm = das_utils._get_coord_manager(resource["Acoustic"]) + data = resource["Acoustic"] if kwargs: cm, data = das_utils._maybe_trim_data(cm, data, **kwargs) - attrs = NeubrexRFSPatchAttrs(**attr_dict) patch = dc.Patch(coords=cm, data=data[:], attrs=attrs) return dc.spool([patch]) def scan(self, resource: H5Reader, **kwargs) -> list[dc.PatchAttrs]: - """Get the attributes of this format from File.""" - attrs, cm, data = das_utils._get_attrs_coords_and_data(resource) - attrs["coords"] = cm.to_summary_dict() - attrs["path"] = resource.filename - attrs["file_format"] = self.name - attrs["file_version"] = self.version - return [dc.PatchAttrs(**attrs)] + """Get the summary of patches in this file.""" + attrs = self._get_attr(resource) + cm = das_utils._get_coord_manager(resource["Acoustic"]) + data = resource["Acoustic"] + summary = dc.PatchSummary(coords=cm, data=data[:], attrs=attrs) + return [summary] diff --git a/dascore/io/neubrex/utils_das.py b/dascore/io/neubrex/utils_das.py index d16f3210..8bc08c09 100644 --- a/dascore/io/neubrex/utils_das.py +++ b/dascore/io/neubrex/utils_das.py @@ -27,7 +27,7 @@ def _get_time_coord(acoustic): """Get the time coordinate.""" attrs = acoustic.attrs - # We havent encountered a time decimated file yet; raise over guess + # See 465 for why this is here. assert attrs["TimeDecimationFilter"] in {0, 1}, "not implemented" gps = unbyte(attrs["GPSTimeStamp(UTC)"]) diff --git a/dascore/io/optodas/__init__.py b/dascore/io/optodas/__init__.py deleted file mode 100644 index 2076107f..00000000 --- a/dascore/io/optodas/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -""" -Support for OptoDAS format. - -This is used by the OptoDAS interrogator made by Alcatel Submarine Networks. - -More info here: https://web.asn.com/ -""" -from __future__ import annotations -from .core import OptoDASV8 diff --git a/dascore/io/pickle/core.py b/dascore/io/pickle/core.py index 80f62b51..f71f2b5b 100644 --- a/dascore/io/pickle/core.py +++ b/dascore/io/pickle/core.py @@ -4,8 +4,9 @@ import pickle -import dascore +import dascore as dc from dascore.io import BinaryReader, BinaryWriter, FiberIO +from dascore.utils.misc import get_path class PickleIO(FiberIO): @@ -49,8 +50,14 @@ def get_format(self, resource: BinaryReader, **kwargs) -> tuple[str, str] | bool def read(self, resource: BinaryReader, **kwargs): """Read a Patch/Spool from disk.""" - out = pickle.load(resource) - return dascore.spool(out) + patch: dc.Patch = pickle.load(resource) + # Add the relevant file info. + out = patch.update_attrs( + path=get_path(resource), + format_name=self.name, + format_version=self.version, + ) + return dc.spool([out]) def write(self, patch, resource: BinaryWriter, **kwargs): """Write a Patch/Spool to disk.""" diff --git a/dascore/io/prodml/core.py b/dascore/io/prodml/core.py index 49b940cb..16355a5a 100644 --- a/dascore/io/prodml/core.py +++ b/dascore/io/prodml/core.py @@ -10,7 +10,13 @@ from dascore.utils.models import UnitQuantity, UTF8Str from ...utils.hdf5 import H5Reader -from .utils import _get_prodml_attrs, _get_prodml_version_str, _read_prodml +from .utils import ( + _get_data_coords, + _get_prodml_attrs, + _get_prodml_version_str, + _get_raw_node_dict, + _read_prodml, +) class ProdMLPatchAttrs(dc.PatchAttrs): @@ -43,16 +49,21 @@ def get_format(self, resource: H5Reader, **kwargs) -> tuple[str, str] | bool: if version_str: return (self.name, version_str) - def scan(self, resource: H5Reader, **kwargs) -> list[dc.PatchAttrs]: + def scan(self, resource: H5Reader, **kwargs) -> list[dc.PatchSummary]: """Scan a prodml file, return summary information about the file's contents.""" - file_version = _get_prodml_version_str(resource) - extras = { - "path": resource.filename, - "file_format": self.name, - "file_version": str(file_version), - } - attrs = _get_prodml_attrs(resource, extras=extras) - return [ProdMLPatchAttrs(**x) for x in attrs] + attr_list = _get_prodml_attrs(resource, self.name, self.version) + acq = resource["Acquisition"] + nodes = list(_get_raw_node_dict(acq.values())) + out = [] + for attrs, node in zip(attr_list, nodes): + data, coords = _get_data_coords(attrs, node) + summary = dc.PatchSummary( + data=data, + attrs=ProdMLPatchAttrs(**attrs), + coords=coords, + ) + out.append(summary) + return out def read( self, @@ -63,7 +74,12 @@ def read( ) -> dc.BaseSpool: """Read a ProdML file.""" patches = _read_prodml( - resource, time=time, distance=distance, attr_cls=ProdMLPatchAttrs + resource, + format_name=self.name, + format_version=self.version, + time=time, + distance=distance, + attr_cls=ProdMLPatchAttrs, ) return dc.spool(patches) diff --git a/dascore/io/prodml/utils.py b/dascore/io/prodml/utils.py index b6f81c87..bac458b2 100644 --- a/dascore/io/prodml/utils.py +++ b/dascore/io/prodml/utils.py @@ -6,7 +6,7 @@ from dascore.constants import VALID_DATA_TYPES from dascore.core.coordmanager import get_coord_manager from dascore.core.coords import get_coord -from dascore.utils.misc import iterate, maybe_get_items, unbyte +from dascore.utils.misc import get_path, iterate, maybe_get_items, unbyte # --- Getting format/version @@ -83,7 +83,7 @@ def _get_data_unit_and_type(node): return out -def _get_prodml_attrs(fi, extras=None) -> list[dict]: +def _get_prodml_attrs(fi, format_name, format_version) -> list[dict]: """Scan a prodML file, return metadata.""" _root_attrs = { "PulseWidth": "pulse_width", @@ -99,21 +99,18 @@ def _get_prodml_attrs(fi, extras=None) -> list[dict]: } acq = fi["Acquisition"] base_info = maybe_get_items(acq.attrs, _root_attrs) - d_coord = _get_distance_coord(acq) raw_nodes = _get_raw_node_dict(acq) + path = get_path(fi) # Iterate each raw data node. I have only ever seen 1 in a file but since # it is indexed like Raw[0] there might be more. out = [] for node in raw_nodes.values(): info = dict(base_info) - t_coord = _get_time_coord(node) - info.update(t_coord.get_attrs_dict("time")) info.update(_get_data_unit_and_type(node)) - info["dims"] = _get_dims(node) - if extras is not None: - info.update(extras) - info["coords"] = {"time": t_coord, "distance": d_coord} + info["path"] = path + info["format_name"] = format_name + info["format_version"] = format_version out.append(info) return out @@ -133,21 +130,25 @@ def _get_dims(node): return dims -def _get_data_attr(attrs, node, time, distance): - """Get a new attributes with adjusted time/distance and data array.""" +def _get_data_coords(attrs, node, time=None, distance=None): + """Get the data array with coordinates.""" dims = _get_dims(node) cm = get_coord_manager(attrs["coords"], dims=dims) - new_cm, data = cm.select(array=node["RawData"], time=time, distance=distance) - return data, new_cm + data = node["RawData"] + if time is not None or distance is not None: + cm, data = cm.select(array=node["RawData"], time=time, distance=distance) + return data, cm -def _read_prodml(fi, distance=None, time=None, attr_cls=dc.PatchAttrs): +def _read_prodml( + fi, format_name, format_version, distance=None, time=None, attr_cls=dc.PatchAttrs +): """Read the prodml values into a patch.""" - attr_list = _get_prodml_attrs(fi) + attr_list = _get_prodml_attrs(fi, format_name, format_version) nodes = list(_get_raw_node_dict(fi["Acquisition"]).values()) out = [] for attrs, node in zip(attr_list, nodes): - data, coords = _get_data_attr(attrs, node, time, distance) + data, coords = _get_data_coords(attrs, node, time, distance) if data.size: pattrs = attr_cls(**attrs) out.append(dc.Patch(data=data, attrs=pattrs, coords=coords)) diff --git a/dascore/io/segy/core.py b/dascore/io/segy/core.py index a44ceab8..a78b66a8 100644 --- a/dascore/io/segy/core.py +++ b/dascore/io/segy/core.py @@ -5,11 +5,11 @@ import dascore as dc from dascore.io.core import FiberIO from dascore.utils.io import BinaryReader -from dascore.utils.misc import optional_import +from dascore.utils.misc import get_path, optional_import from .utils import ( - _get_attrs, _get_coords, + _get_data_summary, _get_filtered_data_and_coords, _get_segy_version, _write_segy, @@ -28,6 +28,15 @@ class SegyV1_0(FiberIO): # noqa # subclassed and this changed for debugging reasons. _package_name = "segyio" + def _get_attrs(self, resource): + """Get the basic attributes for a segy file.""" + info = dict( + path=get_path(resource), + format_name=self.name, + format_version=self.version, + ) + return dc.PatchAttrs.model_validate(info) + def get_format(self, fp: BinaryReader, **kwargs) -> tuple[str, str] | bool: """Make sure input is segy.""" return _get_segy_version(fp) @@ -41,18 +50,17 @@ def read(self, path, time=None, channel=None, **kwargs): be implemented as well. """ segyio = optional_import(self._package_name) + attrs = self._get_attrs(path) with segyio.open(path, ignore_geometry=True) as fi: coords = _get_coords(fi) - attrs = _get_attrs(fi, coords, path, self) data, coords = _get_filtered_data_and_coords( fi, coords, time=time, channel=channel ) - patch = dc.Patch(coords=coords, data=data, attrs=attrs) patch_trimmed = patch.select(time=time, channel=channel) return dc.spool([patch_trimmed]) - def scan(self, path, **kwargs) -> list[dc.PatchAttrs]: + def scan(self, path, **kwargs) -> list[dc.PatchSummary]: """ Used to get metadata about a file without reading the whole file. @@ -62,10 +70,12 @@ def scan(self, path, **kwargs) -> list[dc.PatchAttrs]: format-specific subclass. """ segyio = optional_import(self._package_name) + attrs = self._get_attrs(path) with segyio.open(path, ignore_geometry=True) as fi: coords = _get_coords(fi) - attrs = _get_attrs(fi, coords, path, self) - return [attrs] + data_summary = _get_data_summary(fi) + summary = dc.PatchSummary(coords=coords, data=data_summary, attrs=attrs) + return [summary] def write(self, spool: dc.Patch | dc.BaseSpool, resource, **kwargs): """ diff --git a/dascore/io/segy/utils.py b/dascore/io/segy/utils.py index 0b44ac48..a2f94d80 100644 --- a/dascore/io/segy/utils.py +++ b/dascore/io/segy/utils.py @@ -128,17 +128,6 @@ def _get_coords(fi): return coords -def _get_attrs(fi, coords, path, file_io): - """Create Patch Attribute from SEGY header contents.""" - attrs = dc.PatchAttrs( - path=path, - file_version=file_io.version, - file_format=file_io.name, - coords=coords, - ) - return attrs - - def _get_time_from_header(header): """Creates a datetime64 object from SEGY header date information.""" segyio = optional_import("segyio") @@ -231,6 +220,11 @@ def _make_time_header_dict(time_coord): return header +def _get_data_summary(fi): + """Get the data summary of the array contained in the segy file.""" + breakpoint() + + def _write_segy(spool, resource, version, segyio): """ Private function for writing a patch/spool as SEGY. diff --git a/dascore/io/sentek/core.py b/dascore/io/sentek/core.py index 1d49e1d1..c336b48b 100644 --- a/dascore/io/sentek/core.py +++ b/dascore/io/sentek/core.py @@ -7,6 +7,8 @@ import dascore as dc from dascore.io import BinaryReader from dascore.io.core import FiberIO +from dascore.utils.misc import get_path +from dascore.utils.models import ArraySummary from .utils import _get_patch_attrs, _get_version @@ -18,6 +20,17 @@ class SentekV5(FiberIO): version = "5" preferred_extensions = ("das",) + def _get_attrs_coords_offsets(self, resource): + """Get attributes, coordinates, and data offsets from file.""" + attrs_dict, coords, offsets = _get_patch_attrs( + resource, + path=get_path(resource), + format_name=self.name, + format_version=self.version, + ) + attrs = dc.PatchAttrs(**attrs_dict) + return attrs, coords, offsets + def read( self, resource: BinaryReader, @@ -26,7 +39,7 @@ def read( **kwargs, ) -> dc.BaseSpool: """Read a Sentek das file, return a DataArray.""" - attrs, coords, offsets = _get_patch_attrs(resource) + attrs, coords, offsets = self._get_attrs_coords_offsets(resource) resource.seek(offsets[0]) array = np.fromfile(resource, dtype=np.float32, count=offsets[1] * offsets[2]) array = np.reshape(array, (offsets[1], offsets[2])).T @@ -42,10 +55,8 @@ def get_format(self, resource: BinaryReader, **kwargs) -> tuple[str, str] | bool def scan(self, resource: BinaryReader, **kwargs): """Extract metadata from sentek file.""" - extras = { - "file_format": self.name, - "file_version": self.version, - "path": resource.name, - } - - return [_get_patch_attrs(resource, extras=extras)[0]] + attrs, coords, offsets = self._get_attrs_coords_offsets(resource) + shape = (offsets[2], offsets[1]) + data_summary = ArraySummary(shape=shape, dtype=np.float32, ndim=2) + summary = dc.PatchSummary(coords=coords, attrs=attrs, data=data_summary) + return [summary] diff --git a/dascore/io/sentek/utils.py b/dascore/io/sentek/utils.py index a112014a..5cd3fa08 100644 --- a/dascore/io/sentek/utils.py +++ b/dascore/io/sentek/utils.py @@ -8,13 +8,14 @@ import dascore as dc from dascore.core import get_coord, get_coord_manager +from dascore.utils.misc import get_path def _get_version(fid): """Determine if Sentek file.""" - name = fid.name + path = get_path(fid) # Sentek files cannot change the extension, or file name. - sw_data = name.endswith(".das") + sw_data = path.endswith(".das") fid.seek(0) # There isn't anything in the header particularly useful for determining # if it is a Sentek file, so we do what we can here. @@ -31,7 +32,7 @@ def _get_version(fid): strain_rate = int(np.fromfile(fid, dtype=np.float32, count=1)[0]) proper_strain_rate = strain_rate in {0, 1} # Note: We will need to modify this later for different versions of the - # sentek data, but for now we only support 5. + # Sentek data, but for now we only support 5. if sw_data and is_positive and proper_strain_rate and nearly_ints: return ("sentek", "5") return False @@ -53,7 +54,7 @@ def _get_time_from_file_name(name) -> np.datetime64: return np.datetime64(iso) -def _get_patch_attrs(fid, extras=None): +def _get_patch_attrs(fid, path, format_name, format_version): """Extracts patch metadata. A few important fields in the header and their meaning: @@ -92,8 +93,8 @@ def _get_patch_attrs(fid, extras=None): coord_manager = get_coord_manager( {"time": time, "distance": dist}, dims=("distance", "time") ) - attrs = dc.PatchAttrs( - coords=coord_manager, data_type=data_type, **({} if extras is None else extras) + attrs = dict( + data_type=data_type, path=path, foram=format_name, format_version=format_version ) offsets = fid.tell(), int(measurement_count), int(sensor_num) return attrs, coord_manager, offsets diff --git a/dascore/io/silixah5/core.py b/dascore/io/silixah5/core.py index b1fd5256..aece1068 100644 --- a/dascore/io/silixah5/core.py +++ b/dascore/io/silixah5/core.py @@ -7,11 +7,11 @@ import numpy as np import dascore as dc +import dascore.io.silixah5.utils as util from dascore.constants import opt_timeable_types from dascore.io import FiberIO from dascore.utils.hdf5 import H5Reader - -from .utils import _get_attr, _get_patch, _get_version_string +from dascore.utils.misc import get_path class SilixaPatchAttrs(dc.PatchAttrs): @@ -30,6 +30,14 @@ class SilixaH5V1(FiberIO): preferred_extensions = ("hdf5", "h5") version = "1" + def _get_attr_coords(self, resource): + """Get attributes and coordinates of patch in file.""" + info, coords = util._get_attr_dict(resource) + info["path"] = get_path(resource) + info["format_name"] = self.name + info["format_version"] = self.version + return SilixaPatchAttrs(**info), coords + def get_format(self, resource: H5Reader, **kwargs) -> tuple[str, str] | bool: """ Return name and version string if Silixa hdf5 else False. @@ -39,20 +47,16 @@ def get_format(self, resource: H5Reader, **kwargs) -> tuple[str, str] | bool: resource A path to the file which may contain terra15 data. """ - version_str = _get_version_string(resource, self.version) + version_str = util._get_version_string(resource, self.version) if version_str: return self.name, version_str - def scan(self, resource: H5Reader, **kwargs) -> list[dc.PatchAttrs]: + def scan(self, resource: H5Reader, **kwargs) -> list[dc.PatchSummary]: """Scan a Silixa HDF5 file, return summary information on the contents.""" - file_version = _get_version_string(resource, self.version) - extras = { - "path": resource.filename, - "file_format": self.name, - "file_version": str(file_version), - } - attrs = _get_attr(resource, SilixaPatchAttrs, extras=extras) - return [attrs] + attrs, coords = self._get_attr_coords(resource) + data = resource["Acoustic"] + summary = dc.PatchSummary(data=data, attrs=attrs, coords=coords) + return [summary] def read( self, @@ -62,7 +66,9 @@ def read( **kwargs, ) -> dc.BaseSpool: """Read a single file with Silixa H5 data inside.""" - patches = _get_patch( - resource, time=time, distance=distance, attr_cls=SilixaPatchAttrs - ) - return dc.spool(patches) + attrs, coords = self._get_attr_coords(resource) + data = resource["Acoustic"] + if time is not None or distance is not None: + coords, data = coords.select(array=data, time=time, distance=distance) + patch = dc.Patch(data=data[:], coords=coords, attrs=attrs) + return dc.spool([patch]) diff --git a/dascore/io/silixah5/utils.py b/dascore/io/silixah5/utils.py index 0dde293e..cdf0564c 100644 --- a/dascore/io/silixah5/utils.py +++ b/dascore/io/silixah5/utils.py @@ -83,28 +83,17 @@ def _get_coords(attrs_dict, shape): return cm -def _get_attr_dict(resource): - """Get the attribute map.""" +def _get_attr_coords(resource): + """Get a dict of attributes and the coordinate manager.""" ds = resource["Acoustic"] attrs_dict = maybe_get_items(ds.attrs, _ATTR_MAP) - attrs_dict["coords"] = _get_coords(attrs_dict, ds.shape) - return attrs_dict - - -def _get_attr(resource, attr_cls, extras=None): - """Get the attribute class""" - attrs = _get_attr_dict(resource) - expected_fields = set(attr_cls.model_fields) - attrs_sub = {i: v for i, v in attrs.items() if i in expected_fields} - attrs_sub.update(extras if extras else {}) - attrs = attr_cls.model_validate(attrs_sub) - return attrs + coords = _get_coords(attrs_dict, ds.shape) + return attrs_dict, coords def _get_patch(resource, time=None, distance=None, attr_cls=dc.PatchAttrs): """Get a patch from ap_sensing file.""" - attrs = _get_attr_dict(resource) - coords = attrs["coords"] + attrs, coords = _get_attr_coords(resource) data = resource["Acoustic"] if time is not None or distance is not None: coords, data = coords.select(array=data, time=time, distance=distance) diff --git a/dascore/io/tdms/core.py b/dascore/io/tdms/core.py index b2179916..88d3df9d 100644 --- a/dascore/io/tdms/core.py +++ b/dascore/io/tdms/core.py @@ -6,8 +6,9 @@ from dascore.constants import timeable_types from dascore.core import Patch from dascore.io import BinaryReader, FiberIO +from dascore.utils.misc import get_path -from .utils import _get_data, _get_default_attrs, _get_version_str +from .utils import _get_attrs_coords, _get_data, _get_version_str class TDMSFormatterV4713(FiberIO): @@ -18,6 +19,14 @@ class TDMSFormatterV4713(FiberIO): preferred_extensions = ("tdms",) lead_in_length = 28 + def _get_attr_coords(self, resource): + """Get a PatchAttrs for the file.""" + out, coords, _ = _get_attrs_coords(resource) + out["path"] = get_path(resource) + out["file_format"] = self.name + out["file_version"] = self.version + return dc.PatchAttrs(**out), coords + def get_format(self, stream: BinaryReader, **kwargs) -> tuple[str, str] | bool: """ Return a tuple of (TDMS, version) if TDMS else False. @@ -36,13 +45,10 @@ def get_format(self, stream: BinaryReader, **kwargs) -> tuple[str, str] | bool: except Exception: return False - def scan(self, resource: BinaryReader, **kwargs) -> list[dc.PatchAttrs]: + def scan(self, resource: BinaryReader, **kwargs) -> list[dc.PatchSummary]: """Scan a tdms file, return summary information about the file's contents.""" - out = _get_default_attrs(resource) - out["path"] = getattr(resource, "name", "") - out["file_format"] = self.name - out["file_version"] = self.version - return [dc.PatchAttrs(**out)] + attrs, coords = self._get_attr_coords(resource) + raise Exception("ARGG!") def read( self, @@ -54,10 +60,9 @@ def read( """Read a silixa tdms file, return a DataArray.""" # get all data, total amount of samples and associated attributes data, channel_length, attrs_full = _get_data(resource, lead_in_length=28) - attrs = _get_default_attrs(resource, attrs_full) - coords = dc.core.get_coord_manager(attrs.pop("coords")) + attrs, coords = self._get_attr_coords(resource) # trim data if required if time is not None or distance is not None: coords, data = coords.select(data, time=time, distance=distance) patch = Patch(data=data, coords=coords, attrs=attrs) - return dc.spool(patch) + return dc.spool([patch]) diff --git a/dascore/io/tdms/utils.py b/dascore/io/tdms/utils.py index 7e3a0f96..ebe59c01 100644 --- a/dascore/io/tdms/utils.py +++ b/dascore/io/tdms/utils.py @@ -9,6 +9,7 @@ import numpy as np +import dascore as dc from dascore.core.attrs import PatchAttrs from dascore.core.coords import get_coord from dascore.utils.time import to_datetime64, to_timedelta64 @@ -124,18 +125,6 @@ def _get_time_coord(attrs, num_samps): return coord -def _get_default_attrs(tdms_file, attrs=None): - """Return the required/default attributes which can be fetched from attributes.""" - all_attrs = attrs if attrs is not None else _get_all_attrs(tdms_file)[0] - # cull attributes to only include defaults (TODO: think about why?) - out = { - default_attr: all_attrs[default_attr] - for default_attr in DEFAULT_ATTRS - if default_attr in all_attrs - } - return out - - def _read_attr(tdms_file): """ Read a single property from the TDMS file. @@ -169,7 +158,7 @@ def _get_distance_coord(attr): return d_coord -def _get_all_attrs(tdms_file, lead_in_length=28): +def _get_attrs_coords(tdms_file, lead_in_length=28): """Return all the attributes which can be fetched from attributes.""" # read leadin infomation into fileinfo lead_in = tdms_file.read(lead_in_length) @@ -227,21 +216,23 @@ def _get_all_attrs(tdms_file, lead_in_length=28): / np.dtype(fileinfo["data_type"]).itemsize ) t_coord = _get_time_coord(out, numofsamples) - out["coords"] = {"time": t_coord, "distance": d_coord} - out.update(t_coord.get_attrs_dict("time")) - out.update(d_coord.get_attrs_dict("distance")) - return out, fileinfo + coord = dc.core.get_coord_manager({"time": t_coord, "distance": d_coord}) + return out, coord, fileinfo def _get_fileinfo(tdms_file, lead_in_length=28): """Get info about file not included in the attributes.""" - attrs, fileinfo = _get_all_attrs(tdms_file) + attrs, _, fileinfo = _get_attrs_coords(tdms_file) # Read Dimension of the raw data array (has to be 1): _ = struct.unpack(" tuple[str, str] | bool: """ Return True if file contains terra15 version 2 data else False. @@ -35,15 +62,10 @@ def get_format(self, resource: H5Reader, **kwargs) -> tuple[str, str] | bool: if version_str: return (self.name, version_str) - def scan(self, resource: H5Reader, **kwargs) -> list[dc.PatchAttrs]: + def scan(self, resource: H5Reader, **kwargs) -> list[dc.PatchSummary]: """Scan a terra15 v2 file, return summary information.""" - version, data_node = _get_version_data_node(resource) - extras = { - "path": resource.filename, - "file_format": self.name, - "file_version": str(version), - } - return _scan_terra15(resource, data_node, extras) + attrs, cm, data = self._get_attrs_coords_data_node(resource) + return [dc.PatchSummary(attrs=attrs, coords=cm, data=data)] def read( self, @@ -70,7 +92,7 @@ def read( negligible. """ patch = _read_terra15(resource, time, distance, snap_dims=snap_dims) - return dc.spool(patch) + return dc.spool([patch]) class Terra15FormatterV5(Terra15FormatterV4): diff --git a/dascore/io/terra15/utils.py b/dascore/io/terra15/utils.py index 85e8327c..ddbaa5a9 100644 --- a/dascore/io/terra15/utils.py +++ b/dascore/io/terra15/utils.py @@ -2,7 +2,6 @@ from __future__ import annotations -import dascore as dc from dascore.constants import timeable_types from dascore.core import Patch from dascore.core.coordmanager import get_coord_manager @@ -79,18 +78,6 @@ def _get_version_data_node(root): return version, data_node -def _scan_terra15(h5_fi, data_node, extras=None): - """Scan a terra15 file, return metadata.""" - out = extras - out.update(_get_default_attrs(h5_fi.attrs)) - coords = { - "time": _get_time_coord(data_node, snap_dims=True), - "distance": _get_distance_coord(h5_fi), - } - out["coords"] = coords - return [dc.PatchAttrs(**out)] - - # --- Reading patch @@ -100,6 +87,10 @@ def _get_raw_time_coord(data_node): return get_coord(data=to_datetime64(time)) +def _get_coord_manager(h5): + """Get the coordinate manager from the terra15 file.""" + + def _read_terra15( pyfi, time: tuple[timeable_types, timeable_types] | None = None, @@ -147,9 +138,6 @@ def _read_terra15( def _get_default_attrs(root_node_attrs): """ Return the required/default attributes which can be fetched from attributes. - - Note: missing time, distance absolute ranges. Downstream functions should handle - this. """ out = {} _root_attrs = { diff --git a/dascore/utils/misc.py b/dascore/utils/misc.py index 266fc005..a6d42e4d 100644 --- a/dascore/utils/misc.py +++ b/dascore/utils/misc.py @@ -745,3 +745,18 @@ def to_object_array(object_sequence): out = np.empty(len(object_sequence), dtype=object) out[:] = object_sequence return out + + +def get_path(obj) -> str: + """ + Get the path string of an object. + + Parameters + ---------- + obj + An object that represents a path to a resource. + """ + # Handles the case of + if hasattr(obj, "filename"): + return obj.filename + return str(obj) diff --git a/pyproject.toml b/pyproject.toml index 5ed0bb88..ccbc857b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,7 +110,7 @@ FEBUS__V1 = "dascore.io.febus.core:Febus1" FEBUS__V2 = "dascore.io.febus.core:Febus2" NEUBREXRFS__V1 = "dascore.io.neubrex:NeubrexRFSV1" NEUBREXDAS__V1 = "dascore.io.neubrex:NeubrexDASV1" -OPTODAS__V8 = "dascore.io.optodas.core:OptoDASV8" +OPTODAS__V8 = "dascore.io.asn.core:OptoDASV8" PICKLE = "dascore.io.pickle.core:PickleIO" PRODML__V2_0 = "dascore.io.prodml.core:ProdMLV2_0" PRODML__V2_1 = "dascore.io.prodml.core:ProdMLV2_1"