diff --git a/dascore/io/ap_sensing/utils.py b/dascore/io/ap_sensing/utils.py index 07d6ecd3..4aff1fa4 100644 --- a/dascore/io/ap_sensing/utils.py +++ b/dascore/io/ap_sensing/utils.py @@ -4,7 +4,8 @@ import dascore as dc from dascore.core import get_coord, get_coord_manager -from dascore.utils.misc import _maybe_unpack, get_path, unbyte +from dascore.utils.misc import _maybe_unpack, unbyte +from dascore.utils.fs import get_uri def _get_version_string(resource): @@ -83,7 +84,7 @@ def _get_attrs_dict(resource, format_name): instrumet_id=unbyte(_maybe_unpack(daq["SerialNumber"])), gauge_length=_maybe_unpack(pserver["GaugeLength"]), radians_to_nano_strain=_maybe_unpack(pserver["RadiansToNanoStrain"]), - path=get_path(resource), + path=get_uri(resource), format_name=format_name, format_version=version, ) diff --git a/dascore/io/asn/utils.py b/dascore/io/asn/utils.py index 5c54168f..3ac493d3 100644 --- a/dascore/io/asn/utils.py +++ b/dascore/io/asn/utils.py @@ -6,7 +6,9 @@ import dascore.core from dascore.core.coords import get_coord from dascore.utils.hdf5 import unpack_scalar_h5_dataset -from dascore.utils.misc import get_path, unbyte +from dascore.utils.misc import unbyte +from dascore.utils.fs import get_uri + # --- Getting format/version @@ -78,7 +80,7 @@ def _get_attr_dict(header, path, format_name, format_version): def _get_opto_das_coords_attrs(fi, format_name) -> tuple[dc.CoordManager, dict]: """Scan a OptoDAS file, return metadata.""" cm = _get_coord_manager(fi) - path = get_path(fi) + path = get_uri(fi) version = _get_opto_das_version_str(fi) attrs = _get_attr_dict(fi["header"], path, format_name, version) return cm, attrs diff --git a/dascore/io/core.py b/dascore/io/core.py index 69c70adc..ca449609 100644 --- a/dascore/io/core.py +++ b/dascore/io/core.py @@ -38,7 +38,8 @@ ) from dascore.utils.io import IOResourceManager, get_handle_from_resource from dascore.utils.mapping import FrozenDict -from dascore.utils.misc import _iter_filesystem, cached_method, iterate, warn_or_raise +from dascore.utils.misc import cached_method, iterate, warn_or_raise +from dascore.utils.fs import _iter_filesystem from dascore.utils.models import ( CommaSeparatedStr, DascoreBaseModel, diff --git a/dascore/io/dasdae/core.py b/dascore/io/dasdae/core.py index c638e8ca..64197028 100644 --- a/dascore/io/dasdae/core.py +++ b/dascore/io/dasdae/core.py @@ -9,7 +9,8 @@ H5Reader, H5Writer, ) -from dascore.utils.misc import get_path, unbyte +from dascore.utils.misc import unbyte +from ...utils.fs import get_uri from dascore.utils.patch import get_patch_names from .utils import ( @@ -88,7 +89,7 @@ def get_format(self, resource: H5Reader, **kwargs) -> tuple[str, str] | bool: def read(self, resource: H5Reader, **kwargs) -> SpoolType: """Read a DASDAE file.""" patches = [] - path = get_path(resource) + path = get_uri(resource) format_version = unbyte(resource.attrs["__DASDAE_version__"]) format_name = self.name try: diff --git a/dascore/io/dasdae/utils.py b/dascore/io/dasdae/utils.py index 7e1bf914..41995ee3 100644 --- a/dascore/io/dasdae/utils.py +++ b/dascore/io/dasdae/utils.py @@ -9,7 +9,8 @@ from dascore.core.coordmanager import get_coord_manager from dascore.core.coords import get_coord from dascore.utils.hdf5 import Empty -from dascore.utils.misc import get_path, suppress_warnings, unbyte +from dascore.utils.misc import suppress_warnings, unbyte +from dascore.utils.fs import get_uri from dascore.utils.time import to_int # --- Functions for writing DASDAE format @@ -189,7 +190,7 @@ def _read_patch(patch_group, path, format_name, format_version, **kwargs): def _get_summary_from_patch_groups(h5, format_name="DASDAE"): """Get the contents from each patch group.""" - path = get_path(h5) + path = get_uri(h5) format_version = h5.attrs["__DASDAE_version__"] out = [] for name, group in h5[("/waveforms")].items(): diff --git a/dascore/io/dashdf5/core.py b/dascore/io/dashdf5/core.py index f2e361d7..a0e86ae5 100644 --- a/dascore/io/dashdf5/core.py +++ b/dascore/io/dashdf5/core.py @@ -6,7 +6,7 @@ from dascore.constants import opt_timeable_types from dascore.io import FiberIO from dascore.utils.hdf5 import H5Reader -from dascore.utils.misc import get_path +from ...utils.fs import get_uri from .utils import _get_cf_attrs, _get_cf_coords, _get_cf_version_str @@ -21,7 +21,7 @@ class DASHDF5(FiberIO): def _get_attr(self, resource: H5Reader): """Get the attrs dict with path and such populated.""" attrs = _get_cf_attrs(resource) - attrs["path"] = get_path(resource) + attrs["path"] = get_uri(resource) attrs["format_name"] = self.name attrs["format_version"] = self.version return dc.PatchAttrs.model_validate(attrs) diff --git a/dascore/io/febus/core.py b/dascore/io/febus/core.py index 8975c15f..b9454e32 100644 --- a/dascore/io/febus/core.py +++ b/dascore/io/febus/core.py @@ -10,7 +10,7 @@ from dascore.constants import opt_timeable_types from dascore.io import FiberIO from dascore.utils.hdf5 import H5Reader -from dascore.utils.misc import get_path +from ...utils.fs import get_uri from dascore.utils.models import UTF8Str from .utils import ( @@ -71,7 +71,7 @@ def scan(self, resource: H5Reader, **kwargs) -> list[dc.PatchSummary]: """Scan a febus file, return summary information about the file's contents.""" return _scan_febus( resource, - path=get_path(resource), + path=get_uri(resource), format_name=self.name, format_version=self.version, attr_cls=FebusPatchAttrs, @@ -87,7 +87,7 @@ def read( """Read a febus spool of patches.""" patches = _read_febus( resource, - path=get_path(resource), + path=get_uri(resource), format_name=self.name, format_version=self.version, time=time, diff --git a/dascore/io/gdr/core.py b/dascore/io/gdr/core.py index d436ec7a..4ceda4fb 100644 --- a/dascore/io/gdr/core.py +++ b/dascore/io/gdr/core.py @@ -17,7 +17,7 @@ _maybe_trim_data, ) from dascore.utils.hdf5 import H5Reader -from dascore.utils.misc import get_path +from dascore.utils.fs import get_uri class GDRPatchAttrs(dc.PatchAttrs): @@ -40,7 +40,7 @@ class GDR_V1(FiberIO): # noqa def _get_attr_coord_data(self, resource, snap=True): """Get the attributes, coordinates, and h5 dataset.""" attr_dict, cm, data = _get_attrs_coords_and_data(resource, snap=snap) - attr_dict["path"] = get_path(resource) + attr_dict["path"] = get_uri(resource) attr_dict["format_name"] = self.name attr_dict["version"] = self.version attr = GDRPatchAttrs(**attr_dict) diff --git a/dascore/io/indexer.py b/dascore/io/indexer.py index 78a17897..90579abe 100644 --- a/dascore/io/indexer.py +++ b/dascore/io/indexer.py @@ -18,7 +18,6 @@ import dascore as dc from dascore.constants import ONE_SECOND_IN_NS, PROGRESS_LEVELS from dascore.exceptions import InvalidIndexVersionError -from dascore.utils.hdf5 import HDFPatchIndexManager from dascore.utils.misc import iterate from dascore.utils.pd import filter_df from dascore.utils.time import get_max_min_times, to_timedelta64 diff --git a/dascore/io/neubrex/core.py b/dascore/io/neubrex/core.py index 21f46a67..50037c7b 100644 --- a/dascore/io/neubrex/core.py +++ b/dascore/io/neubrex/core.py @@ -12,7 +12,7 @@ from dascore.constants import SpoolType from dascore.io import FiberIO from dascore.utils.hdf5 import H5Reader -from dascore.utils.misc import get_path +from dascore.utils.fs import get_uri class NeubrexRFSPatchAttrs(dc.PatchAttrs): @@ -53,7 +53,7 @@ class NeubrexRFSV1(FiberIO): def _get_attrs(self, resource) -> NeubrexRFSPatchAttrs: """Get the patch attributes.""" attr = rfs_utils._get_attr_dict(resource) - attr["path"] = get_path(resource) + attr["path"] = get_uri(resource) attr["format_name"] = self.name attr["format_version"] = self.version return NeubrexRFSPatchAttrs(**attr) @@ -106,7 +106,7 @@ class NeubrexDASV1(FiberIO): def _get_attr(self, resource) -> NeubrexDASPatchAttrs: """Get the attrs for from the file.""" attr = das_utils._get_attr_dict(resource["Acoustic"]) - attr["path"] = get_path(resource) + attr["path"] = get_uri(resource) attr["format_name"] = self.name attr["format_version"] = self.version return NeubrexDASPatchAttrs(**attr) diff --git a/dascore/io/pickle/core.py b/dascore/io/pickle/core.py index f71f2b5b..3ac4edef 100644 --- a/dascore/io/pickle/core.py +++ b/dascore/io/pickle/core.py @@ -6,7 +6,7 @@ import dascore as dc from dascore.io import BinaryReader, BinaryWriter, FiberIO -from dascore.utils.misc import get_path +from dascore.utils.fs import get_uri class PickleIO(FiberIO): @@ -53,7 +53,7 @@ def read(self, resource: BinaryReader, **kwargs): patch: dc.Patch = pickle.load(resource) # Add the relevant file info. out = patch.update_attrs( - path=get_path(resource), + path=get_uri(resource), format_name=self.name, format_version=self.version, ) diff --git a/dascore/io/prodml/utils.py b/dascore/io/prodml/utils.py index bac458b2..5c1b0406 100644 --- a/dascore/io/prodml/utils.py +++ b/dascore/io/prodml/utils.py @@ -6,7 +6,8 @@ from dascore.constants import VALID_DATA_TYPES from dascore.core.coordmanager import get_coord_manager from dascore.core.coords import get_coord -from dascore.utils.misc import get_path, iterate, maybe_get_items, unbyte +from dascore.utils.misc import iterate, maybe_get_items, unbyte +from dascore.utils.fs import get_uri # --- Getting format/version @@ -100,7 +101,7 @@ def _get_prodml_attrs(fi, format_name, format_version) -> list[dict]: acq = fi["Acquisition"] base_info = maybe_get_items(acq.attrs, _root_attrs) raw_nodes = _get_raw_node_dict(acq) - path = get_path(fi) + path = get_uri(fi) # Iterate each raw data node. I have only ever seen 1 in a file but since # it is indexed like Raw[0] there might be more. diff --git a/dascore/io/segy/core.py b/dascore/io/segy/core.py index a78b66a8..ffc6286c 100644 --- a/dascore/io/segy/core.py +++ b/dascore/io/segy/core.py @@ -5,7 +5,8 @@ import dascore as dc from dascore.io.core import FiberIO from dascore.utils.io import BinaryReader -from dascore.utils.misc import get_path, optional_import +from dascore.utils.misc import optional_import +from ...utils.fs import get_uri from .utils import ( _get_coords, @@ -31,7 +32,7 @@ class SegyV1_0(FiberIO): # noqa def _get_attrs(self, resource): """Get the basic attributes for a segy file.""" info = dict( - path=get_path(resource), + path=get_uri(resource), format_name=self.name, format_version=self.version, ) diff --git a/dascore/io/sentek/core.py b/dascore/io/sentek/core.py index c336b48b..9071a55d 100644 --- a/dascore/io/sentek/core.py +++ b/dascore/io/sentek/core.py @@ -7,7 +7,7 @@ import dascore as dc from dascore.io import BinaryReader from dascore.io.core import FiberIO -from dascore.utils.misc import get_path +from ...utils.fs import get_uri from dascore.utils.models import ArraySummary from .utils import _get_patch_attrs, _get_version @@ -24,7 +24,7 @@ def _get_attrs_coords_offsets(self, resource): """Get attributes, coordinates, and data offsets from file.""" attrs_dict, coords, offsets = _get_patch_attrs( resource, - path=get_path(resource), + path=get_uri(resource), format_name=self.name, format_version=self.version, ) diff --git a/dascore/io/sentek/utils.py b/dascore/io/sentek/utils.py index 5cd3fa08..d6ca661c 100644 --- a/dascore/io/sentek/utils.py +++ b/dascore/io/sentek/utils.py @@ -8,12 +8,12 @@ import dascore as dc from dascore.core import get_coord, get_coord_manager -from dascore.utils.misc import get_path +from dascore.utils.fs import get_uri def _get_version(fid): """Determine if Sentek file.""" - path = get_path(fid) + path = get_uri(fid) # Sentek files cannot change the extension, or file name. sw_data = path.endswith(".das") fid.seek(0) diff --git a/dascore/io/silixah5/core.py b/dascore/io/silixah5/core.py index aece1068..33023274 100644 --- a/dascore/io/silixah5/core.py +++ b/dascore/io/silixah5/core.py @@ -11,7 +11,7 @@ from dascore.constants import opt_timeable_types from dascore.io import FiberIO from dascore.utils.hdf5 import H5Reader -from dascore.utils.misc import get_path +from dascore.utils.fs import get_uri class SilixaPatchAttrs(dc.PatchAttrs): @@ -33,7 +33,7 @@ class SilixaH5V1(FiberIO): def _get_attr_coords(self, resource): """Get attributes and coordinates of patch in file.""" info, coords = util._get_attr_dict(resource) - info["path"] = get_path(resource) + info["path"] = get_uri(resource) info["format_name"] = self.name info["format_version"] = self.version return SilixaPatchAttrs(**info), coords diff --git a/dascore/io/tdms/core.py b/dascore/io/tdms/core.py index 88d3df9d..d0049c43 100644 --- a/dascore/io/tdms/core.py +++ b/dascore/io/tdms/core.py @@ -6,7 +6,7 @@ from dascore.constants import timeable_types from dascore.core import Patch from dascore.io import BinaryReader, FiberIO -from dascore.utils.misc import get_path +from ...utils.fs import get_uri from .utils import _get_attrs_coords, _get_data, _get_version_str @@ -22,7 +22,7 @@ class TDMSFormatterV4713(FiberIO): def _get_attr_coords(self, resource): """Get a PatchAttrs for the file.""" out, coords, _ = _get_attrs_coords(resource) - out["path"] = get_path(resource) + out["path"] = get_uri(resource) out["file_format"] = self.name out["file_version"] = self.version return dc.PatchAttrs(**out), coords diff --git a/dascore/io/terra15/core.py b/dascore/io/terra15/core.py index d8b28f35..16940d69 100644 --- a/dascore/io/terra15/core.py +++ b/dascore/io/terra15/core.py @@ -8,7 +8,7 @@ from dascore.constants import timeable_types from dascore.io import FiberIO from dascore.utils.hdf5 import H5Reader -from dascore.utils.misc import get_path +from ...utils.fs import get_uri from .utils import ( _get_default_attrs, @@ -39,7 +39,7 @@ def _get_attrs_coords_data_node(self, resource): """Get attributes, coords, and datanode for this file.""" version, data_node = _get_version_data_node(resource) attrs = _get_default_attrs(resource) - attrs["path"] = get_path(resource) + attrs["path"] = get_uri(resource) attrs["format_name"] = self.name attrs["format_version"] = version coords_dict = { diff --git a/dascore/io/xml_binary/core.py b/dascore/io/xml_binary/core.py index c93be029..c62e522c 100644 --- a/dascore/io/xml_binary/core.py +++ b/dascore/io/xml_binary/core.py @@ -11,6 +11,7 @@ import dascore as dc from dascore.io import FiberIO from dascore.utils.models import UTF8Str +from ...utils.fs import get_uri from .utils import _load_patches, _paths_to_attrs, _read_xml_metadata @@ -36,9 +37,9 @@ class XMLBinaryV1(FiberIO): # File extension for data files. _data_extension = ".raw" - def scan(self, resource, timestamp=None, **kwargs) -> list[dc.PatchAttrs]: + def scan(self, resource, timestamp=None, **kwargs) -> list[dc.PatchSummary]: """Scan the contents of the directory.""" - path = Path(resource) + path = get_uri(resource) metadata = _read_xml_metadata(path / self._metadata_name) data_files = list(path.glob(f"*{self._data_extension}")) extra_attrs = { diff --git a/dascore/utils/fs.py b/dascore/utils/fs.py new file mode 100644 index 00000000..7112940e --- /dev/null +++ b/dascore/utils/fs.py @@ -0,0 +1,182 @@ +""" +Utilities related to working with file systems. + +These include actual file systems or virtual ones. +""" +from __future__ import annotations + +import os +import re +from pathlib import Path +from typing import Iterable, Generator +from typing_extensions import Self + +import fsspec + + +# Detect if the string has an associated protocol. +_PROTOCOL_DETECTION_REGEX = r"^([a-zA-Z][a-zA-Z0-9+.-]*):\/\/" + + + +def get_fspath(obj): + """ + + """ + uri = get_uri(obj) + fs = fsspec.open(uri) + return fs + + +class FSPath: + """ + A class that behaves like a pathlib.Path object. + + This helps smooth out some of the edges of fsspec. + """ + fs: fsspec.AbstractFileSystem + + def __init__(self, obj): + """ + + """ + if isinstance(obj, FSPath): + self.__dict__.update(obj.__dict__) + return + if isinstance(obj, fsspec.core.OpenFile): + self._fs = obj.fs + self._path = Path(obj.path) + else: + fs, path = fsspec.url_to_fs(obj) + self._fs = fs + self._path = Path(path) + + @classmethod + def from_fs_path(cls, fs, path): + out = cls.__new__(cls) + out._fs = fs + out._path = path + return out + + @property + def path(self) -> Path: + """Get the pathlib object representing this item.""" + return self._path + + @property + def parent(self) -> Path: + """Get the pathlib object representing this item.""" + return self.from_fs_path(fs=self._fs, path=self._path.parent) + + def _full_name(self): + """ + Return the full name. + + Ideally, this is a string that can be used to recreate the + filesystem and path. + """ + name = self._fs.unstrip_protocol(self._path) + return name + + def exists(self): + """Determine if the file exists.""" + return self._fs.exists(self._path) + + # --- Dunders + + def __truediv__(self, other: str) -> Self: + """Enables division to add to string to Path.""" + return self.from_fs_path(fs=self._fs, path=self._path / other) + + + def __repr__(self) -> str: + return self._full_name() + + + + + +def get_uri(obj) -> str: + """ + Get the uri string of an object representing a file. + + Parameters + ---------- + obj + An object that represents a path to a resource. + """ + if isinstance(obj, str): + # Assume the string rep a local file. + if not re.match(_PROTOCOL_DETECTION_REGEX, obj): + obj = f"file://{obj}" + elif hasattr(obj, "filename"): + obj = f"file://{obj.filename}" + elif isinstance(obj, Path): + obj = f"file://{obj.absolute()}" + elif hasattr(obj, "name"): + obj = f"file://{obj.name}" + elif isinstance(obj, fsspec.core.OpenFiles): + obj = get_fspath(obj) + if hasattr(obj, "full_name"): + obj = obj.full_name + return obj + + +def _iter_filesystem( + paths: str | Path | Iterable[str | Path], + ext: str | None = None, + timestamp: float | None = None, + skip_hidden: bool = True, + include_directories: bool = False, +) -> Generator[str, str, None]: + """ + Iterate contents of a filesystem like thing. + + Options allow for filtering and terminating early. + + Parameters + ---------- + paths + The path to the base directory to traverse. Can also use a collection + of paths. + ext : str or None + The extensions of files to return. + timestamp : int or float + Time stamp indicating the minimum mtime to scan. + skip_hidden : bool + If True skip files or folders (they begin with a '.') + include_directories + If True, also yield directories. In this case, a "skip" can be + passed back to the generator to indicate the rest of the directory + contents should be skipped. + + Yields + ------ + Paths, as strings, meeting requirements. + """ + # handle returning directories if requested. + if include_directories and os.path.isdir(paths): + if not (skip_hidden and str(paths).startswith(".")): + signal = yield paths + if signal is not None and signal == "skip": + yield None + return + try: # a single path was passed + for entry in os.scandir(paths): + if entry.is_file() and (ext is None or entry.name.endswith(ext)): + if timestamp is None or entry.stat().st_mtime >= timestamp: + if entry.name[0] != "." or not skip_hidden: + yield entry.path + elif entry.is_dir() and not (skip_hidden and entry.name[0] == "."): + yield from _iter_filesystem( + entry.path, + ext=ext, + timestamp=timestamp, + skip_hidden=skip_hidden, + include_directories=include_directories, + ) + except (TypeError, AttributeError): # multiple paths were passed + for path in paths: + yield from _iter_filesystem(path, ext, timestamp, skip_hidden) + except NotADirectoryError: # a file path was passed, just return it + yield paths diff --git a/dascore/utils/misc.py b/dascore/utils/misc.py index a6d42e4d..2599904a 100644 --- a/dascore/utils/misc.py +++ b/dascore/utils/misc.py @@ -9,7 +9,7 @@ import os import re import warnings -from collections.abc import Generator, Iterable, Mapping, Sequence, Sized +from collections.abc import Iterable, Mapping, Sequence, Sized from functools import cache from pathlib import Path from types import ModuleType @@ -192,66 +192,6 @@ def _get_nullish(dtype=np.floating): return np.nan -def _iter_filesystem( - paths: str | Path | Iterable[str | Path], - ext: str | None = None, - timestamp: float | None = None, - skip_hidden: bool = True, - include_directories: bool = False, -) -> Generator[str, str, None]: - """ - Iterate contents of a filesystem like thing. - - Options allow for filtering and terminating early. - - Parameters - ---------- - paths - The path to the base directory to traverse. Can also use a collection - of paths. - ext : str or None - The extensions of files to return. - timestamp : int or float - Time stamp indicating the minimum mtime to scan. - skip_hidden : bool - If True skip files or folders (they begin with a '.') - include_directories - If True, also yield directories. In this case, a "skip" can be - passed back to the generator to indicate the rest of the directory - contents should be skipped. - - Yields - ------ - Paths, as strings, meeting requirements. - """ - # handle returning directories if requested. - if include_directories and os.path.isdir(paths): - if not (skip_hidden and str(paths).startswith(".")): - signal = yield paths - if signal is not None and signal == "skip": - yield None - return - try: # a single path was passed - for entry in os.scandir(paths): - if entry.is_file() and (ext is None or entry.name.endswith(ext)): - if timestamp is None or entry.stat().st_mtime >= timestamp: - if entry.name[0] != "." or not skip_hidden: - yield entry.path - elif entry.is_dir() and not (skip_hidden and entry.name[0] == "."): - yield from _iter_filesystem( - entry.path, - ext=ext, - timestamp=timestamp, - skip_hidden=skip_hidden, - include_directories=include_directories, - ) - except (TypeError, AttributeError): # multiple paths were passed - for path in paths: - yield from _iter_filesystem(path, ext, timestamp, skip_hidden) - except NotADirectoryError: # a file path was passed, just return it - yield paths - - def iterate(obj): """ Return an iterable from any object. @@ -747,16 +687,3 @@ def to_object_array(object_sequence): return out -def get_path(obj) -> str: - """ - Get the path string of an object. - - Parameters - ---------- - obj - An object that represents a path to a resource. - """ - # Handles the case of - if hasattr(obj, "filename"): - return obj.filename - return str(obj) diff --git a/pyproject.toml b/pyproject.toml index ccbc857b..71b121f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -131,11 +131,19 @@ GDR_DAS__V1 = "dascore.io.gdr.core:GDR_V1" # --- External tool configuration +# -- Pytest config +# pytest marks +[tool.pytest.ini_options] +markers = [ + "network: marks tests as needing the internet", +] + +# -- Linter configuration [tool.ruff] line-length = 88 -# enable certain types of linting +# Controls the categories of ruff rules applied. lint.select = [ "E", "F", @@ -179,25 +187,20 @@ exclude = [ # lowest python version supported target-version = "py310" +# Ensure ruff just fixes stuff it can safely fix. lint.fixable = ["ALL"] -# List of codes to ignore +# List of codes to ignore. lint.ignore = ["D105", "D107", "D401", "D205", "D200", "D400"] [tool.ruff.lint.mccabe] -# Unlike Flake8, default to a complexity level of 10. +# Controls how complex functions/methods can be. max-complexity = 10 # config for docstring parsing [tool.ruff.lint.pydocstyle] convention = "numpy" -[tool.pytest.ini_options] -filterwarnings = [ - # Ignore hdf5 warnings from pytables, See pytables #1035 - 'ignore::Warning:tables:' -] - [tool.ruff.format] # Use `\n` line endings for all files line-ending = "lf" diff --git a/tests/conftest.py b/tests/conftest.py index fb7aad7f..d84f9e4b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -89,7 +89,6 @@ def swap_index_map_path(tmp_path_factory): tmp_map_path = tmp_path_factory.mktemp("cache_paths") / "cache_paths.json" setattr(DirectoryIndexer, "index_map_path", tmp_map_path) - # --- Coordinate fixtures COORD_MANAGERS = [] diff --git a/tests/test_utils/test_fs.py b/tests/test_utils/test_fs.py new file mode 100644 index 00000000..5c27bb8e --- /dev/null +++ b/tests/test_utils/test_fs.py @@ -0,0 +1,243 @@ +""" +Tests for file system utilities. +""" +import os +import time +from pathlib import Path + +import h5py +import pytest +import fsspec +from dascore.utils.fs import _iter_filesystem, get_uri, FSPath + + +class TestGetUri: + """Tests for getting a path from various objects.""" + + def test_pathlib(self): + """Ensure a pathlib object works with uri generator.""" + my_path = Path(__file__) + path = get_uri(my_path) + assert isinstance(path, str) + assert path == f"file://{str(my_path)}" + + def test_str(self): + """Ensure a string simply returns itself.""" + my_path = str(Path(__file__)) + path = get_uri(my_path) + assert isinstance(path, str) + assert path == f"file://{str(my_path)}" + + def test_fs_spec(self, tmp_path): + """Ensure a fs spec object returns a path string.""" + fs = fsspec.open(Path(tmp_path)) + out = get_uri(fs) + assert out == f"file://{tmp_path}" + + def test_open_file(self, tmp_path): + """Ensure an open file can be used.""" + path = tmp_path / "file.txt" + with open(path, "wb") as f: + uri = get_uri(f) + assert uri == f"file://{path}" + + def test_h5(self, tmp_path): + """Ensure a h5 file returns a path.""" + path = tmp_path / "file.h5" + with h5py.File(path, "w") as f: + uri = get_uri(f) + assert uri == f"file://{path}" + + def test_idempotent(self, tmp_path): + """Ensure the protocol doesn't keep getting appended.""" + my_path = Path(__file__) + path = get_uri(my_path) + path2 = get_uri(path) + path3 = get_uri(path2) + assert path == path2 == path3 + + +class TestIterFS: + """Tests for iterating directories of files.""" + + sub = {"D": {"C": ".mseed"}, "F": ".json", "G": {"H": ".txt"}} # noqa + file_paths = {"A": ".txt", "B": sub} # noqa + + # --- helper functions + def setup_test_directory(self, some_dict: dict, path: Path): + """Build the test directory.""" + for path in self.get_file_paths(some_dict, path): + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w") as fi: + fi.write("useful text") + + def get_file_paths(self, some_dict, path): + """Return expected paths to files.""" + for i, v in some_dict.items(): + if isinstance(v, dict): + yield from self.get_file_paths(v, path / i) + else: + yield path / (i + v) + + # --- fixtures + @pytest.fixture(scope="class") + def simple_dir(self, tmp_path_factory): + """Return a simple directory for iterating.""" + path = Path(tmp_path_factory.mktemp("iterfiles")) + self.setup_test_directory(self.file_paths, path) + return path + + @pytest.fixture(scope="class") + def dir_with_hidden_dir(self, tmp_path_factory): + """Create a directory with a hidden directory inside.""" + path = Path(tmp_path_factory.mktemp("iterfiles_hidden")) + struct = dict(self.file_paths) + # add hidden directory with files in it. + struct[".Hidden"] = {"Another": {"hidden_by_parent": ".txt"}} + self.setup_test_directory(struct, path) + return path + + def test_basic(self, simple_dir): + """Test basic usage of iterfiles.""" + files = set(self.get_file_paths(self.file_paths, simple_dir)) + out = {Path(x) for x in _iter_filesystem(simple_dir)} + assert files == out + + def test_one_subdir(self, simple_dir): + """Test with one sub directory.""" + subdirs = simple_dir / "B" / "D" + out = set(_iter_filesystem(subdirs)) + assert len(out) == 1 + + def test_multiple_subdirs(self, simple_dir): + """Test with multiple sub directories.""" + path1 = simple_dir / "B" / "D" + path2 = simple_dir / "B" / "G" + out = {Path(x) for x in _iter_filesystem([path1, path2])} + files = self.get_file_paths(self.file_paths, simple_dir) + expected = { + x + for x in files + if str(x).startswith(str(path1)) or str(x).startswith(str(path2)) + } + assert out == expected + + def test_extension(self, simple_dir): + """Test filtering based on extention.""" + out = set(_iter_filesystem(simple_dir, ext=".txt")) + for val in out: + assert val.endswith(".txt") + + def test_mtime(self, simple_dir): + """Test filtering based on modified time.""" + files = list(self.get_file_paths(self.file_paths, simple_dir)) + # set the first file mtime in future + now = time.time() + first_file = files[0] + os.utime(first_file, (now + 10, now + 10)) + # get output make sure it only returned first file + out = list(_iter_filesystem(simple_dir, timestamp=now + 5)) + assert len(out) == 1 + assert Path(out[0]) == first_file + + def test_skips_files_in_hidden_directory(self, dir_with_hidden_dir): + """Hidden directory files should be skipped.""" + out1 = list(_iter_filesystem(dir_with_hidden_dir)) + has_hidden_by_parent = ["hidden_by_parent" in x for x in out1] + assert not any(has_hidden_by_parent) + # But if skip_hidden is False it should be there + out2 = list(_iter_filesystem(dir_with_hidden_dir, skip_hidden=False)) + has_hidden_by_parent = ["hidden_by_parent" in x for x in out2] + assert sum(has_hidden_by_parent) == 1 + + def test_pass_file(self, dummy_text_file): + """Just pass a single file and ensure it gets returned.""" + out = list(_iter_filesystem(dummy_text_file)) + assert len(out) == 1 + assert out[0] == dummy_text_file + + def test_no_directories(self, simple_dir): + """Ensure no directories are included when include_directories=False.""" + out = list(_iter_filesystem(simple_dir, include_directories=False)) + has_dirs = [Path(x).is_dir() for x in out] + assert not any(has_dirs) + + def test_include_directories(self, simple_dir): + """Ensure we can get directories back.""" + out = list(_iter_filesystem(simple_dir, include_directories=True)) + returned_dirs = [Path(x) for x in out if Path(x).is_dir()] + assert len(returned_dirs) + # The top level directory should have been included + assert simple_dir in returned_dirs + # Directory names + dir_names = {x.name for x in returned_dirs} + expected_names = {"B", "G", "D"} + assert expected_names.issubset(dir_names) + + def test_skip_signal_directory(self, simple_dir): + """Ensure a skip signal can be sent to stop parsing on directory.""" + out = [] + iterator = _iter_filesystem(simple_dir, include_directories=True) + for path in iterator: + if Path(path).name == "B": + iterator.send("skip") + out.append(path) + names = {Path(x).name.split(".")[0] for x in out} + # Anything after B should have been skipped + assert {"C", "D", "E", "F"}.isdisjoint(names) + + +class TestFSPath: + """Tests for the FS Path abstraction.""" + + @pytest.fixture(scope="class") + def complex_folder(self, tmp_path_factory): + """Make a temp path with several sub folders and such.""" + path = tmp_path_factory.mktemp("complex_fs_folder") + + csv_1 = path / "csv_1.csv" + with csv_1.open("w") as f: + f.write("Name,Age,Occupation,City") + f.write("Alice,30,Engineer,New York") + + text1 = path / "text_1.txt" + with text1.open("w") as f: + f.write("Ground control to major tom!") + + f1 = path / "folder_1" + f1.mkdir(exist_ok=True, parents=True) + text2 = f1 / "text_2.txt" + with text2.open("w") as f: + f.write("Planet Earth is blue and there's nothing I can do") + + f2 = path / "folder_2" + f2.mkdir(exist_ok=True, parents=True) + csv_2 = f2 / "csv_2.csv" + with csv_2.open("w") as f: + f.write("Name,Age,Occupation,City") + f.write("Bob,35,Engineer,Salt Lake City") + return path + + @pytest.fixture(scope="class") + def fspath(self, complex_folder): + """return the fspath object.""" + return FSPath(complex_folder) + + def test_str_and_repr(self, fspath): + """Ensure a valid repr/str exist.""" + out_strs = [str(fspath), repr(fspath)] + for out in out_strs: + assert isinstance(out, str) + assert str(fspath.path) in out + + def test_slash(self, fspath): + """Ensure the slash operator works.""" + out = fspath / "text_1.txt" + assert str(out).endswith("text_1.txt") + + + + + + + diff --git a/tests/test_utils/test_misc.py b/tests/test_utils/test_misc.py index 66a6bec1..e527d15c 100644 --- a/tests/test_utils/test_misc.py +++ b/tests/test_utils/test_misc.py @@ -2,10 +2,7 @@ from __future__ import annotations -import os -import time import warnings -from pathlib import Path import numpy as np import pytest @@ -13,7 +10,6 @@ from dascore.exceptions import MissingOptionalDependencyError from dascore.utils.misc import ( MethodNameSpace, - _iter_filesystem, cached_method, get_stencil_coefs, iterate, @@ -60,135 +56,6 @@ def new_method(self, expected_type): assert pc.namespace.new_method(ParentClass) -class TestIterFS: - """Tests for iterating directories of files.""" - - sub = {"D": {"C": ".mseed"}, "F": ".json", "G": {"H": ".txt"}} # noqa - file_paths = {"A": ".txt", "B": sub} # noqa - - # --- helper functions - def setup_test_directory(self, some_dict: dict, path: Path): - """Build the test directory.""" - for path in self.get_file_paths(some_dict, path): - path.parent.mkdir(parents=True, exist_ok=True) - with path.open("w") as fi: - fi.write("useful text") - - def get_file_paths(self, some_dict, path): - """Return expected paths to files.""" - for i, v in some_dict.items(): - if isinstance(v, dict): - yield from self.get_file_paths(v, path / i) - else: - yield path / (i + v) - - # --- fixtures - @pytest.fixture(scope="class") - def simple_dir(self, tmp_path_factory): - """Return a simple directory for iterating.""" - path = Path(tmp_path_factory.mktemp("iterfiles")) - self.setup_test_directory(self.file_paths, path) - return path - - @pytest.fixture(scope="class") - def dir_with_hidden_dir(self, tmp_path_factory): - """Create a directory with a hidden directory inside.""" - path = Path(tmp_path_factory.mktemp("iterfiles_hidden")) - struct = dict(self.file_paths) - # add hidden directory with files in it. - struct[".Hidden"] = {"Another": {"hidden_by_parent": ".txt"}} - self.setup_test_directory(struct, path) - return path - - def test_basic(self, simple_dir): - """Test basic usage of iterfiles.""" - files = set(self.get_file_paths(self.file_paths, simple_dir)) - out = {Path(x) for x in _iter_filesystem(simple_dir)} - assert files == out - - def test_one_subdir(self, simple_dir): - """Test with one sub directory.""" - subdirs = simple_dir / "B" / "D" - out = set(_iter_filesystem(subdirs)) - assert len(out) == 1 - - def test_multiple_subdirs(self, simple_dir): - """Test with multiple sub directories.""" - path1 = simple_dir / "B" / "D" - path2 = simple_dir / "B" / "G" - out = {Path(x) for x in _iter_filesystem([path1, path2])} - files = self.get_file_paths(self.file_paths, simple_dir) - expected = { - x - for x in files - if str(x).startswith(str(path1)) or str(x).startswith(str(path2)) - } - assert out == expected - - def test_extension(self, simple_dir): - """Test filtering based on extention.""" - out = set(_iter_filesystem(simple_dir, ext=".txt")) - for val in out: - assert val.endswith(".txt") - - def test_mtime(self, simple_dir): - """Test filtering based on modified time.""" - files = list(self.get_file_paths(self.file_paths, simple_dir)) - # set the first file mtime in future - now = time.time() - first_file = files[0] - os.utime(first_file, (now + 10, now + 10)) - # get output make sure it only returned first file - out = list(_iter_filesystem(simple_dir, timestamp=now + 5)) - assert len(out) == 1 - assert Path(out[0]) == first_file - - def test_skips_files_in_hidden_directory(self, dir_with_hidden_dir): - """Hidden directory files should be skipped.""" - out1 = list(_iter_filesystem(dir_with_hidden_dir)) - has_hidden_by_parent = ["hidden_by_parent" in x for x in out1] - assert not any(has_hidden_by_parent) - # But if skip_hidden is False it should be there - out2 = list(_iter_filesystem(dir_with_hidden_dir, skip_hidden=False)) - has_hidden_by_parent = ["hidden_by_parent" in x for x in out2] - assert sum(has_hidden_by_parent) == 1 - - def test_pass_file(self, dummy_text_file): - """Just pass a single file and ensure it gets returned.""" - out = list(_iter_filesystem(dummy_text_file)) - assert len(out) == 1 - assert out[0] == dummy_text_file - - def test_no_directories(self, simple_dir): - """Ensure no directories are included when include_directories=False.""" - out = list(_iter_filesystem(simple_dir, include_directories=False)) - has_dirs = [Path(x).is_dir() for x in out] - assert not any(has_dirs) - - def test_include_directories(self, simple_dir): - """Ensure we can get directories back.""" - out = list(_iter_filesystem(simple_dir, include_directories=True)) - returned_dirs = [Path(x) for x in out if Path(x).is_dir()] - assert len(returned_dirs) - # The top level directory should have been included - assert simple_dir in returned_dirs - # Directory names - dir_names = {x.name for x in returned_dirs} - expected_names = {"B", "G", "D"} - assert expected_names.issubset(dir_names) - - def test_skip_signal_directory(self, simple_dir): - """Ensure a skip signal can be sent to stop parsing on directory.""" - out = [] - iterator = _iter_filesystem(simple_dir, include_directories=True) - for path in iterator: - if Path(path).name == "B": - iterator.send("skip") - out.append(path) - names = {Path(x).name.split(".")[0] for x in out} - # Anything after B should have been skipped - assert {"C", "D", "E", "F"}.isdisjoint(names) - class TestIterate: """Test case for iterate."""