diff --git a/doc/devlog/2023-07-07.ipynb b/doc/devlog/2023-07-07.ipynb index 52b3692c..2de434c4 100644 --- a/doc/devlog/2023-07-07.ipynb +++ b/doc/devlog/2023-07-07.ipynb @@ -184,12 +184,11 @@ "metadata": {}, "outputs": [], "source": [ + "geofile = Path('epymorph/data/geo') / F.to_archive_filename('us_states_2015')\n", "try:\n", " states_geo = StaticGeo(dataclasses.replace(spec), states_values)\n", " states_geo.validate()\n", - " states_geo.save(\n", - " Path('epymorph/data/geo') / F.to_archive_filename('us_states_2015')\n", - " )\n", + " states_geo.save(geofile)\n", "except GeoValidationException as e:\n", " print(e.pretty())" ] @@ -202,7 +201,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 6, @@ -212,7 +211,7 @@ ], "source": [ "# Verify that we can load the file back.\n", - "F.load_from_archive(Path('epymorph/data/geo/us_states_2015.geo.tar'))" + "F.load_from_archive(geofile)" ] }, { @@ -281,39 +280,38 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ + "geofile = Path('epymorph/data/geo') / F.to_archive_filename('us_counties_2015')\n", "try:\n", " counties_geo = StaticGeo(dataclasses.replace(spec), counties_values)\n", " counties_geo.validate()\n", - " counties_geo.save(\n", - " Path('epymorph/data/geo') / F.to_archive_filename('us_counties_2015')\n", - " )\n", + " counties_geo.save(geofile)\n", "except GeoValidationException as e:\n", " print(e.pretty())" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Verify that we can load the file back.\n", - "F.load_from_archive(Path('epymorph/data/geo/us_counties_2015.geo.tar'))" + "F.load_from_archive(geofile)" ] } ], @@ -333,7 +331,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.6" + "version": "3.11.9" }, "orig_nbformat": 4 }, diff --git a/doc/devlog/2024-04-25.ipynb b/doc/devlog/2024-04-25.ipynb new file mode 100644 index 00000000..2ec71823 --- /dev/null +++ b/doc/devlog/2024-04-25.ipynb @@ -0,0 +1,112 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# devlog 2024-04-25\n", + "\n", + "_Author: Tyler Coles_\n", + "\n", + "Testing cache utilities. This script:\n", + "1. tests that reading and writing from archives works,\n", + "2. checks that we can choose to gzip or not, and\n", + "3. measures the impact of gzipping on read/write time and file size." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Save a geo without compression:\n", + "17.8 ms ± 2.71 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "Read a geo without compression:\n", + "3.74 ms ± 73.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "\n", + "Save a geo compressed:\n", + "20.6 ms ± 134 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "Read a geo with compression:\n", + "4.87 ms ± 57.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "\n", + "Bytes as a tar: 153,600\n", + "Bytes as a tgz: 134,722\n", + "Compression ratio: 87.7%\n" + ] + } + ], + "source": [ + "import os\n", + "import shutil\n", + "import tempfile\n", + "\n", + "from epymorph import geo_library\n", + "from epymorph.geo.static import StaticGeoFileOps as F\n", + "\n", + "# Our subject geo can be anything, but this one is a useful demo because it's sizeable.\n", + "geo = geo_library['maricopa_cbg_2019']()\n", + "\n", + "tempdir = tempfile.mkdtemp()\n", + "\n", + "print(\"Save a geo without compression:\")\n", + "%timeit F.save_as_archive(geo, f\"{tempdir}/geo.tar\")\n", + "print(\"Read a geo without compression:\")\n", + "%timeit F.load_from_archive(f\"{tempdir}/geo.tar\")\n", + "\n", + "print()\n", + "\n", + "print(\"Save a geo compressed:\")\n", + "%timeit F.save_as_archive(geo, f\"{tempdir}/geo.tgz\")\n", + "print(\"Read a geo with compression:\")\n", + "%timeit F.load_from_archive(f\"{tempdir}/geo.tgz\")\n", + "\n", + "print()\n", + "\n", + "size_tar = os.path.getsize(f\"{tempdir}/geo.tar\")\n", + "size_tgz = os.path.getsize(f\"{tempdir}/geo.tgz\")\n", + "\n", + "print(f\"Bytes as a tar: {size_tar:>9,}\")\n", + "print(f\"Bytes as a tgz: {size_tgz:>9,}\")\n", + "print(f\"Compression ratio: {(size_tgz / size_tar):.1%}\")\n", + "\n", + "shutil.rmtree(tempdir)\n", + "\n", + "# NOTE: the %timeit magics break isort and autopep8, so you're on your own for formatting" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "We get decent savings in bytes by storing geos gzipped, and it doesn't take much longer to read and write. ✓" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/epymorph/cache.py b/epymorph/cache.py new file mode 100644 index 00000000..d2bed96a --- /dev/null +++ b/epymorph/cache.py @@ -0,0 +1,191 @@ +"""epymorph's file caching utilities.""" +from hashlib import sha256 +from io import BytesIO +from os import PathLike +from pathlib import Path +from tarfile import TarInfo, is_tarfile +from tarfile import open as open_tarfile + +from platformdirs import user_cache_path + +CACHE_PATH = user_cache_path(appname='epymorph', ensure_exists=True) + + +class FileError(Exception): + """Error during a file operation.""" + + +class FileMissingError(FileError): + """Error loading a file, as it does not exist.""" + + +class FileWriteError(FileError): + """Error writing a file.""" + + +class FileReadError(FileError): + """Error loading a file.""" + + +class FileVersionError(FileError): + """Error loading a file due to unmet version requirements.""" + + +class CacheMiss(FileError): + """Raised on a cache-miss (for any reason) during a load-from-cache operation.""" + + +def save_bundle(to_path: str | PathLike[str], version: int, files: dict[str, BytesIO]) -> None: + """ + Save a bundle of files in our tar format with an associated version number. + `to_path` can be absolute or relative; relative paths will be resolved + against the current working directory. Folders in the path which do not exist + will be created automatically. + """ + + if version <= 0: + raise ValueError("version should be greater than zero.") + + try: + # Compute checksums + sha_entries = [] + for name, contents in files.items(): + contents.seek(0) + sha = sha256() + sha.update(contents.read()) + sha_entries.append(f"{sha.hexdigest()} {name}") + + # Create checksums.sha256 file + sha_file = BytesIO() + sha_text = "\n".join(sha_entries) + sha_file.write(bytes(sha_text, encoding='utf-8')) + + # Create cache version file + ver_file = BytesIO() + ver_file.write(bytes(str(version), encoding="utf-8")) + + tarred_files = { + **files, + "checksums.sha256": sha_file, + "version": ver_file, + } + + # Write the tar to disk + tar_path = Path(to_path).resolve() + tar_path.parent.mkdir(parents=True, exist_ok=True) + mode = 'w:gz' if tar_path.suffix == '.tgz' else 'w' + with open_tarfile(name=tar_path, mode=mode) as tar: + for name, contents in tarred_files.items(): + info = TarInfo(name) + info.size = contents.tell() + contents.seek(0) + tar.addfile(info, contents) + + except Exception as e: + msg = f"Unable to write archive at path: {to_path}" + raise FileWriteError(msg) from e + + +def load_bundle(from_path: str | PathLike[str], version_at_least: int = -1) -> dict[str, BytesIO]: + """ + Load a bundle of files in our tar format, optionally enforcing a minimum version. + An Exception is raised if the file cannot be loaded for any reason, or if its version + is incorrect. On success, returns a dictionary of the contained files, mapping the file + name to the bytes of the file. + """ + try: + tar_path = Path(from_path).resolve() + if not tar_path.is_file(): + raise FileMissingError(f"No file at: {tar_path}") + + # Read the tar file into memory + tar_buffer = BytesIO() + with open(tar_path, 'rb') as f: + tar_buffer.write(f.read()) + tar_buffer.seek(0) + + if not is_tarfile(tar_buffer): + raise FileReadError(f"Not a tar file at: {tar_path}") + + mode = 'r:gz' if tar_path.suffix == '.tgz' else 'r' + tarred_files: dict[str, BytesIO] = {} + with open_tarfile(fileobj=tar_buffer, mode=mode) as tar: + for info in tar.getmembers(): + name = info.name + contents = tar.extractfile(info) + if contents is not None: + tarred_files[name] = BytesIO(contents.read()) + + # Check version + if "version" in tarred_files: + ver_file = tarred_files["version"] + version = int(str(ver_file.readline(), encoding="utf-8")) + else: + version = -1 + if version < version_at_least: + raise FileVersionError("Archive is an unacceptable version.") + + # Verify the checksums + if "checksums.sha256" not in tarred_files: + raise FileReadError("Archive appears to be invalid.") + sha_file = tarred_files["checksums.sha256"] + for line_bytes in sha_file.readlines(): + line = str(line_bytes, encoding='utf-8') + [checksum, filename] = line.strip().split(' ') + + if filename not in tarred_files: + raise FileReadError("Archive appears to be invalid.") + + contents = tarred_files[filename] + contents.seek(0) + sha = sha256() + sha.update(contents.read()) + contents.seek(0) + if checksum != sha.hexdigest(): + msg = f"Archive checksum did not match (for file {filename}). "\ + "It is possible the file is corrupt." + raise FileReadError(msg) + + return { + name: contents + for name, contents in tarred_files.items() + if name not in ("checksums.sha256", "version") + } + + except FileError: + raise + except Exception as e: + raise FileReadError(f"Unable to load archive at: {from_path}") from e + + +def _resolve_cache_path(path: str | PathLike[str]) -> Path: + cache_path = Path(path) + if cache_path.is_absolute(): + msg = "When saving to or loading from the cache, please supply a relative path." + raise ValueError(msg) + return CACHE_PATH.joinpath(cache_path).resolve() + + +def save_bundle_to_cache(to_path: str | PathLike[str], version: int, files: dict[str, BytesIO]) -> None: + """ + Save a tar bundle of files to the cache (overwriting the existing file, if any). + The tar includes the sha256 checksums of every content file, + and a version file indicating which application version was + responsible for writing the file (thus allowing the application + to decide if a cached file is still valid when reading it). + """ + save_bundle(_resolve_cache_path(to_path), version, files) + + +def load_bundle_from_cache(from_path: str | PathLike[str], version_at_least: int = -1) -> dict[str, BytesIO]: + """ + Load a tar bundle of files from the cache. `from_path` must be a relative path. + `version_at_least` optionally specifies a version number that must be met or beat + by the cached file in order for the file to be considered valid. If the cached file + was written against a version less than this, it will be considered a cache miss + (raises CacheMiss). + """ + try: + return load_bundle(_resolve_cache_path(from_path), version_at_least) + except FileError as e: + raise CacheMiss() from e diff --git a/epymorph/cli/cache.py b/epymorph/cli/cache.py index 5e9817fa..776c30d2 100644 --- a/epymorph/cli/cache.py +++ b/epymorph/cli/cache.py @@ -5,6 +5,7 @@ from argparse import _SubParsersAction from pathlib import Path +from epymorph.cache import CACHE_PATH from epymorph.data import geo_library, geo_library_dynamic from epymorph.geo import cache from epymorph.geo.static import StaticGeoFileOps as F @@ -111,7 +112,7 @@ def fetch(geo_name_or_path: str, force: bool) -> int: raise cache.GeoCacheException("Specified geo not found.") # cache geo according to information passed - file_path = cache.CACHE_PATH / F.to_archive_filename(geo_name) + file_path = CACHE_PATH / F.to_archive_filename(geo_name) if geo_path is not None and geo_name in geo_library: msg = f"A geo named {geo_name} is already present in the library. Please use the existing geo or change the file name." raise cache.GeoCacheException(msg) @@ -131,9 +132,9 @@ def fetch(geo_name_or_path: str, force: bool) -> int: def export(geo_name_or_path: str, out: str | None, rename: str | None, ignore_cache: bool) -> int: """CLI command handler: export compressed geo to a location outside the cache.""" # split geo name and path - if geo_name_or_path in geo_library_dynamic or os.path.exists(cache.CACHE_PATH / F.to_archive_filename(geo_name_or_path)): + if geo_name_or_path in geo_library_dynamic or os.path.exists(CACHE_PATH / F.to_archive_filename(geo_name_or_path)): geo_name = geo_name_or_path - geo_path = cache.CACHE_PATH / F.to_archive_filename(geo_name) + geo_path = CACHE_PATH / F.to_archive_filename(geo_name) elif os.path.exists(Path(geo_name_or_path).expanduser()): geo_path = Path(geo_name_or_path).expanduser() geo_name = geo_path.stem @@ -161,23 +162,26 @@ def remove(geo_name: str) -> int: def print_geos() -> int: """CLI command handler: print geo cache information""" geos = cache.list_geos() - num_geos = len(geos) - if num_geos > 0: + n = len(geos) + if n > 0: print( - f'epymorph geo cache contains {num_geos} geos totaling {cache.get_total_size()} ({cache.CACHE_PATH})') + f"epymorph geo cache contains {n} geo{('s' if n > 1 else '')} " + f"totaling {cache.get_total_size()} ({CACHE_PATH})" + ) for (name, file_size) in geos: print(f"* {name} ({cache.format_size(file_size)})") else: - print(f'epymorph geo cache is empty ({cache.CACHE_PATH})') + print(f'epymorph geo cache ({CACHE_PATH}) is empty') return 0 # exit code: success def clear() -> int: """CLI command handler: clear geo cache""" - if len(os.listdir(cache.CACHE_PATH)) > 0: + geos = cache.list_geos() + if len(geos) > 0: print( - f'The following geos will be removed from the cache ({cache.CACHE_PATH}) and free {cache.get_total_size()} of space:') - for (name, file_size) in cache.list_geos(): + f'The following geos will be removed from the cache ({CACHE_PATH}) and free {cache.get_total_size()} of space:') + for (name, file_size) in geos: print(f"* {name} ({cache.format_size(file_size)})") choice = input('proceed? [y/n] ') if choice == 'y': @@ -188,5 +192,5 @@ def clear() -> int: return 0 # exit code: success else: - print(f'cache ({cache.CACHE_PATH}) is empty, nothing to clear.') + print(f'epymorph geo cache ({CACHE_PATH}) is empty, nothing to clear.') return 2 # exit code: empty cache diff --git a/epymorph/data/geo/maricopa_cbg_2019.geo.tar b/epymorph/data/geo/maricopa_cbg_2019.geo.tar deleted file mode 100644 index 5823eec5..00000000 Binary files a/epymorph/data/geo/maricopa_cbg_2019.geo.tar and /dev/null differ diff --git a/epymorph/data/geo/maricopa_cbg_2019.geo.tgz b/epymorph/data/geo/maricopa_cbg_2019.geo.tgz new file mode 100644 index 00000000..07e4849b Binary files /dev/null and b/epymorph/data/geo/maricopa_cbg_2019.geo.tgz differ diff --git a/epymorph/data/geo/pei.geo.tar b/epymorph/data/geo/pei.geo.tar deleted file mode 100644 index 5ccb4b20..00000000 Binary files a/epymorph/data/geo/pei.geo.tar and /dev/null differ diff --git a/epymorph/data/geo/pei.geo.tgz b/epymorph/data/geo/pei.geo.tgz new file mode 100644 index 00000000..4a063c85 Binary files /dev/null and b/epymorph/data/geo/pei.geo.tgz differ diff --git a/epymorph/data/geo/us_counties_2015.geo.tar b/epymorph/data/geo/us_counties_2015.geo.tar deleted file mode 100644 index 57950d79..00000000 Binary files a/epymorph/data/geo/us_counties_2015.geo.tar and /dev/null differ diff --git a/epymorph/data/geo/us_counties_2015.geo.tgz b/epymorph/data/geo/us_counties_2015.geo.tgz new file mode 100644 index 00000000..bf6bfbb6 Binary files /dev/null and b/epymorph/data/geo/us_counties_2015.geo.tgz differ diff --git a/epymorph/data/geo/us_states_2015.geo.tar b/epymorph/data/geo/us_states_2015.geo.tar deleted file mode 100644 index e887627f..00000000 Binary files a/epymorph/data/geo/us_states_2015.geo.tar and /dev/null differ diff --git a/epymorph/data/geo/us_states_2015.geo.tgz b/epymorph/data/geo/us_states_2015.geo.tgz new file mode 100644 index 00000000..765cad01 Binary files /dev/null and b/epymorph/data/geo/us_states_2015.geo.tgz differ diff --git a/epymorph/geo/cache.py b/epymorph/geo/cache.py index f6bdfe88..096b45bb 100644 --- a/epymorph/geo/cache.py +++ b/epymorph/geo/cache.py @@ -2,8 +2,7 @@ import os from pathlib import Path -from platformdirs import user_cache_path - +from epymorph.cache import CACHE_PATH from epymorph.data import adrio_maker_library, geo_library_dynamic from epymorph.geo.dynamic import DynamicGeo from epymorph.geo.dynamic import DynamicGeoFileOps as DF @@ -13,8 +12,6 @@ from epymorph.geo.util import convert_to_static_geo from epymorph.log.messaging import dynamic_geo_messaging -CACHE_PATH = user_cache_path(appname='epymorph', ensure_exists=True) - class GeoCacheException(Exception): """An exception raised when a geo cache operation fails.""" @@ -166,9 +163,6 @@ def format_size(size: int) -> str: def get_total_size() -> str: """Returns the total size of all files in the geo cache using 1024-based unit representation.""" - files = os.listdir(CACHE_PATH) - total_size = 0 - for file in files: - total_size += os.path.getsize(CACHE_PATH / file) - + total_size = sum((os.path.getsize(CACHE_PATH / file) + for file, _ in F.iterate_dir_path(CACHE_PATH))) return format_size(total_size) diff --git a/epymorph/geo/static.py b/epymorph/geo/static.py index daebf6ba..227ce4fb 100644 --- a/epymorph/geo/static.py +++ b/epymorph/geo/static.py @@ -2,19 +2,18 @@ A static geo is one that is pre-packaged with all of its data; it doesn't need to fetch any data from outside itself, and all of its data is resident in memory when loaded. """ -import hashlib -import io -import os -import tarfile from importlib.abc import Traversable +from io import BytesIO +from os import PathLike from pathlib import Path from typing import Iterator, Self, cast -import jsonpickle import numpy as np +from jsonpickle import encode as json_encode from numpy.typing import NDArray import epymorph.data_shape as shape +from epymorph.cache import load_bundle, save_bundle from epymorph.error import AttributeException, GeoValidationException from epymorph.geo.geo import Geo from epymorph.geo.spec import (LABEL, AttribDef, StaticGeoSpec, @@ -95,7 +94,7 @@ def select(attrib: AttribDef) -> NDArray: } return self.__class__(self.spec, filtered_values) - def save(self, file: os.PathLike) -> None: + def save(self, file: PathLike) -> None: """Saves this geo to tar format.""" StaticGeoFileOps.save_as_archive(self, file) @@ -106,12 +105,12 @@ class StaticGeoFileOps: @staticmethod def to_archive_filename(geo_id: str) -> str: """Returns the standard filename for a geo archive.""" - return f"{geo_id}.geo.tar" + return f"{geo_id}.geo.tgz" @staticmethod def to_geo_name(filename: str) -> str: """Returns the geo ID from a standard geo archive filename.""" - return filename.removesuffix('.geo.tar') + return filename.removesuffix('.geo.tgz') @staticmethod def iterate_dir(directory: Traversable) -> Iterator[tuple[Traversable, str]]: @@ -123,7 +122,7 @@ def iterate_dir(directory: Traversable) -> Iterator[tuple[Traversable, str]]: """ return ((f, StaticGeoFileOps.to_geo_name(f.name)) for f in directory.iterdir() - if f.is_file() and f.name.endswith('.geo.tar')) + if f.is_file() and f.name.endswith('.geo.tgz')) @staticmethod def iterate_dir_path(directory: Path) -> Iterator[tuple[Path, str]]: @@ -135,100 +134,53 @@ def iterate_dir_path(directory: Path) -> Iterator[tuple[Path, str]]: """ return ((f, StaticGeoFileOps.to_geo_name(f.name)) for f in directory.iterdir() - if f.is_file() and f.name.endswith('.geo.tar')) + if f.is_file() and f.name.endswith('.geo.tgz')) @staticmethod - def save_as_archive(geo: StaticGeo, file: os.PathLike) -> None: + def save_as_archive(geo: StaticGeo, file: PathLike) -> None: """Save a StaticGeo to its tar format.""" - # Write the data file in memory - npz_file = io.BytesIO() - # sorting the geo values makes the sha256 a little more stable + # Write the data file + # (sorting the geo values makes the sha256 a little more stable) + npz_file = BytesIO() np.savez_compressed(npz_file, **as_sorted_dict(geo.values)) - # Data checksum - npz_file.seek(0) - data_sha256 = hashlib.sha256() - data_sha256.update(npz_file.read()) - - # Write the spec file in memory - geo_file = io.BytesIO() - geo_json = cast(str, jsonpickle.encode(geo.spec, unpicklable=True)) + + # Write the spec file + geo_file = BytesIO() + geo_json = cast(str, json_encode(geo.spec, unpicklable=True)) geo_file.write(geo_json.encode('utf-8')) - # Spec checksum - geo_file.seek(0) - spec_sha256 = hashlib.sha256() - spec_sha256.update(geo_file.read()) - - # Write sha256 checksums file in memory - sha_file = io.BytesIO() - sha_text = f"""\ -{data_sha256.hexdigest()} data.npz -{spec_sha256.hexdigest()} spec.geo""" - sha_file.write(bytes(sha_text, encoding='utf-8')) - - # Write the tar to disk - with tarfile.open(file, 'w') as tar: - def add_file(contents: io.BytesIO, name: str) -> None: - info = tarfile.TarInfo(name) - info.size = contents.tell() - contents.seek(0) - tar.addfile(info, contents) - - add_file(npz_file, 'data.npz') - add_file(geo_file, 'spec.geo') - add_file(sha_file, 'checksums.sha256') + + save_bundle( + to_path=file, + version=1, + files={ + "data.npz": npz_file, + "spec.geo": geo_file, + }, + ) @staticmethod - def load_from_archive(file: os.PathLike) -> StaticGeo: + def load_from_archive(file: PathLike) -> StaticGeo: """Load a StaticGeo from its tar format.""" try: - # Read the tar file into memory - tar_buffer = io.BytesIO() - with open(file, 'rb') as f: - tar_buffer.write(f.read()) - tar_buffer.seek(0) - - with tarfile.open(fileobj=tar_buffer, mode='r') as tar: - npz_file = tar.extractfile(tar.getmember('data.npz')) - geo_file = tar.extractfile(tar.getmember('spec.geo')) - sha_file = tar.extractfile(tar.getmember('checksums.sha256')) - - if npz_file is None or geo_file is None or sha_file is None: - msg = 'Archive is incomplete: missing data, spec, and/or checksum files.' - raise GeoValidationException(msg) - - # Verify the checksums - for line_bytes in sha_file.readlines(): - line = str(line_bytes, encoding='utf-8') - [checksum, filename] = line.strip().split(' ') - match filename: - case 'data.npz': - file_to_check = npz_file - case 'spec.geo': - file_to_check = geo_file - case _: - # There shouldn't be any other files listed in the checksum. - msg = f"Unknown file listing in checksums.sha256 ({filename})." - raise GeoValidationException(msg) - file_to_check.seek(0) - sha256 = hashlib.sha256() - sha256.update(file_to_check.read()) - if checksum != sha256.hexdigest(): - msg = f"Archive checksum did not match (for file {filename}). "\ - "It is possible the file has been corrupted." - raise GeoValidationException(msg) - - # Read the spec file in memory - geo_file.seek(0) - spec_json = geo_file.read().decode('utf8') - spec = StaticGeoSpec.deserialize(spec_json) - - # Read the data file in memory - npz_file.seek(0) - with np.load(npz_file) as data: - values = dict(data) - - return StaticGeo(spec, values) - + # allow version -1 so this is backwards compatible with geos that didn't have version + files = load_bundle(file, version_at_least=-1) + if "data.npz" not in files or "spec.geo" not in files: + msg = 'Archive is incomplete: missing data, spec, and/or checksum files.' + raise GeoValidationException(msg) + + # Read the spec file + geo_file = files["spec.geo"] + geo_file.seek(0) + spec_json = geo_file.read().decode('utf8') + spec = StaticGeoSpec.deserialize(spec_json) + + # Read the data file + npz_file = files["data.npz"] + npz_file.seek(0) + with np.load(npz_file) as data: + values = dict(data) + + return StaticGeo(spec, values) except Exception as e: raise GeoValidationException(f"Unable to load '{file}' as a geo.") from e