Add cache utils to standardize our archive format. (#106)

This is the format we have been using for GEOs, but I want to be able to use it for other things as well. I also added versioning support so there's a way to invalidate obsolete cache entries. Also added gzip compression support, and re-generated all the data/geos to use it.
NAU-CCL · Apr 26, 2024 · e6105fd · e6105fd
1 parent 7433936
commit e6105fd
Show file tree

Hide file tree

Showing 14 changed files with 379 additions and 128 deletions.
diff --git a/doc/devlog/2023-07-07.ipynb b/doc/devlog/2023-07-07.ipynb
@@ -184,12 +184,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "geofile = Path('epymorph/data/geo') / F.to_archive_filename('us_states_2015')\n",
     "try:\n",
     "    states_geo = StaticGeo(dataclasses.replace(spec), states_values)\n",
     "    states_geo.validate()\n",
-    "    states_geo.save(\n",
-    "        Path('epymorph/data/geo') / F.to_archive_filename('us_states_2015')\n",
-    "    )\n",
+    "    states_geo.save(geofile)\n",
     "except GeoValidationException as e:\n",
     "    print(e.pretty())"
    ]
@@ -202,7 +201,7 @@
     {
      "data": {
       "text/plain": [
-       "<epymorph.geo.static.StaticGeo at 0x7fe515ee3450>"
+       "<epymorph.geo.static.StaticGeo at 0x7fc2308a7d50>"
       ]
      },
      "execution_count": 6,
@@ -212,7 +211,7 @@
    ],
    "source": [
     "# Verify that we can load the file back.\n",
-    "F.load_from_archive(Path('epymorph/data/geo/us_states_2015.geo.tar'))"
+    "F.load_from_archive(geofile)"
    ]
   },
   {
@@ -281,39 +280,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
+    "geofile = Path('epymorph/data/geo') / F.to_archive_filename('us_counties_2015')\n",
     "try:\n",
     "    counties_geo = StaticGeo(dataclasses.replace(spec), counties_values)\n",
     "    counties_geo.validate()\n",
-    "    counties_geo.save(\n",
-    "        Path('epymorph/data/geo') / F.to_archive_filename('us_counties_2015')\n",
-    "    )\n",
+    "    counties_geo.save(geofile)\n",
     "except GeoValidationException as e:\n",
     "    print(e.pretty())"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<epymorph.geo.static.StaticGeo at 0x7fe512419410>"
+       "<epymorph.geo.static.StaticGeo at 0x7fc2307a5fd0>"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# Verify that we can load the file back.\n",
-    "F.load_from_archive(Path('epymorph/data/geo/us_counties_2015.geo.tar'))"
+    "F.load_from_archive(geofile)"
    ]
   }
  ],
@@ -333,7 +331,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.6"
+   "version": "3.11.9"
   },
   "orig_nbformat": 4
  },

diff --git a/doc/devlog/2024-04-25.ipynb b/doc/devlog/2024-04-25.ipynb
@@ -0,0 +1,112 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# devlog 2024-04-25\n",
+    "\n",
+    "_Author: Tyler Coles_\n",
+    "\n",
+    "Testing cache utilities. This script:\n",
+    "1. tests that reading and writing from archives works,\n",
+    "2. checks that we can choose to gzip or not, and\n",
+    "3. measures the impact of gzipping on read/write time and file size."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Save a geo without compression:\n",
+      "17.8 ms ± 2.71 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
+      "Read a geo without compression:\n",
+      "3.74 ms ± 73.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
+      "\n",
+      "Save a geo compressed:\n",
+      "20.6 ms ± 134 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
+      "Read a geo with compression:\n",
+      "4.87 ms ± 57.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
+      "\n",
+      "Bytes as a tar:   153,600\n",
+      "Bytes as a tgz:   134,722\n",
+      "Compression ratio: 87.7%\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import shutil\n",
+    "import tempfile\n",
+    "\n",
+    "from epymorph import geo_library\n",
+    "from epymorph.geo.static import StaticGeoFileOps as F\n",
+    "\n",
+    "# Our subject geo can be anything, but this one is a useful demo because it's sizeable.\n",
+    "geo = geo_library['maricopa_cbg_2019']()\n",
+    "\n",
+    "tempdir = tempfile.mkdtemp()\n",
+    "\n",
+    "print(\"Save a geo without compression:\")\n",
+    "%timeit F.save_as_archive(geo, f\"{tempdir}/geo.tar\")\n",
+    "print(\"Read a geo without compression:\")\n",
+    "%timeit F.load_from_archive(f\"{tempdir}/geo.tar\")\n",
+    "\n",
+    "print()\n",
+    "\n",
+    "print(\"Save a geo compressed:\")\n",
+    "%timeit F.save_as_archive(geo, f\"{tempdir}/geo.tgz\")\n",
+    "print(\"Read a geo with compression:\")\n",
+    "%timeit F.load_from_archive(f\"{tempdir}/geo.tgz\")\n",
+    "\n",
+    "print()\n",
+    "\n",
+    "size_tar = os.path.getsize(f\"{tempdir}/geo.tar\")\n",
+    "size_tgz = os.path.getsize(f\"{tempdir}/geo.tgz\")\n",
+    "\n",
+    "print(f\"Bytes as a tar: {size_tar:>9,}\")\n",
+    "print(f\"Bytes as a tgz: {size_tgz:>9,}\")\n",
+    "print(f\"Compression ratio: {(size_tgz / size_tar):.1%}\")\n",
+    "\n",
+    "shutil.rmtree(tempdir)\n",
+    "\n",
+    "# NOTE: the %timeit magics break isort and autopep8, so you're on your own for formatting"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Conclusion\n",
+    "\n",
+    "We get decent savings in bytes by storing geos gzipped, and it doesn't take much longer to read and write. ✓"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/epymorph/cache.py b/epymorph/cache.py
@@ -0,0 +1,191 @@
+"""epymorph's file caching utilities."""
+from hashlib import sha256
+from io import BytesIO
+from os import PathLike
+from pathlib import Path
+from tarfile import TarInfo, is_tarfile
+from tarfile import open as open_tarfile
+
+from platformdirs import user_cache_path
+
+CACHE_PATH = user_cache_path(appname='epymorph', ensure_exists=True)
+
+
+class FileError(Exception):
+    """Error during a file operation."""
+
+
+class FileMissingError(FileError):
+    """Error loading a file, as it does not exist."""
+
+
+class FileWriteError(FileError):
+    """Error writing a file."""
+
+
+class FileReadError(FileError):
+    """Error loading a file."""
+
+
+class FileVersionError(FileError):
+    """Error loading a file due to unmet version requirements."""
+
+
+class CacheMiss(FileError):
+    """Raised on a cache-miss (for any reason) during a load-from-cache operation."""
+
+
+def save_bundle(to_path: str | PathLike[str], version: int, files: dict[str, BytesIO]) -> None:
+    """
+    Save a bundle of files in our tar format with an associated version number.
+    `to_path` can be absolute or relative; relative paths will be resolved
+    against the current working directory. Folders in the path which do not exist
+    will be created automatically.
+    """
+
+    if version <= 0:
+        raise ValueError("version should be greater than zero.")
+
+    try:
+        # Compute checksums
+        sha_entries = []
+        for name, contents in files.items():
+            contents.seek(0)
+            sha = sha256()
+            sha.update(contents.read())
+            sha_entries.append(f"{sha.hexdigest()}  {name}")
+
+        # Create checksums.sha256 file
+        sha_file = BytesIO()
+        sha_text = "\n".join(sha_entries)
+        sha_file.write(bytes(sha_text, encoding='utf-8'))
+
+        # Create cache version file
+        ver_file = BytesIO()
+        ver_file.write(bytes(str(version), encoding="utf-8"))
+
+        tarred_files = {
+            **files,
+            "checksums.sha256": sha_file,
+            "version": ver_file,
+        }
+
+        # Write the tar to disk
+        tar_path = Path(to_path).resolve()
+        tar_path.parent.mkdir(parents=True, exist_ok=True)
+        mode = 'w:gz' if tar_path.suffix == '.tgz' else 'w'
+        with open_tarfile(name=tar_path, mode=mode) as tar:
+            for name, contents in tarred_files.items():
+                info = TarInfo(name)
+                info.size = contents.tell()
+                contents.seek(0)
+                tar.addfile(info, contents)
+
+    except Exception as e:
+        msg = f"Unable to write archive at path: {to_path}"
+        raise FileWriteError(msg) from e
+
+
+def load_bundle(from_path: str | PathLike[str], version_at_least: int = -1) -> dict[str, BytesIO]:
+    """
+    Load a bundle of files in our tar format, optionally enforcing a minimum version.
+    An Exception is raised if the file cannot be loaded for any reason, or if its version
+    is incorrect. On success, returns a dictionary of the contained files, mapping the file
+    name to the bytes of the file.
+    """
+    try:
+        tar_path = Path(from_path).resolve()
+        if not tar_path.is_file():
+            raise FileMissingError(f"No file at: {tar_path}")
+
+        # Read the tar file into memory
+        tar_buffer = BytesIO()
+        with open(tar_path, 'rb') as f:
+            tar_buffer.write(f.read())
+        tar_buffer.seek(0)
+
+        if not is_tarfile(tar_buffer):
+            raise FileReadError(f"Not a tar file at: {tar_path}")
+
+        mode = 'r:gz' if tar_path.suffix == '.tgz' else 'r'
+        tarred_files: dict[str, BytesIO] = {}
+        with open_tarfile(fileobj=tar_buffer, mode=mode) as tar:
+            for info in tar.getmembers():
+                name = info.name
+                contents = tar.extractfile(info)
+                if contents is not None:
+                    tarred_files[name] = BytesIO(contents.read())
+
+        # Check version
+        if "version" in tarred_files:
+            ver_file = tarred_files["version"]
+            version = int(str(ver_file.readline(), encoding="utf-8"))
+        else:
+            version = -1
+        if version < version_at_least:
+            raise FileVersionError("Archive is an unacceptable version.")
+
+        # Verify the checksums
+        if "checksums.sha256" not in tarred_files:
+            raise FileReadError("Archive appears to be invalid.")
+        sha_file = tarred_files["checksums.sha256"]
+        for line_bytes in sha_file.readlines():
+            line = str(line_bytes, encoding='utf-8')
+            [checksum, filename] = line.strip().split('  ')
+
+            if filename not in tarred_files:
+                raise FileReadError("Archive appears to be invalid.")
+
+            contents = tarred_files[filename]
+            contents.seek(0)
+            sha = sha256()
+            sha.update(contents.read())
+            contents.seek(0)
+            if checksum != sha.hexdigest():
+                msg = f"Archive checksum did not match (for file {filename}). "\
+                    "It is possible the file is corrupt."
+                raise FileReadError(msg)
+
+        return {
+            name: contents
+            for name, contents in tarred_files.items()
+            if name not in ("checksums.sha256", "version")
+        }
+
+    except FileError:
+        raise
+    except Exception as e:
+        raise FileReadError(f"Unable to load archive at: {from_path}") from e
+
+
+def _resolve_cache_path(path: str | PathLike[str]) -> Path:
+    cache_path = Path(path)
+    if cache_path.is_absolute():
+        msg = "When saving to or loading from the cache, please supply a relative path."
+        raise ValueError(msg)
+    return CACHE_PATH.joinpath(cache_path).resolve()
+
+
+def save_bundle_to_cache(to_path: str | PathLike[str], version: int, files: dict[str, BytesIO]) -> None:
+    """
+    Save a tar bundle of files to the cache (overwriting the existing file, if any).
+    The tar includes the sha256 checksums of every content file,
+    and a version file indicating which application version was
+    responsible for writing the file (thus allowing the application
+    to decide if a cached file is still valid when reading it).
+    """
+    save_bundle(_resolve_cache_path(to_path), version, files)
+
+
+def load_bundle_from_cache(from_path: str | PathLike[str], version_at_least: int = -1) -> dict[str, BytesIO]:
+    """
+    Load a tar bundle of files from the cache. `from_path` must be a relative path.
+    `version_at_least` optionally specifies a version number that must be met or beat
+    by the cached file in order for the file to be considered valid. If the cached file
+    was written against a version less than this, it will be considered a cache miss
+    (raises CacheMiss).
+    """
+    try:
+        return load_bundle(_resolve_cache_path(from_path), version_at_least)
+    except FileError as e:
+        raise CacheMiss() from e