Skip to content

Commit

Permalink
Add cache utils to standardize our archive format. (#106)
Browse files Browse the repository at this point in the history
This is the format we have been using for GEOs, but I want to be able to use it for other things as well.
I also added versioning support so there's a way to invalidate obsolete cache entries.
Also added gzip compression support, and re-generated all the data/geos to use it.
  • Loading branch information
JavadocMD authored Apr 26, 2024
1 parent 7433936 commit e6105fd
Show file tree
Hide file tree
Showing 14 changed files with 379 additions and 128 deletions.
26 changes: 12 additions & 14 deletions doc/devlog/2023-07-07.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -184,12 +184,11 @@
"metadata": {},
"outputs": [],
"source": [
"geofile = Path('epymorph/data/geo') / F.to_archive_filename('us_states_2015')\n",
"try:\n",
" states_geo = StaticGeo(dataclasses.replace(spec), states_values)\n",
" states_geo.validate()\n",
" states_geo.save(\n",
" Path('epymorph/data/geo') / F.to_archive_filename('us_states_2015')\n",
" )\n",
" states_geo.save(geofile)\n",
"except GeoValidationException as e:\n",
" print(e.pretty())"
]
Expand All @@ -202,7 +201,7 @@
{
"data": {
"text/plain": [
"<epymorph.geo.static.StaticGeo at 0x7fe515ee3450>"
"<epymorph.geo.static.StaticGeo at 0x7fc2308a7d50>"
]
},
"execution_count": 6,
Expand All @@ -212,7 +211,7 @@
],
"source": [
"# Verify that we can load the file back.\n",
"F.load_from_archive(Path('epymorph/data/geo/us_states_2015.geo.tar'))"
"F.load_from_archive(geofile)"
]
},
{
Expand Down Expand Up @@ -281,39 +280,38 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"geofile = Path('epymorph/data/geo') / F.to_archive_filename('us_counties_2015')\n",
"try:\n",
" counties_geo = StaticGeo(dataclasses.replace(spec), counties_values)\n",
" counties_geo.validate()\n",
" counties_geo.save(\n",
" Path('epymorph/data/geo') / F.to_archive_filename('us_counties_2015')\n",
" )\n",
" counties_geo.save(geofile)\n",
"except GeoValidationException as e:\n",
" print(e.pretty())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<epymorph.geo.static.StaticGeo at 0x7fe512419410>"
"<epymorph.geo.static.StaticGeo at 0x7fc2307a5fd0>"
]
},
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Verify that we can load the file back.\n",
"F.load_from_archive(Path('epymorph/data/geo/us_counties_2015.geo.tar'))"
"F.load_from_archive(geofile)"
]
}
],
Expand All @@ -333,7 +331,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
"version": "3.11.9"
},
"orig_nbformat": 4
},
Expand Down
112 changes: 112 additions & 0 deletions doc/devlog/2024-04-25.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# devlog 2024-04-25\n",
"\n",
"_Author: Tyler Coles_\n",
"\n",
"Testing cache utilities. This script:\n",
"1. tests that reading and writing from archives works,\n",
"2. checks that we can choose to gzip or not, and\n",
"3. measures the impact of gzipping on read/write time and file size."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Save a geo without compression:\n",
"17.8 ms ± 2.71 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
"Read a geo without compression:\n",
"3.74 ms ± 73.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
"\n",
"Save a geo compressed:\n",
"20.6 ms ± 134 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"Read a geo with compression:\n",
"4.87 ms ± 57.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
"\n",
"Bytes as a tar: 153,600\n",
"Bytes as a tgz: 134,722\n",
"Compression ratio: 87.7%\n"
]
}
],
"source": [
"import os\n",
"import shutil\n",
"import tempfile\n",
"\n",
"from epymorph import geo_library\n",
"from epymorph.geo.static import StaticGeoFileOps as F\n",
"\n",
"# Our subject geo can be anything, but this one is a useful demo because it's sizeable.\n",
"geo = geo_library['maricopa_cbg_2019']()\n",
"\n",
"tempdir = tempfile.mkdtemp()\n",
"\n",
"print(\"Save a geo without compression:\")\n",
"%timeit F.save_as_archive(geo, f\"{tempdir}/geo.tar\")\n",
"print(\"Read a geo without compression:\")\n",
"%timeit F.load_from_archive(f\"{tempdir}/geo.tar\")\n",
"\n",
"print()\n",
"\n",
"print(\"Save a geo compressed:\")\n",
"%timeit F.save_as_archive(geo, f\"{tempdir}/geo.tgz\")\n",
"print(\"Read a geo with compression:\")\n",
"%timeit F.load_from_archive(f\"{tempdir}/geo.tgz\")\n",
"\n",
"print()\n",
"\n",
"size_tar = os.path.getsize(f\"{tempdir}/geo.tar\")\n",
"size_tgz = os.path.getsize(f\"{tempdir}/geo.tgz\")\n",
"\n",
"print(f\"Bytes as a tar: {size_tar:>9,}\")\n",
"print(f\"Bytes as a tgz: {size_tgz:>9,}\")\n",
"print(f\"Compression ratio: {(size_tgz / size_tar):.1%}\")\n",
"\n",
"shutil.rmtree(tempdir)\n",
"\n",
"# NOTE: the %timeit magics break isort and autopep8, so you're on your own for formatting"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Conclusion\n",
"\n",
"We get decent savings in bytes by storing geos gzipped, and it doesn't take much longer to read and write. ✓"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
191 changes: 191 additions & 0 deletions epymorph/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
"""epymorph's file caching utilities."""
from hashlib import sha256
from io import BytesIO
from os import PathLike
from pathlib import Path
from tarfile import TarInfo, is_tarfile
from tarfile import open as open_tarfile

from platformdirs import user_cache_path

CACHE_PATH = user_cache_path(appname='epymorph', ensure_exists=True)


class FileError(Exception):
"""Error during a file operation."""


class FileMissingError(FileError):
"""Error loading a file, as it does not exist."""


class FileWriteError(FileError):
"""Error writing a file."""


class FileReadError(FileError):
"""Error loading a file."""


class FileVersionError(FileError):
"""Error loading a file due to unmet version requirements."""


class CacheMiss(FileError):
"""Raised on a cache-miss (for any reason) during a load-from-cache operation."""


def save_bundle(to_path: str | PathLike[str], version: int, files: dict[str, BytesIO]) -> None:
"""
Save a bundle of files in our tar format with an associated version number.
`to_path` can be absolute or relative; relative paths will be resolved
against the current working directory. Folders in the path which do not exist
will be created automatically.
"""

if version <= 0:
raise ValueError("version should be greater than zero.")

try:
# Compute checksums
sha_entries = []
for name, contents in files.items():
contents.seek(0)
sha = sha256()
sha.update(contents.read())
sha_entries.append(f"{sha.hexdigest()} {name}")

# Create checksums.sha256 file
sha_file = BytesIO()
sha_text = "\n".join(sha_entries)
sha_file.write(bytes(sha_text, encoding='utf-8'))

# Create cache version file
ver_file = BytesIO()
ver_file.write(bytes(str(version), encoding="utf-8"))

tarred_files = {
**files,
"checksums.sha256": sha_file,
"version": ver_file,
}

# Write the tar to disk
tar_path = Path(to_path).resolve()
tar_path.parent.mkdir(parents=True, exist_ok=True)
mode = 'w:gz' if tar_path.suffix == '.tgz' else 'w'
with open_tarfile(name=tar_path, mode=mode) as tar:
for name, contents in tarred_files.items():
info = TarInfo(name)
info.size = contents.tell()
contents.seek(0)
tar.addfile(info, contents)

except Exception as e:
msg = f"Unable to write archive at path: {to_path}"
raise FileWriteError(msg) from e


def load_bundle(from_path: str | PathLike[str], version_at_least: int = -1) -> dict[str, BytesIO]:
"""
Load a bundle of files in our tar format, optionally enforcing a minimum version.
An Exception is raised if the file cannot be loaded for any reason, or if its version
is incorrect. On success, returns a dictionary of the contained files, mapping the file
name to the bytes of the file.
"""
try:
tar_path = Path(from_path).resolve()
if not tar_path.is_file():
raise FileMissingError(f"No file at: {tar_path}")

# Read the tar file into memory
tar_buffer = BytesIO()
with open(tar_path, 'rb') as f:
tar_buffer.write(f.read())
tar_buffer.seek(0)

if not is_tarfile(tar_buffer):
raise FileReadError(f"Not a tar file at: {tar_path}")

mode = 'r:gz' if tar_path.suffix == '.tgz' else 'r'
tarred_files: dict[str, BytesIO] = {}
with open_tarfile(fileobj=tar_buffer, mode=mode) as tar:
for info in tar.getmembers():
name = info.name
contents = tar.extractfile(info)
if contents is not None:
tarred_files[name] = BytesIO(contents.read())

# Check version
if "version" in tarred_files:
ver_file = tarred_files["version"]
version = int(str(ver_file.readline(), encoding="utf-8"))
else:
version = -1
if version < version_at_least:
raise FileVersionError("Archive is an unacceptable version.")

# Verify the checksums
if "checksums.sha256" not in tarred_files:
raise FileReadError("Archive appears to be invalid.")
sha_file = tarred_files["checksums.sha256"]
for line_bytes in sha_file.readlines():
line = str(line_bytes, encoding='utf-8')
[checksum, filename] = line.strip().split(' ')

if filename not in tarred_files:
raise FileReadError("Archive appears to be invalid.")

contents = tarred_files[filename]
contents.seek(0)
sha = sha256()
sha.update(contents.read())
contents.seek(0)
if checksum != sha.hexdigest():
msg = f"Archive checksum did not match (for file {filename}). "\
"It is possible the file is corrupt."
raise FileReadError(msg)

return {
name: contents
for name, contents in tarred_files.items()
if name not in ("checksums.sha256", "version")
}

except FileError:
raise
except Exception as e:
raise FileReadError(f"Unable to load archive at: {from_path}") from e


def _resolve_cache_path(path: str | PathLike[str]) -> Path:
cache_path = Path(path)
if cache_path.is_absolute():
msg = "When saving to or loading from the cache, please supply a relative path."
raise ValueError(msg)
return CACHE_PATH.joinpath(cache_path).resolve()


def save_bundle_to_cache(to_path: str | PathLike[str], version: int, files: dict[str, BytesIO]) -> None:
"""
Save a tar bundle of files to the cache (overwriting the existing file, if any).
The tar includes the sha256 checksums of every content file,
and a version file indicating which application version was
responsible for writing the file (thus allowing the application
to decide if a cached file is still valid when reading it).
"""
save_bundle(_resolve_cache_path(to_path), version, files)


def load_bundle_from_cache(from_path: str | PathLike[str], version_at_least: int = -1) -> dict[str, BytesIO]:
"""
Load a tar bundle of files from the cache. `from_path` must be a relative path.
`version_at_least` optionally specifies a version number that must be met or beat
by the cached file in order for the file to be considered valid. If the cached file
was written against a version less than this, it will be considered a cache miss
(raises CacheMiss).
"""
try:
return load_bundle(_resolve_cache_path(from_path), version_at_least)
except FileError as e:
raise CacheMiss() from e
Loading

0 comments on commit e6105fd

Please sign in to comment.