Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cache utils to standardize our archive format. #106

Merged
merged 1 commit into from
Apr 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 12 additions & 14 deletions doc/devlog/2023-07-07.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -184,12 +184,11 @@
"metadata": {},
"outputs": [],
"source": [
"geofile = Path('epymorph/data/geo') / F.to_archive_filename('us_states_2015')\n",
"try:\n",
" states_geo = StaticGeo(dataclasses.replace(spec), states_values)\n",
" states_geo.validate()\n",
" states_geo.save(\n",
" Path('epymorph/data/geo') / F.to_archive_filename('us_states_2015')\n",
" )\n",
" states_geo.save(geofile)\n",
"except GeoValidationException as e:\n",
" print(e.pretty())"
]
Expand All @@ -202,7 +201,7 @@
{
"data": {
"text/plain": [
"<epymorph.geo.static.StaticGeo at 0x7fe515ee3450>"
"<epymorph.geo.static.StaticGeo at 0x7fc2308a7d50>"
]
},
"execution_count": 6,
Expand All @@ -212,7 +211,7 @@
],
"source": [
"# Verify that we can load the file back.\n",
"F.load_from_archive(Path('epymorph/data/geo/us_states_2015.geo.tar'))"
"F.load_from_archive(geofile)"
]
},
{
Expand Down Expand Up @@ -281,39 +280,38 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"geofile = Path('epymorph/data/geo') / F.to_archive_filename('us_counties_2015')\n",
"try:\n",
" counties_geo = StaticGeo(dataclasses.replace(spec), counties_values)\n",
" counties_geo.validate()\n",
" counties_geo.save(\n",
" Path('epymorph/data/geo') / F.to_archive_filename('us_counties_2015')\n",
" )\n",
" counties_geo.save(geofile)\n",
"except GeoValidationException as e:\n",
" print(e.pretty())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<epymorph.geo.static.StaticGeo at 0x7fe512419410>"
"<epymorph.geo.static.StaticGeo at 0x7fc2307a5fd0>"
]
},
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Verify that we can load the file back.\n",
"F.load_from_archive(Path('epymorph/data/geo/us_counties_2015.geo.tar'))"
"F.load_from_archive(geofile)"
]
}
],
Expand All @@ -333,7 +331,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
"version": "3.11.9"
},
"orig_nbformat": 4
},
Expand Down
112 changes: 112 additions & 0 deletions doc/devlog/2024-04-25.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# devlog 2024-04-25\n",
"\n",
"_Author: Tyler Coles_\n",
"\n",
"Testing cache utilities. This script:\n",
"1. tests that reading and writing from archives works,\n",
"2. checks that we can choose to gzip or not, and\n",
"3. measures the impact of gzipping on read/write time and file size."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Save a geo without compression:\n",
"17.8 ms ± 2.71 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
"Read a geo without compression:\n",
"3.74 ms ± 73.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
"\n",
"Save a geo compressed:\n",
"20.6 ms ± 134 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
"Read a geo with compression:\n",
"4.87 ms ± 57.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
"\n",
"Bytes as a tar: 153,600\n",
"Bytes as a tgz: 134,722\n",
"Compression ratio: 87.7%\n"
]
}
],
"source": [
"import os\n",
"import shutil\n",
"import tempfile\n",
"\n",
"from epymorph import geo_library\n",
"from epymorph.geo.static import StaticGeoFileOps as F\n",
"\n",
"# Our subject geo can be anything, but this one is a useful demo because it's sizeable.\n",
"geo = geo_library['maricopa_cbg_2019']()\n",
"\n",
"tempdir = tempfile.mkdtemp()\n",
"\n",
"print(\"Save a geo without compression:\")\n",
"%timeit F.save_as_archive(geo, f\"{tempdir}/geo.tar\")\n",
"print(\"Read a geo without compression:\")\n",
"%timeit F.load_from_archive(f\"{tempdir}/geo.tar\")\n",
"\n",
"print()\n",
"\n",
"print(\"Save a geo compressed:\")\n",
"%timeit F.save_as_archive(geo, f\"{tempdir}/geo.tgz\")\n",
"print(\"Read a geo with compression:\")\n",
"%timeit F.load_from_archive(f\"{tempdir}/geo.tgz\")\n",
"\n",
"print()\n",
"\n",
"size_tar = os.path.getsize(f\"{tempdir}/geo.tar\")\n",
"size_tgz = os.path.getsize(f\"{tempdir}/geo.tgz\")\n",
"\n",
"print(f\"Bytes as a tar: {size_tar:>9,}\")\n",
"print(f\"Bytes as a tgz: {size_tgz:>9,}\")\n",
"print(f\"Compression ratio: {(size_tgz / size_tar):.1%}\")\n",
"\n",
"shutil.rmtree(tempdir)\n",
"\n",
"# NOTE: the %timeit magics break isort and autopep8, so you're on your own for formatting"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Conclusion\n",
"\n",
"We get decent savings in bytes by storing geos gzipped, and it doesn't take much longer to read and write. ✓"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
191 changes: 191 additions & 0 deletions epymorph/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
"""epymorph's file caching utilities."""
from hashlib import sha256
from io import BytesIO
from os import PathLike
from pathlib import Path
from tarfile import TarInfo, is_tarfile
from tarfile import open as open_tarfile

from platformdirs import user_cache_path

CACHE_PATH = user_cache_path(appname='epymorph', ensure_exists=True)


class FileError(Exception):
"""Error during a file operation."""


class FileMissingError(FileError):
"""Error loading a file, as it does not exist."""


class FileWriteError(FileError):
"""Error writing a file."""


class FileReadError(FileError):
"""Error loading a file."""


class FileVersionError(FileError):
"""Error loading a file due to unmet version requirements."""


class CacheMiss(FileError):
"""Raised on a cache-miss (for any reason) during a load-from-cache operation."""


def save_bundle(to_path: str | PathLike[str], version: int, files: dict[str, BytesIO]) -> None:
"""
Save a bundle of files in our tar format with an associated version number.
`to_path` can be absolute or relative; relative paths will be resolved
against the current working directory. Folders in the path which do not exist
will be created automatically.
"""

if version <= 0:
raise ValueError("version should be greater than zero.")

try:
# Compute checksums
sha_entries = []
for name, contents in files.items():
contents.seek(0)
sha = sha256()
sha.update(contents.read())
sha_entries.append(f"{sha.hexdigest()} {name}")

# Create checksums.sha256 file
sha_file = BytesIO()
sha_text = "\n".join(sha_entries)
sha_file.write(bytes(sha_text, encoding='utf-8'))

# Create cache version file
ver_file = BytesIO()
ver_file.write(bytes(str(version), encoding="utf-8"))

tarred_files = {
**files,
"checksums.sha256": sha_file,
"version": ver_file,
}

# Write the tar to disk
tar_path = Path(to_path).resolve()
tar_path.parent.mkdir(parents=True, exist_ok=True)
mode = 'w:gz' if tar_path.suffix == '.tgz' else 'w'
with open_tarfile(name=tar_path, mode=mode) as tar:
for name, contents in tarred_files.items():
info = TarInfo(name)
info.size = contents.tell()
contents.seek(0)
tar.addfile(info, contents)

except Exception as e:
msg = f"Unable to write archive at path: {to_path}"
raise FileWriteError(msg) from e


def load_bundle(from_path: str | PathLike[str], version_at_least: int = -1) -> dict[str, BytesIO]:
"""
Load a bundle of files in our tar format, optionally enforcing a minimum version.
An Exception is raised if the file cannot be loaded for any reason, or if its version
is incorrect. On success, returns a dictionary of the contained files, mapping the file
name to the bytes of the file.
"""
try:
tar_path = Path(from_path).resolve()
if not tar_path.is_file():
raise FileMissingError(f"No file at: {tar_path}")

# Read the tar file into memory
tar_buffer = BytesIO()
with open(tar_path, 'rb') as f:
tar_buffer.write(f.read())
tar_buffer.seek(0)

if not is_tarfile(tar_buffer):
raise FileReadError(f"Not a tar file at: {tar_path}")

mode = 'r:gz' if tar_path.suffix == '.tgz' else 'r'
tarred_files: dict[str, BytesIO] = {}
with open_tarfile(fileobj=tar_buffer, mode=mode) as tar:
for info in tar.getmembers():
name = info.name
contents = tar.extractfile(info)
if contents is not None:
tarred_files[name] = BytesIO(contents.read())

# Check version
if "version" in tarred_files:
ver_file = tarred_files["version"]
version = int(str(ver_file.readline(), encoding="utf-8"))
else:
version = -1
if version < version_at_least:
raise FileVersionError("Archive is an unacceptable version.")

# Verify the checksums
if "checksums.sha256" not in tarred_files:
raise FileReadError("Archive appears to be invalid.")
sha_file = tarred_files["checksums.sha256"]
for line_bytes in sha_file.readlines():
line = str(line_bytes, encoding='utf-8')
[checksum, filename] = line.strip().split(' ')

if filename not in tarred_files:
raise FileReadError("Archive appears to be invalid.")

contents = tarred_files[filename]
contents.seek(0)
sha = sha256()
sha.update(contents.read())
contents.seek(0)
if checksum != sha.hexdigest():
msg = f"Archive checksum did not match (for file {filename}). "\
"It is possible the file is corrupt."
raise FileReadError(msg)

return {
name: contents
for name, contents in tarred_files.items()
if name not in ("checksums.sha256", "version")
}

except FileError:
raise
except Exception as e:
raise FileReadError(f"Unable to load archive at: {from_path}") from e


def _resolve_cache_path(path: str | PathLike[str]) -> Path:
cache_path = Path(path)
if cache_path.is_absolute():
msg = "When saving to or loading from the cache, please supply a relative path."
raise ValueError(msg)
return CACHE_PATH.joinpath(cache_path).resolve()


def save_bundle_to_cache(to_path: str | PathLike[str], version: int, files: dict[str, BytesIO]) -> None:
"""
Save a tar bundle of files to the cache (overwriting the existing file, if any).
The tar includes the sha256 checksums of every content file,
and a version file indicating which application version was
responsible for writing the file (thus allowing the application
to decide if a cached file is still valid when reading it).
"""
save_bundle(_resolve_cache_path(to_path), version, files)


def load_bundle_from_cache(from_path: str | PathLike[str], version_at_least: int = -1) -> dict[str, BytesIO]:
"""
Load a tar bundle of files from the cache. `from_path` must be a relative path.
`version_at_least` optionally specifies a version number that must be met or beat
by the cached file in order for the file to be considered valid. If the cached file
was written against a version less than this, it will be considered a cache miss
(raises CacheMiss).
"""
try:
return load_bundle(_resolve_cache_path(from_path), version_at_least)
except FileError as e:
raise CacheMiss() from e
Loading
Loading