-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add cache utils to standardize our archive format. (#106)
This is the format we have been using for GEOs, but I want to be able to use it for other things as well. I also added versioning support so there's a way to invalidate obsolete cache entries. Also added gzip compression support, and re-generated all the data/geos to use it.
- Loading branch information
Showing
14 changed files
with
379 additions
and
128 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# devlog 2024-04-25\n", | ||
"\n", | ||
"_Author: Tyler Coles_\n", | ||
"\n", | ||
"Testing cache utilities. This script:\n", | ||
"1. tests that reading and writing from archives works,\n", | ||
"2. checks that we can choose to gzip or not, and\n", | ||
"3. measures the impact of gzipping on read/write time and file size." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Save a geo without compression:\n", | ||
"17.8 ms ± 2.71 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", | ||
"Read a geo without compression:\n", | ||
"3.74 ms ± 73.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", | ||
"\n", | ||
"Save a geo compressed:\n", | ||
"20.6 ms ± 134 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", | ||
"Read a geo with compression:\n", | ||
"4.87 ms ± 57.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", | ||
"\n", | ||
"Bytes as a tar: 153,600\n", | ||
"Bytes as a tgz: 134,722\n", | ||
"Compression ratio: 87.7%\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import os\n", | ||
"import shutil\n", | ||
"import tempfile\n", | ||
"\n", | ||
"from epymorph import geo_library\n", | ||
"from epymorph.geo.static import StaticGeoFileOps as F\n", | ||
"\n", | ||
"# Our subject geo can be anything, but this one is a useful demo because it's sizeable.\n", | ||
"geo = geo_library['maricopa_cbg_2019']()\n", | ||
"\n", | ||
"tempdir = tempfile.mkdtemp()\n", | ||
"\n", | ||
"print(\"Save a geo without compression:\")\n", | ||
"%timeit F.save_as_archive(geo, f\"{tempdir}/geo.tar\")\n", | ||
"print(\"Read a geo without compression:\")\n", | ||
"%timeit F.load_from_archive(f\"{tempdir}/geo.tar\")\n", | ||
"\n", | ||
"print()\n", | ||
"\n", | ||
"print(\"Save a geo compressed:\")\n", | ||
"%timeit F.save_as_archive(geo, f\"{tempdir}/geo.tgz\")\n", | ||
"print(\"Read a geo with compression:\")\n", | ||
"%timeit F.load_from_archive(f\"{tempdir}/geo.tgz\")\n", | ||
"\n", | ||
"print()\n", | ||
"\n", | ||
"size_tar = os.path.getsize(f\"{tempdir}/geo.tar\")\n", | ||
"size_tgz = os.path.getsize(f\"{tempdir}/geo.tgz\")\n", | ||
"\n", | ||
"print(f\"Bytes as a tar: {size_tar:>9,}\")\n", | ||
"print(f\"Bytes as a tgz: {size_tgz:>9,}\")\n", | ||
"print(f\"Compression ratio: {(size_tgz / size_tar):.1%}\")\n", | ||
"\n", | ||
"shutil.rmtree(tempdir)\n", | ||
"\n", | ||
"# NOTE: the %timeit magics break isort and autopep8, so you're on your own for formatting" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Conclusion\n", | ||
"\n", | ||
"We get decent savings in bytes by storing geos gzipped, and it doesn't take much longer to read and write. ✓" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": ".venv", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.9" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
"""epymorph's file caching utilities.""" | ||
from hashlib import sha256 | ||
from io import BytesIO | ||
from os import PathLike | ||
from pathlib import Path | ||
from tarfile import TarInfo, is_tarfile | ||
from tarfile import open as open_tarfile | ||
|
||
from platformdirs import user_cache_path | ||
|
||
CACHE_PATH = user_cache_path(appname='epymorph', ensure_exists=True) | ||
|
||
|
||
class FileError(Exception): | ||
"""Error during a file operation.""" | ||
|
||
|
||
class FileMissingError(FileError): | ||
"""Error loading a file, as it does not exist.""" | ||
|
||
|
||
class FileWriteError(FileError): | ||
"""Error writing a file.""" | ||
|
||
|
||
class FileReadError(FileError): | ||
"""Error loading a file.""" | ||
|
||
|
||
class FileVersionError(FileError): | ||
"""Error loading a file due to unmet version requirements.""" | ||
|
||
|
||
class CacheMiss(FileError): | ||
"""Raised on a cache-miss (for any reason) during a load-from-cache operation.""" | ||
|
||
|
||
def save_bundle(to_path: str | PathLike[str], version: int, files: dict[str, BytesIO]) -> None: | ||
""" | ||
Save a bundle of files in our tar format with an associated version number. | ||
`to_path` can be absolute or relative; relative paths will be resolved | ||
against the current working directory. Folders in the path which do not exist | ||
will be created automatically. | ||
""" | ||
|
||
if version <= 0: | ||
raise ValueError("version should be greater than zero.") | ||
|
||
try: | ||
# Compute checksums | ||
sha_entries = [] | ||
for name, contents in files.items(): | ||
contents.seek(0) | ||
sha = sha256() | ||
sha.update(contents.read()) | ||
sha_entries.append(f"{sha.hexdigest()} {name}") | ||
|
||
# Create checksums.sha256 file | ||
sha_file = BytesIO() | ||
sha_text = "\n".join(sha_entries) | ||
sha_file.write(bytes(sha_text, encoding='utf-8')) | ||
|
||
# Create cache version file | ||
ver_file = BytesIO() | ||
ver_file.write(bytes(str(version), encoding="utf-8")) | ||
|
||
tarred_files = { | ||
**files, | ||
"checksums.sha256": sha_file, | ||
"version": ver_file, | ||
} | ||
|
||
# Write the tar to disk | ||
tar_path = Path(to_path).resolve() | ||
tar_path.parent.mkdir(parents=True, exist_ok=True) | ||
mode = 'w:gz' if tar_path.suffix == '.tgz' else 'w' | ||
with open_tarfile(name=tar_path, mode=mode) as tar: | ||
for name, contents in tarred_files.items(): | ||
info = TarInfo(name) | ||
info.size = contents.tell() | ||
contents.seek(0) | ||
tar.addfile(info, contents) | ||
|
||
except Exception as e: | ||
msg = f"Unable to write archive at path: {to_path}" | ||
raise FileWriteError(msg) from e | ||
|
||
|
||
def load_bundle(from_path: str | PathLike[str], version_at_least: int = -1) -> dict[str, BytesIO]: | ||
""" | ||
Load a bundle of files in our tar format, optionally enforcing a minimum version. | ||
An Exception is raised if the file cannot be loaded for any reason, or if its version | ||
is incorrect. On success, returns a dictionary of the contained files, mapping the file | ||
name to the bytes of the file. | ||
""" | ||
try: | ||
tar_path = Path(from_path).resolve() | ||
if not tar_path.is_file(): | ||
raise FileMissingError(f"No file at: {tar_path}") | ||
|
||
# Read the tar file into memory | ||
tar_buffer = BytesIO() | ||
with open(tar_path, 'rb') as f: | ||
tar_buffer.write(f.read()) | ||
tar_buffer.seek(0) | ||
|
||
if not is_tarfile(tar_buffer): | ||
raise FileReadError(f"Not a tar file at: {tar_path}") | ||
|
||
mode = 'r:gz' if tar_path.suffix == '.tgz' else 'r' | ||
tarred_files: dict[str, BytesIO] = {} | ||
with open_tarfile(fileobj=tar_buffer, mode=mode) as tar: | ||
for info in tar.getmembers(): | ||
name = info.name | ||
contents = tar.extractfile(info) | ||
if contents is not None: | ||
tarred_files[name] = BytesIO(contents.read()) | ||
|
||
# Check version | ||
if "version" in tarred_files: | ||
ver_file = tarred_files["version"] | ||
version = int(str(ver_file.readline(), encoding="utf-8")) | ||
else: | ||
version = -1 | ||
if version < version_at_least: | ||
raise FileVersionError("Archive is an unacceptable version.") | ||
|
||
# Verify the checksums | ||
if "checksums.sha256" not in tarred_files: | ||
raise FileReadError("Archive appears to be invalid.") | ||
sha_file = tarred_files["checksums.sha256"] | ||
for line_bytes in sha_file.readlines(): | ||
line = str(line_bytes, encoding='utf-8') | ||
[checksum, filename] = line.strip().split(' ') | ||
|
||
if filename not in tarred_files: | ||
raise FileReadError("Archive appears to be invalid.") | ||
|
||
contents = tarred_files[filename] | ||
contents.seek(0) | ||
sha = sha256() | ||
sha.update(contents.read()) | ||
contents.seek(0) | ||
if checksum != sha.hexdigest(): | ||
msg = f"Archive checksum did not match (for file {filename}). "\ | ||
"It is possible the file is corrupt." | ||
raise FileReadError(msg) | ||
|
||
return { | ||
name: contents | ||
for name, contents in tarred_files.items() | ||
if name not in ("checksums.sha256", "version") | ||
} | ||
|
||
except FileError: | ||
raise | ||
except Exception as e: | ||
raise FileReadError(f"Unable to load archive at: {from_path}") from e | ||
|
||
|
||
def _resolve_cache_path(path: str | PathLike[str]) -> Path: | ||
cache_path = Path(path) | ||
if cache_path.is_absolute(): | ||
msg = "When saving to or loading from the cache, please supply a relative path." | ||
raise ValueError(msg) | ||
return CACHE_PATH.joinpath(cache_path).resolve() | ||
|
||
|
||
def save_bundle_to_cache(to_path: str | PathLike[str], version: int, files: dict[str, BytesIO]) -> None: | ||
""" | ||
Save a tar bundle of files to the cache (overwriting the existing file, if any). | ||
The tar includes the sha256 checksums of every content file, | ||
and a version file indicating which application version was | ||
responsible for writing the file (thus allowing the application | ||
to decide if a cached file is still valid when reading it). | ||
""" | ||
save_bundle(_resolve_cache_path(to_path), version, files) | ||
|
||
|
||
def load_bundle_from_cache(from_path: str | PathLike[str], version_at_least: int = -1) -> dict[str, BytesIO]: | ||
""" | ||
Load a tar bundle of files from the cache. `from_path` must be a relative path. | ||
`version_at_least` optionally specifies a version number that must be met or beat | ||
by the cached file in order for the file to be considered valid. If the cached file | ||
was written against a version less than this, it will be considered a cache miss | ||
(raises CacheMiss). | ||
""" | ||
try: | ||
return load_bundle(_resolve_cache_path(from_path), version_at_least) | ||
except FileError as e: | ||
raise CacheMiss() from e |
Oops, something went wrong.