Skip to content

Commit 0dcc5cf

Browse files
author
Tyler Coles
committed
Add cache utils to standardize our archive format.
This is the format we have been using for GEOs, but I want to be able to use it for other things as well. I also added versioning support so there's a way to invalidate obsolete cache entries. Also added gzip compression support, and re-generated all the data/geos to use it.
1 parent 7433936 commit 0dcc5cf

14 files changed

+379
-128
lines changed

doc/devlog/2023-07-07.ipynb

+12-14
Original file line numberDiff line numberDiff line change
@@ -184,12 +184,11 @@
184184
"metadata": {},
185185
"outputs": [],
186186
"source": [
187+
"geofile = Path('epymorph/data/geo') / F.to_archive_filename('us_states_2015')\n",
187188
"try:\n",
188189
" states_geo = StaticGeo(dataclasses.replace(spec), states_values)\n",
189190
" states_geo.validate()\n",
190-
" states_geo.save(\n",
191-
" Path('epymorph/data/geo') / F.to_archive_filename('us_states_2015')\n",
192-
" )\n",
191+
" states_geo.save(geofile)\n",
193192
"except GeoValidationException as e:\n",
194193
" print(e.pretty())"
195194
]
@@ -202,7 +201,7 @@
202201
{
203202
"data": {
204203
"text/plain": [
205-
"<epymorph.geo.static.StaticGeo at 0x7fe515ee3450>"
204+
"<epymorph.geo.static.StaticGeo at 0x7fc2308a7d50>"
206205
]
207206
},
208207
"execution_count": 6,
@@ -212,7 +211,7 @@
212211
],
213212
"source": [
214213
"# Verify that we can load the file back.\n",
215-
"F.load_from_archive(Path('epymorph/data/geo/us_states_2015.geo.tar'))"
214+
"F.load_from_archive(geofile)"
216215
]
217216
},
218217
{
@@ -281,39 +280,38 @@
281280
},
282281
{
283282
"cell_type": "code",
284-
"execution_count": 10,
283+
"execution_count": 9,
285284
"metadata": {},
286285
"outputs": [],
287286
"source": [
287+
"geofile = Path('epymorph/data/geo') / F.to_archive_filename('us_counties_2015')\n",
288288
"try:\n",
289289
" counties_geo = StaticGeo(dataclasses.replace(spec), counties_values)\n",
290290
" counties_geo.validate()\n",
291-
" counties_geo.save(\n",
292-
" Path('epymorph/data/geo') / F.to_archive_filename('us_counties_2015')\n",
293-
" )\n",
291+
" counties_geo.save(geofile)\n",
294292
"except GeoValidationException as e:\n",
295293
" print(e.pretty())"
296294
]
297295
},
298296
{
299297
"cell_type": "code",
300-
"execution_count": 11,
298+
"execution_count": 10,
301299
"metadata": {},
302300
"outputs": [
303301
{
304302
"data": {
305303
"text/plain": [
306-
"<epymorph.geo.static.StaticGeo at 0x7fe512419410>"
304+
"<epymorph.geo.static.StaticGeo at 0x7fc2307a5fd0>"
307305
]
308306
},
309-
"execution_count": 11,
307+
"execution_count": 10,
310308
"metadata": {},
311309
"output_type": "execute_result"
312310
}
313311
],
314312
"source": [
315313
"# Verify that we can load the file back.\n",
316-
"F.load_from_archive(Path('epymorph/data/geo/us_counties_2015.geo.tar'))"
314+
"F.load_from_archive(geofile)"
317315
]
318316
}
319317
],
@@ -333,7 +331,7 @@
333331
"name": "python",
334332
"nbconvert_exporter": "python",
335333
"pygments_lexer": "ipython3",
336-
"version": "3.11.6"
334+
"version": "3.11.9"
337335
},
338336
"orig_nbformat": 4
339337
},

doc/devlog/2024-04-25.ipynb

+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# devlog 2024-04-25\n",
8+
"\n",
9+
"_Author: Tyler Coles_\n",
10+
"\n",
11+
"Testing cache utilities. This script:\n",
12+
"1. tests that reading and writing from archives works,\n",
13+
"2. checks that we can choose to gzip or not, and\n",
14+
"3. measures the impact of gzipping on read/write time and file size."
15+
]
16+
},
17+
{
18+
"cell_type": "code",
19+
"execution_count": 2,
20+
"metadata": {},
21+
"outputs": [
22+
{
23+
"name": "stdout",
24+
"output_type": "stream",
25+
"text": [
26+
"Save a geo without compression:\n",
27+
"17.8 ms ± 2.71 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
28+
"Read a geo without compression:\n",
29+
"3.74 ms ± 73.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
30+
"\n",
31+
"Save a geo compressed:\n",
32+
"20.6 ms ± 134 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n",
33+
"Read a geo with compression:\n",
34+
"4.87 ms ± 57.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
35+
"\n",
36+
"Bytes as a tar: 153,600\n",
37+
"Bytes as a tgz: 134,722\n",
38+
"Compression ratio: 87.7%\n"
39+
]
40+
}
41+
],
42+
"source": [
43+
"import os\n",
44+
"import shutil\n",
45+
"import tempfile\n",
46+
"\n",
47+
"from epymorph import geo_library\n",
48+
"from epymorph.geo.static import StaticGeoFileOps as F\n",
49+
"\n",
50+
"# Our subject geo can be anything, but this one is a useful demo because it's sizeable.\n",
51+
"geo = geo_library['maricopa_cbg_2019']()\n",
52+
"\n",
53+
"tempdir = tempfile.mkdtemp()\n",
54+
"\n",
55+
"print(\"Save a geo without compression:\")\n",
56+
"%timeit F.save_as_archive(geo, f\"{tempdir}/geo.tar\")\n",
57+
"print(\"Read a geo without compression:\")\n",
58+
"%timeit F.load_from_archive(f\"{tempdir}/geo.tar\")\n",
59+
"\n",
60+
"print()\n",
61+
"\n",
62+
"print(\"Save a geo compressed:\")\n",
63+
"%timeit F.save_as_archive(geo, f\"{tempdir}/geo.tgz\")\n",
64+
"print(\"Read a geo with compression:\")\n",
65+
"%timeit F.load_from_archive(f\"{tempdir}/geo.tgz\")\n",
66+
"\n",
67+
"print()\n",
68+
"\n",
69+
"size_tar = os.path.getsize(f\"{tempdir}/geo.tar\")\n",
70+
"size_tgz = os.path.getsize(f\"{tempdir}/geo.tgz\")\n",
71+
"\n",
72+
"print(f\"Bytes as a tar: {size_tar:>9,}\")\n",
73+
"print(f\"Bytes as a tgz: {size_tgz:>9,}\")\n",
74+
"print(f\"Compression ratio: {(size_tgz / size_tar):.1%}\")\n",
75+
"\n",
76+
"shutil.rmtree(tempdir)\n",
77+
"\n",
78+
"# NOTE: the %timeit magics break isort and autopep8, so you're on your own for formatting"
79+
]
80+
},
81+
{
82+
"cell_type": "markdown",
83+
"metadata": {},
84+
"source": [
85+
"## Conclusion\n",
86+
"\n",
87+
"We get decent savings in bytes by storing geos gzipped, and it doesn't take much longer to read and write. ✓"
88+
]
89+
}
90+
],
91+
"metadata": {
92+
"kernelspec": {
93+
"display_name": ".venv",
94+
"language": "python",
95+
"name": "python3"
96+
},
97+
"language_info": {
98+
"codemirror_mode": {
99+
"name": "ipython",
100+
"version": 3
101+
},
102+
"file_extension": ".py",
103+
"mimetype": "text/x-python",
104+
"name": "python",
105+
"nbconvert_exporter": "python",
106+
"pygments_lexer": "ipython3",
107+
"version": "3.11.9"
108+
}
109+
},
110+
"nbformat": 4,
111+
"nbformat_minor": 2
112+
}

epymorph/cache.py

+191
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
"""epymorph's file caching utilities."""
2+
from hashlib import sha256
3+
from io import BytesIO
4+
from os import PathLike
5+
from pathlib import Path
6+
from tarfile import TarInfo, is_tarfile
7+
from tarfile import open as open_tarfile
8+
9+
from platformdirs import user_cache_path
10+
11+
CACHE_PATH = user_cache_path(appname='epymorph', ensure_exists=True)
12+
13+
14+
class FileError(Exception):
15+
"""Error during a file operation."""
16+
17+
18+
class FileMissingError(FileError):
19+
"""Error loading a file, as it does not exist."""
20+
21+
22+
class FileWriteError(FileError):
23+
"""Error writing a file."""
24+
25+
26+
class FileReadError(FileError):
27+
"""Error loading a file."""
28+
29+
30+
class FileVersionError(FileError):
31+
"""Error loading a file due to unmet version requirements."""
32+
33+
34+
class CacheMiss(FileError):
35+
"""Raised on a cache-miss (for any reason) during a load-from-cache operation."""
36+
37+
38+
def save_bundle(to_path: str | PathLike[str], version: int, files: dict[str, BytesIO]) -> None:
39+
"""
40+
Save a bundle of files in our tar format with an associated version number.
41+
`to_path` can be absolute or relative; relative paths will be resolved
42+
against the current working directory. Folders in the path which do not exist
43+
will be created automatically.
44+
"""
45+
46+
if version <= 0:
47+
raise ValueError("version should be greater than zero.")
48+
49+
try:
50+
# Compute checksums
51+
sha_entries = []
52+
for name, contents in files.items():
53+
contents.seek(0)
54+
sha = sha256()
55+
sha.update(contents.read())
56+
sha_entries.append(f"{sha.hexdigest()} {name}")
57+
58+
# Create checksums.sha256 file
59+
sha_file = BytesIO()
60+
sha_text = "\n".join(sha_entries)
61+
sha_file.write(bytes(sha_text, encoding='utf-8'))
62+
63+
# Create cache version file
64+
ver_file = BytesIO()
65+
ver_file.write(bytes(str(version), encoding="utf-8"))
66+
67+
tarred_files = {
68+
**files,
69+
"checksums.sha256": sha_file,
70+
"version": ver_file,
71+
}
72+
73+
# Write the tar to disk
74+
tar_path = Path(to_path).resolve()
75+
tar_path.parent.mkdir(parents=True, exist_ok=True)
76+
mode = 'w:gz' if tar_path.suffix == '.tgz' else 'w'
77+
with open_tarfile(name=tar_path, mode=mode) as tar:
78+
for name, contents in tarred_files.items():
79+
info = TarInfo(name)
80+
info.size = contents.tell()
81+
contents.seek(0)
82+
tar.addfile(info, contents)
83+
84+
except Exception as e:
85+
msg = f"Unable to write archive at path: {to_path}"
86+
raise FileWriteError(msg) from e
87+
88+
89+
def load_bundle(from_path: str | PathLike[str], version_at_least: int = -1) -> dict[str, BytesIO]:
90+
"""
91+
Load a bundle of files in our tar format, optionally enforcing a minimum version.
92+
An Exception is raised if the file cannot be loaded for any reason, or if its version
93+
is incorrect. On success, returns a dictionary of the contained files, mapping the file
94+
name to the bytes of the file.
95+
"""
96+
try:
97+
tar_path = Path(from_path).resolve()
98+
if not tar_path.is_file():
99+
raise FileMissingError(f"No file at: {tar_path}")
100+
101+
# Read the tar file into memory
102+
tar_buffer = BytesIO()
103+
with open(tar_path, 'rb') as f:
104+
tar_buffer.write(f.read())
105+
tar_buffer.seek(0)
106+
107+
if not is_tarfile(tar_buffer):
108+
raise FileReadError(f"Not a tar file at: {tar_path}")
109+
110+
mode = 'r:gz' if tar_path.suffix == '.tgz' else 'r'
111+
tarred_files: dict[str, BytesIO] = {}
112+
with open_tarfile(fileobj=tar_buffer, mode=mode) as tar:
113+
for info in tar.getmembers():
114+
name = info.name
115+
contents = tar.extractfile(info)
116+
if contents is not None:
117+
tarred_files[name] = BytesIO(contents.read())
118+
119+
# Check version
120+
if "version" in tarred_files:
121+
ver_file = tarred_files["version"]
122+
version = int(str(ver_file.readline(), encoding="utf-8"))
123+
else:
124+
version = -1
125+
if version < version_at_least:
126+
raise FileVersionError("Archive is an unacceptable version.")
127+
128+
# Verify the checksums
129+
if "checksums.sha256" not in tarred_files:
130+
raise FileReadError("Archive appears to be invalid.")
131+
sha_file = tarred_files["checksums.sha256"]
132+
for line_bytes in sha_file.readlines():
133+
line = str(line_bytes, encoding='utf-8')
134+
[checksum, filename] = line.strip().split(' ')
135+
136+
if filename not in tarred_files:
137+
raise FileReadError("Archive appears to be invalid.")
138+
139+
contents = tarred_files[filename]
140+
contents.seek(0)
141+
sha = sha256()
142+
sha.update(contents.read())
143+
contents.seek(0)
144+
if checksum != sha.hexdigest():
145+
msg = f"Archive checksum did not match (for file {filename}). "\
146+
"It is possible the file is corrupt."
147+
raise FileReadError(msg)
148+
149+
return {
150+
name: contents
151+
for name, contents in tarred_files.items()
152+
if name not in ("checksums.sha256", "version")
153+
}
154+
155+
except FileError:
156+
raise
157+
except Exception as e:
158+
raise FileReadError(f"Unable to load archive at: {from_path}") from e
159+
160+
161+
def _resolve_cache_path(path: str | PathLike[str]) -> Path:
162+
cache_path = Path(path)
163+
if cache_path.is_absolute():
164+
msg = "When saving to or loading from the cache, please supply a relative path."
165+
raise ValueError(msg)
166+
return CACHE_PATH.joinpath(cache_path).resolve()
167+
168+
169+
def save_bundle_to_cache(to_path: str | PathLike[str], version: int, files: dict[str, BytesIO]) -> None:
170+
"""
171+
Save a tar bundle of files to the cache (overwriting the existing file, if any).
172+
The tar includes the sha256 checksums of every content file,
173+
and a version file indicating which application version was
174+
responsible for writing the file (thus allowing the application
175+
to decide if a cached file is still valid when reading it).
176+
"""
177+
save_bundle(_resolve_cache_path(to_path), version, files)
178+
179+
180+
def load_bundle_from_cache(from_path: str | PathLike[str], version_at_least: int = -1) -> dict[str, BytesIO]:
181+
"""
182+
Load a tar bundle of files from the cache. `from_path` must be a relative path.
183+
`version_at_least` optionally specifies a version number that must be met or beat
184+
by the cached file in order for the file to be considered valid. If the cached file
185+
was written against a version less than this, it will be considered a cache miss
186+
(raises CacheMiss).
187+
"""
188+
try:
189+
return load_bundle(_resolve_cache_path(from_path), version_at_least)
190+
except FileError as e:
191+
raise CacheMiss() from e

0 commit comments

Comments
 (0)