Skip to content

Commit

Permalink
OCFL storage
Browse files Browse the repository at this point in the history
  • Loading branch information
shsdev committed Nov 25, 2024
1 parent ffb9a93 commit f52e2f2
Show file tree
Hide file tree
Showing 23 changed files with 9,827 additions and 2 deletions.
12 changes: 10 additions & 2 deletions eatb/pairtree_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,19 @@ def _next_version(self, identifier):
:return: next formatted version directory name
"""
if not self.identifier_object_exists(identifier):
return VersionDirFormat % 0
version_num = 1
return VersionDirFormat % 1
version_num = 2
while self.identifier_version_object_exists(identifier, version_num):
version_num += 1
return VersionDirFormat % version_num

def next_version(self, identifier):
"""
Get next formatted version directory name
:param identifier: identifier
:return: next formatted version directory name
"""
return self._next_version(identifier)

def curr_version(self, identifier):
"""
Expand Down
167 changes: 167 additions & 0 deletions eatb/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
# -*- coding: utf-8 -*-
import json
import os
import shutil
import hashlib
from collections import defaultdict
from datetime import datetime
from subprocess import check_output

from eatb.cli import CliCommand, CliCommands
Expand All @@ -27,3 +31,166 @@ def get_hashed_filelist(strip_path_part, directory, commands=CliCommands()):
return result


def compute_sha512(file_path):
"""
Computes the SHA-512 hash of a file.
"""
sha512_hash = hashlib.sha512()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""): # Read in 8KB chunks
sha512_hash.update(chunk)
return sha512_hash.hexdigest()


def compute_md5(file_path):
"""
Computes the MD5 hash of a file.
"""
md5_hash = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""): # Read in 8KB chunks
md5_hash.update(chunk)
return md5_hash.hexdigest()


def compute_file_hashes(file_path):
"""
Computes multiple hashes (e.g., SHA-512 and MD5) for a file and returns them as a dictionary.
"""
return {
"sha512": compute_sha512(file_path),
"md5": compute_md5(file_path),
}


def get_sha512_hash(file_path):
"""Compute the SHA-512 hash of a file."""
h = hashlib.sha512()
with open(file_path, "rb") as f:
while chunk := f.read(8192):
h.update(chunk)
return h.hexdigest()


def write_inventory_from_directory(identifier, version, data_dir, action, metadata=None):
"""Write inventory"""
if not os.path.exists(data_dir):
raise ValueError(f"Data directory does not exist: {data_dir}")

inventory_path = os.path.join(data_dir, "inventory.json")

# Load or initialize the inventory
if os.path.exists(inventory_path):
with open(inventory_path, "r", encoding="utf-8") as f:
inventory = json.load(f)

# Reinitialize fixity as defaultdict
inventory["fixity"] = defaultdict(lambda: defaultdict(list), {
algo: defaultdict(list, hash_dict) for algo, hash_dict in inventory.get("fixity", {}).items()
})
inventory["manifest"] = defaultdict(list, inventory.get("manifest", {}))
else:
inventory = {
"digestAlgorithm": "sha512",
"fixity": defaultdict(lambda: defaultdict(list)),
"head": version,
"id": identifier,
"manifest": defaultdict(list),
"type": "https://ocfl.io/1.1/spec/#inventory",
"versions": {}
}

# Prepare version entry
version_entry = {
"created": datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ'),
"message": action,
"state": defaultdict(list),
"added": metadata.get("added", []),
"removed": metadata.get("removed", []),
}

# Process files in the version directory
version_dir = os.path.join(data_dir, version)
for subdir, _, files in os.walk(version_dir):
for file in files:
file_path = os.path.join(subdir, file)
# Compute relative path directly from the version directory
relative_path = os.path.relpath(file_path, version_dir)
hashes = compute_file_hashes(file_path)

for algo, hash_val in hashes.items():
print(f"Adding: {algo}, {hash_val}, {relative_path}")
inventory["fixity"][algo][hash_val].append(f"{version}/{relative_path}")

# Ensure the relative_path is added only once to manifest
if f"{version}/{relative_path}" not in inventory["manifest"][hashes["sha512"]]:
inventory["manifest"][hashes["sha512"]].append(f"{version}/{relative_path}")

# Ensure the relative_path is added only once to state
if relative_path not in version_entry["state"][hashes["sha512"]]:
version_entry["state"][hashes["sha512"]].append(relative_path)

# Update inventory
inventory["versions"][version] = version_entry
inventory["head"] = version

# Write updated inventory
with open(inventory_path, "w", encoding="utf-8") as f:
json.dump(inventory, f, indent=4)

# Write OCFL object declaration and checksum
with open(os.path.join(data_dir, "0=ocfl_object_1.0"), "w", encoding="utf-8") as ocfl_file:
ocfl_file.write("ocfl_object_1.0")
inventory_hash = get_sha512_hash(inventory_path)
with open(os.path.join(data_dir, "inventory.json.sha512"), "w", encoding="utf-8") as hash_file:
hash_file.write(f"{inventory_hash} inventory.json")


def update_storage_with_differences(working_dir, new_version_target_dir, previous_version_existing_dir, inventory_path, exclude_files=None):
"""
Copies only new or modified files to the storage directory and identifies deleted files.
Returns:
tuple: (list of added/changed files, list of deleted files)
"""
added_or_changed = []
deleted_files = []
previous_files = {}

assert not exclude_files or type(exclude_files) is list, \
"param 'exclude_files' must be of type list"

# Load inventory to get the state of the previous version
if os.path.exists(inventory_path):
with open(inventory_path, "r", encoding="utf-8") as f:
inventory = json.load(f)
previous_version_state = inventory["versions"].get(previous_version_existing_dir, {}).get("state", {})
for hash_val, paths in previous_version_state.items():
for path in paths:
previous_files[path] = hash_val

# Compare files in the working directory with the previous version
for subdir, _, files in os.walk(working_dir):
for file in files:
source_file = os.path.join(subdir, file)
relative_path = os.path.relpath(source_file, working_dir)
target_file = os.path.join(new_version_target_dir, relative_path)
os.makedirs(os.path.dirname(target_file), exist_ok=True)

# Compute hash for the current file
current_hash = compute_file_hashes(source_file)["sha512"]

# Check if the file is new or has changed
if (relative_path not in previous_files or previous_files[relative_path] != current_hash) and file not in exclude_files:
# Check if the file already exists with the same content
if not os.path.exists(target_file) or compute_file_hashes(target_file)["sha512"] != current_hash:
shutil.copy2(source_file, target_file)
added_or_changed.append(relative_path)

# Identify deleted files
current_files = {os.path.relpath(os.path.join(subdir, file), working_dir) for subdir, _, files in os.walk(working_dir) for file in files}
for path in previous_files.keys():
if path not in current_files:
deleted_files.append(path)

return added_or_changed, deleted_files
131 changes: 131 additions & 0 deletions tests/test_ocfl_storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Test storage module"""
import os
import shutil
import json
import unittest
from eatb import ROOT
from eatb.utils import randomutils
from eatb.storage import update_storage_with_differences
from eatb.storage import write_inventory_from_directory

OCFL_STORAGE_DIR = os.path.join(ROOT, 'tests/test_resources/ocfl-storage/')
EXAMPLE_WORKING_DIR = os.path.join(OCFL_STORAGE_DIR, 'working-dir')
TMP_DIRECTORY = '/tmp/temp-' + randomutils.randomword(10)
TMP_WORKING_DIRECTORY = TMP_DIRECTORY + 'workingdir'
TMP_AIP_DIRECTORY = TMP_DIRECTORY + 'aipdir'
ADDITIONAL_FILE = 'additionalfile.txt'
ADDITIONAL_FILE_PATH = os.path.join(OCFL_STORAGE_DIR, ADDITIONAL_FILE)

class TestOcflStorage(unittest.TestCase):
"""Test storage functions"""
@classmethod
def setUpClass(cls):
try:
# Ensure the repository storage directory exists
if not os.path.exists(TMP_DIRECTORY):
os.makedirs(TMP_DIRECTORY)
if not os.path.exists(TMP_WORKING_DIRECTORY):
os.makedirs(TMP_WORKING_DIRECTORY)
if not os.path.exists(TMP_AIP_DIRECTORY):
os.makedirs(TMP_AIP_DIRECTORY)

# Copy the content from test_repo to repository_storage_dir
for item in os.listdir(EXAMPLE_WORKING_DIR):
source_path = os.path.join(EXAMPLE_WORKING_DIR, item)
destination_path = os.path.join(TMP_WORKING_DIRECTORY, item)

if os.path.isdir(source_path):
print(f"Copy directory from '{source_path}' to '{destination_path}'.")
shutil.copytree(source_path, destination_path, dirs_exist_ok=True)
else:
print(f"Copy file from '{source_path}' to '{destination_path}'.")
shutil.copy2(source_path, destination_path)

# Verify and log success
print(f"Copied contents from '{EXAMPLE_WORKING_DIR}' to '{TMP_WORKING_DIRECTORY}'.")

except Exception as e:
raise RuntimeError("Error setting up test") from e

@classmethod
def tearDownClass(cls):
shutil.rmtree(TMP_DIRECTORY)

def test_1_tmp_working_dir_file_access(self):
"""Test file access in temporary package"""
self.assertTrue(os.path.exists(os.path.join(TMP_WORKING_DIRECTORY, "METS.xml")))

def test_2_store_first_version(self):
"""Test write storage"""
# Define version and storage directories
package_name = "example.aip"
working_dir = TMP_WORKING_DIRECTORY
data_dir = os.path.join(TMP_AIP_DIRECTORY, "data")
previous_version = "v00000"
new_version = "v00001"
new_version_target_dir = os.path.join(data_dir, new_version)
os.makedirs(new_version_target_dir, exist_ok=True)
inventory_path = os.path.join(data_dir, "inventory.json")
excludes = [f"{package_name}.tar", f"{package_name}.xml"]
changed_files, deleted_files = update_storage_with_differences(
working_dir, new_version_target_dir, previous_version, inventory_path, exclude_files=excludes
)
# Update inventory
print(f"Updating OCFL inventory for version {new_version}")
write_inventory_from_directory(
identifier="urn:uuid:d695500b-0209-4c06-bea6-8fdd52c6db22",
version=new_version,
data_dir=data_dir,
action="ingest",
metadata={"added": changed_files, "removed": deleted_files},
)

def test_3_additional_file_to_working_dir(self):
"""Test adding additional file to temporary working dir"""
# make sure the additional file exists
self.assertTrue(os.path.exists(ADDITIONAL_FILE_PATH))
additional_file_target_dir = os.path.join(TMP_WORKING_DIRECTORY, "representations/a87c22b3-af5a-4c3c-8463-4d6eab95439c/data/")
shutil.copy2(ADDITIONAL_FILE_PATH, additional_file_target_dir)
# make sure the additional file was added to the working directory
self.assertTrue(os.path.exists(os.path.join(additional_file_target_dir, ADDITIONAL_FILE)))


def test_4_additional_file_to_working_dir(self):
"""Test write new version storage"""
# Define version and storage directories
package_name = "example.aip"
working_dir = TMP_WORKING_DIRECTORY
data_dir = os.path.join(TMP_AIP_DIRECTORY, "data")
previous_version = "v00001"
new_version = "v00002"
new_version_target_dir = os.path.join(data_dir, new_version)
os.makedirs(new_version_target_dir, exist_ok=True)
inventory_path = os.path.join(data_dir, "inventory.json")
excludes = [f"{package_name}.tar", f"{package_name}.xml"]
changed_files, deleted_files = update_storage_with_differences(
working_dir, new_version_target_dir, previous_version, inventory_path, exclude_files=excludes
)
# Update inventory
print(f"Updating OCFL inventory for version {new_version}")
write_inventory_from_directory(
identifier="urn:uuid:d695500b-0209-4c06-bea6-8fdd52c6db22",
version=new_version,
data_dir=data_dir,
action="ingest",
metadata={"added": changed_files, "removed": deleted_files},
)
"""Test if the specified file is present under 'versions/v00002/added'"""
# Load the inventory.json file
with open(inventory_path, "r", encoding="utf-8") as f:
inventory = json.load(f)

# Check if the specific entry is in the 'added' list under 'v00002'
added_files = inventory.get("versions", {}).get("v00002", {}).get("added", [])
self.assertIn("representations/a87c22b3-af5a-4c3c-8463-4d6eab95439c/data/additionalfile.txt", added_files)



if __name__ == '__main__':
unittest.main()
1 change: 1 addition & 0 deletions tests/test_resources/ocfl-storage/additionalfile.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is an additional file
Loading

0 comments on commit f52e2f2

Please sign in to comment.