Skip to content

Commit

Permalink
Simplify ocfl storage test resources and improve implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
shsdev committed Nov 26, 2024
1 parent f28ac30 commit 9635541
Show file tree
Hide file tree
Showing 24 changed files with 220 additions and 9,554 deletions.
15 changes: 15 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Changelog

## [0.2.7] - 2024-11-25
### Changed
- **Pairtree** as directory-based storage with differential versioning.

## [0.2.6] - 2024-11-24
### Changed
- **Identifier mapping** based on Pairtree identifier string cleaning.

## [0.2.4] - 2024-06-01
### Changed
- **Identifier encoding/decoding**.
- **Updated** identifier mapping functions.
- **Adapted** safe filename functions.
187 changes: 170 additions & 17 deletions eatb/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,39 @@
import os
import shutil
import hashlib
from typing import List, Tuple, Dict
from collections import defaultdict
from datetime import datetime
from subprocess import check_output

from eatb.cli import CliCommand, CliCommands
from eatb import VersionDirFormat


def get_previous_version_series(current_version: str, version_format: str = VersionDirFormat) -> list:
"""
Generate a list of previous version strings up to (excluding) the given current version.
Parameters:
current_version (str): The current version string, e.g., "v00002".
version_format (str): The format string for the version, e.g., 'v%05d'.
Returns:
list: A list of previous version strings in the series.
Example usage:
print(get_version_series("v00001")) # Output: []
print(get_version_series("v00002")) # Output: ['v00001']
print(get_version_series("v00005")) # Output: ['v00001', 'v00002', 'v00003', 'v00004']
"""
# Extract the numeric part of the current version
try:
current_number = int(current_version[1:])
except ValueError as e:
raise ValueError("Invalid version format. Must match the version_format, e.g., 'v%05d'.") from e

# Generate all previous versions
return [version_format % i for i in range(1, current_number)]


def get_hashed_filelist(strip_path_part, directory, commands=CliCommands()):
Expand Down Expand Up @@ -72,8 +100,66 @@ def get_sha512_hash(file_path):
return h.hexdigest()


def write_inventory_from_directory(identifier, version, data_dir, action, metadata=None):
"""Write inventory"""
def write_inventory_from_directory(
identifier: str,
version: str,
data_dir: str,
action: str,
metadata: Dict = None
) -> bool:
"""
Generates or updates an inventory based on the contents of a specified directory.
Parameters:
identifier (str):
A unique identifier for the inventory or the dataset being processed.
This could be a package ID, dataset name, or other unique identifier.
version (str):
The version label for the inventory. This allows tracking changes
across different versions of the data.
data_dir (str):
The path to the directory containing the files to be inventoried.
This directory is scanned recursively to collect file information.
action (str):
Specifies the action to be performed. Examples might include:
- `"create"`: Create a new inventory for the given directory.
- `"update"`: Update an existing inventory with the latest changes.
- `"validate"`: Validate the inventory against the current directory state.
metadata (Dict, optional):
A dictionary containing additional metadata to be associated with the inventory.
Examples include:
- Creation date
- Author information
- Custom tags
Defaults to `None`, meaning no additional metadata is included.
Returns:
bool:
`True` if the inventory was successfully written or updated,
`False` if the operation failed.
Raises:
ValueError: If invalid parameters (e.g., unsupported `action`) are provided.
FileNotFoundError: If the specified `data_dir` does not exist.
IOError: If there is an issue writing the inventory file.
Example:
success = write_inventory_from_directory(
identifier="dataset123",
version="v1.0",
data_dir="/path/to/data",
action="create",
metadata={"author": "Name of Author", "created_at": "2024-11-26"}
)
if success:
print("Inventory successfully written.")
else:
print("Failed to write inventory.")
"""
if not os.path.exists(data_dir):
raise ValueError(f"Data directory does not exist: {data_dir}")

Expand Down Expand Up @@ -109,22 +195,35 @@ def write_inventory_from_directory(identifier, version, data_dir, action, metada
"removed": metadata.get("removed", []),
}

# Collect existing files from all previous versions
existing_files = {}
for prev_version, prev_data in inventory["versions"].items():
for hash_val, paths in prev_data["state"].items():
for path in paths:
existing_files[f"{prev_version}/{path}"] = hash_val

# Process files in the version directory
version_dir = os.path.join(data_dir, version)
for subdir, _, files in os.walk(version_dir):
for file in files:
file_path = os.path.join(subdir, file)
# Compute relative path directly from the version directory
relative_path = os.path.relpath(file_path, version_dir)
full_ocfl_path = f"{version}/{relative_path}" # Full path in the current version
hashes = compute_file_hashes(file_path)

if any(
existing_path.endswith(relative_path) and existing_files[existing_path] == hashes["sha512"]
for existing_path in existing_files
):
print(f"Skipping {file_path} as it already exists")
continue

for algo, hash_val in hashes.items():
print(f"Adding: {algo}, {hash_val}, {relative_path}")
inventory["fixity"][algo][hash_val].append(f"{version}/{relative_path}")
inventory["fixity"][algo][hash_val].append(full_ocfl_path)

# Ensure the relative_path is added only once to manifest
if f"{version}/{relative_path}" not in inventory["manifest"][hashes["sha512"]]:
inventory["manifest"][hashes["sha512"]].append(f"{version}/{relative_path}")
if full_ocfl_path not in inventory["manifest"][hashes["sha512"]]:
inventory["manifest"][hashes["sha512"]].append(full_ocfl_path)

# Ensure the relative_path is added only once to state
if relative_path not in version_entry["state"][hashes["sha512"]]:
Expand All @@ -145,31 +244,78 @@ def write_inventory_from_directory(identifier, version, data_dir, action, metada
with open(os.path.join(data_dir, "inventory.json.sha512"), "w", encoding="utf-8") as hash_file:
hash_file.write(f"{inventory_hash} inventory.json")

return os.path.exists(inventory_path) and os.path.exists(inventory_path)


def update_storage_with_differences(working_dir, new_version_target_dir, previous_version_existing_dir, inventory_path, exclude_files=None):
def update_storage_with_differences(
working_dir: str, new_version_target_dir: str, previous_versions: List[str],
inventory_path: str, exclude_files: List[str] = None
) -> Tuple[List[str], List[str]]:
"""
Copies only new or modified files to the storage directory and identifies deleted files.
Parameters:
working_dir (str):
The directory containing the current version of the files that need to be compared
with previous versions. This is the source directory for the operation.
new_version_target_dir (str):
The target directory where new or modified files will be copied.
This represents the storage location for the updated version.
previous_versions (List[str]):
A list of version identifiers (version tags, e.g., v00001) that represent
the prior states of the files. These versions are used to determine whether
files have changed, been added, or deleted.
inventory_path (str):
The path to the inventory file (in JSON format), which contains metadata about
the files in all previous versions. This file is used to track file states across
versions (e.g., hashes and paths).
exclude_files (List[str], optional):
A list of filenames to be excluded from the comparison and copying process.
These files will be ignored even if they differ between versions.
Defaults to `None`, meaning no files are excluded.
Returns:
tuple: (list of added/changed files, list of deleted files)
Tuple[List[str], List[str]]:
A tuple containing:
- `added_or_changed` (List[str]): A list of relative paths for files that were
either added or modified in the `working_dir` compared to the `previous_versions`.
- `deleted_files` (List[str]): A list of relative paths for files that were
present in the `previous_versions` but are no longer in the `working_dir`.
Example:
added, deleted = update_storage_with_differences(
working_dir="/path/to/current",
new_version_target_dir="/path/to/storage",
previous_versions=["v00001", "v00002"],
inventory_path="/path/to/inventory.json",
exclude_files=["tempfile.txt", "debug.log"]
)
"""
added_or_changed = []
deleted_files = []
previous_files = {}

assert not exclude_files or type(exclude_files) is list, \
assert isinstance(previous_versions, list), \
"param 'previous_versions' must be of type list"

assert not exclude_files or isinstance(exclude_files, list), \
"param 'exclude_files' must be of type list"

# Load inventory to get the state of the previous version
# Load inventory to get the state of all previous versions
if os.path.exists(inventory_path):
with open(inventory_path, "r", encoding="utf-8") as f:
inventory = json.load(f)
previous_version_state = inventory["versions"].get(previous_version_existing_dir, {}).get("state", {})
for hash_val, paths in previous_version_state.items():
for path in paths:
previous_files[path] = hash_val
for prev_version in previous_versions:
version_state = inventory["versions"].get(prev_version, {}).get("state", {})
for hash_val, paths in version_state.items():
for path in paths:
previous_files[path] = hash_val

# Compare files in the working directory with the previous version
# Compare files in the working directory with all previous versions
for subdir, _, files in os.walk(working_dir):
for file in files:
source_file = os.path.join(subdir, file)
Expand All @@ -179,9 +325,16 @@ def update_storage_with_differences(working_dir, new_version_target_dir, previou

# Compute hash for the current file
current_hash = compute_file_hashes(source_file)["sha512"]
#Files which exist in previous versions with the same hash do not get copied
if any(
existing_path.endswith(relative_path) and previous_files[existing_path] == current_hash
for existing_path in previous_files
):
print(f"Skipping {source_file} as it already exists in previous versions")
continue

# Check if the file is new or has changed
if (relative_path not in previous_files or previous_files[relative_path] != current_hash) and file not in exclude_files:
if (relative_path not in previous_files or previous_files[relative_path] != current_hash) and file not in (exclude_files or []):
# Check if the file already exists with the same content
if not os.path.exists(target_file) or compute_file_hashes(target_file)["sha512"] != current_hash:
shutil.copy2(source_file, target_file)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="eatb",
version="0.2.6",
version="0.2.7",
author="E-ARK Foundation",
author_email="admin@e-ark-foundation.eu",
license='MIT',
Expand Down
52 changes: 34 additions & 18 deletions tests/test_ocfl_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,17 @@
from eatb.storage import update_storage_with_differences
from eatb.storage import write_inventory_from_directory

OCFL_STORAGE_DIR = os.path.join(ROOT, 'tests/test_resources/ocfl-storage/')
EXAMPLE_WORKING_DIR = os.path.join(OCFL_STORAGE_DIR, 'working-dir')
TMP_DIRECTORY = '/tmp/temp-' + randomutils.randomword(10)
OCFL_TEST_RESOURCES = os.path.join(ROOT, 'tests/test_resources/ocfl-storage/')
EXAMPLE_WORKING_DIR = os.path.join(OCFL_TEST_RESOURCES, 'working-dir')
# TMP_DIRECTORY = '/tmp/temp-' + randomutils.randomword(10) + "/"
TMP_DIRECTORY = '/home/shs/earkweb/tmp' + randomutils.randomword(10) + "/"
TMP_WORKING_DIRECTORY = TMP_DIRECTORY + 'workingdir'
TMP_AIP_DIRECTORY = TMP_DIRECTORY + 'aipdir'
AIP_DATA_DIR = os.path.join(TMP_AIP_DIRECTORY, "data")
FIRST_FILE = 'firstfile.txt'
FIRST_FILE_PATH = os.path.join(OCFL_TEST_RESOURCES, 'workingdir', FIRST_FILE)
ADDITIONAL_FILE = 'additionalfile.txt'
ADDITIONAL_FILE_PATH = os.path.join(OCFL_STORAGE_DIR, ADDITIONAL_FILE)
ADDITIONAL_FILE_PATH = os.path.join(OCFL_TEST_RESOURCES, ADDITIONAL_FILE)

class TestOcflStorage(unittest.TestCase):
"""Test storage functions"""
Expand Down Expand Up @@ -51,11 +55,12 @@ def setUpClass(cls):

@classmethod
def tearDownClass(cls):
shutil.rmtree(TMP_DIRECTORY)
pass
#shutil.rmtree(TMP_DIRECTORY)

def test_1_tmp_working_dir_file_access(self):
"""Test file access in temporary package"""
self.assertTrue(os.path.exists(os.path.join(TMP_WORKING_DIRECTORY, "METS.xml")))
self.assertTrue(os.path.exists(os.path.join(TMP_WORKING_DIRECTORY, "firstfile.txt")))

def test_2_store_first_version(self):
"""Test write storage"""
Expand All @@ -69,8 +74,9 @@ def test_2_store_first_version(self):
os.makedirs(new_version_target_dir, exist_ok=True)
inventory_path = os.path.join(data_dir, "inventory.json")
excludes = [f"{package_name}.tar", f"{package_name}.xml"]
previous_versions = [previous_version]
changed_files, deleted_files = update_storage_with_differences(
working_dir, new_version_target_dir, previous_version, inventory_path, exclude_files=excludes
working_dir, new_version_target_dir, previous_versions, inventory_path, exclude_files=excludes
)
# Update inventory
print(f"Updating OCFL inventory for version {new_version}")
Expand All @@ -81,39 +87,49 @@ def test_2_store_first_version(self):
action="ingest",
metadata={"added": changed_files, "removed": deleted_files},
)
self.assertFalse(os.path.exists(os.path.join(TMP_WORKING_DIRECTORY, ADDITIONAL_FILE)))

def test_3_additional_file_to_working_dir(self):
"""Test adding additional file to temporary working dir"""
# make sure the additional file exists
self.assertTrue(os.path.exists(ADDITIONAL_FILE_PATH))
additional_file_target_dir = os.path.join(TMP_WORKING_DIRECTORY, "representations/a87c22b3-af5a-4c3c-8463-4d6eab95439c/data/")
shutil.copy2(ADDITIONAL_FILE_PATH, additional_file_target_dir)
shutil.copy2(ADDITIONAL_FILE_PATH, TMP_WORKING_DIRECTORY)
# make sure the additional file was added to the working directory
self.assertTrue(os.path.exists(os.path.join(additional_file_target_dir, ADDITIONAL_FILE)))

self.assertTrue(os.path.exists(os.path.join(TMP_WORKING_DIRECTORY, ADDITIONAL_FILE)))

def test_4_additional_file_to_working_dir(self):
"""Test write new version storage"""
# Define version and storage directories
package_name = "example.aip"
working_dir = TMP_WORKING_DIRECTORY
data_dir = os.path.join(TMP_AIP_DIRECTORY, "data")

previous_version = "v00001"
new_version = "v00002"
new_version_target_dir = os.path.join(data_dir, new_version)
new_version_target_dir = os.path.join(AIP_DATA_DIR, new_version)
os.makedirs(new_version_target_dir, exist_ok=True)
inventory_path = os.path.join(data_dir, "inventory.json")
inventory_path = os.path.join(AIP_DATA_DIR, "inventory.json")
excludes = [f"{package_name}.tar", f"{package_name}.xml"]
previous_versions = [previous_version]
changed_files, deleted_files = update_storage_with_differences(
working_dir, new_version_target_dir, previous_version, inventory_path, exclude_files=excludes
working_dir, new_version_target_dir, previous_versions, inventory_path, exclude_files=excludes
)

self.assertTrue(os.path.exists(os.path.join(AIP_DATA_DIR, previous_version, FIRST_FILE)))
self.assertFalse(os.path.exists(os.path.join(AIP_DATA_DIR, previous_version, ADDITIONAL_FILE)))

self.assertFalse(
expr=os.path.exists(os.path.join(AIP_DATA_DIR, new_version, FIRST_FILE)),
msg=f"First file may not be in version {new_version}"
)
self.assertTrue(os.path.exists(os.path.join(AIP_DATA_DIR, new_version, ADDITIONAL_FILE)))

# Update inventory
print(f"Updating OCFL inventory for version {new_version}")
write_inventory_from_directory(
identifier="urn:uuid:d695500b-0209-4c06-bea6-8fdd52c6db22",
version=new_version,
data_dir=data_dir,
action="ingest",
data_dir=AIP_DATA_DIR,
action="update 1",
metadata={"added": changed_files, "removed": deleted_files},
)
"""Test if the specified file is present under 'versions/v00002/added'"""
Expand All @@ -123,7 +139,7 @@ def test_4_additional_file_to_working_dir(self):

# Check if the specific entry is in the 'added' list under 'v00002'
added_files = inventory.get("versions", {}).get("v00002", {}).get("added", [])
self.assertIn("representations/a87c22b3-af5a-4c3c-8463-4d6eab95439c/data/additionalfile.txt", added_files)
self.assertIn("additionalfile.txt", added_files)



Expand Down
Loading

0 comments on commit 9635541

Please sign in to comment.