From df57aa3dd146be1b1d1bccf1b50e2c7a3c310b58 Mon Sep 17 00:00:00 2001 From: Maaike Date: Thu, 9 Jan 2025 15:43:00 +0100 Subject: [PATCH] Updated storage management (#829) * remove archive, separate api and data retention * update docs * commit cron update * commit test.env updte * commit example update * Update data-retention.rst * Update env.py --------- Co-authored-by: Tom Kralidis --- docs/source/reference/configuration.rst | 2 +- .../reference/running/data-retention.rst | 25 +++--- tests/test.env | 2 +- wis2box-create-config.py | 2 +- wis2box-management/docker/wis2box.cron | 4 +- wis2box-management/wis2box/api/__init__.py | 28 +++++- wis2box-management/wis2box/data/__init__.py | 86 ++++++------------- wis2box-management/wis2box/env.py | 7 +- .../wis2box/pubsub/subscribe.py | 6 +- wis2box.env.example | 2 +- 10 files changed, 81 insertions(+), 83 deletions(-) diff --git a/docs/source/reference/configuration.rst b/docs/source/reference/configuration.rst index c0d1c1f8..c384fbbb 100644 --- a/docs/source/reference/configuration.rst +++ b/docs/source/reference/configuration.rst @@ -70,8 +70,8 @@ The following environment variables can be used to configure `WIS2BOX_STORAGE`. WIS2BOX_STORAGE_PASSWORD=minio123 # password for the storage-layer WIS2BOX_STORAGE_INCOMING=wis2box-incoming # name of the storage-bucket/folder for incoming files WIS2BOX_STORAGE_PUBLIC=wis2box-public # name of the storage-bucket/folder for public files - WIS2BOX_STORAGE_ARCHIVE=wis2box-archive # name of the storage-bucket/folder for archived data WIS2BOX_STORAGE_DATA_RETENTION_DAYS=7 # number of days to keep files in incoming and public + WIS2BOX_STORAGE_API_RETENTION_DAYS=7 # number of days to keep files in API backend MinIO diff --git a/docs/source/reference/running/data-retention.rst b/docs/source/reference/running/data-retention.rst index 6e01a93e..d673a091 100644 --- a/docs/source/reference/running/data-retention.rst +++ b/docs/source/reference/running/data-retention.rst @@ -5,12 +5,15 @@ Data retention ============== wis2box is configured to set data retention according to your requirements. Data retention is managed -via the ``WIS2BOX_STORAGE_DATA_RETENTION_DAYS`` environment variable as part of configuring wis2box. +via the ``WIS2BOX_STORAGE_DATA_RETENTION_DAYS`` and ``WIS2BOX_STORAGE_API_RETENTION_DAYS`` environment variables as part of configuring wis2box. -Cleaning --------- +Once a day, at UTC midnight, wis2box will run the commands ``wis2box data clean`` and ``wis2box api clean`` to remove data older than the specified retention period +(cronjob defined in ``wis2box-management/docker/wis2box.cron``). -Cleaning applies to storage defined by ``WIS2BOX_STORAGE_PUBLIC`` and involves the deletion of files after set amount of time. +Cleaning (storage) +------------------ + +Cleaning applies to storage defined by ``WIS2BOX_STORAGE_PUBLIC`` and ``WIS2BOX_STORAGE_INCOMING`` and involves the deletion of files after set amount of time. Cleaning is performed by default daily at 0Z by the system, and can also be run interactively with: @@ -24,15 +27,17 @@ Cleaning is performed by default daily at 0Z by the system, and can also be run wis2box data clean --days=30 -Archiving ---------- +Cleaning (API) +-------------- -Archiving applies to storage defined by ``WIS2BOX_STORAGE_INCOMING`` and involves moving files to the storage defined by ``WIS2BOX_STORAGE_ARCHIVE``. +Cleaning applies to data in the API backend and involves the deletion of records after a set amount of time. -Archive is performed on incoming data by default daily at 1Z by the system, and can also be run interactively with: +Cleaning is performed by default daily at 0Z by the system, and can also be run interactively with: .. code-block:: bash - wis2box data archive + # delete data older than WIS2BOX_STORAGE_API_RETENTION_DAYS by default + wis2box api clean -Only files with a timestamp older than one hour are considered for archiving. + # delete data older than --days (force override) + wis2box api clean --days=30 diff --git a/tests/test.env b/tests/test.env index d1807fa7..b7d8dbc3 100644 --- a/tests/test.env +++ b/tests/test.env @@ -31,9 +31,9 @@ WIS2BOX_BASEMAP_ATTRIBUTION=OpenStreetMap None: fh.write('WIS2BOX_STORAGE_TYPE=S3\n') fh.write('WIS2BOX_STORAGE_SOURCE=http://minio:9000\n') fh.write('WIS2BOX_STORAGE_INCOMING=wis2box-incoming\n') - fh.write('WIS2BOX_STORAGE_ARCHIVE=wis2box-archive\n') fh.write('WIS2BOX_STORAGE_PUBLIC=wis2box-public\n') fh.write('WIS2BOX_STORAGE_DATA_RETENTION_DAYS=30\n') + fh.write('WIS2BOX_STORAGE_API_RETENTION_DAYS=100\n') # use the default username wis2box for WIS2BOX_STORAGE_USERNAME fh.write('WIS2BOX_STORAGE_USERNAME=wis2box\n') # get password for WIS2BOX_STORAGE_PASSWORD and write it to wis2box.env diff --git a/wis2box-management/docker/wis2box.cron b/wis2box-management/docker/wis2box.cron index 393f1d74..919aec29 100644 --- a/wis2box-management/docker/wis2box.cron +++ b/wis2box-management/docker/wis2box.cron @@ -1,4 +1,4 @@ -0 0 * * * su wis2box -c "wis2box data clean --days=$WIS2BOX_STORAGE_DATA_RETENTION_DAYS" > /proc/1/fd/1 2>/proc/1/fd/2 -0 1 * * * su wis2box -c "wis2box data archive" > /proc/1/fd/1 2>/proc/1/fd/2 +0 0 * * * su wis2box -c "wis2box data clean > /proc/1/fd/1 2>/proc/1/fd/2 +0 1 * * * su wis2box -c "wis2box api clean > /proc/1/fd/1 2>/proc/1/fd/2 0 15 * * * su wis2box -c "wis2box metadata discovery republish" > /proc/1/fd/1 2>/proc/1/fd/2 */10 * * * * su wis2box -c "echo 'wis2box.cron is alive'" > /proc/1/fd/1 2>/proc/1/fd/2 diff --git a/wis2box-management/wis2box/api/__init__.py b/wis2box-management/wis2box/api/__init__.py index 51cf51e2..30fc8fa9 100644 --- a/wis2box-management/wis2box/api/__init__.py +++ b/wis2box-management/wis2box/api/__init__.py @@ -32,7 +32,8 @@ from wis2box.api.config import load_config from wis2box.data_mappings import get_plugins -from wis2box.env import (DOCKER_API_URL, API_URL) +from wis2box.env import (DOCKER_API_URL, API_URL, STORAGE_API_RETENTION_DAYS, + STORAGE_DATA_RETENTION_DAYS) LOGGER = logging.getLogger(__name__) @@ -302,6 +303,31 @@ def delete_collection(ctx, collection, verbosity): click.echo('Collection deleted') +@click.command() +@click.pass_context +@click.option('--days', '-d', help='Number of days of data to keep in API-backend', type=int) # noqa +@cli_helpers.OPTION_VERBOSITY +def clean(ctx, days, verbosity): + """Clean data from API backend older than X days""" + + if days is not None: + click.echo(f'Using days={days}') + days_ = days + elif STORAGE_API_RETENTION_DAYS is not None: + click.echo(f'Using STORAGE_API_RETENTION_DAYS={STORAGE_API_RETENTION_DAYS}') # noqa + days_ = STORAGE_API_RETENTION_DAYS + else: + click.echo(f'Using STORAGE_DATA_RETENTION_DAYS={STORAGE_DATA_RETENTION_DAYS}') # noqa + days_ = STORAGE_DATA_RETENTION_DAYS + + if days_ is None or days_ < 0: + click.echo('No api data retention set. Skipping') + else: + LOGGER.debug('Cleaning API data backend') + delete_collections_by_retention(days_) + + api.add_command(setup) api.add_command(add_collection) api.add_command(delete_collection) +api.add_command(clean) diff --git a/wis2box-management/wis2box/data/__init__.py b/wis2box-management/wis2box/data/__init__.py index dec5a7d3..f212ba89 100644 --- a/wis2box-management/wis2box/data/__init__.py +++ b/wis2box-management/wis2box/data/__init__.py @@ -19,53 +19,26 @@ # ############################################################################### -from datetime import datetime, timedelta, timezone import logging +from datetime import datetime, timedelta, timezone from typing import Union import click from wis2box import cli_helpers from wis2box.api import (setup_collection, remove_collection, - delete_collections_by_retention, reindex_collection) from wis2box.data_mappings import get_data_mappings -from wis2box.env import (STORAGE_SOURCE, STORAGE_ARCHIVE, STORAGE_PUBLIC, - STORAGE_DATA_RETENTION_DAYS, STORAGE_INCOMING) +from wis2box.env import (STORAGE_SOURCE, STORAGE_PUBLIC, STORAGE_INCOMING, + STORAGE_DATA_RETENTION_DAYS) from wis2box.handler import Handler from wis2box.metadata.discovery import DiscoveryMetadata -from wis2box.storage import put_data, move_data, list_content, delete_data -from wis2box.util import older_than, walk_path +from wis2box.storage import put_data, list_content, delete_data +from wis2box.util import walk_path LOGGER = logging.getLogger(__name__) -def archive_data(source_path: str, archive_path: str) -> None: - """ - Archive data based on today's date (YYYY-MM-DD) - - :param source_path: `str` of base storage-path for source - :param arcive_path: `str` of base storage-path for archive - - :returns: `None` - """ - - today_dir = f"{archive_path}/{datetime.now().date().strftime('%Y-%m-%d')}" - LOGGER.debug(f'Archive directory={today_dir}') - datetime_now = datetime.now(timezone.utc) - LOGGER.debug(f'datetime_now={datetime_now}') - for obj in list_content(source_path): - storage_path = obj['fullpath'] - archive_path = f"{today_dir}/{obj['filename']}" - LOGGER.debug(f"filename={obj['filename']}") - LOGGER.debug(f"last_modified={obj['last_modified']}") - if obj['last_modified'] < datetime_now - timedelta(hours=1): - LOGGER.debug(f'Moving {storage_path} to {archive_path}') - move_data(storage_path, archive_path) - else: - LOGGER.debug(f"{storage_path} created less than 1 h ago, skip") - - def clean_data(source_path: str, days: int) -> None: """ Remove data older than n days from source_path and API indexes') @@ -76,21 +49,24 @@ def clean_data(source_path: str, days: int) -> None: :returns: `None` """ - LOGGER.debug(f'Clean files in {source_path} older than {days} day(s)') + before = datetime.now(timezone.utc) - timedelta(days=days) + LOGGER.info(f'Deleting data older than {before} from {source_path}') + nfiles_deleted = 0 for obj in list_content(source_path): if obj['basedir'] == 'metadata': LOGGER.debug('Skipping metadata') continue + # don't delete files in the base-directory + if obj['basedir'] == '' or obj['basedir'] == obj['filename']: + continue storage_path = obj['fullpath'] - if older_than(obj['basedir'], days): - LOGGER.debug(f"{obj['basedir']} is older than {days} days") - LOGGER.debug(f'Deleting {storage_path}') + LOGGER.debug(f"filename={obj['filename']}") + LOGGER.debug(f"last_modified={obj['last_modified']}") + if obj['last_modified'] < before: + LOGGER.debug(f"Deleting {storage_path}") delete_data(storage_path) - else: - LOGGER.debug(f"{obj['basedir']} less than {days} days old") - - LOGGER.debug('Cleaning API indexes') - delete_collections_by_retention(days) + nfiles_deleted += 1 + LOGGER.info(f'Deleted {nfiles_deleted} files from {source_path}') def gcm(mcf: Union[dict, str]) -> dict: @@ -157,37 +133,30 @@ def data(): pass -@click.command() -@click.pass_context -@cli_helpers.OPTION_VERBOSITY -def archive(ctx, verbosity): - """Move data from incoming storage to archive storage""" - - source_path = f'{STORAGE_SOURCE}/{STORAGE_INCOMING}' - archive_path = f'{STORAGE_SOURCE}/{STORAGE_ARCHIVE}' - - click.echo(f'Archiving data from {source_path} to {archive_path}') - archive_data(source_path, archive_path) - - @click.command() @click.pass_context @click.option('--days', '-d', help='Number of days of data to keep', type=int) @cli_helpers.OPTION_VERBOSITY def clean(ctx, days, verbosity): - """Clean data directories and API indexes""" + """Clean data from storage older than X days""" if days is not None: + click.echo(f'Using data retention days: {days}') days_ = days else: + click.echo(f'Using default data retention days: {STORAGE_DATA_RETENTION_DAYS}') # noqa days_ = STORAGE_DATA_RETENTION_DAYS if days_ is None or days_ < 0: click.echo('No data retention set. Skipping') else: - storage_path = f'{STORAGE_SOURCE}/{STORAGE_PUBLIC}' - click.echo(f'Deleting data > {days_} day(s) old from {storage_path}') - clean_data(storage_path, days_) + storage_path_public = f'{STORAGE_SOURCE}/{STORAGE_PUBLIC}' + click.echo(f'Deleting data > {days_} day(s) old from {storage_path_public}') # noqa + clean_data(storage_path_public, days_) + storage_path_incoming = f'{STORAGE_SOURCE}/{STORAGE_INCOMING}' + click.echo(f'Deleting data > {days_} day(s) old from {storage_path_incoming}') # noqa + clean_data(storage_path_incoming, days_) + click.echo('Done') @click.command() @@ -286,7 +255,6 @@ def add_collection_items(ctx, topic_hierarchy, path, recursive, verbosity): click.echo('Done') -data.add_command(archive) data.add_command(clean) data.add_command(ingest) data.add_command(add_collection) diff --git a/wis2box-management/wis2box/env.py b/wis2box-management/wis2box/env.py index 4132cc7f..49fd40a8 100644 --- a/wis2box-management/wis2box/env.py +++ b/wis2box-management/wis2box/env.py @@ -59,7 +59,6 @@ STORAGE_USERNAME = os.environ.get('WIS2BOX_STORAGE_USERNAME', 'wis2box') STORAGE_PASSWORD = os.environ.get('WIS2BOX_STORAGE_PASSWORD', 'minio123') STORAGE_INCOMING = os.environ.get('WIS2BOX_STORAGE_INCOMING', 'wis2box-incoming') # noqa -STORAGE_ARCHIVE = os.environ.get('WIS2BOX_STORAGE_ARCHIVE', 'wis2box-archive') STORAGE_PUBLIC = os.environ.get('WIS2BOX_STORAGE_PUBLIC', 'wis2box-public') try: @@ -67,6 +66,11 @@ except TypeError: STORAGE_DATA_RETENTION_DAYS = None +try: + STORAGE_API_RETENTION_DAYS = int(os.environ.get('WIS2BOX_STORAGE_API_RETENTION_DAYS', 100)) # noqa +except TypeError: + STORAGE_API_RETENTION_DAYS = None + LOGLEVEL = os.environ.get('WIS2BOX_LOGGING_LOGLEVEL', 'ERROR') LOGFILE = os.environ.get('WIS2BOX_LOGGING_LOGFILE', 'stdout') @@ -159,7 +163,6 @@ def create(ctx, verbosity): storages = { STORAGE_INCOMING: 'private', - STORAGE_ARCHIVE: 'private', STORAGE_PUBLIC: 'readonly' } for key, value in storages.items(): diff --git a/wis2box-management/wis2box/pubsub/subscribe.py b/wis2box-management/wis2box/pubsub/subscribe.py index cbbc4048..a6e2a47b 100644 --- a/wis2box-management/wis2box/pubsub/subscribe.py +++ b/wis2box-management/wis2box/pubsub/subscribe.py @@ -38,8 +38,7 @@ from wis2box.data.message import MessageData from wis2box.env import (DATADIR, DOCKER_BROKER, - STORAGE_SOURCE, STORAGE_ARCHIVE, - STORAGE_INCOMING) + STORAGE_SOURCE, STORAGE_INCOMING) from wis2box.handler import Handler, NotHandledError import wis2box.metadata.discovery as discovery_metadata from wis2box.plugin import load_plugin, PLUGINS @@ -156,9 +155,6 @@ def on_message_handler(self, client, userdata, msg): LOGGER.info(f'Do not process directories: {key}') return filepath = f'{STORAGE_SOURCE}/{key}' - if key.startswith(STORAGE_ARCHIVE): - LOGGER.info(f'Do not process archived-data: {key}') - return # start a new process to handle the received data while len(mp.active_children()) == mp.cpu_count(): sleep(0.05) diff --git a/wis2box.env.example b/wis2box.env.example index cbfdb6f9..be9614db 100644 --- a/wis2box.env.example +++ b/wis2box.env.example @@ -43,8 +43,8 @@ WIS2BOX_STORAGE_USERNAME=minio WIS2BOX_STORAGE_PASSWORD=minio123 WIS2BOX_STORAGE_INCOMING=wis2box-incoming WIS2BOX_STORAGE_PUBLIC=wis2box-public -WIS2BOX_STORAGE_ARCHIVE=wis2box-archive WIS2BOX_STORAGE_DATA_RETENTION_DAYS=7 +WIS2BOX_STORAGE_API_RETENTION_DAYS=7 # you should be okay from here