diff --git a/geospaas_harvesting/__init__.py b/geospaas_harvesting/__init__.py index 1a0d64c..8edd594 100644 --- a/geospaas_harvesting/__init__.py +++ b/geospaas_harvesting/__init__.py @@ -1,9 +1,12 @@ """This module provides means to gather metadata about various datasets into the GeoSPaaS catalog """ + +import importlib import logging.config import os import os.path +import pkgutil import sys import yaml @@ -20,3 +23,11 @@ if logging_configuration: logging.config.dictConfig(logging_configuration) logging.captureWarnings(True) + +# import plugins +discovered_plugins = { + name: importlib.import_module(name) + for finder, name, ispkg + in pkgutil.iter_modules() + if name.startswith('geospaas_harvesting_') +} diff --git a/geospaas_harvesting/config.py b/geospaas_harvesting/config.py index e3d45aa..f7743d7 100644 --- a/geospaas_harvesting/config.py +++ b/geospaas_harvesting/config.py @@ -1,28 +1,37 @@ """Configuration management""" +import importlib import logging +import pkgutil -import geospaas_harvesting.providers.aviso as providers_aviso -import geospaas_harvesting.providers.base as providers_base -import geospaas_harvesting.providers.ceda as providers_ceda -import geospaas_harvesting.providers.cmems as providers_cmems -import geospaas_harvesting.providers.copernicus_scihub as providers_copernicus_scihub -import geospaas_harvesting.providers.earthdata_cmr as providers_earthdata_cmr -import geospaas_harvesting.providers.erddap as providers_erddap -import geospaas_harvesting.providers.ftp as providers_ftp -import geospaas_harvesting.providers.http as providers_http -import geospaas_harvesting.providers.jaxa as providers_jaxa -import geospaas_harvesting.providers.local as providers_local -import geospaas_harvesting.providers.metno as providers_metno -import geospaas_harvesting.providers.noaa as providers_noaa -import geospaas_harvesting.providers.podaac as providers_podaac -import geospaas_harvesting.providers.resto as providers_resto +import geospaas_harvesting from .arguments import ArgumentParser, BooleanArgument, DictArgument, ListArgument +from .providers.base import Provider from .utils import read_yaml_file +def import_provider_modules(): + """Import provider classes from core modules and plugins""" + imported = [] + for base_module in [geospaas_harvesting, *geospaas_harvesting.discovered_plugins.values()]: + for _, name, ispkg in pkgutil.iter_modules(base_module.__path__): + if name == 'providers': + providers = importlib.import_module(f"{base_module.__name__}.{name}") + imported.append(providers) + if ispkg: + for _, provider_name, _ in pkgutil.iter_modules(providers.__path__): + imported.append( + importlib.import_module(f"{providers.__name__}.{provider_name}")) + return imported + + +import_provider_modules() logger = logging.getLogger(__name__) +class NoProviderFoundError(Exception): + """No provider class was found""" + + class Configuration(): """Base class for configuration objects""" @@ -57,23 +66,19 @@ class ProvidersArgument(DictArgument): 'password': 'pass123' } """ - provider_types = { - 'aviso': providers_aviso.AVISOProvider, - 'ceda': providers_ceda.CEDAProvider, - 'cmems': providers_cmems.CMEMSProvider, - 'copernicus_scihub': providers_copernicus_scihub.CopernicusScihubProvider, - 'earthdata_cmr': providers_earthdata_cmr.EarthDataCMRProvider, - 'ftp': providers_ftp.FTPProvider, - 'gportal_ftp': providers_jaxa.GPortalProvider, - 'http': providers_http.HTTPProvider, - 'metno': providers_metno.METNOProvider, - 'nansat': providers_local.NansatProvider, - 'netcdf': providers_local.NetCDFProvider, - 'noaa': providers_noaa.NOAAProvider, - 'podaac': providers_podaac.PODAACProvider, - 'resto': providers_resto.RestoProvider, - 'tabledap': providers_erddap.ERDDAPTableProvider, - } + provider_classes = Provider.__subclasses__() + + def __init__(self, name, **kwargs): + super().__init__(name, **kwargs) + + def _find_provider(self, provider_type): + """Try to find a provider matching the `provider_type` in the + Provider subclasses + """ + for provider_class in self.provider_classes: + if provider_class.type == provider_type: + return provider_class + raise NoProviderFoundError(f"No provider found of type {provider_type}") def parse(self, value): """Go through the list of provider settings and create the @@ -84,15 +89,16 @@ def parse(self, value): for provider_name, provider_settings in providers_dict.items(): try: _providers[provider_name] = ( - self.provider_types[provider_settings['type']]( + self._find_provider(provider_settings['type'])( name=provider_name, **provider_settings, )) except KeyError as error: logger.error('Missing setting for provider: %s', error.args[0]) + except NoProviderFoundError as error: + logger.error(error.args[0]) return _providers - class ProvidersConfiguration(Configuration): """Configuration manager for providers""" @@ -110,7 +116,7 @@ class SearchConfiguration(Configuration): def __init__(self): self.providers = None - common_argument_parser = providers_base.Provider().search_parameters_parser + common_argument_parser = Provider().search_parameters_parser self.config_arguments_parser = ArgumentParser([ DictArgument( 'common', argument_parser=common_argument_parser), diff --git a/geospaas_harvesting/providers/aviso.py b/geospaas_harvesting/providers/aviso.py index c2a6f3f..89af461 100644 --- a/geospaas_harvesting/providers/aviso.py +++ b/geospaas_harvesting/providers/aviso.py @@ -6,6 +6,9 @@ class AVISOProvider(TimeFilterMixin, Provider): """Provider for AVISO's Thredds""" + + type = 'aviso' + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.url = 'https://tds.aviso.altimetry.fr/thredds' diff --git a/geospaas_harvesting/providers/ceda.py b/geospaas_harvesting/providers/ceda.py index 86d5030..65dc691 100644 --- a/geospaas_harvesting/providers/ceda.py +++ b/geospaas_harvesting/providers/ceda.py @@ -9,6 +9,8 @@ class CEDAProvider(TimeFilterMixin, Provider): """Provider for CEDA FTP server""" + type = 'ceda' + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.url = "ftp://anon-ftp.ceda.ac.uk" diff --git a/geospaas_harvesting/providers/cmems.py b/geospaas_harvesting/providers/cmems.py index ce03113..60f3fee 100644 --- a/geospaas_harvesting/providers/cmems.py +++ b/geospaas_harvesting/providers/cmems.py @@ -1,5 +1,6 @@ """Code for searching CMEMS data (https://marine.copernicus.eu/)""" import calendar +import logging import re import tempfile from datetime import datetime @@ -21,6 +22,8 @@ class CMEMSProvider(Provider): """Provider for CMEMS using the copernicusmarine package""" + type = 'cmems' + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.search_parameters_parser.add_arguments([ @@ -119,8 +122,9 @@ def make_filter(self): months_regex.append(f"{month:02d}({days_regex})") years_regex.append(f"({year}({'|'.join(months_regex)}))") + full_regex = '|'.join(years_regex) - return f".*_({'|'.join(years_regex)})_.*" + return f"^(.*_({full_regex})_.*)|({full_regex}.*)$" @staticmethod def _find_dict_in_list(dicts_list, key, value): @@ -200,6 +204,8 @@ def get_normalized_attributes(self, dataset_info, **kwargs): class CMEMSMetadataNormalizer(): """Normalizer for CMEMS datasets""" + logger = logging.getLogger(__name__ + '.CMEMSMetadataNormalizer') + def __init__(self, product_info): self._product_info = product_info @@ -305,7 +311,7 @@ def get_time_coverage(self, entry_id): ), # generic 1 day coverage ( - re.compile(rf'(^|[-_.:]){providers_utils.YEARMONTHDAY_REGEX}([-_.:T]|$)'), + re.compile(rf'(^|[-_.:]){providers_utils.YEARMONTHDAY_REGEX}(\d{{6}})?([-_.:T]|$)'), lambda time: (time, time + relativedelta(days=1)) ), # generic 1 month coverage @@ -371,13 +377,23 @@ def get_dataset_parameters(self, dataset_info): variables = [] variable_dict = None for variable in dataset_info.metadata['variables']: + standard_name = variable.get('standard_name') + short_name = variable.get('short_name') + if standard_name: + search_name = standard_name + elif short_name: + search_name = short_name + else: + self.logger.error('No available name for the following variable, skipping: %s', + variable) + continue + try: - variable_dict = providers_utils.get_cf_or_wkv_standard_name( - variable['standard_name']) + variable_dict = providers_utils.get_cf_or_wkv_standard_name(search_name) except IndexError: try: variable_dict = pythesint.vocabularies['cf_standard_name'].fuzzy_search( - variable['standard_name'])[0] + search_name)[0] except IndexError: continue if variable_dict not in variables: diff --git a/geospaas_harvesting/providers/copernicus_scihub.py b/geospaas_harvesting/providers/copernicus_scihub.py index 569c2b1..af5b509 100644 --- a/geospaas_harvesting/providers/copernicus_scihub.py +++ b/geospaas_harvesting/providers/copernicus_scihub.py @@ -17,6 +17,9 @@ class CopernicusScihubProvider(Provider): """Provider for the Copernicus Scihub APIs""" + + type = 'copernicus_scihub' + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.search_url = 'https://apihub.copernicus.eu/apihub/search' diff --git a/geospaas_harvesting/providers/earthdata_cmr.py b/geospaas_harvesting/providers/earthdata_cmr.py index 6e3402d..1ab57b6 100644 --- a/geospaas_harvesting/providers/earthdata_cmr.py +++ b/geospaas_harvesting/providers/earthdata_cmr.py @@ -16,6 +16,9 @@ class EarthDataCMRProvider(Provider): properly validated because of the massive amount of collections available through this API. This needs to be refined. """ + + type = 'earthdata_cmr' + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.search_url = 'https://cmr.earthdata.nasa.gov/search/granules.umm_json' diff --git a/geospaas_harvesting/providers/erddap.py b/geospaas_harvesting/providers/erddap.py index ab21b37..935258e 100644 --- a/geospaas_harvesting/providers/erddap.py +++ b/geospaas_harvesting/providers/erddap.py @@ -6,6 +6,9 @@ class ERDDAPTableProvider(Provider): """Provider for tabledap APIs""" + + type = 'tabledap' + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.url = kwargs['url'].rstrip('/') diff --git a/geospaas_harvesting/providers/ftp.py b/geospaas_harvesting/providers/ftp.py index f64dcbb..f874b62 100644 --- a/geospaas_harvesting/providers/ftp.py +++ b/geospaas_harvesting/providers/ftp.py @@ -9,6 +9,8 @@ class FTPProvider(TimeFilterMixin, Provider): """Generic FTP provider""" + type = 'ftp' + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.search_parameters_parser.add_arguments([ diff --git a/geospaas_harvesting/providers/http.py b/geospaas_harvesting/providers/http.py index e246eec..31268e1 100644 --- a/geospaas_harvesting/providers/http.py +++ b/geospaas_harvesting/providers/http.py @@ -9,6 +9,8 @@ class HTTPProvider(TimeFilterMixin, Provider): """Generic HTTP directory provider""" + type = 'http' + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.search_parameters_parser.add_arguments([ diff --git a/geospaas_harvesting/providers/jaxa.py b/geospaas_harvesting/providers/jaxa.py index 41cea4b..b7eaa1f 100644 --- a/geospaas_harvesting/providers/jaxa.py +++ b/geospaas_harvesting/providers/jaxa.py @@ -9,6 +9,8 @@ class GPortalProvider(TimeFilterMixin, Provider): """Provider for JAXA GPortal FTP server""" + type = 'gportal_ftp' + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.url = "ftp://ftp.gportal.jaxa.jp" diff --git a/geospaas_harvesting/providers/local.py b/geospaas_harvesting/providers/local.py index b19d04c..7d9d253 100644 --- a/geospaas_harvesting/providers/local.py +++ b/geospaas_harvesting/providers/local.py @@ -26,6 +26,8 @@ class NansatProvider(TimeFilterMixin, Provider): """Provider for local files with metadata provided by Nansat """ + type = 'nansat' + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.search_parameters_parser.add_arguments([ @@ -45,6 +47,8 @@ class NetCDFProvider(TimeFilterMixin, Provider): """Provider for local files with metadata extracted directly using """ + type = 'netcdf' + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.longitude_attribute = kwargs.get('longitude_attribute', 'LONGITUDE') diff --git a/geospaas_harvesting/providers/metno.py b/geospaas_harvesting/providers/metno.py index f91d3f7..2b0879a 100644 --- a/geospaas_harvesting/providers/metno.py +++ b/geospaas_harvesting/providers/metno.py @@ -6,6 +6,9 @@ class METNOProvider(TimeFilterMixin, Provider): """Provider for MET NO's Thredds""" + + type = 'metno' + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.url = 'https://thredds.met.no/thredds' diff --git a/geospaas_harvesting/providers/noaa.py b/geospaas_harvesting/providers/noaa.py index c6484b9..41d3111 100644 --- a/geospaas_harvesting/providers/noaa.py +++ b/geospaas_harvesting/providers/noaa.py @@ -9,6 +9,8 @@ class NOAAProvider(TimeFilterMixin, Provider): """Provider for NOAA FTP servers""" + type = 'noaa' + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.url = "ftp://{server}.ncep.noaa.gov" diff --git a/geospaas_harvesting/providers/podaac.py b/geospaas_harvesting/providers/podaac.py index 53b97c6..d305d0f 100644 --- a/geospaas_harvesting/providers/podaac.py +++ b/geospaas_harvesting/providers/podaac.py @@ -6,6 +6,9 @@ class PODAACProvider(TimeFilterMixin, Provider): """Provider for PODAAC's OpenDAP""" + + type = 'podaac' + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.url = 'https://opendap.jpl.nasa.gov/opendap' diff --git a/geospaas_harvesting/providers/resto.py b/geospaas_harvesting/providers/resto.py index 7a84164..d164e5c 100644 --- a/geospaas_harvesting/providers/resto.py +++ b/geospaas_harvesting/providers/resto.py @@ -23,6 +23,8 @@ class RestoProvider(Provider): parameters are fetched from the API. """ + type = 'resto' + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.url = kwargs['url'].rstrip('/') diff --git a/tests/providers/test_cmems.py b/tests/providers/test_cmems.py index 190b86b..7e8d5dd 100644 --- a/tests/providers/test_cmems.py +++ b/tests/providers/test_cmems.py @@ -1,5 +1,6 @@ # pylint: disable=protected-access """Tests for the CMEMS provider""" +import logging import unittest import unittest.mock as mock from datetime import datetime, timezone @@ -58,38 +59,43 @@ def test_make_filter(self): """Test making a regular expression matching a time range """ mock_crawler = mock.Mock() + regex_template = "^(.*_({regex})_.*)|({regex}.*)$" mock_crawler.time_range = (datetime(2024, 9, 1), datetime(2024, 9, 2)) self.assertEqual( CMEMSCrawler.make_filter(mock_crawler), - '.*_((2024(09(01|02))))_.*') + regex_template.format(regex='(2024(09(01|02)))')) mock_crawler.time_range = (datetime(2024, 9, 1), datetime(2024, 10, 15)) self.assertEqual( CMEMSCrawler.make_filter(mock_crawler), - '.*_((2024(09(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24' - '|25|26|27|28|29|30)|10(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15))))_.*') + regex_template.format(regex=( + '(2024(09(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23' + '|24|25|26|27|28|29|30)|10(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15)))' + ))) mock_crawler.time_range = (datetime(2024, 11, 1), datetime(2025, 1, 1)) self.assertEqual( CMEMSCrawler.make_filter(mock_crawler), - '.*_((202412[0-3][0-9])|(2024(11(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|' - '18|19|20|21|22|23|24|25|26|27|28|29|30)))|(2025(01(01))))_.*') + regex_template.format(regex=( + '(202412[0-3][0-9])|(2024(11(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|' + '16|17|18|19|20|21|22|23|24|25|26|27|28|29|30)))|(2025(01(01)))'))) mock_crawler.time_range = (datetime(2023, 12, 30), datetime(2024, 1, 2)) self.assertEqual( CMEMSCrawler.make_filter(mock_crawler), - '.*_((2023(12(30|31)))|(2024(01(01|02))))_.*') + regex_template.format(regex=('(2023(12(30|31)))|(2024(01(01|02)))'))) mock_crawler.time_range = (datetime(2023, 12, 30), datetime(2025, 1, 2)) self.assertEqual( CMEMSCrawler.make_filter(mock_crawler), - '.*_((2023(12(30|31)))|(2024[0-9]{4})|(2025(01(01|02))))_.*') + regex_template.format(regex=( + '(2023(12(30|31)))|(2024[0-9]{4})|(2025(01(01|02)))'))) mock_crawler.time_range = (None, None) self.assertIsNone(CMEMSCrawler.make_filter(mock_crawler)) @@ -549,17 +555,20 @@ def test_get_dataset_parameters(self): '.get_cf_or_wkv_standard_name') as mock_get_cf_wkv, \ mock.patch('pythesint.vocabularies', vocabularies): - mock_get_cf_wkv.side_effect = ('variable_1', IndexError, IndexError) + mock_get_cf_wkv.side_effect = ('variable_1', IndexError, IndexError, 'variable_4') vocabularies['cf_standard_name'].fuzzy_search.side_effect = ( IndexError, ['variable_3', 'varrriable_3']) - self.assertListEqual( - self.normalizer.get_dataset_parameters(DatasetInfo('foo', { - 'variables': ({'standard_name': 'var1'}, - {'standard_name': 'var2'}, - {'standard_name': 'var3'}) - })), - ['variable_1', 'variable_3']) + with self.assertLogs(logger=self.normalizer.logger, level=logging.ERROR): + self.assertListEqual( + self.normalizer.get_dataset_parameters(DatasetInfo('foo', { + 'variables': ({'standard_name': 'var1'}, + {'standard_name': 'var2'}, + {'standard_name': 'var3'}, + {'short_name': 'v4'}, + {'foo': 'bar'}) + })), + ['variable_1', 'variable_3', 'variable_4']) def test_get_service(self): """Test retrieval of the type of repository where the data is diff --git a/tests/test_config.py b/tests/test_config.py index 6e9e441..c800c89 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -3,11 +3,8 @@ import logging import unittest import unittest.mock as mock -from datetime import datetime, timezone as tz -from pathlib import Path import geospaas_harvesting.config as config -import geospaas_harvesting.providers.base as providers_base import geospaas_harvesting.providers.podaac as providers_podaac import geospaas_harvesting.providers.cmems as providers_cmems import geospaas_harvesting.providers.resto as providers_resto @@ -58,10 +55,17 @@ def test_parse(self): name='cmems', username='user', password='pass'), }) - def test_parse_error(self): + def test_parse_config_error(self): """Test error handling when parsing wrong configuration""" with self.assertLogs(config.logger, level=logging.ERROR): - _ = config.ProvidersArgument('providers').parse({'foo': {}}) + config.ProvidersArgument('providers').parse({'foo': {}}) + + def test_parse_no_provider_found(self): + """Test error handling when no provider matches the requested + type + """ + with self.assertLogs(config.logger, level=logging.ERROR): + config.ProvidersArgument('providers').parse({'foo': {'type': 'foo'}}) class ProvidersConfigurationTestCase(unittest.TestCase):