Skip to content

Commit

Permalink
Merge pull request #150 from nansencenter/refactor_structure
Browse files Browse the repository at this point in the history
Refactor the code structure
  • Loading branch information
aperrin66 authored Feb 5, 2025
2 parents bab3da8 + 249c320 commit 8d544e9
Show file tree
Hide file tree
Showing 18 changed files with 140 additions and 60 deletions.
11 changes: 11 additions & 0 deletions geospaas_harvesting/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
"""This module provides means to gather metadata about various datasets
into the GeoSPaaS catalog
"""

import importlib
import logging.config
import os
import os.path
import pkgutil
import sys
import yaml

Expand All @@ -20,3 +23,11 @@
if logging_configuration:
logging.config.dictConfig(logging_configuration)
logging.captureWarnings(True)

# import plugins
discovered_plugins = {
name: importlib.import_module(name)
for finder, name, ispkg
in pkgutil.iter_modules()
if name.startswith('geospaas_harvesting_')
}
76 changes: 41 additions & 35 deletions geospaas_harvesting/config.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,37 @@
"""Configuration management"""
import importlib
import logging
import pkgutil

import geospaas_harvesting.providers.aviso as providers_aviso
import geospaas_harvesting.providers.base as providers_base
import geospaas_harvesting.providers.ceda as providers_ceda
import geospaas_harvesting.providers.cmems as providers_cmems
import geospaas_harvesting.providers.copernicus_scihub as providers_copernicus_scihub
import geospaas_harvesting.providers.earthdata_cmr as providers_earthdata_cmr
import geospaas_harvesting.providers.erddap as providers_erddap
import geospaas_harvesting.providers.ftp as providers_ftp
import geospaas_harvesting.providers.http as providers_http
import geospaas_harvesting.providers.jaxa as providers_jaxa
import geospaas_harvesting.providers.local as providers_local
import geospaas_harvesting.providers.metno as providers_metno
import geospaas_harvesting.providers.noaa as providers_noaa
import geospaas_harvesting.providers.podaac as providers_podaac
import geospaas_harvesting.providers.resto as providers_resto
import geospaas_harvesting
from .arguments import ArgumentParser, BooleanArgument, DictArgument, ListArgument
from .providers.base import Provider
from .utils import read_yaml_file


def import_provider_modules():
"""Import provider classes from core modules and plugins"""
imported = []
for base_module in [geospaas_harvesting, *geospaas_harvesting.discovered_plugins.values()]:
for _, name, ispkg in pkgutil.iter_modules(base_module.__path__):
if name == 'providers':
providers = importlib.import_module(f"{base_module.__name__}.{name}")
imported.append(providers)
if ispkg:
for _, provider_name, _ in pkgutil.iter_modules(providers.__path__):
imported.append(
importlib.import_module(f"{providers.__name__}.{provider_name}"))
return imported


import_provider_modules()
logger = logging.getLogger(__name__)


class NoProviderFoundError(Exception):
"""No provider class was found"""


class Configuration():
"""Base class for configuration objects"""

Expand Down Expand Up @@ -57,23 +66,19 @@ class ProvidersArgument(DictArgument):
'password': 'pass123'
}
"""
provider_types = {
'aviso': providers_aviso.AVISOProvider,
'ceda': providers_ceda.CEDAProvider,
'cmems': providers_cmems.CMEMSProvider,
'copernicus_scihub': providers_copernicus_scihub.CopernicusScihubProvider,
'earthdata_cmr': providers_earthdata_cmr.EarthDataCMRProvider,
'ftp': providers_ftp.FTPProvider,
'gportal_ftp': providers_jaxa.GPortalProvider,
'http': providers_http.HTTPProvider,
'metno': providers_metno.METNOProvider,
'nansat': providers_local.NansatProvider,
'netcdf': providers_local.NetCDFProvider,
'noaa': providers_noaa.NOAAProvider,
'podaac': providers_podaac.PODAACProvider,
'resto': providers_resto.RestoProvider,
'tabledap': providers_erddap.ERDDAPTableProvider,
}
provider_classes = Provider.__subclasses__()

def __init__(self, name, **kwargs):
super().__init__(name, **kwargs)

def _find_provider(self, provider_type):
"""Try to find a provider matching the `provider_type` in the
Provider subclasses
"""
for provider_class in self.provider_classes:
if provider_class.type == provider_type:
return provider_class
raise NoProviderFoundError(f"No provider found of type {provider_type}")

def parse(self, value):
"""Go through the list of provider settings and create the
Expand All @@ -84,15 +89,16 @@ def parse(self, value):
for provider_name, provider_settings in providers_dict.items():
try:
_providers[provider_name] = (
self.provider_types[provider_settings['type']](
self._find_provider(provider_settings['type'])(
name=provider_name,
**provider_settings,
))
except KeyError as error:
logger.error('Missing setting for provider: %s', error.args[0])
except NoProviderFoundError as error:
logger.error(error.args[0])
return _providers


class ProvidersConfiguration(Configuration):
"""Configuration manager for providers"""

Expand All @@ -110,7 +116,7 @@ class SearchConfiguration(Configuration):

def __init__(self):
self.providers = None
common_argument_parser = providers_base.Provider().search_parameters_parser
common_argument_parser = Provider().search_parameters_parser
self.config_arguments_parser = ArgumentParser([
DictArgument(
'common', argument_parser=common_argument_parser),
Expand Down
3 changes: 3 additions & 0 deletions geospaas_harvesting/providers/aviso.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

class AVISOProvider(TimeFilterMixin, Provider):
"""Provider for AVISO's Thredds"""

type = 'aviso'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = 'https://tds.aviso.altimetry.fr/thredds'
Expand Down
2 changes: 2 additions & 0 deletions geospaas_harvesting/providers/ceda.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
class CEDAProvider(TimeFilterMixin, Provider):
"""Provider for CEDA FTP server"""

type = 'ceda'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "ftp://anon-ftp.ceda.ac.uk"
Expand Down
26 changes: 21 additions & 5 deletions geospaas_harvesting/providers/cmems.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Code for searching CMEMS data (https://marine.copernicus.eu/)"""
import calendar
import logging
import re
import tempfile
from datetime import datetime
Expand All @@ -21,6 +22,8 @@
class CMEMSProvider(Provider):
"""Provider for CMEMS using the copernicusmarine package"""

type = 'cmems'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.search_parameters_parser.add_arguments([
Expand Down Expand Up @@ -119,8 +122,9 @@ def make_filter(self):
months_regex.append(f"{month:02d}({days_regex})")

years_regex.append(f"({year}({'|'.join(months_regex)}))")
full_regex = '|'.join(years_regex)

return f".*_({'|'.join(years_regex)})_.*"
return f"^(.*_({full_regex})_.*)|({full_regex}.*)$"

@staticmethod
def _find_dict_in_list(dicts_list, key, value):
Expand Down Expand Up @@ -200,6 +204,8 @@ def get_normalized_attributes(self, dataset_info, **kwargs):
class CMEMSMetadataNormalizer():
"""Normalizer for CMEMS datasets"""

logger = logging.getLogger(__name__ + '.CMEMSMetadataNormalizer')

def __init__(self, product_info):
self._product_info = product_info

Expand Down Expand Up @@ -305,7 +311,7 @@ def get_time_coverage(self, entry_id):
),
# generic 1 day coverage
(
re.compile(rf'(^|[-_.:]){providers_utils.YEARMONTHDAY_REGEX}([-_.:T]|$)'),
re.compile(rf'(^|[-_.:]){providers_utils.YEARMONTHDAY_REGEX}(\d{{6}})?([-_.:T]|$)'),
lambda time: (time, time + relativedelta(days=1))
),
# generic 1 month coverage
Expand Down Expand Up @@ -371,13 +377,23 @@ def get_dataset_parameters(self, dataset_info):
variables = []
variable_dict = None
for variable in dataset_info.metadata['variables']:
standard_name = variable.get('standard_name')
short_name = variable.get('short_name')
if standard_name:
search_name = standard_name
elif short_name:
search_name = short_name
else:
self.logger.error('No available name for the following variable, skipping: %s',
variable)
continue

try:
variable_dict = providers_utils.get_cf_or_wkv_standard_name(
variable['standard_name'])
variable_dict = providers_utils.get_cf_or_wkv_standard_name(search_name)
except IndexError:
try:
variable_dict = pythesint.vocabularies['cf_standard_name'].fuzzy_search(
variable['standard_name'])[0]
search_name)[0]
except IndexError:
continue
if variable_dict not in variables:
Expand Down
3 changes: 3 additions & 0 deletions geospaas_harvesting/providers/copernicus_scihub.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@

class CopernicusScihubProvider(Provider):
"""Provider for the Copernicus Scihub APIs"""

type = 'copernicus_scihub'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.search_url = 'https://apihub.copernicus.eu/apihub/search'
Expand Down
3 changes: 3 additions & 0 deletions geospaas_harvesting/providers/earthdata_cmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ class EarthDataCMRProvider(Provider):
properly validated because of the massive amount of collections
available through this API. This needs to be refined.
"""

type = 'earthdata_cmr'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.search_url = 'https://cmr.earthdata.nasa.gov/search/granules.umm_json'
Expand Down
3 changes: 3 additions & 0 deletions geospaas_harvesting/providers/erddap.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

class ERDDAPTableProvider(Provider):
"""Provider for tabledap APIs"""

type = 'tabledap'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = kwargs['url'].rstrip('/')
Expand Down
2 changes: 2 additions & 0 deletions geospaas_harvesting/providers/ftp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
class FTPProvider(TimeFilterMixin, Provider):
"""Generic FTP provider"""

type = 'ftp'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.search_parameters_parser.add_arguments([
Expand Down
2 changes: 2 additions & 0 deletions geospaas_harvesting/providers/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
class HTTPProvider(TimeFilterMixin, Provider):
"""Generic HTTP directory provider"""

type = 'http'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.search_parameters_parser.add_arguments([
Expand Down
2 changes: 2 additions & 0 deletions geospaas_harvesting/providers/jaxa.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
class GPortalProvider(TimeFilterMixin, Provider):
"""Provider for JAXA GPortal FTP server"""

type = 'gportal_ftp'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "ftp://ftp.gportal.jaxa.jp"
Expand Down
4 changes: 4 additions & 0 deletions geospaas_harvesting/providers/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ class NansatProvider(TimeFilterMixin, Provider):
"""Provider for local files with metadata provided by Nansat
"""

type = 'nansat'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.search_parameters_parser.add_arguments([
Expand All @@ -45,6 +47,8 @@ class NetCDFProvider(TimeFilterMixin, Provider):
"""Provider for local files with metadata extracted directly using
"""

type = 'netcdf'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.longitude_attribute = kwargs.get('longitude_attribute', 'LONGITUDE')
Expand Down
3 changes: 3 additions & 0 deletions geospaas_harvesting/providers/metno.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

class METNOProvider(TimeFilterMixin, Provider):
"""Provider for MET NO's Thredds"""

type = 'metno'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = 'https://thredds.met.no/thredds'
Expand Down
2 changes: 2 additions & 0 deletions geospaas_harvesting/providers/noaa.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
class NOAAProvider(TimeFilterMixin, Provider):
"""Provider for NOAA FTP servers"""

type = 'noaa'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "ftp://{server}.ncep.noaa.gov"
Expand Down
3 changes: 3 additions & 0 deletions geospaas_harvesting/providers/podaac.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

class PODAACProvider(TimeFilterMixin, Provider):
"""Provider for PODAAC's OpenDAP"""

type = 'podaac'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = 'https://opendap.jpl.nasa.gov/opendap'
Expand Down
2 changes: 2 additions & 0 deletions geospaas_harvesting/providers/resto.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ class RestoProvider(Provider):
parameters are fetched from the API.
"""

type = 'resto'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = kwargs['url'].rstrip('/')
Expand Down
Loading

0 comments on commit 8d544e9

Please sign in to comment.