From 0f14f76f02e8732e03fa364e13ad4cf385dc5578 Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Fri, 24 Jan 2025 08:44:15 +0000 Subject: [PATCH 1/6] update cmems normalizer time coverage regex --- geospaas_harvesting/providers/cmems.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/geospaas_harvesting/providers/cmems.py b/geospaas_harvesting/providers/cmems.py index 3047ab9..6fdd939 100644 --- a/geospaas_harvesting/providers/cmems.py +++ b/geospaas_harvesting/providers/cmems.py @@ -307,7 +307,7 @@ def get_time_coverage(self, entry_id): ), # generic 1 day coverage ( - re.compile(rf'(^|[-_.:]){providers_utils.YEARMONTHDAY_REGEX}([-_.:T]|$)'), + re.compile(rf'(^|[-_.:]){providers_utils.YEARMONTHDAY_REGEX}(\d{{6}})?([-_.:T]|$)'), lambda time: (time, time + relativedelta(days=1)) ), # generic 1 month coverage From 3ad93cec73fb89ecff94131fc8e82da84676fb3f Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Fri, 24 Jan 2025 08:45:25 +0000 Subject: [PATCH 2/6] cmems normalizer: support variables with no standard name --- geospaas_harvesting/providers/cmems.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/geospaas_harvesting/providers/cmems.py b/geospaas_harvesting/providers/cmems.py index 6fdd939..4265d8e 100644 --- a/geospaas_harvesting/providers/cmems.py +++ b/geospaas_harvesting/providers/cmems.py @@ -1,5 +1,6 @@ """Code for searching CMEMS data (https://marine.copernicus.eu/)""" import calendar +import logging import re import tempfile from datetime import datetime @@ -202,6 +203,8 @@ def get_normalized_attributes(self, dataset_info, **kwargs): class CMEMSMetadataNormalizer(): """Normalizer for CMEMS datasets""" + logger = logging.getLogger(__name__ + '.CMEMSMetadataNormalizer') + def __init__(self, product_info): self._product_info = product_info @@ -373,13 +376,21 @@ def get_dataset_parameters(self, dataset_info): variables = [] variable_dict = None for variable in dataset_info.metadata['variables']: + if variable['standard_name']: + search_name = variable['standard_name'] + elif variable['short_name']: + search_name = variable['short_name'] + else: + self.logger.error('No available name for the following variable, skipping: %s', + variable) + continue + try: - variable_dict = providers_utils.get_cf_or_wkv_standard_name( - variable['standard_name']) + variable_dict = providers_utils.get_cf_or_wkv_standard_name(search_name) except IndexError: try: variable_dict = pythesint.vocabularies['cf_standard_name'].fuzzy_search( - variable['standard_name'])[0] + search_name)[0] except IndexError: continue if variable_dict not in variables: From 16a71aded8ffd1ffb1b42423352244b469799c2a Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Fri, 24 Jan 2025 08:46:00 +0000 Subject: [PATCH 3/6] cmems crawler: update date filter regex --- geospaas_harvesting/providers/cmems.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/geospaas_harvesting/providers/cmems.py b/geospaas_harvesting/providers/cmems.py index 4265d8e..3ba3116 100644 --- a/geospaas_harvesting/providers/cmems.py +++ b/geospaas_harvesting/providers/cmems.py @@ -122,8 +122,9 @@ def make_filter(self): months_regex.append(f"{month:02d}({days_regex})") years_regex.append(f"({year}({'|'.join(months_regex)}))") + full_regex = '|'.join(years_regex) - return f".*_({'|'.join(years_regex)})_.*" + return f"^(.*_({full_regex})_.*)|({full_regex}.*)$" @staticmethod def _find_dict_in_list(dicts_list, key, value): From b0152881060f4c80a95f0ba5d02b64977cb9e845 Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Fri, 24 Jan 2025 08:46:19 +0000 Subject: [PATCH 4/6] update cmems make_filter tests --- tests/providers/test_cmems.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tests/providers/test_cmems.py b/tests/providers/test_cmems.py index 190b86b..18ae678 100644 --- a/tests/providers/test_cmems.py +++ b/tests/providers/test_cmems.py @@ -58,38 +58,43 @@ def test_make_filter(self): """Test making a regular expression matching a time range """ mock_crawler = mock.Mock() + regex_template = "^(.*_({regex})_.*)|({regex}.*)$" mock_crawler.time_range = (datetime(2024, 9, 1), datetime(2024, 9, 2)) self.assertEqual( CMEMSCrawler.make_filter(mock_crawler), - '.*_((2024(09(01|02))))_.*') + regex_template.format(regex='(2024(09(01|02)))')) mock_crawler.time_range = (datetime(2024, 9, 1), datetime(2024, 10, 15)) self.assertEqual( CMEMSCrawler.make_filter(mock_crawler), - '.*_((2024(09(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24' - '|25|26|27|28|29|30)|10(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15))))_.*') + regex_template.format(regex=( + '(2024(09(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23' + '|24|25|26|27|28|29|30)|10(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15)))' + ))) mock_crawler.time_range = (datetime(2024, 11, 1), datetime(2025, 1, 1)) self.assertEqual( CMEMSCrawler.make_filter(mock_crawler), - '.*_((202412[0-3][0-9])|(2024(11(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|' - '18|19|20|21|22|23|24|25|26|27|28|29|30)))|(2025(01(01))))_.*') + regex_template.format(regex=( + '(202412[0-3][0-9])|(2024(11(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|' + '16|17|18|19|20|21|22|23|24|25|26|27|28|29|30)))|(2025(01(01)))'))) mock_crawler.time_range = (datetime(2023, 12, 30), datetime(2024, 1, 2)) self.assertEqual( CMEMSCrawler.make_filter(mock_crawler), - '.*_((2023(12(30|31)))|(2024(01(01|02))))_.*') + regex_template.format(regex=('(2023(12(30|31)))|(2024(01(01|02)))'))) mock_crawler.time_range = (datetime(2023, 12, 30), datetime(2025, 1, 2)) self.assertEqual( CMEMSCrawler.make_filter(mock_crawler), - '.*_((2023(12(30|31)))|(2024[0-9]{4})|(2025(01(01|02))))_.*') + regex_template.format(regex=( + '(2023(12(30|31)))|(2024[0-9]{4})|(2025(01(01|02)))'))) mock_crawler.time_range = (None, None) self.assertIsNone(CMEMSCrawler.make_filter(mock_crawler)) From 716ce0e42234e270614fb589f6a0ccf828fffbde Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Fri, 24 Jan 2025 09:07:46 +0000 Subject: [PATCH 5/6] update dataset variables retrieval for cmems normalizer --- geospaas_harvesting/providers/cmems.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/geospaas_harvesting/providers/cmems.py b/geospaas_harvesting/providers/cmems.py index 3ba3116..60f3fee 100644 --- a/geospaas_harvesting/providers/cmems.py +++ b/geospaas_harvesting/providers/cmems.py @@ -377,10 +377,12 @@ def get_dataset_parameters(self, dataset_info): variables = [] variable_dict = None for variable in dataset_info.metadata['variables']: - if variable['standard_name']: - search_name = variable['standard_name'] - elif variable['short_name']: - search_name = variable['short_name'] + standard_name = variable.get('standard_name') + short_name = variable.get('short_name') + if standard_name: + search_name = standard_name + elif short_name: + search_name = short_name else: self.logger.error('No available name for the following variable, skipping: %s', variable) From 68e1d8e424de9e9d929b310602352f7e9d9ad909 Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Fri, 24 Jan 2025 09:07:53 +0000 Subject: [PATCH 6/6] update cmems tests --- tests/providers/test_cmems.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/providers/test_cmems.py b/tests/providers/test_cmems.py index 18ae678..7e8d5dd 100644 --- a/tests/providers/test_cmems.py +++ b/tests/providers/test_cmems.py @@ -1,5 +1,6 @@ # pylint: disable=protected-access """Tests for the CMEMS provider""" +import logging import unittest import unittest.mock as mock from datetime import datetime, timezone @@ -554,17 +555,20 @@ def test_get_dataset_parameters(self): '.get_cf_or_wkv_standard_name') as mock_get_cf_wkv, \ mock.patch('pythesint.vocabularies', vocabularies): - mock_get_cf_wkv.side_effect = ('variable_1', IndexError, IndexError) + mock_get_cf_wkv.side_effect = ('variable_1', IndexError, IndexError, 'variable_4') vocabularies['cf_standard_name'].fuzzy_search.side_effect = ( IndexError, ['variable_3', 'varrriable_3']) - self.assertListEqual( - self.normalizer.get_dataset_parameters(DatasetInfo('foo', { - 'variables': ({'standard_name': 'var1'}, - {'standard_name': 'var2'}, - {'standard_name': 'var3'}) - })), - ['variable_1', 'variable_3']) + with self.assertLogs(logger=self.normalizer.logger, level=logging.ERROR): + self.assertListEqual( + self.normalizer.get_dataset_parameters(DatasetInfo('foo', { + 'variables': ({'standard_name': 'var1'}, + {'standard_name': 'var2'}, + {'standard_name': 'var3'}, + {'short_name': 'v4'}, + {'foo': 'bar'}) + })), + ['variable_1', 'variable_3', 'variable_4']) def test_get_service(self): """Test retrieval of the type of repository where the data is