Skip to content

Commit

Permalink
Merge pull request #151 from nansencenter/fix_cmems_010_043
Browse files Browse the repository at this point in the history
CMEMS provider fixes for compatibility with SST_GLO_PHY_L4_NRT_010_043
  • Loading branch information
aperrin66 authored Feb 3, 2025
2 parents b2236ca + 68e1d8e commit 249c320
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 20 deletions.
24 changes: 19 additions & 5 deletions geospaas_harvesting/providers/cmems.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Code for searching CMEMS data (https://marine.copernicus.eu/)"""
import calendar
import logging
import re
import tempfile
from datetime import datetime
Expand Down Expand Up @@ -121,8 +122,9 @@ def make_filter(self):
months_regex.append(f"{month:02d}({days_regex})")

years_regex.append(f"({year}({'|'.join(months_regex)}))")
full_regex = '|'.join(years_regex)

return f".*_({'|'.join(years_regex)})_.*"
return f"^(.*_({full_regex})_.*)|({full_regex}.*)$"

@staticmethod
def _find_dict_in_list(dicts_list, key, value):
Expand Down Expand Up @@ -202,6 +204,8 @@ def get_normalized_attributes(self, dataset_info, **kwargs):
class CMEMSMetadataNormalizer():
"""Normalizer for CMEMS datasets"""

logger = logging.getLogger(__name__ + '.CMEMSMetadataNormalizer')

def __init__(self, product_info):
self._product_info = product_info

Expand Down Expand Up @@ -307,7 +311,7 @@ def get_time_coverage(self, entry_id):
),
# generic 1 day coverage
(
re.compile(rf'(^|[-_.:]){providers_utils.YEARMONTHDAY_REGEX}([-_.:T]|$)'),
re.compile(rf'(^|[-_.:]){providers_utils.YEARMONTHDAY_REGEX}(\d{{6}})?([-_.:T]|$)'),
lambda time: (time, time + relativedelta(days=1))
),
# generic 1 month coverage
Expand Down Expand Up @@ -373,13 +377,23 @@ def get_dataset_parameters(self, dataset_info):
variables = []
variable_dict = None
for variable in dataset_info.metadata['variables']:
standard_name = variable.get('standard_name')
short_name = variable.get('short_name')
if standard_name:
search_name = standard_name
elif short_name:
search_name = short_name
else:
self.logger.error('No available name for the following variable, skipping: %s',
variable)
continue

try:
variable_dict = providers_utils.get_cf_or_wkv_standard_name(
variable['standard_name'])
variable_dict = providers_utils.get_cf_or_wkv_standard_name(search_name)
except IndexError:
try:
variable_dict = pythesint.vocabularies['cf_standard_name'].fuzzy_search(
variable['standard_name'])[0]
search_name)[0]
except IndexError:
continue
if variable_dict not in variables:
Expand Down
39 changes: 24 additions & 15 deletions tests/providers/test_cmems.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# pylint: disable=protected-access
"""Tests for the CMEMS provider"""
import logging
import unittest
import unittest.mock as mock
from datetime import datetime, timezone
Expand Down Expand Up @@ -58,38 +59,43 @@ def test_make_filter(self):
"""Test making a regular expression matching a time range
"""
mock_crawler = mock.Mock()
regex_template = "^(.*_({regex})_.*)|({regex}.*)$"

mock_crawler.time_range = (datetime(2024, 9, 1),
datetime(2024, 9, 2))
self.assertEqual(
CMEMSCrawler.make_filter(mock_crawler),
'.*_((2024(09(01|02))))_.*')
regex_template.format(regex='(2024(09(01|02)))'))

mock_crawler.time_range = (datetime(2024, 9, 1),
datetime(2024, 10, 15))
self.assertEqual(
CMEMSCrawler.make_filter(mock_crawler),
'.*_((2024(09(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24'
'|25|26|27|28|29|30)|10(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15))))_.*')
regex_template.format(regex=(
'(2024(09(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23'
'|24|25|26|27|28|29|30)|10(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15)))'
)))

mock_crawler.time_range = (datetime(2024, 11, 1),
datetime(2025, 1, 1))
self.assertEqual(
CMEMSCrawler.make_filter(mock_crawler),
'.*_((202412[0-3][0-9])|(2024(11(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|'
'18|19|20|21|22|23|24|25|26|27|28|29|30)))|(2025(01(01))))_.*')
regex_template.format(regex=(
'(202412[0-3][0-9])|(2024(11(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|'
'16|17|18|19|20|21|22|23|24|25|26|27|28|29|30)))|(2025(01(01)))')))

mock_crawler.time_range = (datetime(2023, 12, 30),
datetime(2024, 1, 2))
self.assertEqual(
CMEMSCrawler.make_filter(mock_crawler),
'.*_((2023(12(30|31)))|(2024(01(01|02))))_.*')
regex_template.format(regex=('(2023(12(30|31)))|(2024(01(01|02)))')))

mock_crawler.time_range = (datetime(2023, 12, 30),
datetime(2025, 1, 2))
self.assertEqual(
CMEMSCrawler.make_filter(mock_crawler),
'.*_((2023(12(30|31)))|(2024[0-9]{4})|(2025(01(01|02))))_.*')
regex_template.format(regex=(
'(2023(12(30|31)))|(2024[0-9]{4})|(2025(01(01|02)))')))

mock_crawler.time_range = (None, None)
self.assertIsNone(CMEMSCrawler.make_filter(mock_crawler))
Expand Down Expand Up @@ -549,17 +555,20 @@ def test_get_dataset_parameters(self):
'.get_cf_or_wkv_standard_name') as mock_get_cf_wkv, \
mock.patch('pythesint.vocabularies', vocabularies):

mock_get_cf_wkv.side_effect = ('variable_1', IndexError, IndexError)
mock_get_cf_wkv.side_effect = ('variable_1', IndexError, IndexError, 'variable_4')
vocabularies['cf_standard_name'].fuzzy_search.side_effect = (
IndexError, ['variable_3', 'varrriable_3'])

self.assertListEqual(
self.normalizer.get_dataset_parameters(DatasetInfo('foo', {
'variables': ({'standard_name': 'var1'},
{'standard_name': 'var2'},
{'standard_name': 'var3'})
})),
['variable_1', 'variable_3'])
with self.assertLogs(logger=self.normalizer.logger, level=logging.ERROR):
self.assertListEqual(
self.normalizer.get_dataset_parameters(DatasetInfo('foo', {
'variables': ({'standard_name': 'var1'},
{'standard_name': 'var2'},
{'standard_name': 'var3'},
{'short_name': 'v4'},
{'foo': 'bar'})
})),
['variable_1', 'variable_3', 'variable_4'])

def test_get_service(self):
"""Test retrieval of the type of repository where the data is
Expand Down

0 comments on commit 249c320

Please sign in to comment.