Skip to content

Commit 8f8172b

Browse files
committed
#20 - change CoreOrExtType to enum for type hinting (review feedback)
1 parent 9d9daca commit 8f8172b

File tree

6 files changed

+59
-55
lines changed

6 files changed

+59
-55
lines changed

src/dwcahandler/dwca/__init__.py

+13-13
Original file line numberDiff line numberDiff line change
@@ -25,17 +25,13 @@
2525
from enum import Enum
2626
from functools import wraps
2727
from typing import Optional, Union
28-
28+
import logging
2929
import pandas as pd
3030

3131

3232
class CoreOrExtType(Enum):
3333
CORE = "core"
3434
EXTENSION = "extension"
35-
@dataclass(frozen=True)
36-
class MetaDefaultFields:
37-
ID: str = "id"
38-
CORE_ID: str = "coreid"
3935

4036
# Default keys for content when creating dwca
4137
DefaultKeys = namedtuple("DefaultKeys", ["EVENT", "OCCURRENCE"])(
@@ -184,8 +180,17 @@ class Defaults:
184180
# Translation csv encoding values
185181
translate_table: dict = field(init=False,
186182
default_factory=lambda: {'LF': '\r\n', '\\t': '\t', '\\n': '\n'})
183+
MetaDefaultFields: namedtuple = namedtuple("MetaDefaultFields", ["ID", "CORE_ID"])(
184+
ID="id",
185+
CORE_ID="coreid"
186+
)
187+
187188

188189

190+
# Imports at end of file to allow classes to be used
191+
from dwcahandler.dwca.terms import Terms, NsPrefix
192+
from dwcahandler.dwca.dwca_meta import (MetaElementTypes, MetaElementInfo, MetaDwCA,
193+
MetaElementAttributes, get_meta_class_row_type)
189194
@dataclass
190195
class ContentData:
191196
"""A class describing the content data used for core and extension.
@@ -220,13 +225,8 @@ def add_data(self, other_csv_file_type: ContentData):
220225
self.type = other_csv_file_type.type
221226
return False
222227

223-
228+
from dwcahandler.dwca.eml import Eml
224229
from dwcahandler.dwca.base_dwca import BaseDwca
225-
from dwcahandler.dwca.core_dwca import DfContent, Dwca
230+
from dwcahandler.dwca.core_dwca import Dwca, DfContent
226231
from dwcahandler.dwca.dwca_factory import DwcaHandler
227-
from dwcahandler.dwca.dwca_meta import (MetaDwCA, MetaElementAttributes,
228-
MetaElementInfo, MetaElementTypes,
229-
get_meta_class_row_type)
230-
from dwcahandler.dwca.eml import Eml
231-
# Imports at end of file to allow classes to be used
232-
from dwcahandler.dwca.terms import NsPrefix, Terms
232+

src/dwcahandler/dwca/core_dwca.py

+22-20
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from numpy import nan
2222
from pandas.errors import EmptyDataError
2323
from pandas.io import parsers
24-
from dwcahandler.dwca import (BaseDwca, CoreOrExtType, MetaDefaultFields, CSVEncoding,
24+
from dwcahandler.dwca import (BaseDwca, CoreOrExtType, CSVEncoding,
2525
ContentData, Defaults, Eml, Terms, get_keys,
2626
MetaDwCA, MetaElementInfo, MetaElementTypes,
2727
MetaElementAttributes, Stat, record_diff_stat)
@@ -93,9 +93,9 @@ def _update_core_ids(self, core_df) -> str:
9393
:param core_df: The data frame to generate identifiers for
9494
return id field
9595
"""
96-
if MetaDefaultFields.ID not in core_df.columns.to_list():
97-
core_df.insert(0, MetaDefaultFields.ID, core_df.apply(lambda _: uuid.uuid4(), axis=1), False)
98-
return MetaDefaultFields.ID
96+
if self.defaults_prop.MetaDefaultFields.ID not in core_df.columns.to_list():
97+
core_df.insert(0, self.defaults_prop.MetaDefaultFields.ID, core_df.apply(lambda _: uuid.uuid4(), axis=1), False)
98+
return self.defaults_prop.MetaDefaultFields.ID
9999
else:
100100
raise ValueError("core df should not contain id column")
101101

@@ -140,23 +140,23 @@ def _update_extension_ids(self, csv_content: pd.DataFrame, core_df_content: pd.D
140140
set(link_col).issubset(set(csv_content.index.names))):
141141
csv_content.reset_index(inplace=True, drop=True)
142142

143-
csv_content = csv_content.merge(core_df_content.loc[:, MetaDefaultFields.ID],
143+
csv_content = csv_content.merge(core_df_content.loc[:, self.defaults_prop.MetaDefaultFields.ID],
144144
left_on=link_col,
145145
right_on=link_col, how='outer')
146146

147-
if MetaDefaultFields.ID in csv_content.columns.to_list():
148-
unmatched_content = csv_content[csv_content[MetaDefaultFields.ID].isnull()]
149-
unmatched_content = unmatched_content.drop(columns=[MetaDefaultFields.ID])
147+
if self.defaults_prop.MetaDefaultFields.ID in csv_content.columns.to_list():
148+
unmatched_content = csv_content[csv_content[self.defaults_prop.MetaDefaultFields.ID].isnull()]
149+
unmatched_content = unmatched_content.drop(columns=[self.defaults_prop.MetaDefaultFields.ID])
150150
if len(unmatched_content) > 0:
151151
log.info("There are orphaned keys in extension file")
152152
pd.set_option("display.max_columns", 7)
153153
pd.set_option('display.max_colwidth', 15)
154154
pd.set_option('display.max_rows', 10)
155155
log.info("\n%s", unmatched_content)
156-
csv_content = csv_content[~csv_content[MetaDefaultFields.ID].isnull()]
157-
col = csv_content.pop(MetaDefaultFields.ID)
156+
csv_content = csv_content[~csv_content[self.defaults_prop.MetaDefaultFields.ID].isnull()]
157+
col = csv_content.pop(self.defaults_prop.MetaDefaultFields.ID)
158158
csv_content.insert(0, col.name, col)
159-
csv_content.rename(columns={MetaDefaultFields.ID: ext_core_id_field}, inplace=True)
159+
csv_content.rename(columns={self.defaults_prop.MetaDefaultFields.ID: ext_core_id_field}, inplace=True)
160160
return csv_content, ext_core_id_field
161161
else:
162162
raise ValueError("Something is not right. The core id failed to be created")
@@ -207,8 +207,9 @@ def _find_fields_with_zero_idx(meta_element_fields: list):
207207
def _add_first_id_field_if_exists(meta_element: MetaElementAttributes):
208208
zero_index_exist = _find_fields_with_zero_idx(meta_element.fields)
209209
if meta_element.core_id and meta_element.core_id.index and not zero_index_exist:
210-
return [MetaDefaultFields.ID] if meta_element.meta_element_type.core_or_ext_type == CoreOrExtType.CORE \
211-
else [MetaDefaultFields.CORE_ID]
210+
return [self.defaults_prop.MetaDefaultFields.ID] if (
211+
meta_element.meta_element_type.core_or_ext_type == CoreOrExtType.CORE) \
212+
else [self.defaults_prop.MetaDefaultFields.CORE_ID]
212213
else:
213214
return []
214215

@@ -287,7 +288,7 @@ def _update_values(self, df_content, delta_df_content, keys, stat):
287288
:return: The updated content
288289
"""
289290
# Extract columns that need updating, excluding self.keys and id
290-
non_update_column = list(MetaDefaultFields)
291+
non_update_column = list(self.defaults_prop.MetaDefaultFields)
291292
non_update_column.extend(keys)
292293
update_columns = [i for i in delta_df_content.columns.to_list()
293294
if i not in non_update_column]
@@ -429,12 +430,13 @@ def _extract_core_keys(self, core_content, keys):
429430
:return: A data frame indexed by the `id` column that contains the
430431
key elements for each record
431432
"""
432-
columns = [MetaDefaultFields.ID] if MetaDefaultFields.ID in core_content.columns.tolist() else []
433+
columns = [self.defaults_prop.MetaDefaultFields.ID] \
434+
if self.defaults_prop.MetaDefaultFields.ID in core_content.columns.tolist() else []
433435
if all(key in core_content.columns for key in keys):
434436
columns.extend(keys)
435437
df = core_content[columns]
436-
if MetaDefaultFields.ID in core_content.columns.tolist():
437-
df.set_index(MetaDefaultFields.ID, drop=True, inplace=True)
438+
if self.defaults_prop.MetaDefaultFields.ID in core_content.columns.tolist():
439+
df.set_index(self.defaults_prop.MetaDefaultFields.ID, drop=True, inplace=True)
438440
else:
439441
raise ValueError(f"Keys does not exist in core content {''.join(keys)}")
440442
return df
@@ -869,17 +871,17 @@ def validate_content(self, content_to_validate: dict = None, error_file: str = N
869871

870872
if not self.check_duplicates(keys_df, content.keys, error_file):
871873
log.error("Validation failed for %s %s content for duplicates keys %s",
872-
content.meta_info.core_or_ext_type, content.meta_info.type, content.keys)
874+
content.meta_info.core_or_ext_type.value, content.meta_info.type, content.keys)
873875
validation_content_success = False
874876

875877
if not self._validate_columns(content):
876878
log.error("Validation failed for %s %s content for duplicate columns",
877-
content.meta_info.core_or_ext_type, content.meta_info.type)
879+
content.meta_info.core_or_ext_type.value, content.meta_info.type)
878880
validation_content_success = False
879881

880882
if validation_content_success:
881883
log.info("Validation successful for %s %s content for unique keys %s",
882-
content.meta_info.core_or_ext_type, content.meta_info.type, content.keys)
884+
content.meta_info.core_or_ext_type.value, content.meta_info.type, content.keys)
883885
else:
884886
validation_success = False
885887

src/dwcahandler/dwca/dwca_meta.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import re
1212
from dataclasses import dataclass, field
1313
from typing import Optional
14-
from dwcahandler.dwca import CSVEncoding, CoreOrExtType, Terms, MetaDefaultFields
14+
from dwcahandler.dwca import CSVEncoding, CoreOrExtType, Terms, Defaults
1515
from enum import Enum
1616

1717

@@ -72,7 +72,7 @@ class MetaElementAttributes:
7272
@dataclass
7373
class MetaDwCA:
7474
"""Complete Metadata for a DwCA including dataset metadata and schema information"""
75-
eml_xml_filename: str = field(default='eml.xml')
75+
eml_xml_filename: str = field(default=Defaults.eml_xml_filename)
7676
dwca_meta: ET.Element = field(init=False)
7777
meta_elements: list[MetaElementAttributes] = field(default_factory=list, init=False)
7878

@@ -89,10 +89,10 @@ def extract_field_attr_value(field_elm, attrib):
8989

9090
fields = node_elm.findall(f'{ns}field')
9191
id_field = []
92-
if core_or_ext_type == 'core':
93-
id_field = node_elm.findall(f'{ns}id')
92+
if core_or_ext_type == CoreOrExtType.CORE:
93+
id_field = node_elm.findall(f'{ns}{Defaults.MetaDefaultFields.ID}')
9494
else:
95-
id_field = node_elm.findall(f'{ns}coreid')
95+
id_field = node_elm.findall(f'{ns}{Defaults.MetaDefaultFields.CORE_ID}')
9696
file_name = node_elm.find(f'{ns}files').find(f'{ns}location').text
9797
meta_element_info = MetaElementInfo(
9898
core_or_ext_type=core_or_ext_type,
@@ -138,11 +138,11 @@ def read_meta_file(self, meta_file):
138138
tree = ET.parse(meta_file)
139139
root = tree.getroot()
140140
ns = self._get_namespace(root)
141-
node_elm = root.find(f'{ns}{CoreOrExtType.CORE}')
141+
node_elm = root.find(f"{ns}{CoreOrExtType.CORE.value}")
142142
self.meta_elements = [self.__extract_meta_info(ns, node_elm, CoreOrExtType.CORE)]
143143
self.meta_elements.extend(
144144
[self.__extract_meta_info(ns, ne, CoreOrExtType.EXTENSION)
145-
for ne in root.findall(f'{ns}{CoreOrExtType.EXTENSION}')])
145+
for ne in root.findall(f"{ns}{CoreOrExtType.EXTENSION.value}")])
146146

147147
def remove_meta_elements(self, exts_to_remove):
148148
"""Remove extension files from the metadata
@@ -211,7 +211,7 @@ def _build_meta_xml(self, meta_elem_attrib: MetaElementAttributes):
211211
212212
:param meta_elem_attrib: The meta information for the row
213213
"""
214-
elem = ET.SubElement(self.dwca_meta, meta_elem_attrib.meta_element_type.core_or_ext_type)
214+
elem = ET.SubElement(self.dwca_meta, meta_elem_attrib.meta_element_type.core_or_ext_type.value)
215215
elem.attrib['encoding'] = meta_elem_attrib.meta_element_type.charset_encoding
216216
elem.attrib['rowType'] = meta_elem_attrib.meta_element_type.type.value
217217
elem.attrib['fieldsTerminatedBy'] = meta_elem_attrib.meta_element_type.csv_encoding.csv_delimiter
@@ -225,13 +225,13 @@ def _build_meta_xml(self, meta_elem_attrib: MetaElementAttributes):
225225
location = ET.SubElement(files, 'location')
226226
location.text = meta_elem_attrib.meta_element_type.file_name
227227
if meta_elem_attrib.core_id:
228-
id_field = ET.SubElement(elem, MetaDefaultFields.ID) \
229-
if meta_elem_attrib.meta_element_type.core_or_ext_type == 'core' \
230-
else ET.SubElement(elem, MetaDefaultFields.CORE_ID)
228+
id_field = ET.SubElement(elem, Defaults.MetaDefaultFields.ID) \
229+
if meta_elem_attrib.meta_element_type.core_or_ext_type == CoreOrExtType.CORE \
230+
else ET.SubElement(elem, Defaults.MetaDefaultFields.CORE_ID)
231231
id_field.attrib['index'] = meta_elem_attrib.core_id.index
232232

233233
for _, f in enumerate(meta_elem_attrib.fields):
234-
if f.field_name not in list(MetaDefaultFields):
234+
if f.field_name not in list(Defaults.MetaDefaultFields):
235235
field_elem = ET.SubElement(elem, "field")
236236
if f.index is not None:
237237
field_elem.attrib['index'] = f.index

src/dwcahandler/dwca/terms.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717

1818
def absolute_file_paths(directory):
19-
"""Convert files in a directory into absolute paths and return
19+
"""Convert data in a directory into absolute paths and return
2020
as a generator
2121
2222
:param directory: The directory to scan.
@@ -71,7 +71,7 @@ class Terms:
7171

7272
GBIF_EXT = "https://rs.gbif.org/extensions.json"
7373

74-
GBIF_REGISTERED_EXTENSION = [e for e in GbifRegisteredExt]
74+
GBIF_REGISTERED_EXTENSION = pd.DataFrame(columns=["prefix", "identifier", "namespace", "issued_date"])#[e for e in GbifRegisteredExt]
7575

7676
DWC_SOURCE_URL = "https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/terms/terms.csv"
7777

@@ -267,7 +267,7 @@ def __sort_values(df_to_sort: pd.DataFrame, sorting_column: str) -> pd.DataFrame
267267

268268
log.info("Current class and terms")
269269

270-
exclude_update_prefixes = [NsPrefix.DC.value]
270+
exclude_update_prefixes = [NsPrefix.DC.value, NsPrefix.DWC.value]
271271
terms = Terms()
272272
print(terms.class_df.groupby(["prefix"]).agg(
273273
class_prefix_count=pd.NamedAgg(column="prefix", aggfunc="count")
@@ -277,7 +277,7 @@ def __sort_values(df_to_sort: pd.DataFrame, sorting_column: str) -> pd.DataFrame
277277
))
278278
terms.class_df = terms.class_df[terms.class_df.prefix.isin(exclude_update_prefixes)]
279279
terms.terms_df = terms.terms_df[terms.terms_df.prefix.isin(exclude_update_prefixes)]
280-
terms.update_dwc_terms()
280+
#terms.update_dwc_terms()
281281
terms.update_gbif_ext()
282282
terms.class_df = __sort_values(terms.class_df, "class")
283283
terms.terms_df = __sort_values(terms.terms_df, "term")
@@ -291,3 +291,5 @@ def __sort_values(df_to_sort: pd.DataFrame, sorting_column: str) -> pd.DataFrame
291291
term_prefix_count=pd.NamedAgg(column="prefix", aggfunc="count")
292292
))
293293
return terms.terms_df, terms.class_df
294+
295+
#Terms.update_terms()

tests/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import csv
66
from dwcahandler import Eml
77
from xml.dom.minidom import parseString
8-
from dwcahandler import MetaDwCA, MetaDefaultFields
8+
from dwcahandler import MetaDwCA, Defaults
99

1010

1111
def get_eml_content():
@@ -25,7 +25,7 @@ def make_fields(columns: list, term_uri: str, field_start: int = 0, core_id: str
2525
idx_start = field_start if field_start != -2 else 0
2626

2727
for idx, col in enumerate(columns):
28-
if not (col in list(MetaDefaultFields)):
28+
if not (col in list(Defaults.MetaDefaultFields)):
2929
dwc_term_uri = "http://rs.tdwg.org/dwc/terms" if col == 'occurrenceID' else term_uri
3030
fields = fields + '\n' + f'<field index="{str(idx + idx_start)}" term="{dwc_term_uri}/{col}"/>'
3131

tests/test_write_dwca.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def test_generate_dwca_without_ext(self):
5151
root = tree.getroot()
5252
ns = _get_namespace(root)
5353
assert ns == "{http://rs.tdwg.org/dwc/text/}"
54-
core_node = root.find(f'{ns}{CoreOrExtType.CORE}')
54+
core_node = root.find(f'{ns}{CoreOrExtType.CORE.value}')
5555
assert core_node is not None
5656
fields = core_node.findall(f'{ns}field')
5757
term_fields = [f.attrib.get('term') for f in fields]
@@ -90,7 +90,7 @@ def test_generate_dwca_with_ext(self):
9090
root = tree.getroot()
9191
ns = _get_namespace(root)
9292
assert ns == "{http://rs.tdwg.org/dwc/text/}"
93-
core_node = root.find(f'{ns}{CoreOrExtType.CORE}')
93+
core_node = root.find(f'{ns}{CoreOrExtType.CORE.value}')
9494
assert core_node is not None
9595
fields = core_node.findall(f'{ns}field')
9696
term_fields = [f.attrib.get('term') for f in fields]
@@ -99,7 +99,7 @@ def test_generate_dwca_with_ext(self):
9999
assert any(sample_col in f for f in term_fields)
100100
core_file = core_node.find(f'{ns}files').find(f'{ns}location').text
101101

102-
ext_node = root.find(f'{ns}{CoreOrExtType.EXTENSION}')
102+
ext_node = root.find(f'{ns}{CoreOrExtType.EXTENSION.value}')
103103
assert ext_node is not None
104104
fields = ext_node.findall(f'{ns}field')
105105
term_fields = [f.attrib.get('term') for f in fields]
@@ -150,7 +150,7 @@ def test_generate_dwca_in_memory(self):
150150
root = tree.getroot()
151151
ns = _get_namespace(root)
152152
assert ns == "{http://rs.tdwg.org/dwc/text/}"
153-
core_node = root.find(f'{ns}{CoreOrExtType.CORE}')
153+
core_node = root.find(f'{ns}{CoreOrExtType.CORE.value}')
154154
assert core_node is not None
155155
fields = core_node.findall(f'{ns}field')
156156
term_fields = [f.attrib.get('term') for f in fields]

0 commit comments

Comments
 (0)