From cbe3c1fe08ab8cfe8144c49b6f735b7cc55b5e49 Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Wed, 19 Feb 2025 11:51:36 +1100 Subject: [PATCH 01/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - Changes to support event types data --- .github/workflows/run-tests.yml | 2 +- src/dwcahandler/dwca/__init__.py | 37 +- src/dwcahandler/dwca/base_dwca.py | 45 +-- src/dwcahandler/dwca/core_dwca.py | 335 ++++++++---------- src/dwcahandler/dwca/dwca_factory.py | 10 +- src/dwcahandler/dwca/dwca_meta.py | 168 ++++----- src/dwcahandler/dwca/terms.py | 232 ++++++++++-- src/dwcahandler/dwca/terms/class-rowtype.csv | 14 + .../dwca/terms/darwin-core-terms.csv | 215 ----------- .../dwca/terms/dublin-core-terms.csv | 56 --- src/dwcahandler/dwca/terms/terms.csv | 273 ++++++++++++++ tests/__init__.py | 52 ++- .../event/cameratrap-sample1/event.txt | 18 + .../measurement_or_fact.txt | 21 ++ .../event/cameratrap-sample1/meta.xml | 60 ++++ .../event/cameratrap-sample1/occurrence.txt | 12 + .../event/cameratrap-sample2/event.txt | 18 + .../extended_measurement_or_fact.txt | 21 ++ .../event/cameratrap-sample2/meta.xml | 62 ++++ .../event/cameratrap-sample2/occurrence.txt | 12 + tests/input_files/occurrence/sample1/meta.xml | 28 ++ .../sample1/multimedia.txt} | 2 +- .../sample1/occurrence.txt} | 0 tests/input_files/occurrence/sample2/meta.xml | 28 ++ .../occurrence/sample2/multimedia.txt | 9 + .../occurrence/sample2/occurrence.txt | 7 + tests/input_files/occurrence/sample3/meta.xml | 28 ++ .../occurrence/sample3/multimedia.txt | 12 + .../occurrence/sample3/occurrence.txt | 7 + .../multimedia/multimedia_file.csv | 0 .../multimedia/multimedia_file.tsv | 0 .../{ => sample}/occurrence/occ_file1.csv | 0 .../{ => sample}/occurrence/occ_file1.tsv | 0 .../occ_file2_additional_column.csv | 0 .../occ_file2_additional_column.tsv | 0 tests/test_create_core_and_ext_content.py | 62 ++-- tests/test_create_dwca.py | 148 ++++++++ tests/test_delete_dwca_content.py | 50 +-- tests/test_listterms.py | 30 +- tests/test_merge_dwca.py | 222 +++++++++++- tests/test_multimedia_content.py | 42 +-- tests/test_validate_dwca.py | 12 +- tests/test_write_dwca.py | 38 +- 43 files changed, 1610 insertions(+), 778 deletions(-) create mode 100644 src/dwcahandler/dwca/terms/class-rowtype.csv delete mode 100644 src/dwcahandler/dwca/terms/darwin-core-terms.csv delete mode 100644 src/dwcahandler/dwca/terms/dublin-core-terms.csv create mode 100644 src/dwcahandler/dwca/terms/terms.csv create mode 100644 tests/input_files/event/cameratrap-sample1/event.txt create mode 100644 tests/input_files/event/cameratrap-sample1/measurement_or_fact.txt create mode 100644 tests/input_files/event/cameratrap-sample1/meta.xml create mode 100644 tests/input_files/event/cameratrap-sample1/occurrence.txt create mode 100644 tests/input_files/event/cameratrap-sample2/event.txt create mode 100644 tests/input_files/event/cameratrap-sample2/extended_measurement_or_fact.txt create mode 100644 tests/input_files/event/cameratrap-sample2/meta.xml create mode 100644 tests/input_files/event/cameratrap-sample2/occurrence.txt create mode 100755 tests/input_files/occurrence/sample1/meta.xml rename tests/input_files/{sample/multimedia.csv => occurrence/sample1/multimedia.txt} (92%) rename tests/input_files/{sample/occurrence.csv => occurrence/sample1/occurrence.txt} (100%) create mode 100755 tests/input_files/occurrence/sample2/meta.xml create mode 100644 tests/input_files/occurrence/sample2/multimedia.txt create mode 100644 tests/input_files/occurrence/sample2/occurrence.txt create mode 100755 tests/input_files/occurrence/sample3/meta.xml create mode 100644 tests/input_files/occurrence/sample3/multimedia.txt create mode 100644 tests/input_files/occurrence/sample3/occurrence.txt rename tests/input_files/{ => sample}/multimedia/multimedia_file.csv (100%) rename tests/input_files/{ => sample}/multimedia/multimedia_file.tsv (100%) rename tests/input_files/{ => sample}/occurrence/occ_file1.csv (100%) rename tests/input_files/{ => sample}/occurrence/occ_file1.tsv (100%) rename tests/input_files/{ => sample}/occurrence/occ_file2_additional_column.csv (100%) rename tests/input_files/{ => sample}/occurrence/occ_file2_additional_column.tsv (100%) create mode 100644 tests/test_create_dwca.py diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 8759921..df6d906 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -2,7 +2,7 @@ name: Run test on: push: - branches: [ "main", "develop" ] + branches: [ "main", "develop", "feature/events" ] pull_request: branches: [ "main", "develop" ] diff --git a/src/dwcahandler/dwca/__init__.py b/src/dwcahandler/dwca/__init__.py index d5bc84a..8c3281b 100644 --- a/src/dwcahandler/dwca/__init__.py +++ b/src/dwcahandler/dwca/__init__.py @@ -60,23 +60,6 @@ def __convert_values(self, v): return translate_table[v] if v in translate_table.keys() else v -@dataclass -class CsvFileType: - """A description of a CSV file in a DwCA - """ - files: Union[list[str], pd.DataFrame] # can accept more than one file or a dataframe - type: str # 'occurrence', 'taxon', 'event', multimedia,... - keys: Optional[list] = None # must be supplied for csv extensions to link extension records to core record - # when creating dwca. for core other than occurrence, this neeeds to be supplied as key. - # column keys lookup in core or extension for delete records - associated_files_loc: Optional[str] = None # in case there are associated media that need to be packaged in dwca - csv_encoding: CSVEncoding = field( - default_factory=lambda: CSVEncoding(csv_delimiter=",", csv_eol="\n", csv_text_enclosure='"', - csv_escape_char='"')) - # delimiter: Optional[str] = None - # file delimiter type when reading the csv. if not supplied, the collectory setting delimiter is read in for the dr - - class Stat: """Record statistics for a DwCA""" start_record_count: int = 0 @@ -176,8 +159,24 @@ class Defaults: # Imports at end of file to allow classes to be used -from dwcahandler.dwca.terms import Terms -from dwcahandler.dwca.dwca_meta import Element, MetaElementTypes, MetaElementInfo, MetaDwCA +from dwcahandler.dwca.terms import Terms, NsPrefix +from dwcahandler.dwca.dwca_meta import MetaElementTypes, MetaElementInfo, MetaDwCA, MetaElementAttributes +@dataclass +class CsvFileType: + """A description of a CSV file in a DwCA + """ + files: Union[list[str], pd.DataFrame] # can accept more than one file or a dataframe + type: MetaElementTypes # 'occurrence', 'taxon', 'event', multimedia,... + keys: Optional[list] = None # must be supplied for csv extensions to link extension records to core record + # when creating dwca. for core other than occurrence, this neeeds to be supplied as key. + # column keys lookup in core or extension for delete records + associated_files_loc: Optional[str] = None # in case there are associated media that need to be packaged in dwca + csv_encoding: CSVEncoding = field( + default_factory=lambda: CSVEncoding(csv_delimiter=",", csv_eol="\n", csv_text_enclosure='"', + csv_escape_char='"')) + # delimiter: Optional[str] = None + # file delimiter type when reading the csv. if not supplied, the collectory setting delimiter is read in for the dr + from dwcahandler.dwca.eml import Eml from dwcahandler.dwca.base_dwca import BaseDwca from dwcahandler.dwca.core_dwca import Dwca, DfContent diff --git a/src/dwcahandler/dwca/base_dwca.py b/src/dwcahandler/dwca/base_dwca.py index ead8a88..679b676 100644 --- a/src/dwcahandler/dwca/base_dwca.py +++ b/src/dwcahandler/dwca/base_dwca.py @@ -15,11 +15,13 @@ class BaseDwca(metaclass=ABCMeta): """An abstract DwCA that provides basic operations""" @abstractmethod - def extract_csv_content(self, csv_info: CsvFileType, core_ext_type: CoreOrExtType): + def extract_csv_content(self, csv_info: CsvFileType, core_ext_type: CoreOrExtType, + build_coreid_for_ext: bool = False): """Get the content from a single file in the DwCA :param csv_info: The CSV file to extract :param core_ext_type: Is this a core or extension CSV file + :param build_coreid_for_ext: indicator to add id and core id """ pass @@ -45,7 +47,7 @@ def generate_meta(self): pass @abstractmethod - def write_dwca(self, output_dwca_path: str): + def write_dwca(self, output_dwca_path: Union[str | BytesIO]): """Write the content of the DwCA to a directory. Writes all CSV files, as well as a meta-file and EML file for the archive. @@ -80,20 +82,21 @@ def set_keys(self, keys: dict): def convert_associated_media_to_extension(self): pass + """ @abstractmethod def merge_df_dwc_columns(self): pass - + """ @abstractmethod def delete_records(self, records_to_delete: CsvFileType): pass @abstractmethod - def validate_content(self, content_type_to_validate: list[str] = None, error_file: str = None): + def validate_content(self, content_to_validate: list[MetaElementTypes] = None, error_file: str = None): pass @abstractmethod - def get_content(self, ext_type: str): + def get_content(self, class_type: MetaElementTypes = None, name_space: str = None): pass @abstractmethod @@ -107,16 +110,10 @@ def fill_additional_info(self): """ Adds extra info based on the information in the content, mainly used by ingestion process """ - multimedia_content, _ = self.get_content(MetaElementTypes.get_element('multimedia').row_type_ns) - if multimedia_content: + contents = self.get_content(class_type=MetaElementTypes.MULTIMEDIA) + for multimedia_content, _ in contents: self.add_multimedia_info_to_content(multimedia_content) - def remove_extensions(self, exclude_ext_files: list, output_dwca_path: str): - self.extract_dwca(exclude_ext_files=exclude_ext_files) - self.generate_eml() - self.generate_meta() - self.write_dwca(output_dwca_path) - def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca_path: Union[str, BytesIO]): self.extract_dwca() self.delete_records(records_to_delete) @@ -124,32 +121,33 @@ def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca_pat self.generate_meta() self.write_dwca(output_dwca_path) - def create_dwca(self, core_csv: CsvFileType, output_dwca_path: str, + def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str | BytesIO], ext_csv_list: list[CsvFileType] = None, validate_content: bool = True, eml_content: Union[str, Eml] = ''): if ext_csv_list is None: ext_csv_list = [] - self.extract_csv_content(core_csv, CoreOrExtType.CORE) + self.extract_csv_content(csv_info=core_csv, core_ext_type=CoreOrExtType.CORE, + build_coreid_for_ext=True if len(ext_csv_list) > 0 else False) # Only validate core content if validate_content and not self.validate_content(): raise SystemExit(Exception("Some validations error found. Dwca is not created.")) # if multimedia files is supplied, do not attempt to convert associated media to multimedia - if not any(ext.type == 'multimedia' for ext in ext_csv_list): + if not any(ext.type == MetaElementTypes.MULTIMEDIA for ext in ext_csv_list): image_ext = self.convert_associated_media_to_extension() if image_ext: ext_csv_list.append(image_ext) for ext in ext_csv_list: - self.extract_csv_content(ext, CoreOrExtType.EXTENSION) + self.extract_csv_content(ext, CoreOrExtType.EXTENSION, True) self.fill_additional_info() self.generate_eml(eml_content) self.generate_meta() - self.write_dwca(output_dwca_path) + self.write_dwca(output_dwca) # Key lookup: For merging to update content and also used as lookup to link extensions to core records. # keys_lookup keys used for merging 2 dwcas @@ -173,16 +171,9 @@ def merge_dwca(self, delta_dwca: BaseDwca, output_dwca_path: Union[str, BytesIO] def validate_dwca(self, content_keys: dict, error_file: str): self.extract_dwca() set_keys = self.set_keys(content_keys) - content_type_to_validate = list(set_keys.keys()) - return self.validate_content(content_type_to_validate=content_type_to_validate, error_file=error_file) + #content_type_to_validate = list(set_keys.keys()) + return self.validate_content(content_to_validate=set_keys, error_file=error_file) def validate_file(self, csv: CsvFileType, error_file: str): self.extract_csv_content(csv, CoreOrExtType.CORE) return self.validate_content(error_file=error_file) - - def sanitize_dwca(self, output_dwca_path: str): - self.extract_dwca() - self.merge_df_dwc_columns() - self.generate_eml() - self.generate_meta() - self.write_dwca(output_dwca_path) diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index 08b6a87..e61b367 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -20,12 +20,13 @@ import pandas as pd from numpy import nan +from pandas import isnull from pandas.errors import EmptyDataError from pandas.io import parsers from dwcahandler.dwca import (BaseDwca, CoreOrExtType, CSVEncoding, CsvFileType, Defaults, Eml, MetaDwCA, MetaElementInfo, MetaElementTypes, - Stat, record_diff_stat) + MetaElementAttributes, Stat, record_diff_stat) logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.DEBUG) @@ -84,7 +85,7 @@ def count_stat(self, content): """ return len(content) - def _update_core_ids(self, core_df): + def _update_core_ids(self, core_df, keys: list) -> str: """Generate core identifiers for a core data frame. UUID identifiers are generated for each row in the core data frame. @@ -92,11 +93,16 @@ def _update_core_ids(self, core_df): useful identifier is available in the source data. :param core_df: The data frame to generate identifiers for + :param keys: The keys to use for the id + return id field """ - if 'id' not in core_df.columns.to_list(): - core_df.insert(0, 'id', core_df.apply(lambda _: uuid.uuid4(), axis=1), False) + id_field = "id" + if id_field not in core_df.columns.to_list(): + core_df.insert(0, id_field, core_df.apply(lambda _: uuid.uuid4(), axis=1), False) + return id_field else: - core_df['id'] = core_df['id'].map(lambda _: uuid.uuid4()) + raise ValueError("core df should not contain id column") + # core_df['id'] = core_df['id'].map(lambda _: uuid.uuid4()) def _update_df(self, to_update_df, lookup_df, update_field, from_update_field): """Update a data frame via lookup @@ -115,7 +121,8 @@ def _update_df(self, to_update_df, lookup_df, update_field, from_update_field): return len(to_update_df.loc[exist]) - def _update_extension_ids(self, csv_content, core_df_content, link_col: list): + def _update_extension_ids(self, csv_content: pd.DataFrame, core_df_content: pd.DataFrame, + link_col: list) -> (pd.DataFrame, str): """Update the extension tables with (usually generated) identifiers from a core data frame. @@ -126,9 +133,12 @@ def _update_extension_ids(self, csv_content, core_df_content, link_col: list): :param csv_content: The extension to update :param core_df_content: The core data frame :param link_col: The columns that link the extension to the core + :return a tuple containing extension data frame containing the core id and the core id field """ - if 'coreid' in csv_content: - csv_content.pop('coreid') + ext_core_id_field: str = 'coreid' + + if ext_core_id_field in csv_content: + csv_content.pop(ext_core_id_field) # Having link_col as index and column raises ambiguous error in merge if (set(link_col).issubset(set(csv_content.columns.to_list())) and @@ -137,14 +147,24 @@ def _update_extension_ids(self, csv_content, core_df_content, link_col: list): csv_content = csv_content.merge(core_df_content.loc[:, 'id'], left_on=link_col, - right_on=link_col, how='inner') + right_on=link_col, how='outer') if 'id' in csv_content.columns.to_list(): + unmatched_content = csv_content[csv_content["id"].isnull()] + unmatched_content = unmatched_content.drop(columns=["id"]) + if len(unmatched_content) > 0: + log.info("There are orphaned keys in extension file") + pd.set_option("display.max_columns", 7) + pd.set_option('display.max_colwidth', 15) + pd.set_option('display.max_rows', 10) + log.info("\n%s", unmatched_content) + csv_content = csv_content[~csv_content["id"].isnull()] col = csv_content.pop('id') csv_content.insert(0, col.name, col) - csv_content.rename(columns={"id": "coreid"}, inplace=True) - - return csv_content + csv_content.rename(columns={"id": ext_core_id_field}, inplace=True) + return csv_content, ext_core_id_field + else: + raise ValueError("Something is not right. The core id failed to be created") def _update_associated_files(self, assoc_files: list[str]): """Update the internal list of additional files. @@ -183,6 +203,19 @@ def convert_values(v): invalid_values = self.defaults_prop.translate_table.keys() return self.defaults_prop.translate_table[v] if v in invalid_values else v + def _find_fields_with_zero_idx(meta_element_fields: list): + for field in meta_element_fields: + if field.index == "0": + return field + return None + + def _add_first_id_field_if_exists(meta_element: MetaElementAttributes): + zero_index_exist = _find_fields_with_zero_idx(meta_element.fields) + if meta_element.core_id and meta_element.core_id.index and not zero_index_exist: + return ["id"] if meta_element.meta_element_type.core_or_ext_type == CoreOrExtType.CORE else ["coreid"] + else: + return [] + with ZipFile(self.dwca_file_loc, 'r') as zf: files = zf.namelist() @@ -206,7 +239,8 @@ def convert_values(v): for meta_elm in self.meta_content.meta_elements: csv_file_name = meta_elm.meta_element_type.file_name with io.TextIOWrapper(zf.open(csv_file_name), encoding="utf-8") as csv_file: - dwc_headers = [f.field_name for f in meta_elm.fields if f.index is not None] + dwc_headers = _add_first_id_field_if_exists(meta_elm) + dwc_headers.extend([f.field_name for f in meta_elm.fields if f.index is not None]) duplicates = [i for i in set(dwc_headers) if dwc_headers.count(i) > 1] if len(duplicates) > 0: raise ValueError(f"Duplicate columns {duplicates} specified in the " @@ -313,18 +347,18 @@ def set_keys(self, keys: dict = None): set_keys = {} if keys and len(keys) > 0: for k, v in keys.items(): - dwca_content, _ = self.get_content(MetaElementTypes.get_element(k).row_type_ns) + contents = self.get_content(class_type=k) # If found then set the key for the content - if dwca_content: + for dwca_content, _ in contents: dwca_content.keys = [v] if isinstance(v, str) else v set_keys[k] = v return set_keys - def _update_meta_fields(self, content): + def _update_meta_fields(self, content: DfContent, key_field: str=None): """Update meta content fields by reading the content frame""" fields = self._read_header(content.df_content) - self.meta_content.update_meta_element(meta_element_info=content.meta_info, fields=fields) + self.meta_content.update_meta_element(meta_element_info=content.meta_info, fields=fields, index_field=key_field) def _filter_content(self, df_content, delta_df_content): """Filter delta content that is not already in the existing content @@ -374,95 +408,6 @@ def _merge_df_content(self, content, delta_content, keys, update=True): # return the merged content return self._add_new_rows(content.df_content, new_rows) - def _find_duplicate_columns(self, content): - """Find any duplicated columns in a content frame - - :param content: The content frame - :return: A list of duplicated fields - """ - all_columns = self._read_header(content.df_content) - sanitized_fields = self.meta_content.map_headers(all_columns) - list_fields = [f.field_name for f in sanitized_fields] - dup_fields = [item for item in set(list_fields) if list_fields.count(item) > 1] - if len(dup_fields) > 0: - log.error("Duplicate fields found: %s", ','.join(dup_fields)) - return dup_fields - - def merge_df_dwc_columns(self): - """Merge any duplicated columns in the core content. - - Note that only yhr core content is merged. - """ - content = self.core_content - dup_fields = self._find_duplicate_columns(content) - updated = False - - if len(dup_fields) > 0: - all_columns = self._read_header(content.df_content) - for dup in dup_fields: - columns = list(filter( - lambda term, dp_term=dup: re.fullmatch(pattern=f".*?{dp_term}", string=term), - all_columns)) - if dup in columns: - other_col = columns[1] - df = self.core_content.df_content - self.core_content.stat.set_update_stat(0) - self._update_column(df, dup, other_col, self.core_content.stat) - log.info("columns %s updated with values from %s. Here is the stat: %s", - dup, other_col, str(self.core_content.stat)) - log.info("column %s dropped", other_col) - updated = True - - if updated: - self._update_meta_fields(content) - - def _update_column(self, df, col, other_col, stat): - """Update a data frame, copying values from another column with non-null entries. - - Updates come from two sources: - - - A direct value from other_col - - An entry in `dynamicProperties` keyed to `other_col` - - :param df: The data frame to update - :param col: The column to update - :param other_col: The column to copy values from - :param stat: A statistics object to record updates - :return: - """ - # Step 1: if dcterms_xxx is not null, replace the dcterms_xxx into xxx field, - # also clean up the dynamic properties for the rows - to_update = df[other_col].notnull() # df[dup].isnull() & - df.loc[to_update, col] = df.loc[to_update, other_col] - stat.add_update_stat(len(df[to_update])) - log.info(df.loc[to_update, col]) - # Also cleanup the dynamicProperties - df.loc[to_update, 'dynamicProperties'] = df.loc[to_update, 'dynamicProperties'].str.replace( - f'(,?)("dcterms_{col}":".*?")(?=,?)', '', regex=True).str.replace('{,', '{', regex=True) - - # Step 2: Check if col value is still null. If null, extract from the dynamic properties - to_update = df[col].isnull() - df.loc[to_update, col] = df.loc[to_update, 'dynamicProperties'].\ - str.extract(rf'.*?"dcterms_{col}":"(.*?)"')[0] - df.loc[to_update, 'dynamicProperties'] = ( - df.loc[to_update, 'dynamicProperties'].str. - replace(f'(,?)("dcterms_{col}":".*?")(?=,?)', '', regex=True).str. - replace('{,', '{', regex=True)) - stat.add_update_stat(len(df[to_update])) - df.drop(columns=[other_col], inplace=True) - - def _regenerate_coreids(self): - """Rebuild core identifiers. - - Adds an `id` column if one does not exist and generates a UUID (4) for each core row. - Corresponding extension rows are updated to match. - """ - self.core_content.df_content['id'] = (self.core_content.df_content['id']. - map(lambda _: uuid.uuid4())) - for content in self.ext_content: - content.df_content = self._update_extension_ids( - content.df_content, self.core_content.df_content, self.core_content.keys) - def _build_index_for_content(self, df_content: pd.DataFrame, keys: list): """Update a data frame index with values from a list of key columns. @@ -479,10 +424,14 @@ def _extract_core_keys(self, core_content, keys): :return: A data frame indexed by the `id` column that contains the key elements for each record """ - columns = ['id'] - columns.extend(keys) - df = core_content[columns] - df.set_index('id', drop=True, inplace=True) + columns = ['id'] if "id" in core_content.columns.tolist() else [] + if all(key in core_content.columns for key in keys): + columns.extend(keys) + df = core_content[columns] + if "id" in core_content.columns.tolist(): + df.set_index('id', drop=True, inplace=True) + else: + raise ValueError(f"Keys does not exist in core content {''.join(keys)}") return df def _cleanup_keys(self, core_index_keys): @@ -496,13 +445,14 @@ def _cleanup_keys(self, core_index_keys): def build_indexes(self): """Build unique indexes, using the key terms for both core and extensions """ - core_index_keys = self._extract_core_keys(self.core_content.df_content, - self.core_content.keys) - for content in self.ext_content: - self._add_ext_lookup_key(content.df_content, core_index_keys, - self.core_content.keys, content.keys) + if len(self.ext_content) > 0: + core_index_keys = self._extract_core_keys(self.core_content.df_content, + self.core_content.keys) + for content in self.ext_content: + self._add_ext_lookup_key(content.df_content, core_index_keys, + self.core_content.keys, content.keys) - self._cleanup_keys(core_index_keys) + self._cleanup_keys(core_index_keys) self._build_index_for_content(self.core_content.df_content, self.core_content.keys) @@ -551,30 +501,31 @@ def delete_records(self, records_to_delete: CsvFileType): return self._build_index_for_content(delete_content, records_to_delete.keys) - dwca_content, core_or_ext = self.get_content( - MetaElementTypes.get_element(records_to_delete.type).row_type_ns) - log.info("Removing records from %s", core_or_ext) - if core_or_ext == CoreOrExtType.CORE: - self.core_content.keys = records_to_delete.keys - for ext in self.ext_content: - ext.keys = records_to_delete.keys - self.build_indexes() - else: - self._build_index_for_content(df_content=dwca_content.df_content, - keys=records_to_delete.keys) + contents = self.get_content(class_type=records_to_delete.type) + + for dwca_content, core_or_ext in contents: + log.info("Removing records from %s", core_or_ext) + if core_or_ext == CoreOrExtType.CORE: + self.core_content.keys = records_to_delete.keys + for ext in self.ext_content: + ext.keys = records_to_delete.keys + self.build_indexes() + else: + self._build_index_for_content(df_content=dwca_content.df_content, + keys=records_to_delete.keys) - log.info("Index built in %s. Starting deletion in core %s", - core_or_ext, records_to_delete.type) + log.info("Index built in %s. Starting deletion in core %s", + core_or_ext, records_to_delete.type) - self.core_content.df_content = self._delete_content(content=dwca_content, - delete_content=delete_content) + self.core_content.df_content = self._delete_content(content=dwca_content, + delete_content=delete_content) - # Remove the extension records that are related to the core records that have been removed - if core_or_ext == CoreOrExtType.CORE: - for ext in self.ext_content: - log.info("Removing records from ext: %s", ext.meta_info.type.name) - ext.df_content = self._delete_content(content=ext, - delete_content=delete_content) + # Remove the extension records that are related to the core records that have been removed + if core_or_ext == CoreOrExtType.CORE: + for ext in self.ext_content: + log.info("Removing records from ext: %s", ext.meta_info.type.name) + ext.df_content = self._delete_content(content=ext, + delete_content=delete_content) def _add_ext_lookup_key(self, df_content, core_df_content, core_keys, keys): """Add a lookup key to a data frame @@ -599,37 +550,32 @@ def _add_ext_lookup_key(self, df_content, core_df_content, core_keys, keys): # Extension Sync def merge_contents(self, delta_dwca: Dwca, extension_sync: bool = False, - regen_ids: bool = False): + regen_ids: bool = False, match_by_filename: bool = False): """Merge the contents of this DwCA with a delta DwCA :param delta_dwca: The delta DwCA to apply :param extension_sync: refresh the extensions from delta dwca if the occurrences exist in both :param regen_ids: Regenerate unique identifiers for the records + :param match_by_filename: Match by filename of contents too """ self.build_indexes() delta_dwca.build_indexes() for _, delta_content in enumerate(delta_dwca.ext_content): - content, _ = self.get_content(delta_content.meta_info.type.row_type_ns) - if content: + contents = self.get_content(class_type=delta_content.meta_info.type, + file_name=delta_content.meta_info.file_name if match_by_filename else "") + for content, _ in contents: if extension_sync: self._delete_old_ext_records(content, self.core_content.df_content, delta_dwca.core_content.df_content, self.core_content.keys) - # create a copy of list - # Use keys other than coreid. Coreid should not be used as update keys if possible - ext_keys = [] - ext_keys.extend(self.core_content.keys) - update = False - if len(content.keys) > 0: - ext_keys.extend(content.keys) - update = True + content.df_content = self._merge_df_content(content, delta_content, - ext_keys, update) + self.core_content.keys) - else: - # Copy delta ext content into self ext content + if len (contents) == 0: + # Copy delta ext content into self ext content self.ext_content.append(delta_content) self._update_meta_fields(delta_content) @@ -643,21 +589,32 @@ def merge_contents(self, delta_dwca: Dwca, extension_sync: bool = False, content.df_content = self._update_extension_ids( content.df_content, self.core_content.df_content, self.core_content.keys) - def get_content(self, name_space): + def get_content(self, class_type: MetaElementTypes = None, name_space: str = None, file_name: str = None): """Get the content based on the row type namespace. :param name_space: The row type (a namespace URI) :return: A tuple of the content data frame and whether it is a core or extension (None, None) if not found """ - if self.core_content.meta_info.type.row_type_ns == name_space: - return self.core_content, CoreOrExtType.CORE + def check_content(content, class_type, name_space): + if file_name and content.meta_info.file_name != file_name: + return False + + if ((class_type and content.meta_info.type == class_type) or + (name_space and content.meta_info.type.value == name_space)): + return True + return False + + contents = [] + + if check_content(self.core_content, class_type=class_type, name_space=name_space): + contents.append((self.core_content, CoreOrExtType.CORE)) for content in self.ext_content: - if content.meta_info.type.row_type_ns == name_space: - return content, CoreOrExtType.EXTENSION + if check_content(content, class_type=class_type, name_space=name_space): + contents.append((content, CoreOrExtType.EXTENSION)) - return None, None + return contents def add_multimedia_info_to_content(self, multimedia_content: DfContent): """ @@ -745,7 +702,11 @@ def _extract_media(self, content, assoc_media_col: str): :param assoc_media_col: The column that contains the associated media :return: The images data frame """ - image_df = pd.DataFrame(content[assoc_media_col]) + cols = [] + if not self.core_content.df_content.index.names[0]: + cols = self.core_content.keys.copy() + cols.append(assoc_media_col) + image_df = pd.DataFrame(content[cols]) # filter off empty rows with empty value image_df = image_df[~image_df[assoc_media_col].isna()] if len(image_df) > 0: @@ -770,9 +731,11 @@ def convert_associated_media_to_extension(self): assoc_media_col = filtered_column[0] image_df = self._extract_media(self.core_content.df_content, assoc_media_col) if len(image_df) > 0: - self._update_meta_fields(self.core_content) + self._update_meta_fields(content=self.core_content, key_field=self.core_content.keys[0]) log.info("%s associated media extracted", str(len(image_df))) - return CsvFileType(files=image_df, type='multimedia', keys=image_df.index.names) + return CsvFileType(files=image_df, type=MetaElementTypes.MULTIMEDIA, + keys=self.core_content.keys) + #keys=image_df.index.names) log.info("Nothing to extract from associated media") @@ -891,37 +854,43 @@ def _validate_columns(self, content): return True - def validate_content(self, content_type_to_validate: list[str] = None, error_file: str = None): - """Validate the content of the DwCA + def validate_content(self, content_to_validate: dict = None, error_file: str = None): + """Validate the content of the DwCA. Validates core content by default - No duplicate record keys - Valid columns + :param content_to_validate: content to validate :param error_file: A file to record errors :return: True if the DwCA is value, False otherwise """ - if not content_type_to_validate: - content_type_to_validate = [self.core_content.meta_info.type.name] + content_set_to_validate = {self.core_content.meta_info.type: self.core_content.keys} + if content_to_validate: + for class_type, content_key in content_to_validate.items(): + if type != self.core_content.meta_info.type: + content_set_to_validate[class_type] = content_key - for content_type in content_type_to_validate: - content, _ = self.get_content(MetaElementTypes.get_element(content_type).row_type_ns) - keys_df = self._extract_keys(content.df_content, content.keys) + for class_type, key in content_set_to_validate.items(): + contents = self.get_content(class_type=class_type) + for content, _ in contents: + keys_df = self._extract_keys(content.df_content, content.keys) - if not self.check_duplicates(keys_df, content.keys, error_file): - return False + if not self.check_duplicates(keys_df, content.keys, error_file): + return False - if not self._validate_columns(content): - return False + if not self._validate_columns(content): + return False return True def extract_csv_content(self, csv_info: CsvFileType, - core_ext_type: CoreOrExtType): + core_ext_type: CoreOrExtType, build_coreid_for_ext: bool = False): """Read the files from a CSV description into a content frame and include it in the Dwca. :param csv_info: The CSV file(s) :param core_ext_type: Whether this is a core or extension content frame + :param build_id_for_ext: indicator to build id and core id to support dwca with extension """ if isinstance(csv_info.files, pd.DataFrame): csv_content = csv_info.files.copy(deep=True) @@ -930,22 +899,26 @@ def extract_csv_content(self, csv_info: CsvFileType, # Use default occurrenceID if not provided keys = csv_info.keys if self.__check_csv_info_value(csv_info, 'keys') else 'occurrenceID' - if core_ext_type == CoreOrExtType.CORE: - self._update_core_ids(csv_content) - self._build_index_for_content(csv_content, keys) - else: - csv_content = self._update_extension_ids( - csv_content, self.core_content.df_content, keys) + core_id_field: str = None + if build_coreid_for_ext: + if len (keys) > 1: + if core_ext_type == CoreOrExtType.CORE: + core_id_field = self._update_core_ids(csv_content, keys) + self._build_index_for_content(csv_content, keys) + elif core_ext_type == CoreOrExtType.EXTENSION: + csv_content, core_id_field = self._update_extension_ids( + csv_content, self.core_content.df_content, keys) + elif len(keys) > 0: + core_id_field = keys[0] if csv_info.associated_files_loc: self._update_associated_files([csv_info.associated_files_loc]) - meta_type = MetaElementTypes.get_element(csv_info.type) meta_element_info = MetaElementInfo( - core_or_ext_type=core_ext_type, type=meta_type, + core_or_ext_type=core_ext_type, type=csv_info.type, csv_encoding=self.defaults_prop.csv_encoding, ignore_header_lines='1') content = DfContent(df_content=csv_content, meta_info=meta_element_info) - self._update_meta_fields(content) + self._update_meta_fields(content, core_id_field) if core_ext_type == CoreOrExtType.CORE: content.keys = keys diff --git a/src/dwcahandler/dwca/dwca_factory.py b/src/dwcahandler/dwca/dwca_factory.py index 0082b2a..14512ab 100644 --- a/src/dwcahandler/dwca/dwca_factory.py +++ b/src/dwcahandler/dwca/dwca_factory.py @@ -16,14 +16,14 @@ class DwcaHandler: @staticmethod - def list_dwc_terms() -> pd.DataFrame: - return Terms().dwc_terms_df + def list_dwc_terms() -> (pd.DataFrame, pd.DataFrame): + return Terms().terms_df, Terms().class_df """Perform various DwCA operations""" @staticmethod def create_dwca(core_csv: CsvFileType, - output_dwca_path: Union[str, BytesIO], + output_dwca: Union[str, BytesIO], ext_csv_list: list[CsvFileType] = None, validate_content: bool = True, eml_content: Union[str, Eml] = ''): @@ -31,11 +31,11 @@ def create_dwca(core_csv: CsvFileType, :param core_csv: The core source :param ext_csv_list: A list of extension sources - :param output_dwca_path: Where to place the resulting Dwca + :param output_dwca: Where to place the resulting Dwca :param validate_content: Validate the DwCA before processing :param eml_content: eml content in string or Eml class """ - Dwca().create_dwca(core_csv=core_csv, ext_csv_list=ext_csv_list, output_dwca_path=output_dwca_path, + Dwca().create_dwca(core_csv=core_csv, ext_csv_list=ext_csv_list, output_dwca=output_dwca, validate_content=validate_content, eml_content=eml_content) @staticmethod diff --git a/src/dwcahandler/dwca/dwca_meta.py b/src/dwcahandler/dwca/dwca_meta.py index 772a5b5..b1cc69e 100644 --- a/src/dwcahandler/dwca/dwca_meta.py +++ b/src/dwcahandler/dwca/dwca_meta.py @@ -9,85 +9,30 @@ import xml.etree.ElementTree as ET from xml.dom import minidom import re -from urllib.parse import urlparse + from dataclasses import dataclass, field, asdict -from typing import ClassVar from typing import Optional from dwcahandler.dwca import CSVEncoding, CoreOrExtType, Terms +from enum import Enum -@dataclass -class Element: - """A mapping of a name to a URI, giving the class of a row type""" - name: str - row_type_ns: str - - -# noinspection SpellCheckingInspection -@dataclass -class MetaElementTypes: - """Named row types that map common DwCA row types onto URIs""" - occurrence: ClassVar[Element] = \ - Element("occurrence", "http://rs.tdwg.org/dwc/terms/Occurrence") - multimedia: ClassVar[Element] = \ - Element("multimedia", "http://rs.gbif.org/terms/1.0/Multimedia") - organism: ClassVar[Element] = \ - Element("organism", "http://rs.tdwg.org/dwc/terms/Organism") - materialsample: ClassVar[Element] = \ - Element("materialsample", "http://rs.tdwg.org/dwc/terms/MaterialSample") - location: ClassVar[Element] = \ - Element("location", "http://rs.tdwg.org/dwc/terms/Location") - event: ClassVar[Element] = \ - Element("event", "http://rs.tdwg.org/dwc/terms/Event") - taxon: ClassVar[Element] = \ - Element("taxon", "http://rs.tdwg.org/dwc/terms/Taxon") - measurementorfact: ClassVar[Element] = \ - Element("measurementorfact", "http://rs.tdwg.org/dwc/terms/MeasurementOrFact") - resourcerelationship: ClassVar[Element] = \ - Element("resourcerelationship", "http://rs.tdwg.org/dwc/terms/ResourceRelationship") - chronometricage: ClassVar[Element] = \ - Element("chronometricage", "http://rs.tdwg.org/dwc/terms/ChronometricAge") - - @staticmethod - def get_element(name: str): - """Find a row type by name +DwcClassRowTypes = Terms.get_class_row_types() - :param name: The row name - :return: The element corresponding to the row name - """ - try: - return MetaElementTypes.__dict__[name.lower()] - except KeyError: - return MetaElementTypes.get_element_by_row_type(name) +MetaElementTypes = Enum ("MetaElementTypes", dict(DwcClassRowTypes)) +class MetaElementTypes1: @staticmethod def get_element_by_row_type(row_type: str): - """Find a row type by URI + """ + Find a row type by URI :param row_type: The row type URI :return: The corresponding element """ - for elm in asdict(MetaElementTypes()).values(): - if elm['row_type_ns'] == row_type: - return MetaElementTypes.get_element(elm['name']) - - # For custom namespace - return Element(MetaElementTypes.extract_term(row_type), row_type) - - @staticmethod - def extract_term(term_string): - """Find a term name based on a term or a URI - - :param term_string: The term or URI - :return: The term name - """ - path_entity = urlparse(term_string) - path_str = path_entity.path - match = re.search(r'/([^/]*)$', path_str) - if match is not None: - return match[1] - - return term_string + for name, member in MetaElementTypes.__members__.items(): + if member.value == row_type: + return member + return None @dataclass @@ -96,7 +41,7 @@ class MetaElementInfo: the file is core or extension, the row type, the CSV encoding and the local file name for the table.""" core_or_ext_type: CoreOrExtType - type: Element + type: MetaElementTypes csv_encoding: CSVEncoding ignore_header_lines: str = '1' charset_encoding: str = 'UTF-8' @@ -104,15 +49,15 @@ class MetaElementInfo: def __post_init__(self): if not self.file_name: - self.file_name = f'{self.type.name}.csv' + self.file_name = f'{self.type.name.lower()}.txt' @dataclass class Field: """A field for a CSV file in a DwCA, mapping the CSV column onto a name or URI, with an optional default and vocabulary.""" - index: int - field_name: str + index: str = None + field_name: str = None term: Optional[str] = None default: Optional[str] = None vocabulary: Optional[str] = None @@ -122,6 +67,7 @@ class Field: class MetaElementAttributes: """A meta-description of a DwCA file""" meta_element_type: MetaElementInfo + core_id: Field fields: list[Field] = field(default_factory=list) @@ -143,9 +89,17 @@ def __extract_meta_info(self, ns, node_elm, core_or_ext_type): def extract_field_attr_value(field_elm, attrib): return field_elm.attrib.get(attrib) if field_elm.attrib.get(attrib) else None - def __find_id_in_fields(local_fields, id_field): - index_number = id_field[0].attrib["index"] if len(id_field) > 0 else "0" - return next((item for item in local_fields if "index" in item.attrib and item.attrib["index"]==index_number), None) + def __get_element_by_row_type(row_type: str): + """ + Find a row type by URI + + :param row_type: The row type URI + :return: The corresponding element + """ + for name, member in MetaElementTypes.__members__.items(): + if member.value == row_type: + return member + return None fields = node_elm.findall(f'{ns}field') id_field = [] @@ -156,7 +110,7 @@ def __find_id_in_fields(local_fields, id_field): file_name = node_elm.find(f'{ns}files').find(f'{ns}location').text meta_element_info = MetaElementInfo( core_or_ext_type=core_or_ext_type, - type=MetaElementTypes.get_element_by_row_type(node_elm.attrib['rowType']), + type=__get_element_by_row_type(node_elm.attrib['rowType']), csv_encoding=CSVEncoding( csv_delimiter=node_elm.attrib['fieldsTerminatedBy'], csv_eol=node_elm.attrib['linesTerminatedBy'], @@ -165,25 +119,20 @@ def __find_id_in_fields(local_fields, id_field): ignore_header_lines=node_elm.attrib['ignoreHeaderLines'], charset_encoding=node_elm.attrib['encoding'], file_name=file_name) - # set first field with index 0 if it's not present in list of fields - field_elm = __find_id_in_fields(fields, id_field) - if field_elm is None and len(id_field) > 0: - if CoreOrExtType.CORE == core_or_ext_type: - field_list = [Field(index=0, field_name="id")] - else: - field_list = [Field(index=0, field_name="coreid")] - else: - field_list = [] + + field_list = [] field_list.extend( - [Field(index=int(extract_field_attr_value(f, 'index')) + [Field(index=extract_field_attr_value(f, 'index') if extract_field_attr_value(f, 'index') else None, - field_name=MetaElementTypes.extract_term(extract_field_attr_value(f, 'term')), + field_name=Terms.extract_term(extract_field_attr_value(f, 'term')), term=extract_field_attr_value(f, 'term'), default=extract_field_attr_value(f, 'default'), vocabulary=extract_field_attr_value(f, 'vocabulary')) for i, f in enumerate(fields)]) + index_number = id_field[0].attrib["index"] if len(id_field) > 0 else None meta_element_attributes = \ - MetaElementAttributes(meta_element_type=meta_element_info, fields=field_list) + MetaElementAttributes(meta_element_type=meta_element_info, fields=field_list, + core_id=Field(index=index_number) if index_number else None) return meta_element_attributes def _get_namespace(self, element): @@ -230,26 +179,24 @@ def __get_terms(self, field_elm): return col_name if len(self.terms_df[self.terms_df['term'].str.lower() == col_name.lower()]) <= 0 \ else self.terms_df[self.terms_df['term'].str.lower() == col_name.lower()]['uri'].values[0] - def map_headers(self, headers: list[str], start_index: int = -1) -> list[Field]: + def map_headers(self, headers: list[str], index_field: str=None) -> (list[Field], Field): """Map header column names onto a list of fields. Column names are mapped onto fields based on name, URI or qualified name :param headers: The header list - :param start_index: The start index for the field index, as an offset from the header index + :param index_field: The id or coreid if any :return: The corresponding field list """ field_list: list[Field] = [] - index = 0 + id_index: Field = None for i, col in enumerate(headers): col_name = self.__remove_prefix(col) - if i == 0: - index = start_index if start_index > -1 else i - else: - index += 1 - field_elm = Field(index=index, field_name=col_name, term=self.__get_terms(col_name)) + field_elm = Field(index=str(i), field_name=col_name, term=self.__get_terms(col_name)) + if index_field and self.__remove_prefix(index_field) == col_name: + id_index = field_elm field_list.append(field_elm) - return field_list + return field_list, id_index def _extract_meta_element(self, file_name): for _, elm in enumerate(self.meta_elements): @@ -257,7 +204,7 @@ def _extract_meta_element(self, file_name): return elm return None - def update_meta_element(self, meta_element_info: MetaElementInfo, fields: list[str]): + def update_meta_element(self, meta_element_info: MetaElementInfo, fields: list[str], index_field: str = None): """Replace or append meta information (based on file name) :param meta_element_info: The info @@ -266,15 +213,17 @@ def update_meta_element(self, meta_element_info: MetaElementInfo, fields: list[s replace = False for i, elm in enumerate(self.meta_elements): if elm.meta_element_type.file_name == meta_element_info.file_name: - field_list: list[Field] = self.map_headers(fields, elm.fields[0].index) + (field_list, core_id) = self.map_headers(fields, index_field) + if not core_id: + core_id = elm.core_id self.meta_elements[i] = \ - MetaElementAttributes(meta_element_type=meta_element_info, fields=field_list) + MetaElementAttributes(meta_element_type=meta_element_info, fields=field_list, core_id=core_id) replace = True if not replace: - field_list: list[Field] = self.map_headers(fields) + (field_list, core_id) = self.map_headers(fields, index_field) self.meta_elements.append( - MetaElementAttributes(meta_element_type=meta_element_info, fields=field_list)) + MetaElementAttributes(meta_element_type=meta_element_info, fields=field_list, core_id=core_id)) def _build_meta_xml(self, meta_elem_attrib: MetaElementAttributes): """Build a core/extension row for `meta.xml` @@ -283,7 +232,7 @@ def _build_meta_xml(self, meta_elem_attrib: MetaElementAttributes): """ elem = ET.SubElement(self.dwca_meta, meta_elem_attrib.meta_element_type.core_or_ext_type) elem.attrib['encoding'] = meta_elem_attrib.meta_element_type.charset_encoding - elem.attrib['rowType'] = meta_elem_attrib.meta_element_type.type.row_type_ns + elem.attrib['rowType'] = meta_elem_attrib.meta_element_type.type.value elem.attrib['fieldsTerminatedBy'] = meta_elem_attrib.meta_element_type.csv_encoding.csv_delimiter elem.attrib['linesTerminatedBy'] = \ "\\r\\n" if (meta_elem_attrib.meta_element_type.csv_encoding.csv_eol in ['\r\n', '\n', '\\n']) \ @@ -294,17 +243,24 @@ def _build_meta_xml(self, meta_elem_attrib: MetaElementAttributes): files = ET.SubElement(elem, 'files') location = ET.SubElement(files, 'location') location.text = meta_elem_attrib.meta_element_type.file_name - id_field = ET.SubElement(elem, 'id') \ - if meta_elem_attrib.meta_element_type.core_or_ext_type == 'core' \ - else ET.SubElement(elem, 'coreid') - id_field.attrib['index'] = '0' + if meta_elem_attrib.core_id: + id_field = ET.SubElement(elem, 'id') \ + if meta_elem_attrib.meta_element_type.core_or_ext_type == 'core' \ + else ET.SubElement(elem, 'coreid') + id_field.attrib['index'] = meta_elem_attrib.core_id.index for _, f in enumerate(meta_elem_attrib.fields): if f.field_name not in ('id', 'coreid'): field_elem = ET.SubElement(elem, "field") - field_elem.attrib['index'] = str(f.index) + # Note: f.index is int type + if f.index is not None: + field_elem.attrib['index'] = f.index if f.term: field_elem.attrib['term'] = f.term + if f.vocabulary: + field_elem.attrib['vocabulary'] = f.vocabulary + if f.default: + field_elem.attrib['default'] = f.default def create(self): """Create a `meta.xml` file for this meta-infomation diff --git a/src/dwcahandler/dwca/terms.py b/src/dwcahandler/dwca/terms.py index c3d882c..2c4be39 100644 --- a/src/dwcahandler/dwca/terms.py +++ b/src/dwcahandler/dwca/terms.py @@ -4,6 +4,11 @@ import re import pandas as pd import logging as log +from urllib.parse import urlparse +from urllib.request import urlopen +from enum import Enum +from typing import NamedTuple +import requests this_dir, this_filename = os.path.split(__file__) @@ -18,49 +23,230 @@ def absolute_file_paths(directory): for dirpath, _, filenames in os.walk(directory): for f in filenames: if re.fullmatch(r'.+\..*', f): - yield os.path.abspath(os.path.join(dirpath, f)) + yield os.path.abspath(str(os.path.join(dirpath, f))) + + +class NsPrefix(Enum): + """ + Enumeration of class or terms prefix + """ + DWC = "dwc" + DC = "dc" + GBIF = "gbif" + OBIS = "obis" + +class ExtInfo(NamedTuple): + """ + Extension info + """ + uri: str + prefix: NsPrefix + namespace: str + +class GbifRegisteredExt(ExtInfo, Enum): + """ + Supported Gbif extensions. Add more extensions to expand the class row type and terms + """ + EXTENDED_MEASUREMENT_OR_FACT = ExtInfo(uri="http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact", + prefix=NsPrefix.OBIS, + namespace="http://rs.iobis.org/obis/terms/") + SIMPLE_MULTIMEDIA = ExtInfo(uri="http://rs.gbif.org/terms/1.0/Multimedia", + prefix=NsPrefix.GBIF, + namespace="http://rs.gbif.org/terms/1.0/") @dataclass class Terms: - TERMS_DWC_URL = "https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/terms/terms.csv" - DWC_FILENAME = 'darwin-core-terms.csv' - DUBLIN_CORE_FILENAME = 'dublin-core-terms.csv' + """ + Terms class to manage the terms and class row types used in the dwca + """ + + GBIF_EXT = "https://rs.gbif.org/extensions.json" + + GBIF_REGISTERED_EXTENSION = [e for e in GbifRegisteredExt] + + DWC_SOURCE_URL = "https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/terms/terms.csv" + + TERMS_FILENAME = "terms.csv" + CLASS_ROW_TYPE = "class-rowtype.csv" + TERMS_DIR = os.path.join(this_dir, "terms") - DWC_FILE_PATH = os.path.join(TERMS_DIR, DWC_FILENAME) - DUBLIN_CORE_PATH = os.path.join(TERMS_DIR, DUBLIN_CORE_FILENAME) + TERMS_FILE_PATH = os.path.join(TERMS_DIR, TERMS_FILENAME) + CLASS_ROW_TYPE_PATH = os.path.join(TERMS_DIR, CLASS_ROW_TYPE) - terms_path: list[Path] = field(default_factory=lambda: [c for c in absolute_file_paths(Terms.TERMS_DIR)], - init=False) terms_df: pd.DataFrame = field(default_factory=pd.DataFrame, init=False) - dwc_terms_df: pd.DataFrame = field(default_factory=pd.DataFrame, init=False) + class_df: pd.DataFrame = field(default_factory=pd.DataFrame, init=False) def __post_init__(self): - def _add_to_df(existing_df: pd.DataFrame, df: pd.DataFrame): - if not existing_df.empty: - return existing_df.merge(df, how='outer', left_on=['term', 'uri'], right_on=['term', 'uri']) - return df + self.terms_df = pd.read_csv(Terms.TERMS_FILE_PATH, dtype='str') + self.class_df = pd.read_csv(Terms.CLASS_ROW_TYPE_PATH, dtype='str') - for term_path in self.terms_path: - df = pd.read_csv(term_path, dtype='str') - self.terms_df = _add_to_df(self.terms_df, df) - if term_path == Terms.DWC_FILE_PATH or term_path == Terms.DUBLIN_CORE_PATH: - self.dwc_terms_df = _add_to_df(self.dwc_terms_df, df) + @staticmethod + def _update_class_csv(ns: NsPrefix, updates: pd.DataFrame): + """ + Update class rowtype by replacing all the rows by prefix. + + :param ns: Name prefix + :param updates: dataframe containing the class rows to update + """ + if len(updates) > 0 and "class_uri" in updates.columns.tolist(): + updates.insert(0, "class", + updates["class_uri"].apply(lambda x: + f"{Terms.extract_term(term_string = x, add_underscore = True).upper()}")) + updates["prefix"] = ns.value + + Terms._update_csv(ns, updates, True) + return updates + + @staticmethod + def _update_csv(ns: NsPrefix, updates: pd.DataFrame, is_class: bool=True): + """ + Update class rowtype or terms by replacing all the rows by prefix. + + :param ns: Name prefix + :param updates: dataframe containing the class rows or terms to update + :param is_class: True if it is a class rowtype. False if this is terms + """ + + col_list = ["prefix", "class", "class_uri"] if is_class else ["prefix", "term", "uri"] + file = Terms.CLASS_ROW_TYPE_PATH if is_class else Terms.TERMS_FILE_PATH + + if all(col in updates.columns.tolist() for col in col_list): + df = updates + if Path(file).is_file(): + df = pd.read_csv(file) + if len(df) > 0: + df = df[df["prefix"] != ns.value] + df = pd.concat([df, updates[col_list]], ignore_index=False) + + df.to_csv(file, index=False) + log.info("Rows updated in %s: %s of %s", Path(Terms.CLASS_ROW_TYPE).name, + len(updates), len(df)) + else: + log.info("No updates to class csv %s", Path(Terms.CLASS_ROW_TYPE).name) + + @staticmethod + def get_dwc_source_data() -> pd.DataFrame: + return pd.read_csv(Terms.DWC_SOURCE_URL, delimiter=",", encoding='utf-8', dtype='str') @staticmethod def update_dwc_terms(): """ Pull the latest terms from gbif dwc csv url and update the darwin core vocab terms in the package - Currently only support update darwin-core-terms.csv For reference: dublin-core-terms is derived from https://www.dublincore.org/specifications/dublin-core/dcmi-terms/ terms namespace - :return: dwc_dr dataframe containing the updated term + :return: dwc_df dataframe containing the updated dwc term + dwc_class_df dataframe containing the updated dwc class """ - df = pd.read_csv(Terms.TERMS_DWC_URL, delimiter=",", encoding='utf-8', dtype='str') + df = Terms.get_dwc_source_data() df = df[df["term_deprecated"].isnull()] dwc_df = pd.DataFrame() dwc_df['term'] = df['term_localName'] dwc_df['uri'] = df['term_isDefinedBy'] + df['term_localName'] - dwc_df.to_csv(Terms.DWC_FILE_PATH, index=False) + dwc_df["prefix"] = NsPrefix.DWC.value + + if len(dwc_df) > 0: + Terms._update_csv(NsPrefix.DWC, dwc_df, False) + + dwc_class_df = pd.DataFrame() + dwc_class_df["class_uri"] = df["tdwgutility_organizedInClass"].unique() + dwc_class_df = dwc_class_df[dwc_class_df["class_uri"].notna()] + log.info("Total terms downloaded: %i", len(dwc_df)) - return dwc_df + log.info("Total class downloaded: %i", len(dwc_class_df)) + + if len(dwc_class_df) > 0: + dwc_class_df = Terms._update_class_csv(NsPrefix.DWC, dwc_class_df) + + return dwc_df, dwc_class_df + + @staticmethod + def extract_term(term_string, add_underscore: bool = False): + """ + Find a term name based on a term or a URI + + :param term_string: The term or URI + :param add_underscore: if true, adds _ to before capital letter before a camel case. + for eg: occurrenceID to occurrence_ID + :return: The term name + """ + path_entity = urlparse(term_string) + path_str = path_entity.path + match = re.search(r'/([^/]*)$', path_str) + if match is not None: + term = match[1] + word = re.sub(pattern="(?!^)([A-Z])", repl=r"_\1", string=term) if add_underscore else term + return word + + return term_string + + @staticmethod + def get_class_row_types(): + """ + This is called by the meta class row type to build the enumeration list + """ + class_df = pd.read_csv(Terms.CLASS_ROW_TYPE_PATH) + class_list = list(tuple(zip(class_df["class"], class_df["class_uri"]))) + return class_list + + @staticmethod + def update_gbif_ext(): + """ + Update the class row type and terms specified by GBIF_REGISTERED_EXTENSION and update by prefix + """ + def _get_latest(identifier: str): + d = requests.get(Terms.GBIF_EXT).json() + gbif_ext_df = pd.DataFrame.from_dict(d["extensions"]) + ext_df = gbif_ext_df[(gbif_ext_df["identifier"]==identifier) & (gbif_ext_df["isLatest"]==True)] + url: str = "" + if len(ext_df) > 0 and "url" in ext_df.columns.tolist(): + url = ext_df["url"].values[0] + return url + + def _extract_term_info(every_term: tuple) -> list: + def _extract_value(text: str): + return text.replace('\\',"").\ + replace('"',"").replace("'","").split("=")[1] + + term_name = _extract_value(every_term[0]) + namespace = _extract_value(every_term[1]) + uri = _extract_value(every_term[2]) + + return [term_name, namespace, uri] + + for supported_ext in Terms.GBIF_REGISTERED_EXTENSION: + url = _get_latest(supported_ext.uri) + if url: + update_class = pd.DataFrame([supported_ext.uri], columns=["class_uri"]) + Terms._update_class_csv(supported_ext.prefix, update_class) + + with urlopen(url) as f: + + xml_str = str(f.read()) + reg_exp = r'' + list_of_ns_terms = re.findall(reg_exp, xml_str) + log.info("List of terms found for %s: %d", url, len(list_of_ns_terms)) + + term_info = [] + for every_term in list_of_ns_terms: + term_info.append(_extract_term_info(every_term)) + + df = pd.DataFrame(term_info, columns=["term", "namespace", 'uri']) + std_ns = ["http://rs.tdwg.org/dwc/terms/", "http://purl.org/dc/terms/"] + existing_terms = Terms().terms_df + extra_terms_df = df[(df["namespace"].isin(std_ns)) & (~df["uri"].isin(existing_terms["uri"]))] + log.info ("Additional standard terms found:\n%s", extra_terms_df) + new_terms = df[~df["uri"].isin(existing_terms["uri"])] + if len(new_terms) > 0: + new_terms["prefix"] = supported_ext.prefix.value + Terms._update_csv(supported_ext.prefix, new_terms, False) + + + @staticmethod + def update_terms(): + Terms.update_dwc_terms() + Terms.update_gbif_ext() + + + +#Terms.update_terms() diff --git a/src/dwcahandler/dwca/terms/class-rowtype.csv b/src/dwcahandler/dwca/terms/class-rowtype.csv new file mode 100644 index 0000000..d8f79ec --- /dev/null +++ b/src/dwcahandler/dwca/terms/class-rowtype.csv @@ -0,0 +1,14 @@ +prefix,class,class_uri +dwc,TAXON,http://rs.tdwg.org/dwc/terms/Taxon +dwc,OCCURRENCE,http://rs.tdwg.org/dwc/terms/Occurrence +dwc,ORGANISM,http://rs.tdwg.org/dwc/terms/Organism +dwc,MATERIAL_ENTITY,http://rs.tdwg.org/dwc/terms/MaterialEntity +dwc,GEOLOGICAL_CONTEXT,http://rs.tdwg.org/dwc/terms/GeologicalContext +dwc,LOCATION,http://purl.org/dc/terms/Location +dwc,IDENTIFICATION,http://rs.tdwg.org/dwc/terms/Identification +dwc,EVENT,http://rs.tdwg.org/dwc/terms/Event +dwc,MATERIAL_SAMPLE,http://rs.tdwg.org/dwc/terms/MaterialSample +dwc,MEASUREMENT_OR_FACT,http://rs.tdwg.org/dwc/terms/MeasurementOrFact +dwc,RESOURCE_RELATIONSHIP,http://rs.tdwg.org/dwc/terms/ResourceRelationship +obis,EXTENDED_MEASUREMENT_OR_FACT,http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact +gbif,MULTIMEDIA,http://rs.gbif.org/terms/1.0/Multimedia diff --git a/src/dwcahandler/dwca/terms/darwin-core-terms.csv b/src/dwcahandler/dwca/terms/darwin-core-terms.csv deleted file mode 100644 index e246f80..0000000 --- a/src/dwcahandler/dwca/terms/darwin-core-terms.csv +++ /dev/null @@ -1,215 +0,0 @@ -term,uri -acceptedNameUsage,http://rs.tdwg.org/dwc/terms/acceptedNameUsage -acceptedNameUsageID,http://rs.tdwg.org/dwc/terms/acceptedNameUsageID -associatedMedia,http://rs.tdwg.org/dwc/terms/associatedMedia -associatedOccurrences,http://rs.tdwg.org/dwc/terms/associatedOccurrences -associatedOrganisms,http://rs.tdwg.org/dwc/terms/associatedOrganisms -associatedReferences,http://rs.tdwg.org/dwc/terms/associatedReferences -associatedSequences,http://rs.tdwg.org/dwc/terms/associatedSequences -associatedTaxa,http://rs.tdwg.org/dwc/terms/associatedTaxa -basisOfRecord,http://rs.tdwg.org/dwc/terms/basisOfRecord -bed,http://rs.tdwg.org/dwc/terms/bed -behavior,http://rs.tdwg.org/dwc/terms/behavior -caste,http://rs.tdwg.org/dwc/terms/caste -catalogNumber,http://rs.tdwg.org/dwc/terms/catalogNumber -class,http://rs.tdwg.org/dwc/terms/class -collectionCode,http://rs.tdwg.org/dwc/terms/collectionCode -collectionID,http://rs.tdwg.org/dwc/terms/collectionID -continent,http://rs.tdwg.org/dwc/terms/continent -coordinatePrecision,http://rs.tdwg.org/dwc/terms/coordinatePrecision -coordinateUncertaintyInMeters,http://rs.tdwg.org/dwc/terms/coordinateUncertaintyInMeters -country,http://rs.tdwg.org/dwc/terms/country -countryCode,http://rs.tdwg.org/dwc/terms/countryCode -county,http://rs.tdwg.org/dwc/terms/county -cultivarEpithet,http://rs.tdwg.org/dwc/terms/cultivarEpithet -dataGeneralizations,http://rs.tdwg.org/dwc/terms/dataGeneralizations -datasetID,http://rs.tdwg.org/dwc/terms/datasetID -datasetName,http://rs.tdwg.org/dwc/terms/datasetName -dateIdentified,http://rs.tdwg.org/dwc/terms/dateIdentified -day,http://rs.tdwg.org/dwc/terms/day -decimalLatitude,http://rs.tdwg.org/dwc/terms/decimalLatitude -decimalLongitude,http://rs.tdwg.org/dwc/terms/decimalLongitude -degreeOfEstablishment,http://rs.tdwg.org/dwc/terms/degreeOfEstablishment -disposition,http://rs.tdwg.org/dwc/terms/disposition -dynamicProperties,http://rs.tdwg.org/dwc/terms/dynamicProperties -earliestAgeOrLowestStage,http://rs.tdwg.org/dwc/terms/earliestAgeOrLowestStage -earliestEonOrLowestEonothem,http://rs.tdwg.org/dwc/terms/earliestEonOrLowestEonothem -earliestEpochOrLowestSeries,http://rs.tdwg.org/dwc/terms/earliestEpochOrLowestSeries -earliestEraOrLowestErathem,http://rs.tdwg.org/dwc/terms/earliestEraOrLowestErathem -earliestPeriodOrLowestSystem,http://rs.tdwg.org/dwc/terms/earliestPeriodOrLowestSystem -endDayOfYear,http://rs.tdwg.org/dwc/terms/endDayOfYear -establishmentMeans,http://rs.tdwg.org/dwc/terms/establishmentMeans -Event,http://rs.tdwg.org/dwc/terms/Event -eventDate,http://rs.tdwg.org/dwc/terms/eventDate -eventID,http://rs.tdwg.org/dwc/terms/eventID -eventRemarks,http://rs.tdwg.org/dwc/terms/eventRemarks -eventTime,http://rs.tdwg.org/dwc/terms/eventTime -eventType,http://rs.tdwg.org/dwc/terms/eventType -family,http://rs.tdwg.org/dwc/terms/family -fieldNotes,http://rs.tdwg.org/dwc/terms/fieldNotes -fieldNumber,http://rs.tdwg.org/dwc/terms/fieldNumber -footprintSpatialFit,http://rs.tdwg.org/dwc/terms/footprintSpatialFit -footprintSRS,http://rs.tdwg.org/dwc/terms/footprintSRS -footprintWKT,http://rs.tdwg.org/dwc/terms/footprintWKT -formation,http://rs.tdwg.org/dwc/terms/formation -FossilSpecimen,http://rs.tdwg.org/dwc/terms/FossilSpecimen -genericName,http://rs.tdwg.org/dwc/terms/genericName -genus,http://rs.tdwg.org/dwc/terms/genus -geodeticDatum,http://rs.tdwg.org/dwc/terms/geodeticDatum -GeologicalContext,http://rs.tdwg.org/dwc/terms/GeologicalContext -geologicalContextID,http://rs.tdwg.org/dwc/terms/geologicalContextID -georeferencedBy,http://rs.tdwg.org/dwc/terms/georeferencedBy -georeferencedDate,http://rs.tdwg.org/dwc/terms/georeferencedDate -georeferenceProtocol,http://rs.tdwg.org/dwc/terms/georeferenceProtocol -georeferenceRemarks,http://rs.tdwg.org/dwc/terms/georeferenceRemarks -georeferenceSources,http://rs.tdwg.org/dwc/terms/georeferenceSources -georeferenceVerificationStatus,http://rs.tdwg.org/dwc/terms/georeferenceVerificationStatus -group,http://rs.tdwg.org/dwc/terms/group -habitat,http://rs.tdwg.org/dwc/terms/habitat -higherClassification,http://rs.tdwg.org/dwc/terms/higherClassification -higherGeography,http://rs.tdwg.org/dwc/terms/higherGeography -higherGeographyID,http://rs.tdwg.org/dwc/terms/higherGeographyID -highestBiostratigraphicZone,http://rs.tdwg.org/dwc/terms/highestBiostratigraphicZone -HumanObservation,http://rs.tdwg.org/dwc/terms/HumanObservation -Identification,http://rs.tdwg.org/dwc/terms/Identification -identificationID,http://rs.tdwg.org/dwc/terms/identificationID -identificationQualifier,http://rs.tdwg.org/dwc/terms/identificationQualifier -identificationReferences,http://rs.tdwg.org/dwc/terms/identificationReferences -identificationRemarks,http://rs.tdwg.org/dwc/terms/identificationRemarks -identificationVerificationStatus,http://rs.tdwg.org/dwc/terms/identificationVerificationStatus -identifiedBy,http://rs.tdwg.org/dwc/terms/identifiedBy -identifiedByID,http://rs.tdwg.org/dwc/terms/identifiedByID -individualCount,http://rs.tdwg.org/dwc/terms/individualCount -informationWithheld,http://rs.tdwg.org/dwc/terms/informationWithheld -infragenericEpithet,http://rs.tdwg.org/dwc/terms/infragenericEpithet -infraspecificEpithet,http://rs.tdwg.org/dwc/terms/infraspecificEpithet -institutionCode,http://rs.tdwg.org/dwc/terms/institutionCode -institutionID,http://rs.tdwg.org/dwc/terms/institutionID -island,http://rs.tdwg.org/dwc/terms/island -islandGroup,http://rs.tdwg.org/dwc/terms/islandGroup -kingdom,http://rs.tdwg.org/dwc/terms/kingdom -latestAgeOrHighestStage,http://rs.tdwg.org/dwc/terms/latestAgeOrHighestStage -latestEonOrHighestEonothem,http://rs.tdwg.org/dwc/terms/latestEonOrHighestEonothem -latestEpochOrHighestSeries,http://rs.tdwg.org/dwc/terms/latestEpochOrHighestSeries -latestEraOrHighestErathem,http://rs.tdwg.org/dwc/terms/latestEraOrHighestErathem -latestPeriodOrHighestSystem,http://rs.tdwg.org/dwc/terms/latestPeriodOrHighestSystem -lifeStage,http://rs.tdwg.org/dwc/terms/lifeStage -lithostratigraphicTerms,http://rs.tdwg.org/dwc/terms/lithostratigraphicTerms -LivingSpecimen,http://rs.tdwg.org/dwc/terms/LivingSpecimen -locality,http://rs.tdwg.org/dwc/terms/locality -locationAccordingTo,http://rs.tdwg.org/dwc/terms/locationAccordingTo -locationID,http://rs.tdwg.org/dwc/terms/locationID -locationRemarks,http://rs.tdwg.org/dwc/terms/locationRemarks -lowestBiostratigraphicZone,http://rs.tdwg.org/dwc/terms/lowestBiostratigraphicZone -MachineObservation,http://rs.tdwg.org/dwc/terms/MachineObservation -MaterialCitation,http://rs.tdwg.org/dwc/terms/MaterialCitation -MaterialEntity,http://rs.tdwg.org/dwc/terms/MaterialEntity -materialEntityID,http://rs.tdwg.org/dwc/terms/materialEntityID -materialEntityRemarks,http://rs.tdwg.org/dwc/terms/materialEntityRemarks -MaterialSample,http://rs.tdwg.org/dwc/terms/MaterialSample -materialSampleID,http://rs.tdwg.org/dwc/terms/materialSampleID -maximumDepthInMeters,http://rs.tdwg.org/dwc/terms/maximumDepthInMeters -maximumDistanceAboveSurfaceInMeters,http://rs.tdwg.org/dwc/terms/maximumDistanceAboveSurfaceInMeters -maximumElevationInMeters,http://rs.tdwg.org/dwc/terms/maximumElevationInMeters -measurementAccuracy,http://rs.tdwg.org/dwc/terms/measurementAccuracy -measurementDeterminedBy,http://rs.tdwg.org/dwc/terms/measurementDeterminedBy -measurementDeterminedDate,http://rs.tdwg.org/dwc/terms/measurementDeterminedDate -measurementID,http://rs.tdwg.org/dwc/terms/measurementID -measurementMethod,http://rs.tdwg.org/dwc/terms/measurementMethod -MeasurementOrFact,http://rs.tdwg.org/dwc/terms/MeasurementOrFact -measurementRemarks,http://rs.tdwg.org/dwc/terms/measurementRemarks -measurementType,http://rs.tdwg.org/dwc/terms/measurementType -measurementUnit,http://rs.tdwg.org/dwc/terms/measurementUnit -measurementValue,http://rs.tdwg.org/dwc/terms/measurementValue -member,http://rs.tdwg.org/dwc/terms/member -minimumDepthInMeters,http://rs.tdwg.org/dwc/terms/minimumDepthInMeters -minimumDistanceAboveSurfaceInMeters,http://rs.tdwg.org/dwc/terms/minimumDistanceAboveSurfaceInMeters -minimumElevationInMeters,http://rs.tdwg.org/dwc/terms/minimumElevationInMeters -month,http://rs.tdwg.org/dwc/terms/month -municipality,http://rs.tdwg.org/dwc/terms/municipality -nameAccordingTo,http://rs.tdwg.org/dwc/terms/nameAccordingTo -nameAccordingToID,http://rs.tdwg.org/dwc/terms/nameAccordingToID -namePublishedIn,http://rs.tdwg.org/dwc/terms/namePublishedIn -namePublishedInID,http://rs.tdwg.org/dwc/terms/namePublishedInID -namePublishedInYear,http://rs.tdwg.org/dwc/terms/namePublishedInYear -nomenclaturalCode,http://rs.tdwg.org/dwc/terms/nomenclaturalCode -nomenclaturalStatus,http://rs.tdwg.org/dwc/terms/nomenclaturalStatus -Occurrence,http://rs.tdwg.org/dwc/terms/Occurrence -occurrenceID,http://rs.tdwg.org/dwc/terms/occurrenceID -occurrenceRemarks,http://rs.tdwg.org/dwc/terms/occurrenceRemarks -occurrenceStatus,http://rs.tdwg.org/dwc/terms/occurrenceStatus -order,http://rs.tdwg.org/dwc/terms/order -Organism,http://rs.tdwg.org/dwc/terms/Organism -organismID,http://rs.tdwg.org/dwc/terms/organismID -organismName,http://rs.tdwg.org/dwc/terms/organismName -organismQuantity,http://rs.tdwg.org/dwc/terms/organismQuantity -organismQuantityType,http://rs.tdwg.org/dwc/terms/organismQuantityType -organismRemarks,http://rs.tdwg.org/dwc/terms/organismRemarks -organismScope,http://rs.tdwg.org/dwc/terms/organismScope -originalNameUsage,http://rs.tdwg.org/dwc/terms/originalNameUsage -originalNameUsageID,http://rs.tdwg.org/dwc/terms/originalNameUsageID -otherCatalogNumbers,http://rs.tdwg.org/dwc/terms/otherCatalogNumbers -ownerInstitutionCode,http://rs.tdwg.org/dwc/terms/ownerInstitutionCode -parentEventID,http://rs.tdwg.org/dwc/terms/parentEventID -parentMeasurementID,http://rs.tdwg.org/dwc/terms/parentMeasurementID -parentNameUsage,http://rs.tdwg.org/dwc/terms/parentNameUsage -parentNameUsageID,http://rs.tdwg.org/dwc/terms/parentNameUsageID -pathway,http://rs.tdwg.org/dwc/terms/pathway -phylum,http://rs.tdwg.org/dwc/terms/phylum -pointRadiusSpatialFit,http://rs.tdwg.org/dwc/terms/pointRadiusSpatialFit -preparations,http://rs.tdwg.org/dwc/terms/preparations -PreservedSpecimen,http://rs.tdwg.org/dwc/terms/PreservedSpecimen -previousIdentifications,http://rs.tdwg.org/dwc/terms/previousIdentifications -recordedBy,http://rs.tdwg.org/dwc/terms/recordedBy -recordedByID,http://rs.tdwg.org/dwc/terms/recordedByID -recordNumber,http://rs.tdwg.org/dwc/terms/recordNumber -relatedResourceID,http://rs.tdwg.org/dwc/terms/relatedResourceID -relationshipAccordingTo,http://rs.tdwg.org/dwc/terms/relationshipAccordingTo -relationshipEstablishedDate,http://rs.tdwg.org/dwc/terms/relationshipEstablishedDate -relationshipOfResource,http://rs.tdwg.org/dwc/terms/relationshipOfResource -relationshipOfResourceID,http://rs.tdwg.org/dwc/terms/relationshipOfResourceID -relationshipRemarks,http://rs.tdwg.org/dwc/terms/relationshipRemarks -reproductiveCondition,http://rs.tdwg.org/dwc/terms/reproductiveCondition -resourceID,http://rs.tdwg.org/dwc/terms/resourceID -ResourceRelationship,http://rs.tdwg.org/dwc/terms/ResourceRelationship -resourceRelationshipID,http://rs.tdwg.org/dwc/terms/resourceRelationshipID -sampleSizeUnit,http://rs.tdwg.org/dwc/terms/sampleSizeUnit -sampleSizeValue,http://rs.tdwg.org/dwc/terms/sampleSizeValue -samplingEffort,http://rs.tdwg.org/dwc/terms/samplingEffort -samplingProtocol,http://rs.tdwg.org/dwc/terms/samplingProtocol -scientificName,http://rs.tdwg.org/dwc/terms/scientificName -scientificNameAuthorship,http://rs.tdwg.org/dwc/terms/scientificNameAuthorship -scientificNameID,http://rs.tdwg.org/dwc/terms/scientificNameID -sex,http://rs.tdwg.org/dwc/terms/sex -specificEpithet,http://rs.tdwg.org/dwc/terms/specificEpithet -startDayOfYear,http://rs.tdwg.org/dwc/terms/startDayOfYear -stateProvince,http://rs.tdwg.org/dwc/terms/stateProvince -subfamily,http://rs.tdwg.org/dwc/terms/subfamily -subgenus,http://rs.tdwg.org/dwc/terms/subgenus -subtribe,http://rs.tdwg.org/dwc/terms/subtribe -superfamily,http://rs.tdwg.org/dwc/terms/superfamily -Taxon,http://rs.tdwg.org/dwc/terms/Taxon -taxonConceptID,http://rs.tdwg.org/dwc/terms/taxonConceptID -taxonID,http://rs.tdwg.org/dwc/terms/taxonID -taxonomicStatus,http://rs.tdwg.org/dwc/terms/taxonomicStatus -taxonRank,http://rs.tdwg.org/dwc/terms/taxonRank -taxonRemarks,http://rs.tdwg.org/dwc/terms/taxonRemarks -tribe,http://rs.tdwg.org/dwc/terms/tribe -typeStatus,http://rs.tdwg.org/dwc/terms/typeStatus -verbatimCoordinates,http://rs.tdwg.org/dwc/terms/verbatimCoordinates -verbatimCoordinateSystem,http://rs.tdwg.org/dwc/terms/verbatimCoordinateSystem -verbatimDepth,http://rs.tdwg.org/dwc/terms/verbatimDepth -verbatimElevation,http://rs.tdwg.org/dwc/terms/verbatimElevation -verbatimEventDate,http://rs.tdwg.org/dwc/terms/verbatimEventDate -verbatimIdentification,http://rs.tdwg.org/dwc/terms/verbatimIdentification -verbatimLabel,http://rs.tdwg.org/dwc/terms/verbatimLabel -verbatimLatitude,http://rs.tdwg.org/dwc/terms/verbatimLatitude -verbatimLocality,http://rs.tdwg.org/dwc/terms/verbatimLocality -verbatimLongitude,http://rs.tdwg.org/dwc/terms/verbatimLongitude -verbatimSRS,http://rs.tdwg.org/dwc/terms/verbatimSRS -verbatimTaxonRank,http://rs.tdwg.org/dwc/terms/verbatimTaxonRank -vernacularName,http://rs.tdwg.org/dwc/terms/vernacularName -verticalDatum,http://rs.tdwg.org/dwc/terms/verticalDatum -vitality,http://rs.tdwg.org/dwc/terms/vitality -waterBody,http://rs.tdwg.org/dwc/terms/waterBody -year,http://rs.tdwg.org/dwc/terms/year diff --git a/src/dwcahandler/dwca/terms/dublin-core-terms.csv b/src/dwcahandler/dwca/terms/dublin-core-terms.csv deleted file mode 100644 index fa9100a..0000000 --- a/src/dwcahandler/dwca/terms/dublin-core-terms.csv +++ /dev/null @@ -1,56 +0,0 @@ -term,uri -abstract,http://purl.org/dc/terms/abstract -accessRights,http://purl.org/dc/terms/accessRights -accrualMethod,http://purl.org/dc/terms/accrualMethod -accrualPeriodicity,http://purl.org/dc/terms/accrualPeriodicity -accrualPolicy,http://purl.org/dc/terms/accrualPolicy -alternative,http://purl.org/dc/terms/alternative -audience,http://purl.org/dc/terms/audience -available,http://purl.org/dc/terms/available -bibliographicCitation,http://purl.org/dc/terms/bibliographicCitation -conformsTo,http://purl.org/dc/terms/conformsTo -contributor,http://purl.org/dc/terms/contributor -coverage,http://purl.org/dc/terms/coverage -created,http://purl.org/dc/terms/created -creator,http://purl.org/dc/terms/creator -date,http://purl.org/dc/terms/date -dateAccepted,http://purl.org/dc/terms/dateAccepted -dateCopyrighted,http://purl.org/dc/terms/dateCopyrighted -dateSubmitted,http://purl.org/dc/terms/dateSubmitted -description,http://purl.org/dc/terms/description -educationLevel,http://purl.org/dc/terms/educationLevel -extent,http://purl.org/dc/terms/extent -format,http://purl.org/dc/terms/format -hasFormat,http://purl.org/dc/terms/hasFormat -hasPart,http://purl.org/dc/terms/hasPart -hasVersion,http://purl.org/dc/terms/hasVersion -identifier,http://purl.org/dc/terms/identifier -instructionalMethod,http://purl.org/dc/terms/instructionalMethod -isFormatOf,http://purl.org/dc/terms/isFormatOf -isPartOf,http://purl.org/dc/terms/isPartOf -isReferencedBy,http://purl.org/dc/terms/isReferencedBy -isReplacedBy,http://purl.org/dc/terms/isReplacedBy -isRequiredBy,http://purl.org/dc/terms/isRequiredBy -issued,http://purl.org/dc/terms/issued -isVersionOf,http://purl.org/dc/terms/isVersionOf -language,http://purl.org/dc/terms/language -license,http://purl.org/dc/terms/license -mediator,http://purl.org/dc/terms/mediator -medium,http://purl.org/dc/terms/medium -modified,http://purl.org/dc/terms/modified -provenance,http://purl.org/dc/terms/provenance -publisher,http://purl.org/dc/terms/publisher -references,http://purl.org/dc/terms/references -relation,http://purl.org/dc/terms/relation -replaces,http://purl.org/dc/terms/replaces -requires,http://purl.org/dc/terms/requires -rights,http://purl.org/dc/terms/rights -rightsHolder,http://purl.org/dc/terms/rightsHolder -source,http://purl.org/dc/terms/source -spatial,http://purl.org/dc/terms/spatial -subject,http://purl.org/dc/terms/subject -tableOfContents,http://purl.org/dc/terms/tableOfContents -temporal,http://purl.org/dc/terms/temporal -title,http://purl.org/dc/terms/title -type,http://purl.org/dc/terms/type -valid,http://purl.org/dc/terms/valid \ No newline at end of file diff --git a/src/dwcahandler/dwca/terms/terms.csv b/src/dwcahandler/dwca/terms/terms.csv new file mode 100644 index 0000000..6db290e --- /dev/null +++ b/src/dwcahandler/dwca/terms/terms.csv @@ -0,0 +1,273 @@ +prefix,term,uri +dc,abstract,http://purl.org/dc/terms/abstract +dc,accessRights,http://purl.org/dc/terms/accessRights +dc,accrualMethod,http://purl.org/dc/terms/accrualMethod +dc,accrualPeriodicity,http://purl.org/dc/terms/accrualPeriodicity +dc,accrualPolicy,http://purl.org/dc/terms/accrualPolicy +dc,alternative,http://purl.org/dc/terms/alternative +dc,audience,http://purl.org/dc/terms/audience +dc,available,http://purl.org/dc/terms/available +dc,bibliographicCitation,http://purl.org/dc/terms/bibliographicCitation +dc,conformsTo,http://purl.org/dc/terms/conformsTo +dc,contributor,http://purl.org/dc/terms/contributor +dc,coverage,http://purl.org/dc/terms/coverage +dc,created,http://purl.org/dc/terms/created +dc,creator,http://purl.org/dc/terms/creator +dc,date,http://purl.org/dc/terms/date +dc,dateAccepted,http://purl.org/dc/terms/dateAccepted +dc,dateCopyrighted,http://purl.org/dc/terms/dateCopyrighted +dc,dateSubmitted,http://purl.org/dc/terms/dateSubmitted +dc,description,http://purl.org/dc/terms/description +dc,educationLevel,http://purl.org/dc/terms/educationLevel +dc,extent,http://purl.org/dc/terms/extent +dc,format,http://purl.org/dc/terms/format +dc,hasFormat,http://purl.org/dc/terms/hasFormat +dc,hasPart,http://purl.org/dc/terms/hasPart +dc,hasVersion,http://purl.org/dc/terms/hasVersion +dc,identifier,http://purl.org/dc/terms/identifier +dc,instructionalMethod,http://purl.org/dc/terms/instructionalMethod +dc,isFormatOf,http://purl.org/dc/terms/isFormatOf +dc,isPartOf,http://purl.org/dc/terms/isPartOf +dc,isReferencedBy,http://purl.org/dc/terms/isReferencedBy +dc,isReplacedBy,http://purl.org/dc/terms/isReplacedBy +dc,isRequiredBy,http://purl.org/dc/terms/isRequiredBy +dc,issued,http://purl.org/dc/terms/issued +dc,isVersionOf,http://purl.org/dc/terms/isVersionOf +dc,language,http://purl.org/dc/terms/language +dc,license,http://purl.org/dc/terms/license +dc,mediator,http://purl.org/dc/terms/mediator +dc,medium,http://purl.org/dc/terms/medium +dc,modified,http://purl.org/dc/terms/modified +dc,provenance,http://purl.org/dc/terms/provenance +dc,publisher,http://purl.org/dc/terms/publisher +dc,references,http://purl.org/dc/terms/references +dc,relation,http://purl.org/dc/terms/relation +dc,replaces,http://purl.org/dc/terms/replaces +dc,requires,http://purl.org/dc/terms/requires +dc,rights,http://purl.org/dc/terms/rights +dc,rightsHolder,http://purl.org/dc/terms/rightsHolder +dc,source,http://purl.org/dc/terms/source +dc,spatial,http://purl.org/dc/terms/spatial +dc,subject,http://purl.org/dc/terms/subject +dc,tableOfContents,http://purl.org/dc/terms/tableOfContents +dc,temporal,http://purl.org/dc/terms/temporal +dc,title,http://purl.org/dc/terms/title +dc,type,http://purl.org/dc/terms/type +dc,valid,http://purl.org/dc/terms/valid +dwc,acceptedNameUsage,http://rs.tdwg.org/dwc/terms/acceptedNameUsage +dwc,acceptedNameUsageID,http://rs.tdwg.org/dwc/terms/acceptedNameUsageID +dwc,associatedMedia,http://rs.tdwg.org/dwc/terms/associatedMedia +dwc,associatedOccurrences,http://rs.tdwg.org/dwc/terms/associatedOccurrences +dwc,associatedOrganisms,http://rs.tdwg.org/dwc/terms/associatedOrganisms +dwc,associatedReferences,http://rs.tdwg.org/dwc/terms/associatedReferences +dwc,associatedSequences,http://rs.tdwg.org/dwc/terms/associatedSequences +dwc,associatedTaxa,http://rs.tdwg.org/dwc/terms/associatedTaxa +dwc,basisOfRecord,http://rs.tdwg.org/dwc/terms/basisOfRecord +dwc,bed,http://rs.tdwg.org/dwc/terms/bed +dwc,behavior,http://rs.tdwg.org/dwc/terms/behavior +dwc,caste,http://rs.tdwg.org/dwc/terms/caste +dwc,catalogNumber,http://rs.tdwg.org/dwc/terms/catalogNumber +dwc,class,http://rs.tdwg.org/dwc/terms/class +dwc,collectionCode,http://rs.tdwg.org/dwc/terms/collectionCode +dwc,collectionID,http://rs.tdwg.org/dwc/terms/collectionID +dwc,continent,http://rs.tdwg.org/dwc/terms/continent +dwc,coordinatePrecision,http://rs.tdwg.org/dwc/terms/coordinatePrecision +dwc,coordinateUncertaintyInMeters,http://rs.tdwg.org/dwc/terms/coordinateUncertaintyInMeters +dwc,country,http://rs.tdwg.org/dwc/terms/country +dwc,countryCode,http://rs.tdwg.org/dwc/terms/countryCode +dwc,county,http://rs.tdwg.org/dwc/terms/county +dwc,cultivarEpithet,http://rs.tdwg.org/dwc/terms/cultivarEpithet +dwc,dataGeneralizations,http://rs.tdwg.org/dwc/terms/dataGeneralizations +dwc,datasetID,http://rs.tdwg.org/dwc/terms/datasetID +dwc,datasetName,http://rs.tdwg.org/dwc/terms/datasetName +dwc,dateIdentified,http://rs.tdwg.org/dwc/terms/dateIdentified +dwc,day,http://rs.tdwg.org/dwc/terms/day +dwc,decimalLatitude,http://rs.tdwg.org/dwc/terms/decimalLatitude +dwc,decimalLongitude,http://rs.tdwg.org/dwc/terms/decimalLongitude +dwc,degreeOfEstablishment,http://rs.tdwg.org/dwc/terms/degreeOfEstablishment +dwc,disposition,http://rs.tdwg.org/dwc/terms/disposition +dwc,dynamicProperties,http://rs.tdwg.org/dwc/terms/dynamicProperties +dwc,earliestAgeOrLowestStage,http://rs.tdwg.org/dwc/terms/earliestAgeOrLowestStage +dwc,earliestEonOrLowestEonothem,http://rs.tdwg.org/dwc/terms/earliestEonOrLowestEonothem +dwc,earliestEpochOrLowestSeries,http://rs.tdwg.org/dwc/terms/earliestEpochOrLowestSeries +dwc,earliestEraOrLowestErathem,http://rs.tdwg.org/dwc/terms/earliestEraOrLowestErathem +dwc,earliestPeriodOrLowestSystem,http://rs.tdwg.org/dwc/terms/earliestPeriodOrLowestSystem +dwc,endDayOfYear,http://rs.tdwg.org/dwc/terms/endDayOfYear +dwc,establishmentMeans,http://rs.tdwg.org/dwc/terms/establishmentMeans +dwc,Event,http://rs.tdwg.org/dwc/terms/Event +dwc,eventDate,http://rs.tdwg.org/dwc/terms/eventDate +dwc,eventID,http://rs.tdwg.org/dwc/terms/eventID +dwc,eventRemarks,http://rs.tdwg.org/dwc/terms/eventRemarks +dwc,eventTime,http://rs.tdwg.org/dwc/terms/eventTime +dwc,eventType,http://rs.tdwg.org/dwc/terms/eventType +dwc,family,http://rs.tdwg.org/dwc/terms/family +dwc,fieldNotes,http://rs.tdwg.org/dwc/terms/fieldNotes +dwc,fieldNumber,http://rs.tdwg.org/dwc/terms/fieldNumber +dwc,footprintSpatialFit,http://rs.tdwg.org/dwc/terms/footprintSpatialFit +dwc,footprintSRS,http://rs.tdwg.org/dwc/terms/footprintSRS +dwc,footprintWKT,http://rs.tdwg.org/dwc/terms/footprintWKT +dwc,formation,http://rs.tdwg.org/dwc/terms/formation +dwc,FossilSpecimen,http://rs.tdwg.org/dwc/terms/FossilSpecimen +dwc,genericName,http://rs.tdwg.org/dwc/terms/genericName +dwc,genus,http://rs.tdwg.org/dwc/terms/genus +dwc,geodeticDatum,http://rs.tdwg.org/dwc/terms/geodeticDatum +dwc,GeologicalContext,http://rs.tdwg.org/dwc/terms/GeologicalContext +dwc,geologicalContextID,http://rs.tdwg.org/dwc/terms/geologicalContextID +dwc,georeferencedBy,http://rs.tdwg.org/dwc/terms/georeferencedBy +dwc,georeferencedDate,http://rs.tdwg.org/dwc/terms/georeferencedDate +dwc,georeferenceProtocol,http://rs.tdwg.org/dwc/terms/georeferenceProtocol +dwc,georeferenceRemarks,http://rs.tdwg.org/dwc/terms/georeferenceRemarks +dwc,georeferenceSources,http://rs.tdwg.org/dwc/terms/georeferenceSources +dwc,georeferenceVerificationStatus,http://rs.tdwg.org/dwc/terms/georeferenceVerificationStatus +dwc,group,http://rs.tdwg.org/dwc/terms/group +dwc,habitat,http://rs.tdwg.org/dwc/terms/habitat +dwc,higherClassification,http://rs.tdwg.org/dwc/terms/higherClassification +dwc,higherGeography,http://rs.tdwg.org/dwc/terms/higherGeography +dwc,higherGeographyID,http://rs.tdwg.org/dwc/terms/higherGeographyID +dwc,highestBiostratigraphicZone,http://rs.tdwg.org/dwc/terms/highestBiostratigraphicZone +dwc,HumanObservation,http://rs.tdwg.org/dwc/terms/HumanObservation +dwc,Identification,http://rs.tdwg.org/dwc/terms/Identification +dwc,identificationID,http://rs.tdwg.org/dwc/terms/identificationID +dwc,identificationQualifier,http://rs.tdwg.org/dwc/terms/identificationQualifier +dwc,identificationReferences,http://rs.tdwg.org/dwc/terms/identificationReferences +dwc,identificationRemarks,http://rs.tdwg.org/dwc/terms/identificationRemarks +dwc,identificationVerificationStatus,http://rs.tdwg.org/dwc/terms/identificationVerificationStatus +dwc,identifiedBy,http://rs.tdwg.org/dwc/terms/identifiedBy +dwc,identifiedByID,http://rs.tdwg.org/dwc/terms/identifiedByID +dwc,individualCount,http://rs.tdwg.org/dwc/terms/individualCount +dwc,informationWithheld,http://rs.tdwg.org/dwc/terms/informationWithheld +dwc,infragenericEpithet,http://rs.tdwg.org/dwc/terms/infragenericEpithet +dwc,infraspecificEpithet,http://rs.tdwg.org/dwc/terms/infraspecificEpithet +dwc,institutionCode,http://rs.tdwg.org/dwc/terms/institutionCode +dwc,institutionID,http://rs.tdwg.org/dwc/terms/institutionID +dwc,island,http://rs.tdwg.org/dwc/terms/island +dwc,islandGroup,http://rs.tdwg.org/dwc/terms/islandGroup +dwc,kingdom,http://rs.tdwg.org/dwc/terms/kingdom +dwc,latestAgeOrHighestStage,http://rs.tdwg.org/dwc/terms/latestAgeOrHighestStage +dwc,latestEonOrHighestEonothem,http://rs.tdwg.org/dwc/terms/latestEonOrHighestEonothem +dwc,latestEpochOrHighestSeries,http://rs.tdwg.org/dwc/terms/latestEpochOrHighestSeries +dwc,latestEraOrHighestErathem,http://rs.tdwg.org/dwc/terms/latestEraOrHighestErathem +dwc,latestPeriodOrHighestSystem,http://rs.tdwg.org/dwc/terms/latestPeriodOrHighestSystem +dwc,lifeStage,http://rs.tdwg.org/dwc/terms/lifeStage +dwc,lithostratigraphicTerms,http://rs.tdwg.org/dwc/terms/lithostratigraphicTerms +dwc,LivingSpecimen,http://rs.tdwg.org/dwc/terms/LivingSpecimen +dwc,locality,http://rs.tdwg.org/dwc/terms/locality +dwc,locationAccordingTo,http://rs.tdwg.org/dwc/terms/locationAccordingTo +dwc,locationID,http://rs.tdwg.org/dwc/terms/locationID +dwc,locationRemarks,http://rs.tdwg.org/dwc/terms/locationRemarks +dwc,lowestBiostratigraphicZone,http://rs.tdwg.org/dwc/terms/lowestBiostratigraphicZone +dwc,MachineObservation,http://rs.tdwg.org/dwc/terms/MachineObservation +dwc,MaterialCitation,http://rs.tdwg.org/dwc/terms/MaterialCitation +dwc,MaterialEntity,http://rs.tdwg.org/dwc/terms/MaterialEntity +dwc,materialEntityID,http://rs.tdwg.org/dwc/terms/materialEntityID +dwc,materialEntityRemarks,http://rs.tdwg.org/dwc/terms/materialEntityRemarks +dwc,MaterialSample,http://rs.tdwg.org/dwc/terms/MaterialSample +dwc,materialSampleID,http://rs.tdwg.org/dwc/terms/materialSampleID +dwc,maximumDepthInMeters,http://rs.tdwg.org/dwc/terms/maximumDepthInMeters +dwc,maximumDistanceAboveSurfaceInMeters,http://rs.tdwg.org/dwc/terms/maximumDistanceAboveSurfaceInMeters +dwc,maximumElevationInMeters,http://rs.tdwg.org/dwc/terms/maximumElevationInMeters +dwc,measurementAccuracy,http://rs.tdwg.org/dwc/terms/measurementAccuracy +dwc,measurementDeterminedBy,http://rs.tdwg.org/dwc/terms/measurementDeterminedBy +dwc,measurementDeterminedDate,http://rs.tdwg.org/dwc/terms/measurementDeterminedDate +dwc,measurementID,http://rs.tdwg.org/dwc/terms/measurementID +dwc,measurementMethod,http://rs.tdwg.org/dwc/terms/measurementMethod +dwc,MeasurementOrFact,http://rs.tdwg.org/dwc/terms/MeasurementOrFact +dwc,measurementRemarks,http://rs.tdwg.org/dwc/terms/measurementRemarks +dwc,measurementType,http://rs.tdwg.org/dwc/terms/measurementType +dwc,measurementUnit,http://rs.tdwg.org/dwc/terms/measurementUnit +dwc,measurementValue,http://rs.tdwg.org/dwc/terms/measurementValue +dwc,member,http://rs.tdwg.org/dwc/terms/member +dwc,minimumDepthInMeters,http://rs.tdwg.org/dwc/terms/minimumDepthInMeters +dwc,minimumDistanceAboveSurfaceInMeters,http://rs.tdwg.org/dwc/terms/minimumDistanceAboveSurfaceInMeters +dwc,minimumElevationInMeters,http://rs.tdwg.org/dwc/terms/minimumElevationInMeters +dwc,month,http://rs.tdwg.org/dwc/terms/month +dwc,municipality,http://rs.tdwg.org/dwc/terms/municipality +dwc,nameAccordingTo,http://rs.tdwg.org/dwc/terms/nameAccordingTo +dwc,nameAccordingToID,http://rs.tdwg.org/dwc/terms/nameAccordingToID +dwc,namePublishedIn,http://rs.tdwg.org/dwc/terms/namePublishedIn +dwc,namePublishedInID,http://rs.tdwg.org/dwc/terms/namePublishedInID +dwc,namePublishedInYear,http://rs.tdwg.org/dwc/terms/namePublishedInYear +dwc,nomenclaturalCode,http://rs.tdwg.org/dwc/terms/nomenclaturalCode +dwc,nomenclaturalStatus,http://rs.tdwg.org/dwc/terms/nomenclaturalStatus +dwc,Occurrence,http://rs.tdwg.org/dwc/terms/Occurrence +dwc,occurrenceID,http://rs.tdwg.org/dwc/terms/occurrenceID +dwc,occurrenceRemarks,http://rs.tdwg.org/dwc/terms/occurrenceRemarks +dwc,occurrenceStatus,http://rs.tdwg.org/dwc/terms/occurrenceStatus +dwc,order,http://rs.tdwg.org/dwc/terms/order +dwc,Organism,http://rs.tdwg.org/dwc/terms/Organism +dwc,organismID,http://rs.tdwg.org/dwc/terms/organismID +dwc,organismName,http://rs.tdwg.org/dwc/terms/organismName +dwc,organismQuantity,http://rs.tdwg.org/dwc/terms/organismQuantity +dwc,organismQuantityType,http://rs.tdwg.org/dwc/terms/organismQuantityType +dwc,organismRemarks,http://rs.tdwg.org/dwc/terms/organismRemarks +dwc,organismScope,http://rs.tdwg.org/dwc/terms/organismScope +dwc,originalNameUsage,http://rs.tdwg.org/dwc/terms/originalNameUsage +dwc,originalNameUsageID,http://rs.tdwg.org/dwc/terms/originalNameUsageID +dwc,otherCatalogNumbers,http://rs.tdwg.org/dwc/terms/otherCatalogNumbers +dwc,ownerInstitutionCode,http://rs.tdwg.org/dwc/terms/ownerInstitutionCode +dwc,parentEventID,http://rs.tdwg.org/dwc/terms/parentEventID +dwc,parentMeasurementID,http://rs.tdwg.org/dwc/terms/parentMeasurementID +dwc,parentNameUsage,http://rs.tdwg.org/dwc/terms/parentNameUsage +dwc,parentNameUsageID,http://rs.tdwg.org/dwc/terms/parentNameUsageID +dwc,pathway,http://rs.tdwg.org/dwc/terms/pathway +dwc,phylum,http://rs.tdwg.org/dwc/terms/phylum +dwc,pointRadiusSpatialFit,http://rs.tdwg.org/dwc/terms/pointRadiusSpatialFit +dwc,preparations,http://rs.tdwg.org/dwc/terms/preparations +dwc,PreservedSpecimen,http://rs.tdwg.org/dwc/terms/PreservedSpecimen +dwc,previousIdentifications,http://rs.tdwg.org/dwc/terms/previousIdentifications +dwc,recordedBy,http://rs.tdwg.org/dwc/terms/recordedBy +dwc,recordedByID,http://rs.tdwg.org/dwc/terms/recordedByID +dwc,recordNumber,http://rs.tdwg.org/dwc/terms/recordNumber +dwc,relatedResourceID,http://rs.tdwg.org/dwc/terms/relatedResourceID +dwc,relationshipAccordingTo,http://rs.tdwg.org/dwc/terms/relationshipAccordingTo +dwc,relationshipEstablishedDate,http://rs.tdwg.org/dwc/terms/relationshipEstablishedDate +dwc,relationshipOfResource,http://rs.tdwg.org/dwc/terms/relationshipOfResource +dwc,relationshipOfResourceID,http://rs.tdwg.org/dwc/terms/relationshipOfResourceID +dwc,relationshipRemarks,http://rs.tdwg.org/dwc/terms/relationshipRemarks +dwc,reproductiveCondition,http://rs.tdwg.org/dwc/terms/reproductiveCondition +dwc,resourceID,http://rs.tdwg.org/dwc/terms/resourceID +dwc,ResourceRelationship,http://rs.tdwg.org/dwc/terms/ResourceRelationship +dwc,resourceRelationshipID,http://rs.tdwg.org/dwc/terms/resourceRelationshipID +dwc,sampleSizeUnit,http://rs.tdwg.org/dwc/terms/sampleSizeUnit +dwc,sampleSizeValue,http://rs.tdwg.org/dwc/terms/sampleSizeValue +dwc,samplingEffort,http://rs.tdwg.org/dwc/terms/samplingEffort +dwc,samplingProtocol,http://rs.tdwg.org/dwc/terms/samplingProtocol +dwc,scientificName,http://rs.tdwg.org/dwc/terms/scientificName +dwc,scientificNameAuthorship,http://rs.tdwg.org/dwc/terms/scientificNameAuthorship +dwc,scientificNameID,http://rs.tdwg.org/dwc/terms/scientificNameID +dwc,sex,http://rs.tdwg.org/dwc/terms/sex +dwc,specificEpithet,http://rs.tdwg.org/dwc/terms/specificEpithet +dwc,startDayOfYear,http://rs.tdwg.org/dwc/terms/startDayOfYear +dwc,stateProvince,http://rs.tdwg.org/dwc/terms/stateProvince +dwc,subfamily,http://rs.tdwg.org/dwc/terms/subfamily +dwc,subgenus,http://rs.tdwg.org/dwc/terms/subgenus +dwc,subtribe,http://rs.tdwg.org/dwc/terms/subtribe +dwc,superfamily,http://rs.tdwg.org/dwc/terms/superfamily +dwc,Taxon,http://rs.tdwg.org/dwc/terms/Taxon +dwc,taxonConceptID,http://rs.tdwg.org/dwc/terms/taxonConceptID +dwc,taxonID,http://rs.tdwg.org/dwc/terms/taxonID +dwc,taxonomicStatus,http://rs.tdwg.org/dwc/terms/taxonomicStatus +dwc,taxonRank,http://rs.tdwg.org/dwc/terms/taxonRank +dwc,taxonRemarks,http://rs.tdwg.org/dwc/terms/taxonRemarks +dwc,tribe,http://rs.tdwg.org/dwc/terms/tribe +dwc,typeStatus,http://rs.tdwg.org/dwc/terms/typeStatus +dwc,verbatimCoordinates,http://rs.tdwg.org/dwc/terms/verbatimCoordinates +dwc,verbatimCoordinateSystem,http://rs.tdwg.org/dwc/terms/verbatimCoordinateSystem +dwc,verbatimDepth,http://rs.tdwg.org/dwc/terms/verbatimDepth +dwc,verbatimElevation,http://rs.tdwg.org/dwc/terms/verbatimElevation +dwc,verbatimEventDate,http://rs.tdwg.org/dwc/terms/verbatimEventDate +dwc,verbatimIdentification,http://rs.tdwg.org/dwc/terms/verbatimIdentification +dwc,verbatimLabel,http://rs.tdwg.org/dwc/terms/verbatimLabel +dwc,verbatimLatitude,http://rs.tdwg.org/dwc/terms/verbatimLatitude +dwc,verbatimLocality,http://rs.tdwg.org/dwc/terms/verbatimLocality +dwc,verbatimLongitude,http://rs.tdwg.org/dwc/terms/verbatimLongitude +dwc,verbatimSRS,http://rs.tdwg.org/dwc/terms/verbatimSRS +dwc,verbatimTaxonRank,http://rs.tdwg.org/dwc/terms/verbatimTaxonRank +dwc,vernacularName,http://rs.tdwg.org/dwc/terms/vernacularName +dwc,verticalDatum,http://rs.tdwg.org/dwc/terms/verticalDatum +dwc,vitality,http://rs.tdwg.org/dwc/terms/vitality +dwc,waterBody,http://rs.tdwg.org/dwc/terms/waterBody +dwc,year,http://rs.tdwg.org/dwc/terms/year +obis,measurementTypeID,http://rs.iobis.org/obis/terms/measurementTypeID +obis,measurementValueID,http://rs.iobis.org/obis/terms/measurementValueID +obis,measurementUnitID,http://rs.iobis.org/obis/terms/measurementUnitID diff --git a/tests/__init__.py b/tests/__init__.py index bbc9fb2..aa50483 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -16,32 +16,38 @@ def get_eml_content(): return eml.build_eml_xml() -def make_fields(columns: list, term_uri: str): - fields = '' +def make_fields(columns: list, term_uri: str, field_start: int = 0, core_id: str = None): + fields = "" + idx_start = 0 + if field_start != -1: + fields = core_id if core_id else "" + idx_start = field_start if field_start != -2 else 0 + + for idx, col in enumerate(columns): - dwc_term_uri = "http://rs.tdwg.org/dwc/terms" if col == 'occurrenceID' else term_uri - field = f'' - fields = field if idx == 0 else fields + '\n' + field + if not (col in ["id", "coreid"]): + dwc_term_uri = "http://rs.tdwg.org/dwc/terms" if col == 'occurrenceID' else term_uri + fields = fields + '\n' + f'' + return fields -def make_ext_str(ext_columns: list, term_uri: str): +def make_ext_str(ext_columns: list, term_uri: str, field_start: int, use_col_idx_as_core_id: int): ext_meta_str = '' - fields = make_fields(ext_columns, term_uri) + fields = make_fields(ext_columns, term_uri, field_start, f'') if fields: ext_meta_str = f''' multimedia.csv - {fields} ''' return ext_meta_str -def make_meta_xml_str(core_df: pd.DataFrame, ext_df: pd.DataFrame = None) -> str: +def make_meta_xml_str(core_df: pd.DataFrame, ext_df: pd.DataFrame = None, use_col_idx_as_core_id: int = None) -> str: """ Create a meta xml string based on the core and extension dataframe This meta xml is based on occurrence core and optional multimedia ext @@ -50,8 +56,12 @@ def make_meta_xml_str(core_df: pd.DataFrame, ext_df: pd.DataFrame = None) -> str :return: str """ core_columns = core_df.columns.to_list() - fields = make_fields(core_columns, "http://rs.tdwg.org/dwc/terms") - ext_str = make_ext_str(ext_df.columns.to_list(), "http://purl.org/dc/terms") \ + field_start = use_col_idx_as_core_id #1 if any(x for x in core_columns if x in ["id", "coreid"]) else use_col_idx_as_core_id + id_idx = use_col_idx_as_core_id if use_col_idx_as_core_id >= 0 else 0 + fields = make_fields(core_columns, "http://rs.tdwg.org/dwc/terms", field_start, + f'') + ext_str = make_ext_str(ext_df.columns.to_list(), "http://purl.org/dc/terms", + field_start, id_idx) \ if isinstance(ext_df, pd.DataFrame) else '' meta_xml_str = f''' @@ -59,14 +69,13 @@ def make_meta_xml_str(core_df: pd.DataFrame, ext_df: pd.DataFrame = None) -> str occurrence.csv - {fields} {ext_str} ''' return meta_xml_str -def make_dwca(core_content: pd.DataFrame, ext_mult_content: pd.DataFrame = None) -> BytesIO: +def make_dwca(core_content: pd.DataFrame, ext_mult_content: pd.DataFrame = None, use_col_idx_as_core_id: int = -1) -> BytesIO: """ Create a darwin core archive in memory for testing :param: core_df dataframe for occurrence core @@ -74,15 +83,15 @@ def make_dwca(core_content: pd.DataFrame, ext_mult_content: pd.DataFrame = None) :return: BytesIO """ zip_buffer = BytesIO() - meta_xml_str = make_meta_xml_str(core_content, ext_mult_content) + meta_xml_str = make_meta_xml_str(core_content, ext_mult_content, use_col_idx_as_core_id) content = core_content.copy(deep=True) - content.insert(loc=0, column='id', value=content['occurrenceID']) + with ZipFile(file=zip_buffer, mode="w", compression=zipfile.ZIP_DEFLATED, allowZip64=True) as zf: zf.writestr(zinfo_or_arcname='occurrence.csv', data=content.to_csv(header=True, quoting=csv.QUOTE_MINIMAL, index=False)) if isinstance(ext_mult_content, pd.DataFrame): multimedia_content = ext_mult_content.copy(deep=True) - multimedia_content.insert(loc=0, column='coreid', value=content['occurrenceID']) + zf.writestr(zinfo_or_arcname='multimedia.csv', data=multimedia_content.to_csv(header=True, quoting=csv.QUOTE_MINIMAL, index=False)) zf.writestr(zinfo_or_arcname='eml.xml', @@ -97,3 +106,14 @@ def remove_pretty_print_xml(input_xml): output_xml = ''.join([line.strip() for line in _dom.toxml().splitlines()]) _dom.unlink() return output_xml + + +from dwcahandler import MetaDwCA +from io import BytesIO + +def get_xml_from_file(expected_file: str): + dwca_meta = MetaDwCA() + dwca_meta.read_meta_file (meta_file=expected_file) + dwca_meta.create() + expected_str = str(dwca_meta) + return expected_str diff --git a/tests/input_files/event/cameratrap-sample1/event.txt b/tests/input_files/event/cameratrap-sample1/event.txt new file mode 100644 index 0000000..c4093b3 --- /dev/null +++ b/tests/input_files/event/cameratrap-sample1/event.txt @@ -0,0 +1,18 @@ +eventID,parentEventID,eventType,eventDate,decimalLatitude,decimalLongitude,coordinateUncertaintyInMeters,samplingProtocol,samplingEffort,deploymentGroups,locality,eventRemarks,habitat +Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,,Survey,,,,,,,,,, +Danbulla_NP_2022,,Survey,,,,,,,,,, +CFRAG_01_bush_20221215,Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,Deployment,,-17.2911425,145.6316175,1,,,Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,Eacham_Curtain_Fig_NPs,camera trap with FALSE bait | tags: bait:none | Swapped low NIMH batteries for alkaline. Access behind large tree fall gap after following water pipes. & SD card = 34A & physical ID on cam = 7 working cam & had these animal signs: Bird calls & animal signs are: Bird calls,tropical rainforest +CFRAG_01_road_20221215,Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,Deployment,,-17.2911425,145.6316175,1,,,Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,Eacham_Curtain_Fig_NPs,camera trap with FALSE bait near hiking trail | tags: bait:none | & SD card = Oo2 & physical ID on cam = 31 working cam & had these animal signs: Bird calls & animal signs are: Bird calls,tropical rainforest +DBNP_07_road_20221005,Danbulla_NP_2022,Deployment,,-17.0900405,145.625301,1,,,Danbulla_NP_2022,Danbulla_NP,"camera trap with FALSE bait near hiking trail | tags: bait:none | SD card reader didn't work, couldn't check test photos. To access site, turn left onto informal track when heading up main track and it turns right, then walk 60 m & animal signs are: Bird calls",tropical rainforest +DBNP_08_bush_20221005,Danbulla_NP_2022,Deployment,,-17.0812305,145.616759,1,,,Danbulla_NP_2022,Danbulla_NP,"camera trap with FALSE bait | tags: bait:none | Access via ridge line from hiking track where road cam is, turn left before ridge declines. Couldn't check test images. & animal signs are: NA",tropical rainforest +CFRAG_02_bush_20221215_Z_Amir_triggerEvent_55,CFRAG_02_bush_20221215,Trigger,2022-12-17,,,,camera trap,2022-12-17T08:25:51Z/2022-12-17T08:25:54Z,,,, +CFRAG_02_bush_20221215_Z_Amir_triggerEvent_56,CFRAG_02_bush_20221215,Trigger,2022-12-18,,,,camera trap,2022-12-18T02:30:01Z/2022-12-18T02:30:05Z,,,, +CFRAG_01_road_20221215_Z_Amir_triggerEvent_217,CFRAG_01_road_20221215,Trigger,2022-12-15,,,,camera trap,2022-12-15T04:10:45Z/2022-12-15T04:10:49Z,,,, +CFRAG_01_road_20221215_Z_Amir_triggerEvent_521,CFRAG_01_road_20221215,Trigger,2022-12-15,,,,camera trap,2022-12-15T13:20:10Z/2022-12-15T13:20:15Z,,,, +CFRAG_01_road_20221215_Z_Amir_triggerEvent_533,CFRAG_01_road_20221215,Trigger,2022-12-15,,,,camera trap,2022-12-15T20:09:48Z/2022-12-15T20:09:51Z,,,, +DBNP_07_road_20221005_Z_Amir_triggerEvent_906,DBNP_07_road_20221005,Trigger,2022-12-10,,,,camera trap,2022-12-10T17:05:36Z/2022-12-10T17:05:36Z,,,, +DBNP_07_road_20221005_Z_Amir_triggerEvent_860,DBNP_07_road_20221005,Trigger,2022-12-10,,,,camera trap,2022-12-10T20:14:11Z/2022-12-10T20:14:12Z,,,, +DBNP_07_road_20221005_Z_Amir_triggerEvent_861,DBNP_07_road_20221005,Trigger,2022-12-10,,,,camera trap,2022-12-10T22:42:03Z/2022-12-10T22:42:04Z,,,, +DBNP_08_bush_20221005_Z_Amir_triggerEvent_446,DBNP_08_bush_20221005,Trigger,2022-11-05,,,,camera trap,2022-11-05T01:10:49Z/2022-11-05T01:11:00Z,,,, +DBNP_08_bush_20221005_Z_Amir_triggerEvent_447,DBNP_08_bush_20221005,Trigger,2022-11-05,,,,camera trap,2022-11-05T06:07:16Z/2022-11-05T06:08:49Z,,,, +DBNP_08_bush_20221005_Z_Amir_triggerEvent_448,DBNP_08_bush_20221005,Trigger,2022-11-06,,,,camera trap,2022-11-06T05:07:16Z/2022-11-06T05:07:24Z,,,, \ No newline at end of file diff --git a/tests/input_files/event/cameratrap-sample1/measurement_or_fact.txt b/tests/input_files/event/cameratrap-sample1/measurement_or_fact.txt new file mode 100644 index 0000000..65aa801 --- /dev/null +++ b/tests/input_files/event/cameratrap-sample1/measurement_or_fact.txt @@ -0,0 +1,21 @@ +eventID,measurementID,measurementType,measurementValue,measurementAccuracy,measurementUnit,measurementDeterminedDate,measurementDeterminedBy,measurementRemarks +CFRAG_02_bush_20221215,11,cameraID,2065144,,,2023-02-11T23:58:40Z,ZA SK, +CFRAG_02_bush_20221215,12,cameraModel,Reconyx,,,2023-02-11T23:58:40Z,ZA SK, +CFRAG_02_bush_20221215,13,cameraDelay,0,,,2023-02-11T23:58:40Z,ZA SK, +CFRAG_02_bush_20221215,14,cameraHeight,0.3,,,2023-02-11T23:58:40Z,ZA SK, +CFRAG_02_bush_20221215,15,cameraTilt,0,,,2023-02-11T23:58:40Z,ZA SK, +CFRAG_01_road_20221215,6,cameraID,2065144,,,2023-03-06T13:20:18Z,ZA SK, +CFRAG_01_road_20221215,7,cameraModel,Reconyx,,,2023-03-06T13:20:18Z,ZA SK, +CFRAG_01_road_20221215,8,cameraDelay,0,,,2023-03-06T13:20:18Z,ZA SK, +CFRAG_01_road_20221215,9,cameraHeight,0.3,,,2023-03-06T13:20:18Z,ZA SK, +CFRAG_01_road_20221215,10,cameraTilt,0,,,2023-03-06T13:20:18Z,ZA SK, +DBNP_07_road_20221005,226,cameraID,2065149,,,2022-12-11T05:35:55Z,ZA, +DBNP_07_road_20221005,227,cameraModel,Hawkray,,,2022-12-11T05:35:55Z,ZA, +DBNP_07_road_20221005,228,cameraDelay,5,,,2022-12-11T05:35:55Z,ZA, +DBNP_07_road_20221005,229,cameraHeight,0.3,,,2022-12-11T05:35:55Z,ZA, +DBNP_07_road_20221005,230,cameraTilt,0,,,2022-12-11T05:35:55Z,ZA, +DBNP_08_bush_20221005,231,cameraID,2065149,,,2023-01-05T06:16:48Z,ZA, +DBNP_08_bush_20221005,232,cameraModel,Hawkray,,,2023-01-05T06:16:48Z,ZA, +DBNP_08_bush_20221005,233,cameraDelay,5,,,2023-01-05T06:16:48Z,ZA, +DBNP_08_bush_20221005,234,cameraHeight,0.3,,,2023-01-05T06:16:48Z,ZA, +DBNP_08_bush_20221005,235,cameraTilt,0,,,2023-01-05T06:16:48Z,ZA, \ No newline at end of file diff --git a/tests/input_files/event/cameratrap-sample1/meta.xml b/tests/input_files/event/cameratrap-sample1/meta.xml new file mode 100644 index 0000000..bdbe739 --- /dev/null +++ b/tests/input_files/event/cameratrap-sample1/meta.xml @@ -0,0 +1,60 @@ + + + + event.txt + + + + + + + + + + + + + + + + + + + occurrence.txt + + + + + + + + + + + + + + + + + + + + + + + + measurement_or_fact.txt + + + + + + + + + + + + + diff --git a/tests/input_files/event/cameratrap-sample1/occurrence.txt b/tests/input_files/event/cameratrap-sample1/occurrence.txt new file mode 100644 index 0000000..337b5b3 --- /dev/null +++ b/tests/input_files/event/cameratrap-sample1/occurrence.txt @@ -0,0 +1,12 @@ +eventID,occurrenceID,mediaID,scientificName,individualCount,lifeStage,sex,behavior,identifiedBy,occurrenceRemarks,occurrenceStatus,samplingProtocol,basisOfRecord,kingdom,classificationProbability,identificationRemarks,samplingEffort,eventDate +CFRAG_02_bush_20221215_Z_Amir_triggerEvent_55,CFRAG_02_bush_20221215_Z_Amir_observationID_58,CFRAG_02_bush_20221215_Z_Amir_mediaID_332,Megapodius reinwardt,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-17T08:25:51Z/2022-12-17T08:25:54Z,2022-12-17 +CFRAG_02_bush_20221215_Z_Amir_triggerEvent_56,CFRAG_02_bush_20221215_Z_Amir_observationID_59,CFRAG_02_bush_20221215_Z_Amir_mediaID_337,Megapodius reinwardt,2,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-18T02:30:01Z/2022-12-18T02:30:05Z,2022-12-18 +CFRAG_01_road_20221215_Z_Amir_triggerEvent_217,CFRAG_01_road_20221215_Z_Amir_observationID_212,CFRAG_01_road_20221215_Z_Amir_mediaID_1630,Alectura lathami,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T04:10:45Z/2022-12-15T04:10:49Z,2022-12-15 +CFRAG_01_road_20221215_Z_Amir_triggerEvent_521,CFRAG_01_road_20221215_Z_Amir_observationID_483,CFRAG_01_road_20221215_Z_Amir_mediaID_3057,Mammalia,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T13:20:10Z/2022-12-15T13:20:15Z,2022-12-15 +CFRAG_01_road_20221215_Z_Amir_triggerEvent_533,CFRAG_01_road_20221215_Z_Amir_observationID_496,CFRAG_01_road_20221215_Z_Amir_mediaID_3137,Megapodius reinwardt,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T20:09:48Z/2022-12-15T20:09:51Z,2022-12-15 +DBNP_07_road_20221005_Z_Amir_triggerEvent_906,DBNP_07_road_20221005_Z_Amir_observationID_1170,DBNP_07_road_20221005_Z_Amir_mediaID_5187,Mammalia,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T17:05:36Z/2022-12-10T17:05:36Z,2022-12-10 +DBNP_07_road_20221005_Z_Amir_triggerEvent_860,DBNP_07_road_20221005_Z_Amir_observationID_1110,DBNP_07_road_20221005_Z_Amir_mediaID_4992,Heteromyias cinereifrons,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T20:14:11Z/2022-12-10T20:14:12Z,2022-12-10 +DBNP_07_road_20221005_Z_Amir_triggerEvent_861,DBNP_07_road_20221005_Z_Amir_observationID_1111,DBNP_07_road_20221005_Z_Amir_mediaID_4995,Heteromyias cinereifrons,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T22:42:03Z/2022-12-10T22:42:04Z,2022-12-10 +DBNP_08_bush_20221005_Z_Amir_triggerEvent_446,DBNP_08_bush_20221005_Z_Amir_observationID_537,DBNP_08_bush_20221005_Z_Amir_mediaID_1656,Hypsiprymnodon moschatus,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-05T01:10:49Z/2022-11-05T01:11:00Z,2022-11-05 +DBNP_08_bush_20221005_Z_Amir_triggerEvent_447,DBNP_08_bush_20221005_Z_Amir_observationID_538,DBNP_08_bush_20221005_Z_Amir_mediaID_1662,Hypsiprymnodon moschatus,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-05T06:07:16Z/2022-11-05T06:08:49Z,2022-11-05 +DBNP_08_bush_20221005_Z_Amir_triggerEvent_448,DBNP_08_bush_20221005_Z_Amir_observationID_539,DBNP_08_bush_20221005_Z_Amir_mediaID_1677,Hypsiprymnodon moschatus,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-06T05:07:16Z/2022-11-06T05:07:24Z,2022-11-06 \ No newline at end of file diff --git a/tests/input_files/event/cameratrap-sample2/event.txt b/tests/input_files/event/cameratrap-sample2/event.txt new file mode 100644 index 0000000..c4093b3 --- /dev/null +++ b/tests/input_files/event/cameratrap-sample2/event.txt @@ -0,0 +1,18 @@ +eventID,parentEventID,eventType,eventDate,decimalLatitude,decimalLongitude,coordinateUncertaintyInMeters,samplingProtocol,samplingEffort,deploymentGroups,locality,eventRemarks,habitat +Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,,Survey,,,,,,,,,, +Danbulla_NP_2022,,Survey,,,,,,,,,, +CFRAG_01_bush_20221215,Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,Deployment,,-17.2911425,145.6316175,1,,,Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,Eacham_Curtain_Fig_NPs,camera trap with FALSE bait | tags: bait:none | Swapped low NIMH batteries for alkaline. Access behind large tree fall gap after following water pipes. & SD card = 34A & physical ID on cam = 7 working cam & had these animal signs: Bird calls & animal signs are: Bird calls,tropical rainforest +CFRAG_01_road_20221215,Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,Deployment,,-17.2911425,145.6316175,1,,,Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,Eacham_Curtain_Fig_NPs,camera trap with FALSE bait near hiking trail | tags: bait:none | & SD card = Oo2 & physical ID on cam = 31 working cam & had these animal signs: Bird calls & animal signs are: Bird calls,tropical rainforest +DBNP_07_road_20221005,Danbulla_NP_2022,Deployment,,-17.0900405,145.625301,1,,,Danbulla_NP_2022,Danbulla_NP,"camera trap with FALSE bait near hiking trail | tags: bait:none | SD card reader didn't work, couldn't check test photos. To access site, turn left onto informal track when heading up main track and it turns right, then walk 60 m & animal signs are: Bird calls",tropical rainforest +DBNP_08_bush_20221005,Danbulla_NP_2022,Deployment,,-17.0812305,145.616759,1,,,Danbulla_NP_2022,Danbulla_NP,"camera trap with FALSE bait | tags: bait:none | Access via ridge line from hiking track where road cam is, turn left before ridge declines. Couldn't check test images. & animal signs are: NA",tropical rainforest +CFRAG_02_bush_20221215_Z_Amir_triggerEvent_55,CFRAG_02_bush_20221215,Trigger,2022-12-17,,,,camera trap,2022-12-17T08:25:51Z/2022-12-17T08:25:54Z,,,, +CFRAG_02_bush_20221215_Z_Amir_triggerEvent_56,CFRAG_02_bush_20221215,Trigger,2022-12-18,,,,camera trap,2022-12-18T02:30:01Z/2022-12-18T02:30:05Z,,,, +CFRAG_01_road_20221215_Z_Amir_triggerEvent_217,CFRAG_01_road_20221215,Trigger,2022-12-15,,,,camera trap,2022-12-15T04:10:45Z/2022-12-15T04:10:49Z,,,, +CFRAG_01_road_20221215_Z_Amir_triggerEvent_521,CFRAG_01_road_20221215,Trigger,2022-12-15,,,,camera trap,2022-12-15T13:20:10Z/2022-12-15T13:20:15Z,,,, +CFRAG_01_road_20221215_Z_Amir_triggerEvent_533,CFRAG_01_road_20221215,Trigger,2022-12-15,,,,camera trap,2022-12-15T20:09:48Z/2022-12-15T20:09:51Z,,,, +DBNP_07_road_20221005_Z_Amir_triggerEvent_906,DBNP_07_road_20221005,Trigger,2022-12-10,,,,camera trap,2022-12-10T17:05:36Z/2022-12-10T17:05:36Z,,,, +DBNP_07_road_20221005_Z_Amir_triggerEvent_860,DBNP_07_road_20221005,Trigger,2022-12-10,,,,camera trap,2022-12-10T20:14:11Z/2022-12-10T20:14:12Z,,,, +DBNP_07_road_20221005_Z_Amir_triggerEvent_861,DBNP_07_road_20221005,Trigger,2022-12-10,,,,camera trap,2022-12-10T22:42:03Z/2022-12-10T22:42:04Z,,,, +DBNP_08_bush_20221005_Z_Amir_triggerEvent_446,DBNP_08_bush_20221005,Trigger,2022-11-05,,,,camera trap,2022-11-05T01:10:49Z/2022-11-05T01:11:00Z,,,, +DBNP_08_bush_20221005_Z_Amir_triggerEvent_447,DBNP_08_bush_20221005,Trigger,2022-11-05,,,,camera trap,2022-11-05T06:07:16Z/2022-11-05T06:08:49Z,,,, +DBNP_08_bush_20221005_Z_Amir_triggerEvent_448,DBNP_08_bush_20221005,Trigger,2022-11-06,,,,camera trap,2022-11-06T05:07:16Z/2022-11-06T05:07:24Z,,,, \ No newline at end of file diff --git a/tests/input_files/event/cameratrap-sample2/extended_measurement_or_fact.txt b/tests/input_files/event/cameratrap-sample2/extended_measurement_or_fact.txt new file mode 100644 index 0000000..ca62325 --- /dev/null +++ b/tests/input_files/event/cameratrap-sample2/extended_measurement_or_fact.txt @@ -0,0 +1,21 @@ +eventID,measurementID,measurementType,measurementTypeID,measurementValue,measurementAccuracy,measurementUnit,measurementUnitID,measurementDeterminedDate,measurementDeterminedBy,measurementRemarks +CFRAG_02_bush_20221215,11,cameraID,cameraID_CFRAG_02_bush_20221215_3,2065144,,,,2023-02-11T23:58:40Z,ZA SK, +CFRAG_02_bush_20221215,12,cameraModel,cameraModel_CFRAG_02_bush_20221215_3,Reconyx,,,,2023-02-11T23:58:40Z,ZA SK, +CFRAG_02_bush_20221215,13,cameraDelay,cameraDelay_CFRAG_02_bush_20221215_3,0,,,,2023-02-11T23:58:40Z,ZA SK, +CFRAG_02_bush_20221215,14,cameraHeight,cameraHeight_CFRAG_02_bush_20221215_3,0.3,,,,2023-02-11T23:58:40Z,ZA SK, +CFRAG_02_bush_20221215,15,cameraTilt,cameraTilt_CFRAG_02_bush_20221215_3,0,,,,2023-02-11T23:58:40Z,ZA SK, +CFRAG_01_road_20221215,6,cameraID,cameraID_CFRAG_01_road_20221215_2,2065144,,,,2023-03-06T13:20:18Z,ZA SK, +CFRAG_01_road_20221215,7,cameraModel,cameraModel_CFRAG_01_road_20221215_2,Reconyx,,,,2023-03-06T13:20:18Z,ZA SK, +CFRAG_01_road_20221215,8,cameraDelay,cameraDelay_CFRAG_01_road_20221215_2,0,,,,2023-03-06T13:20:18Z,ZA SK, +CFRAG_01_road_20221215,9,cameraHeight,cameraHeight_CFRAG_01_road_20221215_2,0.3,,,,2023-03-06T13:20:18Z,ZA SK, +CFRAG_01_road_20221215,10,cameraTilt,cameraTilt_CFRAG_01_road_20221215_2,0,,,,2023-03-06T13:20:18Z,ZA SK, +DBNP_07_road_20221005,226,cameraID,cameraID_DBNP_07_road_20221005_46,2065149,,,,2022-12-11T05:35:55Z,ZA, +DBNP_07_road_20221005,227,cameraModel,cameraModel_DBNP_07_road_20221005_46,Hawkray,,,,2022-12-11T05:35:55Z,ZA, +DBNP_07_road_20221005,228,cameraDelay,cameraDelay_DBNP_07_road_20221005_46,5,,,,2022-12-11T05:35:55Z,ZA, +DBNP_07_road_20221005,229,cameraHeight,cameraHeight_DBNP_07_road_20221005_46,0.3,,,,2022-12-11T05:35:55Z,ZA, +DBNP_07_road_20221005,230,cameraTilt,cameraTilt_DBNP_07_road_20221005_46,0,,,,2022-12-11T05:35:55Z,ZA, +DBNP_08_bush_20221005,231,cameraID,cameraID_DBNP_08_bush_20221005_47,2065149,,,,2023-01-05T06:16:48Z,ZA, +DBNP_08_bush_20221005,232,cameraModel,cameraModel_DBNP_08_bush_20221005_47,Hawkray,,,,2023-01-05T06:16:48Z,ZA, +DBNP_08_bush_20221005,233,cameraDelay,cameraDelay_DBNP_08_bush_20221005_47,5,,,,2023-01-05T06:16:48Z,ZA, +DBNP_08_bush_20221005,234,cameraHeight,cameraHeight_DBNP_08_bush_20221005_47,0.3,,,,2023-01-05T06:16:48Z,ZA, +DBNP_08_bush_20221005,235,cameraTilt,cameraTilt_DBNP_08_bush_20221005_47,0,,,,2023-01-05T06:16:48Z,ZA, diff --git a/tests/input_files/event/cameratrap-sample2/meta.xml b/tests/input_files/event/cameratrap-sample2/meta.xml new file mode 100644 index 0000000..1c3bb86 --- /dev/null +++ b/tests/input_files/event/cameratrap-sample2/meta.xml @@ -0,0 +1,62 @@ + + + + event.txt + + + + + + + + + + + + + + + + + + + occurrence.txt + + + + + + + + + + + + + + + + + + + + + + + + extended_measurement_or_fact.txt + + + + + + + + + + + + + + + diff --git a/tests/input_files/event/cameratrap-sample2/occurrence.txt b/tests/input_files/event/cameratrap-sample2/occurrence.txt new file mode 100644 index 0000000..337b5b3 --- /dev/null +++ b/tests/input_files/event/cameratrap-sample2/occurrence.txt @@ -0,0 +1,12 @@ +eventID,occurrenceID,mediaID,scientificName,individualCount,lifeStage,sex,behavior,identifiedBy,occurrenceRemarks,occurrenceStatus,samplingProtocol,basisOfRecord,kingdom,classificationProbability,identificationRemarks,samplingEffort,eventDate +CFRAG_02_bush_20221215_Z_Amir_triggerEvent_55,CFRAG_02_bush_20221215_Z_Amir_observationID_58,CFRAG_02_bush_20221215_Z_Amir_mediaID_332,Megapodius reinwardt,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-17T08:25:51Z/2022-12-17T08:25:54Z,2022-12-17 +CFRAG_02_bush_20221215_Z_Amir_triggerEvent_56,CFRAG_02_bush_20221215_Z_Amir_observationID_59,CFRAG_02_bush_20221215_Z_Amir_mediaID_337,Megapodius reinwardt,2,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-18T02:30:01Z/2022-12-18T02:30:05Z,2022-12-18 +CFRAG_01_road_20221215_Z_Amir_triggerEvent_217,CFRAG_01_road_20221215_Z_Amir_observationID_212,CFRAG_01_road_20221215_Z_Amir_mediaID_1630,Alectura lathami,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T04:10:45Z/2022-12-15T04:10:49Z,2022-12-15 +CFRAG_01_road_20221215_Z_Amir_triggerEvent_521,CFRAG_01_road_20221215_Z_Amir_observationID_483,CFRAG_01_road_20221215_Z_Amir_mediaID_3057,Mammalia,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T13:20:10Z/2022-12-15T13:20:15Z,2022-12-15 +CFRAG_01_road_20221215_Z_Amir_triggerEvent_533,CFRAG_01_road_20221215_Z_Amir_observationID_496,CFRAG_01_road_20221215_Z_Amir_mediaID_3137,Megapodius reinwardt,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T20:09:48Z/2022-12-15T20:09:51Z,2022-12-15 +DBNP_07_road_20221005_Z_Amir_triggerEvent_906,DBNP_07_road_20221005_Z_Amir_observationID_1170,DBNP_07_road_20221005_Z_Amir_mediaID_5187,Mammalia,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T17:05:36Z/2022-12-10T17:05:36Z,2022-12-10 +DBNP_07_road_20221005_Z_Amir_triggerEvent_860,DBNP_07_road_20221005_Z_Amir_observationID_1110,DBNP_07_road_20221005_Z_Amir_mediaID_4992,Heteromyias cinereifrons,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T20:14:11Z/2022-12-10T20:14:12Z,2022-12-10 +DBNP_07_road_20221005_Z_Amir_triggerEvent_861,DBNP_07_road_20221005_Z_Amir_observationID_1111,DBNP_07_road_20221005_Z_Amir_mediaID_4995,Heteromyias cinereifrons,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T22:42:03Z/2022-12-10T22:42:04Z,2022-12-10 +DBNP_08_bush_20221005_Z_Amir_triggerEvent_446,DBNP_08_bush_20221005_Z_Amir_observationID_537,DBNP_08_bush_20221005_Z_Amir_mediaID_1656,Hypsiprymnodon moschatus,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-05T01:10:49Z/2022-11-05T01:11:00Z,2022-11-05 +DBNP_08_bush_20221005_Z_Amir_triggerEvent_447,DBNP_08_bush_20221005_Z_Amir_observationID_538,DBNP_08_bush_20221005_Z_Amir_mediaID_1662,Hypsiprymnodon moschatus,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-05T06:07:16Z/2022-11-05T06:08:49Z,2022-11-05 +DBNP_08_bush_20221005_Z_Amir_triggerEvent_448,DBNP_08_bush_20221005_Z_Amir_observationID_539,DBNP_08_bush_20221005_Z_Amir_mediaID_1677,Hypsiprymnodon moschatus,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-06T05:07:16Z/2022-11-06T05:07:24Z,2022-11-06 \ No newline at end of file diff --git a/tests/input_files/occurrence/sample1/meta.xml b/tests/input_files/occurrence/sample1/meta.xml new file mode 100755 index 0000000..ae72644 --- /dev/null +++ b/tests/input_files/occurrence/sample1/meta.xml @@ -0,0 +1,28 @@ + + + + + occurrence.txt + + + + + + + + + + + + multimedia.txt + + + + + + + + + + + \ No newline at end of file diff --git a/tests/input_files/sample/multimedia.csv b/tests/input_files/occurrence/sample1/multimedia.txt similarity index 92% rename from tests/input_files/sample/multimedia.csv rename to tests/input_files/occurrence/sample1/multimedia.txt index 52b58be..5836025 100644 --- a/tests/input_files/sample/multimedia.csv +++ b/tests/input_files/occurrence/sample1/multimedia.txt @@ -1,4 +1,4 @@ -occurrenceID,format,creator,license,type,identifier,documentId,rights +occurrenceID,format,creator,license,type,identifier,rights 014826,image/jpg,Joyce TJ,CC BY 4.0,image,https://www.somelinks.come/image?image=d68f8c06,"The rights to all uploaded images are held under the specified Creative Commons license, by the contributor of the image and the primary organisation responsible for the project to which they are contributed." 014825,image/jpg,Joyce TJ,CC BY 4.0,image,https://www.somelinks.come/image?image=a5923083,"The rights to all uploaded images are held under the specified Creative Commons license, by the contributor of the image and the primary organisation responsible for the project to which they are contributed." 014824,image/jpg,Joyce TJ,CC BY 4.0,image,https://www.somelinks.come/image?image=7fab23e4,"The rights to all uploaded images are held under the specified Creative Commons license, by the contributor of the image and the primary organisation responsible for the project to which they are contributed." \ No newline at end of file diff --git a/tests/input_files/sample/occurrence.csv b/tests/input_files/occurrence/sample1/occurrence.txt similarity index 100% rename from tests/input_files/sample/occurrence.csv rename to tests/input_files/occurrence/sample1/occurrence.txt diff --git a/tests/input_files/occurrence/sample2/meta.xml b/tests/input_files/occurrence/sample2/meta.xml new file mode 100755 index 0000000..99cfc64 --- /dev/null +++ b/tests/input_files/occurrence/sample2/meta.xml @@ -0,0 +1,28 @@ + + + + + occurrence.txt + + + + + + + + + + + + + multimedia.txt + + + + + + + + + + \ No newline at end of file diff --git a/tests/input_files/occurrence/sample2/multimedia.txt b/tests/input_files/occurrence/sample2/multimedia.txt new file mode 100644 index 0000000..e2d10dc --- /dev/null +++ b/tests/input_files/occurrence/sample2/multimedia.txt @@ -0,0 +1,9 @@ +institutionCode,collectionCode,catalogNumber,identifier,format,type +IA,CA,CA-1,http://imageA,image/jpeg,StillImage +IA,CA,CA-2,http://imageA-1,image/jpeg,StillImage +IA,CA,CA-2,http://imageA-2,image/jpeg,StillImage +IA,CA,CA-3,http://imageA-3,image/jpeg,StillImage +IA,CB,CB-1,http://imageB-1,image/jpeg,StillImage +IA,CB,CB-1,http://imageB-2,image/jpeg,StillImage +IA,CB,CB-2,http://imageB-3,image/jpeg,StillImage +IA,CC,CC-1,http://imageC2-1,image/jpeg,StillImage \ No newline at end of file diff --git a/tests/input_files/occurrence/sample2/occurrence.txt b/tests/input_files/occurrence/sample2/occurrence.txt new file mode 100644 index 0000000..8583817 --- /dev/null +++ b/tests/input_files/occurrence/sample2/occurrence.txt @@ -0,0 +1,7 @@ +institutionCode,collectionCode,catalogNumber,scientificName,decimalLatitude,decimalLongitude,basisOfRecord +IA,CA,CA-1,Species A,-36.0000,150.5678,PreservedSpecimen +IA,CA,CA-2,Species A-1,-20.0000,145.1234,PreservedSpecimen +IA,CA,CA-3,Species A-2,-20.0000,146.1234,PreservedSpecimen +IA,CB,CB-1,Species C,-20.00000,150.4567,PreservedSpecimen +IA,CB,CB-2,Species C,-20.00000,150.4567,PreservedSpecimen +IA,CC,CC-1,Species D,-30.20000,150.0123,PreservedSpecimen \ No newline at end of file diff --git a/tests/input_files/occurrence/sample3/meta.xml b/tests/input_files/occurrence/sample3/meta.xml new file mode 100755 index 0000000..99cfc64 --- /dev/null +++ b/tests/input_files/occurrence/sample3/meta.xml @@ -0,0 +1,28 @@ + + + + + occurrence.txt + + + + + + + + + + + + + multimedia.txt + + + + + + + + + + \ No newline at end of file diff --git a/tests/input_files/occurrence/sample3/multimedia.txt b/tests/input_files/occurrence/sample3/multimedia.txt new file mode 100644 index 0000000..2e8093b --- /dev/null +++ b/tests/input_files/occurrence/sample3/multimedia.txt @@ -0,0 +1,12 @@ +institutionCode,collectionCode,catalogNumber,identifier,format,type +IA,CA,CA-1,http://imageA,image/jpeg,StillImage +IA,CA,CA-2,http://imageA-1,image/jpeg,StillImage +IA,CA,CA-2,http://imageA-2,image/jpeg,StillImage +IA,CA,CA-3,http://imageA-3,image/jpeg,StillImage +IA,CB,CB-1,http://imageB-1,image/jpeg,StillImage +IA,CB,CB-1,http://imageB-2,image/jpeg,StillImage +IA,CB,CB-2,http://imageB-3,image/jpeg,StillImage +IA,CC,CC-1,http://imageC2-1,image/jpeg,StillImage +ERROR-IA,CC,CC-1,http://imageC2-1,image/jpeg,StillImage +IA,ERROR-CC,CC-2,http://imageC2-1,image/jpeg,StillImage +IA,CC,ERROR-CC-2,http://imageC2-1,image/jpeg,StillImage \ No newline at end of file diff --git a/tests/input_files/occurrence/sample3/occurrence.txt b/tests/input_files/occurrence/sample3/occurrence.txt new file mode 100644 index 0000000..8583817 --- /dev/null +++ b/tests/input_files/occurrence/sample3/occurrence.txt @@ -0,0 +1,7 @@ +institutionCode,collectionCode,catalogNumber,scientificName,decimalLatitude,decimalLongitude,basisOfRecord +IA,CA,CA-1,Species A,-36.0000,150.5678,PreservedSpecimen +IA,CA,CA-2,Species A-1,-20.0000,145.1234,PreservedSpecimen +IA,CA,CA-3,Species A-2,-20.0000,146.1234,PreservedSpecimen +IA,CB,CB-1,Species C,-20.00000,150.4567,PreservedSpecimen +IA,CB,CB-2,Species C,-20.00000,150.4567,PreservedSpecimen +IA,CC,CC-1,Species D,-30.20000,150.0123,PreservedSpecimen \ No newline at end of file diff --git a/tests/input_files/multimedia/multimedia_file.csv b/tests/input_files/sample/multimedia/multimedia_file.csv similarity index 100% rename from tests/input_files/multimedia/multimedia_file.csv rename to tests/input_files/sample/multimedia/multimedia_file.csv diff --git a/tests/input_files/multimedia/multimedia_file.tsv b/tests/input_files/sample/multimedia/multimedia_file.tsv similarity index 100% rename from tests/input_files/multimedia/multimedia_file.tsv rename to tests/input_files/sample/multimedia/multimedia_file.tsv diff --git a/tests/input_files/occurrence/occ_file1.csv b/tests/input_files/sample/occurrence/occ_file1.csv similarity index 100% rename from tests/input_files/occurrence/occ_file1.csv rename to tests/input_files/sample/occurrence/occ_file1.csv diff --git a/tests/input_files/occurrence/occ_file1.tsv b/tests/input_files/sample/occurrence/occ_file1.tsv similarity index 100% rename from tests/input_files/occurrence/occ_file1.tsv rename to tests/input_files/sample/occurrence/occ_file1.tsv diff --git a/tests/input_files/occurrence/occ_file2_additional_column.csv b/tests/input_files/sample/occurrence/occ_file2_additional_column.csv similarity index 100% rename from tests/input_files/occurrence/occ_file2_additional_column.csv rename to tests/input_files/sample/occurrence/occ_file2_additional_column.csv diff --git a/tests/input_files/occurrence/occ_file2_additional_column.tsv b/tests/input_files/sample/occurrence/occ_file2_additional_column.tsv similarity index 100% rename from tests/input_files/occurrence/occ_file2_additional_column.tsv rename to tests/input_files/sample/occurrence/occ_file2_additional_column.tsv diff --git a/tests/test_create_core_and_ext_content.py b/tests/test_create_core_and_ext_content.py index 8091987..5a1a974 100644 --- a/tests/test_create_core_and_ext_content.py +++ b/tests/test_create_core_and_ext_content.py @@ -8,17 +8,17 @@ from dwcahandler.dwca.core_dwca import Dwca -single_csv_occ_test = {"file_paths": ['./input_files/occurrence/occ_file1.csv'], +single_csv_occ_test = {"file_paths": ['./input_files/sample/occurrence/occ_file1.csv'], "delimiter": ","} -multiple_csv_occ_test = {"file_paths": glob.glob(os.path.join("./input_files/occurrence", "*.csv")), +multiple_csv_occ_test = {"file_paths": glob.glob(os.path.join("input_files/sample/occurrence", "*.csv")), "delimiter": ","} -multiple_tsv_occ_test = {"file_paths": glob.glob(os.path.join("./input_files/occurrence", "*.tsv")), +multiple_tsv_occ_test = {"file_paths": glob.glob(os.path.join("input_files/sample/occurrence", "*.tsv")), "delimiter": "\t"} duplicates_csv_occ_test = {"file_paths": single_csv_occ_test["file_paths"] + multiple_csv_occ_test["file_paths"], "delimiter": ","} -csv_occ_with_space = {"file_paths": ['./input_files/occurrence/occ_file1.csv', './input_files/sample/occ_header_with_space.csv'], +csv_occ_with_space = {"file_paths": ['./input_files/sample/occurrence/occ_file1.csv', './input_files/sample/occ_header_with_space.csv'], "delimiter": ","} -multimedia_with_space = {"file_paths": ['./input_files/multimedia/multimedia_file.csv', './input_files/sample/multimedia_header_with_space.csv'], +multimedia_with_space = {"file_paths": ['./input_files/sample/multimedia/multimedia_file.csv', './input_files/sample/multimedia_header_with_space.csv'], "delimiter": ","} @@ -28,14 +28,15 @@ def get_expected_combined_occ_df(file_paths: list, keys: list, delimiter: str = for df in dfs: all_records_df = pd.concat([all_records_df, df], ignore_index=True) all_records_df.drop_duplicates(inplace=True) - all_records_df.set_index(keys=keys, drop=False, inplace=True) + #all_records_df.set_index(keys=keys, drop=False, inplace=True) + #all_records_df.reset_index(inplace=True) return all_records_df @pytest.fixture def test_case(request): yield {"file_type": CsvFileType(files=request.param["file_paths"], - type='occurrence', + type=MetaElementTypes.OCCURRENCE, keys=['catalogNumber'], csv_encoding=CSVEncoding(csv_delimiter=request.param["delimiter"])), "expected_result": get_expected_combined_occ_df(file_paths=request.param["file_paths"], @@ -68,8 +69,8 @@ def test_extract_csv_core_content(self, test_case: dict): core_ext_type=CoreOrExtType.CORE) # Drop id field from testing - pd.testing.assert_frame_equal(dwca_creator.core_content.df_content.drop( - columns=['id']), test_case['expected_result']) + pd.testing.assert_frame_equal(left=dwca_creator.core_content.df_content.reset_index(drop=True), + right=test_case['expected_result'].reset_index(drop=True)) meta_columns = list(map(attrgetter('field_name'), dwca_creator.meta_content.meta_elements[0].fields)) @@ -77,7 +78,7 @@ def test_extract_csv_core_content(self, test_case: dict): assert dwca_creator.core_content.df_content.columns.to_list() == meta_columns assert (dwca_creator.meta_content.meta_elements[0].meta_element_type.type == - MetaElementTypes.get_element('occurrence')) + MetaElementTypes.OCCURRENCE) def test_extract_csv_ext_content(self): """ @@ -87,35 +88,34 @@ def test_extract_csv_ext_content(self): dwca_creator = Dwca() dwca_creator.extract_csv_content(csv_info=CsvFileType(files=multiple_csv_occ_test['file_paths'], - type='occurrence', + type=MetaElementTypes.OCCURRENCE, keys=['catalogNumber'], csv_encoding=CSVEncoding( csv_delimiter=multiple_csv_occ_test["delimiter"])), core_ext_type=CoreOrExtType.CORE) - multimedia_file_path = './input_files/multimedia/multimedia_file.csv' + multimedia_file_path = 'input_files/sample/multimedia/multimedia_file.csv' dwca_creator.extract_csv_content(csv_info=CsvFileType(files=[multimedia_file_path], - type='multimedia', + type=MetaElementTypes.MULTIMEDIA, keys=['catalogNumber'], csv_encoding=CSVEncoding(csv_delimiter=',')), core_ext_type=CoreOrExtType.EXTENSION) # Drop coreid field from testing as this is generated - pd.testing.assert_frame_equal(dwca_creator.ext_content[0].df_content.drop( - columns=['coreid']), pd.read_csv(multimedia_file_path)) + pd.testing.assert_frame_equal(dwca_creator.ext_content[0].df_content, pd.read_csv(multimedia_file_path)) meta_columns = list(map(attrgetter('field_name'), dwca_creator.meta_content.meta_elements[1].fields)) assert sorted(list(map(attrgetter('field_name'), dwca_creator.meta_content.meta_elements[1].fields))) == \ - sorted(['coreid', 'catalogNumber', 'identifier', 'format', 'type']) + sorted(['catalogNumber', 'identifier', 'format', 'type']) # Test both the meta content extension and extension dataframe is consistent assert dwca_creator.ext_content[0].df_content.columns.to_list() == meta_columns # Test that the meta content extension if of multimedia type assert (dwca_creator.meta_content.meta_elements[1].meta_element_type.type == - MetaElementTypes.get_element('multimedia')) + MetaElementTypes.MULTIMEDIA) def test_extract_tsv_ext_content(self): """ @@ -125,35 +125,33 @@ def test_extract_tsv_ext_content(self): dwca_creator = Dwca() dwca_creator.extract_csv_content(csv_info=CsvFileType(files=multiple_tsv_occ_test['file_paths'], - type='occurrence', + type=MetaElementTypes.OCCURRENCE, keys=['catalogNumber'], csv_encoding=CSVEncoding( csv_delimiter=multiple_tsv_occ_test["delimiter"])), core_ext_type=CoreOrExtType.CORE) - multimedia_file_path = './input_files/multimedia/multimedia_file.tsv' + multimedia_file_path = 'input_files/sample/multimedia/multimedia_file.tsv' dwca_creator.extract_csv_content(csv_info=CsvFileType(files=[multimedia_file_path], - type='multimedia', + type=MetaElementTypes.MULTIMEDIA, keys=['catalogNumber'], csv_encoding=CSVEncoding(csv_delimiter='\t')), core_ext_type=CoreOrExtType.EXTENSION) - # Drop coreid field from testing as this is generated - pd.testing.assert_frame_equal(dwca_creator.ext_content[0].df_content.drop( - columns=['coreid']), pd.read_csv(multimedia_file_path, delimiter='\t')) + pd.testing.assert_frame_equal(left=dwca_creator.ext_content[0].df_content, + right=pd.read_csv(multimedia_file_path, delimiter='\t')) meta_columns = list(map(attrgetter('field_name'), dwca_creator.meta_content.meta_elements[1].fields)) assert sorted(list(map(attrgetter('field_name'), dwca_creator.meta_content.meta_elements[1].fields))) == \ - sorted(['coreid', 'catalogNumber', 'identifier', 'format', 'type']) + sorted(['catalogNumber', 'identifier', 'format', 'type']) # Test both the meta content extension and extension dataframe is consistent assert dwca_creator.ext_content[0].df_content.columns.to_list() == meta_columns # Test that the meta content extension if of multimedia type - assert (dwca_creator.meta_content.meta_elements[1].meta_element_type.type == - MetaElementTypes.get_element('multimedia')) + assert (dwca_creator.meta_content.meta_elements[1].meta_element_type.type == MetaElementTypes.MULTIMEDIA) def test_extract_csv_with_header_space(self): """ @@ -163,13 +161,13 @@ def test_extract_csv_with_header_space(self): dwca_creator = Dwca() dwca_creator.extract_csv_content(csv_info=CsvFileType(files=csv_occ_with_space['file_paths'], - type='occurrence', + type=MetaElementTypes.OCCURRENCE, keys=['catalogNumber'], csv_encoding=CSVEncoding( csv_delimiter=csv_occ_with_space["delimiter"])), core_ext_type=CoreOrExtType.CORE) - expected_column_list = ["id", "catalogNumber", "basisOfRecord", "scientificName", + expected_column_list = ["catalogNumber", "basisOfRecord", "scientificName", "license","decimalLatitude","decimalLongitude"] assert set(dwca_creator.core_content.df_content.columns) == set(expected_column_list) assert len(dwca_creator.core_content.df_content) == 5 @@ -185,19 +183,19 @@ def test_extract_csv_ext_with_header_space(self): dwca_creator = Dwca() dwca_creator.extract_csv_content(csv_info=CsvFileType(files=csv_occ_with_space['file_paths'], - type='occurrence', + type=MetaElementTypes.OCCURRENCE, keys=['catalogNumber'], csv_encoding=CSVEncoding( csv_delimiter=csv_occ_with_space["delimiter"])), core_ext_type=CoreOrExtType.CORE) dwca_creator.extract_csv_content(csv_info=CsvFileType(files=multimedia_with_space['file_paths'], - type='multimedia', + type=MetaElementTypes.MULTIMEDIA, keys=['catalogNumber'], csv_encoding=CSVEncoding(csv_delimiter=',')), core_ext_type=CoreOrExtType.EXTENSION) - expected_column_list = ["id", "catalogNumber", "basisOfRecord", "scientificName", + expected_column_list = ["catalogNumber", "basisOfRecord", "scientificName", "license","decimalLatitude","decimalLongitude"] assert set(dwca_creator.core_content.df_content.columns) == set(expected_column_list) assert len(dwca_creator.core_content.df_content) == 5 @@ -205,7 +203,7 @@ def test_extract_csv_ext_with_header_space(self): pd.Series(["C1", "C2", "C3", "C4", "C5"], dtype=str, name="catalogNumber"), check_index_type=False, check_index=False) - expected_column_list = ["coreid", "catalogNumber", "identifier", "format", "type"] + expected_column_list = ["catalogNumber", "identifier", "format", "type"] assert set(dwca_creator.ext_content[0].df_content.columns) == set(expected_column_list) assert len(dwca_creator.ext_content[0].df_content) == 5 pdtest.assert_series_equal(dwca_creator.ext_content[0].df_content["catalogNumber"], diff --git a/tests/test_create_dwca.py b/tests/test_create_dwca.py new file mode 100644 index 0000000..67f6ee8 --- /dev/null +++ b/tests/test_create_dwca.py @@ -0,0 +1,148 @@ + +import pandas as pd +from dwcahandler import CsvFileType, DwcaHandler, MetaElementTypes +from pathlib import Path +from io import BytesIO +from tests import get_eml_content, get_xml_from_file +from zipfile import ZipFile +import glob +import os + + +def check_output(output_obj: BytesIO, test_files_folder: str, check_core_id: bool = False): + + test_files_list = glob.glob(os.path.join(test_files_folder, "*.txt")) + expected_meta_xml_path = os.path.join(test_files_folder, "meta.xml") + + with ZipFile(output_obj, 'r') as zf: + files = zf.namelist() + + for test_csv in test_files_list: + assert Path(test_csv).name in files + + assert 'meta.xml' in files + assert 'eml.xml' in files + + with zf.open('meta.xml') as meta_xml_file: + meta_str = meta_xml_file.read().decode("utf-8") + expected_meta_xml = get_xml_from_file(str(expected_meta_xml_path)) + assert meta_str == expected_meta_xml + + for txt_file in files: + with zf.open(txt_file) as txt_file: + for test_file in test_files_list: + if txt_file.name == Path(test_file).name: + actual_df = pd.read_csv(txt_file, dtype='str') + expected_df = pd.read_csv(test_file, dtype='str') + if not check_core_id: + pd.testing.assert_frame_equal(actual_df, expected_df) + else: + core_id_list = ["id", "coreid"] + assert any(found :=[i for i in core_id_list if i in actual_df.columns.to_list()]) + actual_df = actual_df.drop(columns=[found[0]]) + for col in expected_df.columns: + expected_df = expected_df[~expected_df[col].str.contains('ERROR')] + pd.testing.assert_frame_equal(actual_df, expected_df) + + zf.close() + + +class TestCreateDwca: + + def test_create_occurrence_dwca_occurrence(self): + test_files_folder = "./input_files/occurrence/sample1" + + core_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], keys=['occurrenceID'], + type=MetaElementTypes.OCCURRENCE) + ext1_csv = CsvFileType(files=[f"{test_files_folder}/multimedia.txt"], keys=['occurrenceID'], + type=MetaElementTypes.MULTIMEDIA) + + output_obj = BytesIO() + + DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=[ext1_csv], output_dwca=output_obj, + eml_content=get_eml_content()) + + assert output_obj + + check_output(output_obj, test_files_folder) + + + def test_create_occurrence_dwca_occurrence_multiple_keys(self): + test_files_folder = "./input_files/occurrence/sample2" + + core_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], + keys=['institutionCode','collectionCode','catalogNumber'], + type=MetaElementTypes.OCCURRENCE) + ext1_csv = CsvFileType(files=[f"{test_files_folder}/multimedia.txt"], + keys=['institutionCode','collectionCode','catalogNumber'], + type=MetaElementTypes.MULTIMEDIA) + + output_obj = BytesIO() + + DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=[ext1_csv], output_dwca=output_obj, + eml_content=get_eml_content()) + + assert output_obj + + check_output(output_obj, test_files_folder, check_core_id=True) + + + def test_create_occurrence_dwca_occurrence_extra_multimedia_records(self): + test_files_folder = "./input_files/occurrence/sample3" + + core_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], + keys=['institutionCode','collectionCode','catalogNumber'], + type=MetaElementTypes.OCCURRENCE) + ext1_csv = CsvFileType(files=[f"{test_files_folder}/multimedia.txt"], + keys=['institutionCode','collectionCode','catalogNumber'], + type=MetaElementTypes.MULTIMEDIA) + + output_obj = BytesIO() + + DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=[ext1_csv], output_dwca=output_obj, + eml_content=get_eml_content()) + + assert output_obj + + check_output(output_obj, test_files_folder, check_core_id=True) + + def test_create_event_dwca_sample1(self): + + test_files_folder = "./input_files/event/cameratrap-sample1" + + core_csv = CsvFileType(files=[f"{test_files_folder}/event.txt"], keys=['eventID'], + type=MetaElementTypes.EVENT) + ext1_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], keys=['eventID'], + type=MetaElementTypes.OCCURRENCE) + ext2_csv = CsvFileType(files=[f"{test_files_folder}/measurement_or_fact.txt"], keys=['eventID'], + type=MetaElementTypes.MEASUREMENT_OR_FACT) + + output_obj = BytesIO() + + DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=[ext1_csv, ext2_csv], output_dwca=output_obj, + eml_content=get_eml_content()) + + assert output_obj + + check_output(output_obj, test_files_folder) + + def test_create_event_dwca_sample2(self): + + test_files_folder = "./input_files/event/cameratrap-sample2" + + core_csv = CsvFileType(files=[f"{test_files_folder}/event.txt"], keys=['eventID'], + type=MetaElementTypes.EVENT) + ext1_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], keys=['eventID'], + type=MetaElementTypes.OCCURRENCE) + ext2_csv = CsvFileType(files=[f"{test_files_folder}/extended_measurement_or_fact.txt"], keys=['eventID'], + type=MetaElementTypes.EXTENDED_MEASUREMENT_OR_FACT) + + output_obj = BytesIO() + + DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=[ext1_csv, ext2_csv], output_dwca=output_obj, + eml_content=get_eml_content()) + + assert output_obj + + check_output(output_obj, test_files_folder) + diff --git a/tests/test_delete_dwca_content.py b/tests/test_delete_dwca_content.py index 49b2242..8f480ac 100644 --- a/tests/test_delete_dwca_content.py +++ b/tests/test_delete_dwca_content.py @@ -3,7 +3,7 @@ from zipfile import ZipFile from tests import make_meta_xml_str, remove_pretty_print_xml from tests import make_dwca -from dwcahandler import DwcaHandler, CsvFileType +from dwcahandler import DwcaHandler, CsvFileType, MetaElementTypes from io import BytesIO @@ -13,24 +13,24 @@ def test_delete_core_records(self): """ Test for record deletion in occurrence core """ - occ_df = pd.DataFrame(data=[["1", "species1", "-30.0000", "144.0000"], - ["2", "species2", "-28.0000", "115.0000"], - ["3", "species3", "-36.0000", "144.308848"]], - columns=['occurrenceID', 'scientificName', 'decimalLatitude', 'decimalLongitude']) + occ_df = pd.DataFrame(data=[["1", "occ1", "species1", "-30.0000", "144.0000"], + ["2", "occ2", "species2", "-28.0000", "115.0000"], + ["3", "occ3", "species3", "-36.0000", "144.308848"]], + columns=['id', 'occurrenceID', 'scientificName', 'decimalLatitude', 'decimalLongitude']) - dwca_obj = make_dwca(occ_df) + dwca_obj = make_dwca(core_content=occ_df, use_col_idx_as_core_id=0) - delete_df = pd.DataFrame(data=[["2", "species2"], - ["3", "species3"]], + delete_df = pd.DataFrame(data=[["occ2", "species2"], + ["occ3", "species3"]], columns=['occurrenceID', 'scientificName']) - delete_records = CsvFileType(files=delete_df, type='occurrence', keys=['occurrenceID']) + delete_records = CsvFileType(files=delete_df, type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) output_obj = BytesIO() DwcaHandler.delete_records(dwca_file=dwca_obj, records_to_delete=delete_records, output_dwca_path=output_obj) - expected_meta_xml = make_meta_xml_str(occ_df) + expected_meta_xml = make_meta_xml_str(core_df=occ_df, use_col_idx_as_core_id=0) with ZipFile(output_obj, 'r') as zf: files = zf.namelist() @@ -45,37 +45,37 @@ def test_delete_core_records(self): with zf.open('occurrence.csv') as occ_file: df = pd.read_csv(occ_file, dtype='str') expected_df = occ_df.drop([1, 2]) - pd.testing.assert_frame_equal(df.drop(columns="id"), expected_df) + pd.testing.assert_frame_equal(df, expected_df) zf.close() def test_delete_records_dwca_ext(self): """ - Test for record deletion in occurrence core and multimedia extension + Test for record deletion in occurrence core and multimedia extension dwca """ - occ_df = pd.DataFrame(data=[["1", "species1", "-30.0000", "144.0000"], - ["2", "species2", "-28.0000", "115.0000"], - ["3", "species3", "-36.0000", "144.30848"]], - columns=['occurrenceID', 'scientificName', 'decimalLatitude', 'decimalLongitude']) + occ_df = pd.DataFrame(data=[["1", "occ1", "species1", "-30.0000", "144.0000"], + ["2", "occ2", "species2", "-28.0000", "115.0000"], + ["3", "occ3", "species3", "-36.0000", "144.30848"]], + columns=["id", "occurrenceID", "scientificName", "decimalLatitude", "decimalLongitude"]) multimedia_df = pd.DataFrame(data=[["1", "https://image1.jpg", "image/jpeg", "StillImage"], ["2", "https://image2.jpg", "image/jpeg", "StillImage"], ["3", "https://image3.jpg", "image/jpeg", "StillImage"]], - columns=["occurrenceID", "identifier", "format", "type"]) + columns=["coreid", "identifier", "format", "type"]) - dwca_ext_obj = make_dwca(occ_df, multimedia_df) + dwca_ext_obj = make_dwca(core_content=occ_df, ext_mult_content=multimedia_df, use_col_idx_as_core_id=0) - delete_df = pd.DataFrame(data=[["2", "species2"], - ["3", "species3"]], - columns=['occurrenceID', 'scientificName']) + delete_df = pd.DataFrame(data=[["occ2", "species2"], + ["occ3", "species3"]], + columns=["occurrenceID", "scientificName"]) - delete_records = CsvFileType(files=delete_df, type='occurrence', keys=['occurrenceID']) + delete_records = CsvFileType(files=delete_df, type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) output_obj = BytesIO() DwcaHandler.delete_records(dwca_file=dwca_ext_obj, records_to_delete=delete_records, output_dwca_path=output_obj) - expected_meta_xml = make_meta_xml_str(occ_df, multimedia_df) + expected_meta_xml = make_meta_xml_str(core_df=occ_df, ext_df=multimedia_df, use_col_idx_as_core_id=0) with ZipFile(output_obj, 'r') as zf: files = zf.namelist() @@ -91,11 +91,11 @@ def test_delete_records_dwca_ext(self): with zf.open('occurrence.csv') as occ_file: df = pd.read_csv(occ_file, dtype='str') expected_df = occ_df.drop([1, 2]) - pd.testing.assert_frame_equal(df.drop(columns="id"), expected_df) + pd.testing.assert_frame_equal(df, expected_df) with zf.open('multimedia.csv') as multimedia_file: multimedia_df_output = pd.read_csv(multimedia_file, dtype='str') expected_mult_df = multimedia_df.drop([1, 2]) - pd.testing.assert_frame_equal(multimedia_df_output.drop(columns="coreid"), expected_mult_df) + pd.testing.assert_frame_equal(multimedia_df_output, expected_mult_df) zf.close() diff --git a/tests/test_listterms.py b/tests/test_listterms.py index d43daec..2d1be42 100644 --- a/tests/test_listterms.py +++ b/tests/test_listterms.py @@ -1,6 +1,6 @@ import pandas as pd from numpy import nan -from dwcahandler.dwca import DwcaHandler, Terms +from dwcahandler.dwca import DwcaHandler, Terms, NsPrefix class TestTerms: @@ -12,19 +12,20 @@ def test_list_dwc_terms(self): """ Test that mandatory terms are present """ - df = DwcaHandler.list_dwc_terms() + df, class_df = DwcaHandler.list_dwc_terms() assert df.query('term == "occurrenceID"').shape[0] == 1 assert df.query('term == "basisOfRecord"').shape[0] == 1 assert df.query('term == "scientificName"').shape[0] == 1 assert df.query('term == "decimalLatitude"').shape[0] == 1 assert df.query('term == "decimalLongitude"').shape[0] == 1 assert df.query('term == "eventDate"').shape[0] == 1 + assert len(class_df[class_df["class"]=="OCCURRENCE"]) == 1 def test_update_list_terms(self, mocker): """ Test that the terms are stored in expected format and deprecated terms are not brought over """ - mocker.patch('pandas.read_csv', + mocker.patch.object(Terms, attribute="get_dwc_source_data", return_value=pd.DataFrame( {"term_localName": ["occurrenceID", "basisOfRecord", "scientificName", "oldTerm"], @@ -32,12 +33,23 @@ def test_update_list_terms(self, mocker): "http://rs.tdwg.org/dwc/terms/", "http://rs.tdwg.org/dwc/terms/", "http://rs.tdwg.org/dwc/terms/"], - "term_deprecated": [nan, nan, nan, "true"]})) + "term_deprecated": [nan, nan, nan, "true"], + "tdwgutility_organizedInClass": ["http://rs.tdwg.org/dwc/terms/Occurrence", + "http://rs.tdwg.org/dwc/terms/Occurrence", + "http://rs.tdwg.org/dwc/terms/Occurrence", + "http://rs.tdwg.org/dwc/terms/Occurrence"]})) mocker.patch('pandas.DataFrame.to_csv') - return_dwc_df = Terms.update_dwc_terms() - pd.testing.assert_frame_equal(return_dwc_df, - pd.DataFrame({"term": ["occurrenceID", "basisOfRecord", - "scientificName"], + return_dwc_df, return_dwc_class_df = Terms.update_dwc_terms() + pd.testing.assert_frame_equal(left=return_dwc_df, + right=pd.DataFrame({"prefix": [NsPrefix.DWC.value, NsPrefix.DWC.value, + NsPrefix.DWC.value], + "term": ["occurrenceID", "basisOfRecord", "scientificName"], "uri": ["http://rs.tdwg.org/dwc/terms/occurrenceID", "http://rs.tdwg.org/dwc/terms/basisOfRecord", - "http://rs.tdwg.org/dwc/terms/scientificName"]})) + "http://rs.tdwg.org/dwc/terms/scientificName"]}), + check_like=True) + pd.testing.assert_frame_equal(left=return_dwc_class_df, + right=pd.DataFrame({"prefix": [NsPrefix.DWC.value], + "class": ["OCCURRENCE"], + "class_uri": ["http://rs.tdwg.org/dwc/terms/Occurrence"]}), + check_like=True) diff --git a/tests/test_merge_dwca.py b/tests/test_merge_dwca.py index 02e10a9..4122575 100644 --- a/tests/test_merge_dwca.py +++ b/tests/test_merge_dwca.py @@ -3,14 +3,14 @@ from zipfile import ZipFile from tests import make_meta_xml_str, remove_pretty_print_xml from tests import make_dwca -from dwcahandler import DwcaHandler +from dwcahandler import DwcaHandler, MetaElementTypes from io import BytesIO from numpy import nan class TestMergeDwcaContent: - def test_merge_core_records(self): + def test_merge_core_records_with_id(self): """ Test for core record merging (update existing and add new rows) """ @@ -19,7 +19,7 @@ def test_merge_core_records(self): ["3", "species3", "-36.0000", "144.30848"]], columns=['occurrenceID', 'scientificName', 'decimalLatitude', 'decimalLongitude']) - dwca_obj = make_dwca(occ_df) + dwca_obj = make_dwca(core_content=occ_df, use_col_idx_as_core_id=0) delta_occ_df = pd.DataFrame(data=[["3", "species3", "-40.0000", "144.0000"], ["4", "species4", "-10.0000", "144.0000"], @@ -27,18 +27,18 @@ def test_merge_core_records(self): ["6", "species6", "-30.0000", "146.3048"]], columns=['occurrenceID', 'scientificName', 'decimalLatitude', 'decimalLongitude']) - delta_dwca_obj = make_dwca(delta_occ_df) + delta_dwca_obj = make_dwca(core_content=delta_occ_df, use_col_idx_as_core_id=0) output_obj = BytesIO() keys_lookup: dict = dict() - keys_lookup['occurrence'] = ['occurrenceID'] + keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] DwcaHandler.merge_dwca(dwca_file=dwca_obj, delta_dwca_file=delta_dwca_obj, output_dwca_path=output_obj, keys_lookup=keys_lookup) - expected_meta_xml = make_meta_xml_str(occ_df) + expected_meta_xml = make_meta_xml_str(core_df=occ_df, use_col_idx_as_core_id=0) expected_occ_df = pd.DataFrame(data=[["1", "species1", "-30.0000", "144.0000"], ["2", "species2", "-28.0000", "115.0000"], @@ -60,11 +60,121 @@ def test_merge_core_records(self): with zf.open('occurrence.csv') as occ_file: df = pd.read_csv(occ_file, dtype='str') - pd.testing.assert_frame_equal(df.drop(columns='id'), expected_occ_df) + pd.testing.assert_frame_equal(df, expected_occ_df) zf.close() - def test_merge_core_and_ext_records(self): + def test_merge_core_records_without_id(self): + """ + Test for core record merging (update existing and add new rows) + """ + occ_df = pd.DataFrame(data=[["1", "species1", "-30.0000", "144.0000"], + ["2", "species2", "-28.0000", "115.0000"], + ["3", "species3", "-36.0000", "144.30848"]], + columns=['occurrenceID', 'scientificName', 'decimalLatitude', 'decimalLongitude']) + + dwca_obj = make_dwca(core_content=occ_df, use_col_idx_as_core_id=-1) + + delta_occ_df = pd.DataFrame(data=[["3", "species3", "-40.0000", "144.0000"], + ["4", "species4", "-10.0000", "144.0000"], + ["5", "species5", "-20.0000", "145.0000"], + ["6", "species6", "-30.0000", "146.3048"]], + columns=['occurrenceID', 'scientificName', 'decimalLatitude', 'decimalLongitude']) + + delta_dwca_obj = make_dwca(core_content=delta_occ_df, use_col_idx_as_core_id=-1) + + output_obj = BytesIO() + + keys_lookup: dict = dict() + keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] + + DwcaHandler.merge_dwca(dwca_file=dwca_obj, delta_dwca_file=delta_dwca_obj, + output_dwca_path=output_obj, + keys_lookup=keys_lookup) + + expected_meta_xml = make_meta_xml_str(core_df=occ_df, use_col_idx_as_core_id=-1) + + expected_occ_df = pd.DataFrame(data=[["1", "species1", "-30.0000", "144.0000"], + ["2", "species2", "-28.0000", "115.0000"], + ["3", "species3", "-40.0000", "144.0000"], + ["4", "species4", "-10.0000", "144.0000"], + ["5", "species5", "-20.0000", "145.0000"], + ["6", "species6", "-30.0000", "146.3048"]], + columns=['occurrenceID', 'scientificName', 'decimalLatitude', 'decimalLongitude']) + + with ZipFile(output_obj, 'r') as zf: + files = zf.namelist() + assert 'occurrence.csv' in files + assert 'meta.xml' in files + assert 'eml.xml' in files + + with zf.open('meta.xml') as meta_xml_file: + meta_str = meta_xml_file.read().decode("utf-8") + assert remove_pretty_print_xml(meta_str) == remove_pretty_print_xml(expected_meta_xml) + + with zf.open('occurrence.csv') as occ_file: + df = pd.read_csv(occ_file, dtype='str') + pd.testing.assert_frame_equal(df, expected_occ_df) + + zf.close() + + + def test_merge_core_records_with_separate_id(self): + """ + Test for core record merging (update existing and add new rows) + """ + occ_df = pd.DataFrame(data=[["1", "occ1", "species1", "-30.0000", "144.0000"], + ["2", "occ2", "species2", "-28.0000", "115.0000"], + ["3", "occ3", "species3", "-36.0000", "144.30848"]], + columns=["id", "occurrenceID", 'scientificName', 'decimalLatitude', 'decimalLongitude']) + + dwca_obj = make_dwca(core_content=occ_df, use_col_idx_as_core_id=-2) + + delta_occ_df = pd.DataFrame(data=[["3", "occ3", "species3", "-40.0000", "144.0000"], + ["4", "occ4", "species4", "-10.0000", "144.0000"], + ["5", "occ5", "species5", "-20.0000", "145.0000"], + ["6", "occ6", "species6", "-30.0000", "146.3048"]], + columns=["id", "occurrenceID", "scientificName", "decimalLatitude", "decimalLongitude"]) + + delta_dwca_obj = make_dwca(core_content=delta_occ_df, use_col_idx_as_core_id=-2) + + output_obj = BytesIO() + + keys_lookup: dict = dict() + keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] + + DwcaHandler.merge_dwca(dwca_file=dwca_obj, delta_dwca_file=delta_dwca_obj, + output_dwca_path=output_obj, + keys_lookup=keys_lookup) + + expected_meta_xml = make_meta_xml_str(core_df=occ_df, use_col_idx_as_core_id=-2) + + expected_occ_df = pd.DataFrame(data=[["1", "occ1", "species1", "-30.0000", "144.0000"], + ["2", "occ2", "species2", "-28.0000", "115.0000"], + ["3", "occ3", "species3", "-40.0000", "144.0000"], + ["4", "occ4", "species4", "-10.0000", "144.0000"], + ["5", "occ5", "species5", "-20.0000", "145.0000"], + ["6", "occ6", "species6", "-30.0000", "146.3048"]], + columns=["id", "occurrenceID", "scientificName", "decimalLatitude", "decimalLongitude"]) + + with ZipFile(output_obj, 'r') as zf: + files = zf.namelist() + assert 'occurrence.csv' in files + assert 'meta.xml' in files + assert 'eml.xml' in files + + with zf.open('meta.xml') as meta_xml_file: + meta_str = meta_xml_file.read().decode("utf-8") + assert remove_pretty_print_xml(meta_str) == remove_pretty_print_xml(expected_meta_xml) + + with zf.open('occurrence.csv') as occ_file: + df = pd.read_csv(occ_file, dtype='str') + pd.testing.assert_frame_equal(df, expected_occ_df) + + zf.close() + + + def test_merge_core_and_ext_records_with_id(self): """ Test for core and extension record merging (update existing and add new rows, columns) Occurrence, multimedia and meta xml output is merged as expected @@ -79,7 +189,7 @@ def test_merge_core_and_ext_records(self): ["3", "https://image3.jpg", "image/jpeg", "StillImage"]], columns=["occurrenceID", "identifier", "format", "type"]) - dwca_ext_obj = make_dwca(occ_df, multimedia_df) + dwca_ext_obj = make_dwca(core_content=occ_df, ext_mult_content=multimedia_df, use_col_idx_as_core_id=0) delta_occ_df = pd.DataFrame(data=[["3", "species3", "-40.0000", "144.0000", "Observation"], ["4", "species4", "-10.0000", "144.0000", "Observation"], @@ -94,13 +204,13 @@ def test_merge_core_and_ext_records(self): ["6", "https://image6.webp", "image/webp", "StillImage", "RightsHolder6"]], columns=["occurrenceID", "identifier", "format", "type", "rightsHolder"]) - delta_dwca_ext_obj = make_dwca(delta_occ_df, delta_multimedia_df) + delta_dwca_ext_obj = make_dwca(core_content=delta_occ_df, ext_mult_content=delta_multimedia_df, use_col_idx_as_core_id=0) output_obj = BytesIO() keys_lookup: dict = dict() - keys_lookup['occurrence'] = ['occurrenceID'] - keys_lookup['multimedia'] = ['occurrenceID'] # must be set for the multimedia extension to be updated + keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] + #keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] # must be set for the multimedia extension to be updated DwcaHandler.merge_dwca(dwca_file=dwca_ext_obj, delta_dwca_file=delta_dwca_ext_obj, output_dwca_path=output_obj, @@ -123,7 +233,89 @@ def test_merge_core_and_ext_records(self): ["6", "https://image6.webp", "image/webp", "StillImage", "RightsHolder6"]], columns=["occurrenceID", "identifier", "format", "type", "rightsHolder"]) - expected_meta_xml = make_meta_xml_str(expected_occ_df, expected_multimedia_df) + expected_meta_xml = make_meta_xml_str(core_df=expected_occ_df, ext_df=expected_multimedia_df, use_col_idx_as_core_id=0) + + with ZipFile(output_obj, 'r') as zf: + files = zf.namelist() + assert 'occurrence.csv' in files + assert 'multimedia.csv' in files + assert 'meta.xml' in files + assert 'eml.xml' in files + + with zf.open('meta.xml') as meta_xml_file: + meta_str = meta_xml_file.read().decode("utf-8") + assert remove_pretty_print_xml(meta_str) == remove_pretty_print_xml(expected_meta_xml) + + with zf.open('occurrence.csv') as occ_file: + df_output = pd.read_csv(occ_file, dtype='str') + pd.testing.assert_frame_equal(df_output, expected_occ_df) + + with zf.open('multimedia.csv') as multimedia_file: + multimedia_df_output = pd.read_csv(multimedia_file, dtype='str') + pd.testing.assert_frame_equal(multimedia_df_output, expected_multimedia_df) + + zf.close() + + def test_merge_core_and_ext_records_with_separate_id(self): + """ + Test for core and extension record merging (update existing and add new rows, columns) + Occurrence, multimedia and meta xml output is merged as expected + """ + occ_df = pd.DataFrame(data=[["1", "occ1", "species1", "-30.0000", "144.0000"], + ["2", "occ2", "species2", "-28.0000", "115.0000"], + ["3", "occ3", "species3", "-36.0000", "144.30848"]], + columns=["id", "occurrenceID", "scientificName", "decimalLatitude", "decimalLongitude"]) + + multimedia_df = pd.DataFrame(data=[["1", "https://image1.jpg", "image/jpeg", "StillImage"], + ["2", "https://image2.jpg", "image/jpeg", "StillImage"], + ["3", "https://image3.jpg", "image/jpeg", "StillImage"]], + columns=["id", "identifier", "format", "type"]) + + dwca_ext_obj = make_dwca(core_content=occ_df, ext_mult_content=multimedia_df, use_col_idx_as_core_id=0) + + delta_occ_df = pd.DataFrame(data=[["3", "occ3", "species3", "-40.0000", "144.0000", "Observation"], + ["4", "occ4", "species4", "-10.0000", "144.0000", "Observation"], + ["5", "occ5", "species5", "-20.0000", "145.0000", nan], + ["6", "occ6", "species6", "-30.0000", "146.3048", nan]], + columns=["id", "occurrenceID", "scientificName", "decimalLatitude", "decimalLongitude", + "basisOfRecord"]) + + delta_multimedia_df = pd.DataFrame(data=[["3", "https://new-image3.webp", "image/webp", "StillImage", "RightsHolder3"], + ["4", "https://image4.webp", "image/webp", "StillImage", nan], + ["5", "https://image5.webp", "image/webp", "StillImage", "RightsHolder5"], + ["6", "https://image6.webp", "image/webp", "StillImage", "RightsHolder6"]], + columns=["id", "identifier", "format", "type", "rightsHolder"]) + + delta_dwca_ext_obj = make_dwca(core_content=delta_occ_df, ext_mult_content=delta_multimedia_df, use_col_idx_as_core_id=0) + + output_obj = BytesIO() + + keys_lookup: dict = dict() + keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] + #keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] # must be set for the multimedia extension to be updated + + DwcaHandler.merge_dwca(dwca_file=dwca_ext_obj, delta_dwca_file=delta_dwca_ext_obj, + output_dwca_path=output_obj, + keys_lookup=keys_lookup) + + expected_occ_df = pd.DataFrame(data=[["1", "occ1", "species1", "-30.0000", "144.0000", nan], + ["2", "occ2", "species2", "-28.0000", "115.0000", nan], + ["3", "occ3", "species3", "-40.0000", "144.0000", "Observation"], + ["4", "occ4", "species4", "-10.0000", "144.0000", "Observation"], + ["5", "occ5", "species5", "-20.0000", "145.0000", nan], + ["6", "occ6", "species6", "-30.0000", "146.3048", nan]], + columns=["id", "occurrenceID", "scientificName", "decimalLatitude", "decimalLongitude", + "basisOfRecord"]) + + expected_multimedia_df = pd.DataFrame(data=[["1", "https://image1.jpg", "image/jpeg", "StillImage", nan], + ["2", "https://image2.jpg", "image/jpeg", "StillImage", nan], + ["3", "https://new-image3.webp", "image/webp", "StillImage", "RightsHolder3"], + ["4", "https://image4.webp", "image/webp", "StillImage", nan], + ["5", "https://image5.webp", "image/webp", "StillImage", "RightsHolder5"], + ["6", "https://image6.webp", "image/webp", "StillImage", "RightsHolder6"]], + columns=["coreid", "identifier", "format", "type", "rightsHolder"]) + + expected_meta_xml = make_meta_xml_str(core_df=expected_occ_df, ext_df=expected_multimedia_df, use_col_idx_as_core_id=0) with ZipFile(output_obj, 'r') as zf: files = zf.namelist() @@ -138,10 +330,10 @@ def test_merge_core_and_ext_records(self): with zf.open('occurrence.csv') as occ_file: df_output = pd.read_csv(occ_file, dtype='str') - pd.testing.assert_frame_equal(df_output.drop(columns='id'), expected_occ_df) + pd.testing.assert_frame_equal(df_output, expected_occ_df) with zf.open('multimedia.csv') as multimedia_file: multimedia_df_output = pd.read_csv(multimedia_file, dtype='str') - pd.testing.assert_frame_equal(multimedia_df_output.drop(columns='coreid'), expected_multimedia_df) + pd.testing.assert_frame_equal(multimedia_df_output, expected_multimedia_df) zf.close() diff --git a/tests/test_multimedia_content.py b/tests/test_multimedia_content.py index 54d1dcf..93c03b0 100644 --- a/tests/test_multimedia_content.py +++ b/tests/test_multimedia_content.py @@ -1,6 +1,6 @@ import pandas as pd import dwcahandler -from dwcahandler.dwca import CsvFileType, CoreOrExtType +from dwcahandler.dwca import CsvFileType, CoreOrExtType, MetaElementTypes from dwcahandler.dwca.core_dwca import Dwca from operator import attrgetter import logging @@ -22,7 +22,7 @@ ["3", VIDEO_URL], ["3", MIMETYPE_IMAGE_URL]], columns=['occurrenceID', 'identifier']), - type='multimedia', + type=MetaElementTypes.MULTIMEDIA, keys=['occurrenceID']) @@ -55,29 +55,32 @@ def test_extract_associate_media(self): dwca.extract_csv_content(csv_info=CsvFileType(files=occ_associated_media_df, keys=['occurrenceID'], - type='occurrence'), + type=MetaElementTypes.OCCURRENCE), core_ext_type=CoreOrExtType.CORE) associated_media_image_ext = dwca.convert_associated_media_to_extension() assert 'associatedMedia' not in dwca.core_content.df_content.columns assert sorted(list(map(attrgetter('field_name'), dwca.meta_content.meta_elements[0].fields))) == \ - sorted(['id', 'occurrenceID', 'scientificName']) + sorted(['occurrenceID', 'scientificName']) - pd.testing.assert_frame_equal(associated_media_image_ext.files.reset_index(), image_ext.files) + #pd.testing.assert_frame_equal(associated_media_image_ext.files, image_ext.files) assert associated_media_image_ext.type == image_ext.type - assert associated_media_image_ext.keys[0] == image_ext.keys[0] + #assert associated_media_image_ext.keys[0] == image_ext.keys[0] dwca.extract_csv_content(csv_info=associated_media_image_ext, - core_ext_type=CoreOrExtType.EXTENSION) + core_ext_type=CoreOrExtType.EXTENSION, + build_coreid_for_ext=True) # Compare multimedia ext dataframe (without the coreid) against the expected image_ext dataframe - pd.testing.assert_frame_equal(dwca.ext_content[0].df_content.reset_index().drop(columns=['coreid']), - image_ext.files) + pd.testing.assert_frame_equal(dwca.ext_content[0].df_content.reset_index(drop=True), + image_ext.files, check_index_type=False) # Check the meta content is updated assert sorted(list(map(attrgetter('field_name'), dwca.meta_content.meta_elements[1].fields))) == \ - sorted(['coreid', 'identifier']) + sorted(["identifier", "occurrenceID"]) + + assert dwca.meta_content.meta_elements[1].core_id.index == dwca.meta_content.meta_elements[1].fields[0].index def test_fill_additional_multimedia_info(self, mock_mime_types): """ @@ -91,7 +94,7 @@ def test_fill_additional_multimedia_info(self, mock_mime_types): ["2", "species2"], ["3", "species3"]], columns=['occurrenceID', 'scientificName']), - type='occurrence', + type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']), core_ext_type=CoreOrExtType.CORE) @@ -108,8 +111,7 @@ def test_fill_additional_multimedia_info(self, mock_mime_types): columns=['occurrenceID', 'identifier', 'format', 'type']) # Test that the multimedia extension will now contain the format and type - pd.testing.assert_frame_equal(dwca.ext_content[0].df_content.drop( - columns=['coreid']), expected_multimedia_df) + pd.testing.assert_frame_equal(dwca.ext_content[0].df_content, expected_multimedia_df) def test_fill_multimedia_info_with_format_type_partially_supplied(self, mock_mime_types): """ @@ -131,7 +133,7 @@ def test_fill_multimedia_info_with_format_type_partially_supplied(self, mock_mim ["8", "species8"], ["9", "species9"]], columns=['occurrenceID', 'scientificName']), - type='occurrence', + type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']), core_ext_type=CoreOrExtType.CORE) @@ -149,7 +151,7 @@ def test_fill_multimedia_info_with_format_type_partially_supplied(self, mock_mim dwca.extract_csv_content(csv_info=CsvFileType(files=pd.DataFrame(data=image_data, columns=["occurrenceID", "identifier", "format", "type"]), - type='multimedia', + type=MetaElementTypes.MULTIMEDIA, keys=['occurrenceID']), core_ext_type=CoreOrExtType.EXTENSION) @@ -171,8 +173,7 @@ def test_fill_multimedia_info_with_format_type_partially_supplied(self, mock_mim # Test that the multimedia extension format and type is filled if none provided but # if format and type is provided it remains as provided - pd.testing.assert_frame_equal(dwca.ext_content[0].df_content.drop( - columns=['coreid']), expected_multimedia_df) + pd.testing.assert_frame_equal(dwca.ext_content[0].df_content, expected_multimedia_df) def test_fill_multimedia_info_type_from_format(self, mock_mime_types): """ @@ -191,7 +192,7 @@ def test_fill_multimedia_info_type_from_format(self, mock_mime_types): ["7", "species7"], ["8", "species8"]], columns=['occurrenceID', 'scientificName']), - type='occurrence', + type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']), core_ext_type=CoreOrExtType.CORE) @@ -208,7 +209,7 @@ def test_fill_multimedia_info_type_from_format(self, mock_mime_types): dwca.extract_csv_content(csv_info=CsvFileType(files=pd.DataFrame(data=image_data, columns=["occurrenceID", "identifier", "format"]), - type='multimedia', + type=MetaElementTypes.MULTIMEDIA, keys=['occurrenceID']), core_ext_type=CoreOrExtType.EXTENSION) @@ -229,5 +230,4 @@ def test_fill_multimedia_info_type_from_format(self, mock_mime_types): # Test that the multimedia extension format and type is filled if none provided but # if format and type is provided it remains as provided - pd.testing.assert_frame_equal(dwca.ext_content[0].df_content.drop( - columns=['coreid']), expected_multimedia_df) + pd.testing.assert_frame_equal(dwca.ext_content[0].df_content, expected_multimedia_df) diff --git a/tests/test_validate_dwca.py b/tests/test_validate_dwca.py index 3f68fba..a98eeda 100644 --- a/tests/test_validate_dwca.py +++ b/tests/test_validate_dwca.py @@ -2,7 +2,7 @@ from zipfile import ZipFile import zipfile from pathlib import Path -from dwcahandler import DwcaHandler +from dwcahandler import DwcaHandler, MetaElementTypes import logging import pytest @@ -25,7 +25,7 @@ def test_validate_dwca(self): Test for read and extract dwca. Validate core content """ simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample1") - keys_lookup = {'occurrence': 'occurrenceID'} + keys_lookup = {MetaElementTypes.OCCURRENCE: 'occurrenceID'} dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) assert dwca_result @@ -34,7 +34,7 @@ def test_validate_dwca2(self): Test for read and extract dwca. Validate core content """ simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample2") - keys_lookup = {'occurrence': 'occurrenceID'} + keys_lookup = {MetaElementTypes.OCCURRENCE: 'occurrenceID'} dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) assert dwca_result @@ -44,7 +44,7 @@ def test_empty_keys(self, caplog): """ caplog.set_level(logging.INFO) simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample3") - keys_lookup = {'occurrence': 'occurrenceID'} + keys_lookup = {MetaElementTypes.OCCURRENCE: 'occurrenceID'} dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) assert not dwca_result assert "Empty values found in ['occurrenceID']. Total rows affected: 1" in caplog.messages @@ -56,7 +56,7 @@ def test_duplicate_key(self, caplog): """ caplog.set_level(logging.INFO) simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample4") - keys_lookup = {'occurrence': 'catalogNumber'} + keys_lookup = {MetaElementTypes.OCCURRENCE: 'catalogNumber'} dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) assert not dwca_result assert "Duplicate ['catalogNumber'] found. Total rows affected: 3" in caplog.messages @@ -67,7 +67,7 @@ def test_duplicate_columns_in_dwca(self): Test for read and extract dwca. Validate duplicate columns specified in metadata of dwca """ simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample5") - keys_lookup = {'occurrence': 'catalogNumber'} + keys_lookup = {MetaElementTypes.OCCURRENCE: 'catalogNumber'} with pytest.raises(ValueError) as exc_info: DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) diff --git a/tests/test_write_dwca.py b/tests/test_write_dwca.py index ade3bee..9e32400 100644 --- a/tests/test_write_dwca.py +++ b/tests/test_write_dwca.py @@ -1,5 +1,5 @@ import io -from dwcahandler import DwcaHandler, CsvFileType, CoreOrExtType +from dwcahandler import DwcaHandler, CsvFileType, CoreOrExtType, MetaElementTypes from zipfile import ZipFile from pathlib import Path import xml.etree.ElementTree as ET @@ -12,14 +12,14 @@ def _get_namespace(element): """Get the namespace from a `{namespace}tag` formatted URI param: element - "return: The namespace for the element + :return: The namespace for the element """ m = re.match("\\{.*\\}", element.tag) return m.group(0) if m else '' -occurrence_sample_file = "./input_files/sample/occurrence.csv" -multimedia_sample_file = "./input_files/sample/multimedia.csv" +occurrence_sample_file = "./input_files/occurrence/sample1/occurrence.txt" +multimedia_sample_file = "./input_files/occurrence/sample1/multimedia.txt" sample_occ_df = pd.read_csv(occurrence_sample_file) sample_multimedia_df = pd.read_csv(multimedia_sample_file) @@ -34,12 +34,12 @@ def test_generate_dwca_without_ext(self): """ Test that generated dwca is valid with core occ data """ - core_csv = CsvFileType(files=["./input_files/sample/occurrence.csv"], keys=['occurrenceID'], - type='occurrence') + core_csv = CsvFileType(files=[occurrence_sample_file], keys=['occurrenceID'], + type=MetaElementTypes.OCCURRENCE) p = Path("temp") p.mkdir(parents=True, exist_ok=True) dwca_output_path = str(Path(p / "dwca.zip").absolute()) - DwcaHandler.create_dwca(core_csv=core_csv, output_dwca_path=dwca_output_path, + DwcaHandler.create_dwca(core_csv=core_csv, output_dwca=dwca_output_path, eml_content=get_eml_content()) with ZipFile(dwca_output_path, 'r') as zf: files = zf.namelist() @@ -63,7 +63,7 @@ def test_generate_dwca_without_ext(self): assert core_file with zf.open(core_file) as occ_file: df = pd.read_csv(occ_file) - pd.testing.assert_frame_equal(df.drop(columns=['id']), sample_occ_df) + pd.testing.assert_frame_equal(df, sample_occ_df) zf.close() @@ -71,14 +71,14 @@ def test_generate_dwca_with_ext(self): """ Test that generated dwca is valid with core occ and multimedia data """ - core_csv = CsvFileType(files=["./input_files/sample/occurrence.csv"], keys=['occurrenceID'], - type='occurrence') - ext_csv = CsvFileType(files=["./input_files/sample/multimedia.csv"], keys=['occurrenceID'], - type='multimedia') + core_csv = CsvFileType(files=[occurrence_sample_file], keys=['occurrenceID'], + type=MetaElementTypes.OCCURRENCE) + ext_csv = CsvFileType(files=[multimedia_sample_file], keys=['occurrenceID'], + type=MetaElementTypes.MULTIMEDIA) p = Path("temp") p.mkdir(parents=True, exist_ok=True) dwca_output_path = str(Path(p / "dwca_with_ext.zip").absolute()) - DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=[ext_csv], output_dwca_path=dwca_output_path, + DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=[ext_csv], output_dwca=dwca_output_path, eml_content=get_eml_content()) with ZipFile(dwca_output_path, 'r') as zf: files = zf.namelist() @@ -113,13 +113,11 @@ def test_generate_dwca_with_ext(self): with zf.open(core_file) as occ_file: df = pd.read_csv(occ_file) - assert 'id' in df.columns - pd.testing.assert_frame_equal(df.drop(columns=['id']), sample_occ_df) + pd.testing.assert_frame_equal(df, sample_occ_df) with zf.open(ext_file) as image_file: df = pd.read_csv(image_file) - assert 'coreid' in df.columns - pd.testing.assert_frame_equal(df.drop(columns=['coreid']), sample_multimedia_df) + pd.testing.assert_frame_equal(df, sample_multimedia_df) zf.close() @@ -134,12 +132,12 @@ def test_generate_dwca_in_memory(self): columns=['catalogNumber', 'scientificName']) core_csv = CsvFileType(files=occ_df, - type='occurrence', + type=MetaElementTypes.OCCURRENCE, keys=['catalogNumber']) dwca_output = io.BytesIO() - DwcaHandler.create_dwca(core_csv=core_csv, output_dwca_path=dwca_output, + DwcaHandler.create_dwca(core_csv=core_csv, output_dwca=dwca_output, eml_content=get_eml_content()) with ZipFile(dwca_output, 'r') as zf: @@ -164,6 +162,6 @@ def test_generate_dwca_in_memory(self): assert core_file with zf.open(core_file) as occ_file: df = pd.read_csv(occ_file, dtype='str') - pd.testing.assert_frame_equal(df.drop(columns=['id']), occ_df) + pd.testing.assert_frame_equal(df, occ_df) zf.close() From ba22774ec41d7ad89f605f136f212bc92d1412fe Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Thu, 20 Feb 2025 09:45:38 +1100 Subject: [PATCH 02/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - Remove some unused functions, add more comments and other fixes --- README.md | 60 ++++++----------- src/dwcahandler/dwca/base_dwca.py | 67 +++++++++++++------ src/dwcahandler/dwca/core_dwca.py | 34 ++++------ src/dwcahandler/dwca/dwca_factory.py | 30 +++------ src/dwcahandler/dwca/dwca_meta.py | 21 +----- .../event/cameratrap-sample1/event.txt | 34 +++++----- .../measurement_or_fact.txt | 40 +++++------ .../event/cameratrap-sample1/occurrence.txt | 22 +++--- .../event/cameratrap-sample2/event.txt | 34 +++++----- .../extended_measurement_or_fact.txt | 40 +++++------ .../event/cameratrap-sample2/occurrence.txt | 22 +++--- tests/test_delete_dwca_content.py | 4 +- tests/test_listterms.py | 2 +- tests/test_merge_dwca.py | 10 +-- 14 files changed, 198 insertions(+), 222 deletions(-) diff --git a/README.md b/README.md index a1a37fe..6822ebb 100644 --- a/README.md +++ b/README.md @@ -66,10 +66,11 @@ pip install -i https://test.pypi.org/simple/ dwcahandler ```python from dwcahandler import CsvFileType from dwcahandler import DwcaHandler +from dwcahandler import MetaElementTypes from dwcahandler import Eml -core_csv = CsvFileType(files=['/tmp/occurrence.csv'], type='occurrence', keys=['occurrenceID']) -ext_csvs = [CsvFileType(files=['/tmp/multimedia.csv'], type='multimedia', keys=['occurrenceID'])] +core_csv = CsvFileType(files=['/tmp/occurrence.csv'], type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) +ext_csvs = [CsvFileType(files=['/tmp/multimedia.csv'], type=MetaElementTypes.MULTIMEDIA, keys=['occurrenceID'])] eml = Eml(dataset_name='Test Dataset', description='Dataset description', @@ -77,7 +78,7 @@ eml = Eml(dataset_name='Test Dataset', citation="test citation", rights="test rights") -DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=ext_csvs, eml_content=eml, output_dwca_path='/tmp/dwca.zip') +DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=ext_csvs, eml_content=eml, output_dwca='/tmp/dwca.zip') ```   * Create Darwin Core Archive from pandas dataframe @@ -86,14 +87,15 @@ DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=ext_csvs, eml_content=em ```python from dwcahandler import DwcaHandler from dwcahandler.dwca import CsvFileType +from dwcahandler import MetaElementTypes from dwcahandler import Eml import pandas as pd core_df = pd.read_csv("/tmp/occurrence.csv") -core_frame = CsvFileType(files=core_df, type='occurrence', keys=['occurrenceID']) +core_frame = CsvFileType(files=core_df, type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) ext_df = pd.read_csv("/tmp/multimedia.csv") -ext_frame = [CsvFileType(files=ext_df, type='multimedia', keys=['occurrenceID'])] +ext_frame = [CsvFileType(files=ext_df, type=MetaElementTypes.MULTIMEDIA, keys=['occurrenceID'])] eml = Eml(dataset_name='Test Dataset', description='Dataset description', @@ -101,60 +103,40 @@ eml = Eml(dataset_name='Test Dataset', citation="test citation", rights="test rights") -DwcaHandler.create_dwca(core_csv=core_frame, ext_csv_list=ext_frame, eml_content=eml, output_dwca_path='/tmp/dwca.zip') +DwcaHandler.create_dwca(core_csv=core_frame, ext_csv_list=ext_frame, eml_content=eml, output_dwca='/tmp/dwca.zip') ```   * Merge Darwin Core Archive ```python -from dwcahandler import DwcaHandler +from dwcahandler import DwcaHandler, MetaElementTypes DwcaHandler.merge_dwca(dwca_file='/tmp/dwca.zip', delta_dwca_file='/tmp/delta-dwca.zip', - output_dwca_path='/tmp/new-dwca.zip', - keys_lookup={'occurrence':'occurrenceID'}) + output_dwca='/tmp/new-dwca.zip', + keys_lookup={MetaElementTypes.OCCURRENCE:'occurrenceID'}) ```   * Delete Rows from core file in Darwin Core Archive ```python from dwcahandler import CsvFileType -from dwcahandler import DwcaHandler +from dwcahandler import DwcaHandler, MetaElementTypes -delete_csv = CsvFileType(files=['/tmp/old-records.csv'], type='occurrence', keys=['occurrenceID']) +delete_csv = CsvFileType(files=['/tmp/old-records.csv'], type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) DwcaHandler.delete_records(dwca_file='/tmp/dwca.zip', records_to_delete=delete_csv, - output_dwca_path='/tmp/new-dwca.zip') + output_dwca='/tmp/new-dwca.zip') ```   -* List darwin core terms that is supported in dwcahandler package + ```python from dwcahandler import DwcaHandler -df = DwcaHandler.list_dwc_terms() -print(df) +df_terms, df_class = DwcaHandler.list_terms() +print(df_terms, df_class) ```   -* Other usages may include subclassing the dwca class, modifying the core dataframe content and rebuilding the dwca. -```python -from dwcahandler import Dwca - -class DerivedDwca(Dwca): - """ - Derived class to perform other custom operations that is not included as part of the core operations - """ - def drop_columns(self): - """ - Drop existing column in the core content - """ - self.core_content.df_content.drop(columns=['column1', 'column2'], inplace=True) - self._update_meta_fields(self.core_content) - - -dwca = DerivedDwca(dwca_file_loc='/tmp/dwca.zip') -dwca.extract_dwca() -dwca.drop_columns() -dwca.generate_eml() -dwca.generate_meta() -dwca.write_dwca('/tmp/newdwca.zip') - -``` \ No newline at end of file +* List terms that is supported in dwcahandler package in [terms.csv](src/dwcahandler/dwca/terms/terms.csv) +* Class RowTypes are defined in MetaElementTypes enum class MetaElementTypes. +The supported types are defined by the class column in [class-rowtype.csv](src/dwcahandler/dwca/terms/class-rowtype.csv) +For eg: MetaElementTypes.OCCURRENCE \ No newline at end of file diff --git a/src/dwcahandler/dwca/base_dwca.py b/src/dwcahandler/dwca/base_dwca.py index 679b676..50d91f0 100644 --- a/src/dwcahandler/dwca/base_dwca.py +++ b/src/dwcahandler/dwca/base_dwca.py @@ -65,12 +65,14 @@ def extract_dwca(self, exclude_ext_files: list = None): pass @abstractmethod - def merge_contents(self, delta_dwca: BaseDwca, extension_sync: bool, regen_ids: bool): + def merge_contents(self, delta_dwca: BaseDwca, extension_sync: bool, match_by_filename: bool=False): """Construct a new DwCA by merging the contents of a delta DwCA with this archive. :param delta_dwca: The delta to merge :param extension_sync: Merge extensions - :param regen_ids: Regenerate link identifiers between the core and extension files. + :param match_by_filename: Match the dwca and delta content by also filenames if supplied, + this is extra condition in case if there are more than 1 content with same class type in a dwca + in a rare circumstances """ pass @@ -82,11 +84,6 @@ def set_keys(self, keys: dict): def convert_associated_media_to_extension(self): pass - """ - @abstractmethod - def merge_df_dwc_columns(self): - pass - """ @abstractmethod def delete_records(self, records_to_delete: CsvFileType): pass @@ -114,17 +111,32 @@ def fill_additional_info(self): for multimedia_content, _ in contents: self.add_multimedia_info_to_content(multimedia_content) - def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca_path: Union[str, BytesIO]): + def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca: Union[str, BytesIO]): + """Delete records in dwca if the key records are defined in CsvFileType + + :param records_to_delete: A CsvFileType that containing the text file of the record keys, + the key names of the records and MetaElementType type class of the dwca + where the records need to be removed + :param output_dwca: output dwca path where the result of the dwca is writen to or the output dwca in memory + """ self.extract_dwca() self.delete_records(records_to_delete) self.generate_eml() self.generate_meta() - self.write_dwca(output_dwca_path) + self.write_dwca(output_dwca) - def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str | BytesIO], + def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO], ext_csv_list: list[CsvFileType] = None, validate_content: bool = True, eml_content: Union[str, Eml] = ''): - + """Create a dwca given the contents of core and extensions and eml content + + :param core_csv: CsvFileType containing the files, class types and keys to form the core of the dwca + :param output_dwca: the resulting path of the dwca or the dwca in memory + :param ext_csv_list: list of CsvFileTypes containing the files, class types and keys to form the + extensions of the dwca if supplied + :param validate_content: whether to validate the contents + :param eml_content: eml content in string or a filled Eml object + """ if ext_csv_list is None: ext_csv_list = [] @@ -149,31 +161,46 @@ def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str | BytesIO], self.generate_meta() self.write_dwca(output_dwca) - # Key lookup: For merging to update content and also used as lookup to link extensions to core records. - # keys_lookup keys used for merging 2 dwcas - # regen_ids will generate new uuids for core csv and link coreids extensions to core records. # https://peps.python.org/pep-0484/#forward-references - def merge_dwca(self, delta_dwca: BaseDwca, output_dwca_path: Union[str, BytesIO], keys_lookup: dict = None, - extension_sync=False, regen_ids: bool = False, validate_delta: bool = True): + def merge_dwca(self, delta_dwca: BaseDwca, output_dwca: Union[str, BytesIO], keys_lookup: dict = None, + extension_sync=False, validate_delta: bool = True): + """Merging another dwca to bring in the new records and update the existing records + + :param delta_dwca: delta dwca that contains the updated or new records + :param output_dwca: output dwca containing the path to the physical file and the output of dwca writen in memory + :param keys_lookup: keys to lookup merging with delta_dwca to update content + :param extension_sync: + :param validate_delta: + """ self.extract_dwca() delta_dwca.extract_dwca() self.set_keys(keys_lookup) delta_dwca.set_keys(keys_lookup) if validate_delta and not delta_dwca.validate_content(): raise SystemExit(Exception("Some validations error found in the delta dwca. Dwca is not merged.")) - - self.merge_contents(delta_dwca, extension_sync, regen_ids) + self.merge_contents(delta_dwca, extension_sync) self.fill_additional_info() self.generate_eml() self.generate_meta() - self.write_dwca(output_dwca_path) + self.write_dwca(output_dwca) def validate_dwca(self, content_keys: dict, error_file: str): + """Validate dwca to check if content has unique keys. By default, validates the core content. + If additional checks required in another content, supply it as content_keys + + :param content_keys: a dictionary of class type and the key + for eg. {MetaElementTypes.OCCURRENCE, "occurrenceId"} + :param error_file: optional error_file for the errored data + """ self.extract_dwca() set_keys = self.set_keys(content_keys) - #content_type_to_validate = list(set_keys.keys()) return self.validate_content(content_to_validate=set_keys, error_file=error_file) def validate_file(self, csv: CsvFileType, error_file: str): + """Validate the text file + + :param csv: CsvFileType to pass the csv, key and type + :param error_file: optional error_file for the errored data + """ self.extract_csv_content(csv, CoreOrExtType.CORE) return self.validate_content(error_file=error_file) diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index e61b367..8311894 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -17,10 +17,8 @@ from pathlib import Path from typing import Union from zipfile import ZipFile - import pandas as pd from numpy import nan -from pandas import isnull from pandas.errors import EmptyDataError from pandas.io import parsers from dwcahandler.dwca import (BaseDwca, CoreOrExtType, CSVEncoding, @@ -85,7 +83,7 @@ def count_stat(self, content): """ return len(content) - def _update_core_ids(self, core_df, keys: list) -> str: + def _update_core_ids(self, core_df) -> str: """Generate core identifiers for a core data frame. UUID identifiers are generated for each row in the core data frame. @@ -93,7 +91,6 @@ def _update_core_ids(self, core_df, keys: list) -> str: useful identifier is available in the source data. :param core_df: The data frame to generate identifiers for - :param keys: The keys to use for the id return id field """ id_field = "id" @@ -550,14 +547,14 @@ def _add_ext_lookup_key(self, df_content, core_df_content, core_keys, keys): # Extension Sync def merge_contents(self, delta_dwca: Dwca, extension_sync: bool = False, - regen_ids: bool = False, match_by_filename: bool = False): + match_by_filename: bool = False): """Merge the contents of this DwCA with a delta DwCA :param delta_dwca: The delta DwCA to apply :param extension_sync: refresh the extensions from delta dwca if the occurrences exist in both - :param regen_ids: Regenerate unique identifiers for the records - :param match_by_filename: Match by filename of contents too + :param match_by_filename: Match by filename of contents apart from the content types. + This is particularly useful if a dwca contains more than one content of same type """ self.build_indexes() delta_dwca.build_indexes() @@ -583,18 +580,15 @@ def merge_contents(self, delta_dwca: Dwca, extension_sync: bool = False, delta_content=delta_dwca.core_content, keys=self.core_content.keys) - if regen_ids: - self._update_core_ids(self.core_content.df_content) - for content in self.ext_content: - content.df_content = self._update_extension_ids( - content.df_content, self.core_content.df_content, self.core_content.keys) - def get_content(self, class_type: MetaElementTypes = None, name_space: str = None, file_name: str = None): - """Get the content based on the row type namespace. + def get_content(self, class_type: MetaElementTypes = None, name_space: str = None, file_name: str = None) -> list: + """Get the content based on the class type, row type namespace and optional file name - :param name_space: The row type (a namespace URI) - :return: A tuple of the content data frame and whether - it is a core or extension (None, None) if not found + :param class_type: class_type MetaElementTypes class + :param name_space: The row type (a namespace URI) if it contains value + :param file_name: file_name to match if it contains value + :return: A list of tuples containing the content data frame and + core or extension type """ def check_content(content, class_type, name_space): if file_name and content.meta_info.file_name != file_name: @@ -890,7 +884,7 @@ def extract_csv_content(self, csv_info: CsvFileType, :param csv_info: The CSV file(s) :param core_ext_type: Whether this is a core or extension content frame - :param build_id_for_ext: indicator to build id and core id to support dwca with extension + :param build_coreid_for_ext: indicator to build id and core id to support dwca with extension """ if isinstance(csv_info.files, pd.DataFrame): csv_content = csv_info.files.copy(deep=True) @@ -899,11 +893,11 @@ def extract_csv_content(self, csv_info: CsvFileType, # Use default occurrenceID if not provided keys = csv_info.keys if self.__check_csv_info_value(csv_info, 'keys') else 'occurrenceID' - core_id_field: str = None + core_id_field: str = "" if build_coreid_for_ext: if len (keys) > 1: if core_ext_type == CoreOrExtType.CORE: - core_id_field = self._update_core_ids(csv_content, keys) + core_id_field = self._update_core_ids(csv_content) self._build_index_for_content(csv_content, keys) elif core_ext_type == CoreOrExtType.EXTENSION: csv_content, core_id_field = self._update_extension_ids( diff --git a/src/dwcahandler/dwca/dwca_factory.py b/src/dwcahandler/dwca/dwca_factory.py index 14512ab..3fb7a90 100644 --- a/src/dwcahandler/dwca/dwca_factory.py +++ b/src/dwcahandler/dwca/dwca_factory.py @@ -16,7 +16,7 @@ class DwcaHandler: @staticmethod - def list_dwc_terms() -> (pd.DataFrame, pd.DataFrame): + def list_terms() -> (pd.DataFrame, pd.DataFrame): return Terms().terms_df, Terms().class_df """Perform various DwCA operations""" @@ -38,48 +38,36 @@ def create_dwca(core_csv: CsvFileType, Dwca().create_dwca(core_csv=core_csv, ext_csv_list=ext_csv_list, output_dwca=output_dwca, validate_content=validate_content, eml_content=eml_content) - @staticmethod - def remove_extension_files(dwca_file: str, ext_files: list, output_dwca_path: str): - """Load a DwCA and remove extension files from it - - :param dwca_file: The path to the DwCA - :param ext_files: A list of extension files to delete - :param output_dwca_path: Where to place the resulting DwCA - """ - Dwca(dwca_file_loc=dwca_file).remove_extensions(exclude_ext_files=ext_files, - output_dwca_path=output_dwca_path) - @staticmethod def delete_records(dwca_file: Union[str, BytesIO], records_to_delete: CsvFileType, - output_dwca_path: Union[str, BytesIO]): + output_dwca: Union[str, BytesIO]): """Delete core records listed in the records_to_delete file from DwCA. The specified keys listed in records_to_delete param must exist in the dwca core file :param dwca_file: The path to the DwCA :param records_to_delete: File containing the records to delete and the column key for mapping - :param output_dwca_path: Where to place the resulting DwCA + :param output_dwca: Where to place the resulting DwCA or the dwca output in memory """ Dwca(dwca_file_loc=dwca_file).delete_records_in_dwca(records_to_delete=records_to_delete, - output_dwca_path=output_dwca_path) + output_dwca=output_dwca) @staticmethod - def merge_dwca(dwca_file: Union[str, BytesIO], delta_dwca_file: Union[str, BytesIO], output_dwca_path: Union[str, BytesIO], - keys_lookup: dict = None, extension_sync: bool = False, regen_ids: bool = False, + def merge_dwca(dwca_file: Union[str, BytesIO], delta_dwca_file: Union[str, BytesIO], + output_dwca: Union[str, BytesIO], keys_lookup: dict = None, extension_sync: bool = False, validate_delta_content: bool = True): """Merge a DwCA with a delta DwCA of changes. :param dwca_file: The path to the existing DwCA :param delta_dwca_file: The path to the DwCA containing the delta - :param output_dwca_path: Where to place the resulting + :param output_dwca: Where to place the resulting :param keys_lookup: The keys defining a unique row :param extension_sync: Synchronise extensions - :param regen_ids: Regenerate the unique ids used to tye core and extension records together :param validate_delta_content: Validate the delta DwCA before using """ delta_dwca = Dwca(dwca_file_loc=delta_dwca_file) - Dwca(dwca_file_loc=dwca_file).merge_dwca(delta_dwca=delta_dwca, output_dwca_path=output_dwca_path, + Dwca(dwca_file_loc=dwca_file).merge_dwca(delta_dwca=delta_dwca, output_dwca=output_dwca, keys_lookup=keys_lookup, extension_sync=extension_sync, - regen_ids=regen_ids, validate_delta=validate_delta_content) + validate_delta=validate_delta_content) @staticmethod def validate_dwca(dwca_file: Union[str, BytesIO], keys_lookup: dict = None, error_file: str = None): diff --git a/src/dwcahandler/dwca/dwca_meta.py b/src/dwcahandler/dwca/dwca_meta.py index b1cc69e..1e3a78d 100644 --- a/src/dwcahandler/dwca/dwca_meta.py +++ b/src/dwcahandler/dwca/dwca_meta.py @@ -9,8 +9,7 @@ import xml.etree.ElementTree as ET from xml.dom import minidom import re - -from dataclasses import dataclass, field, asdict +from dataclasses import dataclass, field from typing import Optional from dwcahandler.dwca import CSVEncoding, CoreOrExtType, Terms from enum import Enum @@ -20,20 +19,6 @@ MetaElementTypes = Enum ("MetaElementTypes", dict(DwcClassRowTypes)) -class MetaElementTypes1: - @staticmethod - def get_element_by_row_type(row_type: str): - """ - Find a row type by URI - - :param row_type: The row type URI - :return: The corresponding element - """ - for name, member in MetaElementTypes.__members__.items(): - if member.value == row_type: - return member - return None - @dataclass class MetaElementInfo: @@ -207,7 +192,8 @@ def _extract_meta_element(self, file_name): def update_meta_element(self, meta_element_info: MetaElementInfo, fields: list[str], index_field: str = None): """Replace or append meta information (based on file name) - :param meta_element_info: The info + :param index_field: The field that is also form part of the id/coreid + :param meta_element_info: The meta element info :param fields: The field list """ replace = False @@ -252,7 +238,6 @@ def _build_meta_xml(self, meta_elem_attrib: MetaElementAttributes): for _, f in enumerate(meta_elem_attrib.fields): if f.field_name not in ('id', 'coreid'): field_elem = ET.SubElement(elem, "field") - # Note: f.index is int type if f.index is not None: field_elem.attrib['index'] = f.index if f.term: diff --git a/tests/input_files/event/cameratrap-sample1/event.txt b/tests/input_files/event/cameratrap-sample1/event.txt index c4093b3..82db1f6 100644 --- a/tests/input_files/event/cameratrap-sample1/event.txt +++ b/tests/input_files/event/cameratrap-sample1/event.txt @@ -1,18 +1,18 @@ eventID,parentEventID,eventType,eventDate,decimalLatitude,decimalLongitude,coordinateUncertaintyInMeters,samplingProtocol,samplingEffort,deploymentGroups,locality,eventRemarks,habitat -Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,,Survey,,,,,,,,,, -Danbulla_NP_2022,,Survey,,,,,,,,,, -CFRAG_01_bush_20221215,Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,Deployment,,-17.2911425,145.6316175,1,,,Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,Eacham_Curtain_Fig_NPs,camera trap with FALSE bait | tags: bait:none | Swapped low NIMH batteries for alkaline. Access behind large tree fall gap after following water pipes. & SD card = 34A & physical ID on cam = 7 working cam & had these animal signs: Bird calls & animal signs are: Bird calls,tropical rainforest -CFRAG_01_road_20221215,Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,Deployment,,-17.2911425,145.6316175,1,,,Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,Eacham_Curtain_Fig_NPs,camera trap with FALSE bait near hiking trail | tags: bait:none | & SD card = Oo2 & physical ID on cam = 31 working cam & had these animal signs: Bird calls & animal signs are: Bird calls,tropical rainforest -DBNP_07_road_20221005,Danbulla_NP_2022,Deployment,,-17.0900405,145.625301,1,,,Danbulla_NP_2022,Danbulla_NP,"camera trap with FALSE bait near hiking trail | tags: bait:none | SD card reader didn't work, couldn't check test photos. To access site, turn left onto informal track when heading up main track and it turns right, then walk 60 m & animal signs are: Bird calls",tropical rainforest -DBNP_08_bush_20221005,Danbulla_NP_2022,Deployment,,-17.0812305,145.616759,1,,,Danbulla_NP_2022,Danbulla_NP,"camera trap with FALSE bait | tags: bait:none | Access via ridge line from hiking track where road cam is, turn left before ridge declines. Couldn't check test images. & animal signs are: NA",tropical rainforest -CFRAG_02_bush_20221215_Z_Amir_triggerEvent_55,CFRAG_02_bush_20221215,Trigger,2022-12-17,,,,camera trap,2022-12-17T08:25:51Z/2022-12-17T08:25:54Z,,,, -CFRAG_02_bush_20221215_Z_Amir_triggerEvent_56,CFRAG_02_bush_20221215,Trigger,2022-12-18,,,,camera trap,2022-12-18T02:30:01Z/2022-12-18T02:30:05Z,,,, -CFRAG_01_road_20221215_Z_Amir_triggerEvent_217,CFRAG_01_road_20221215,Trigger,2022-12-15,,,,camera trap,2022-12-15T04:10:45Z/2022-12-15T04:10:49Z,,,, -CFRAG_01_road_20221215_Z_Amir_triggerEvent_521,CFRAG_01_road_20221215,Trigger,2022-12-15,,,,camera trap,2022-12-15T13:20:10Z/2022-12-15T13:20:15Z,,,, -CFRAG_01_road_20221215_Z_Amir_triggerEvent_533,CFRAG_01_road_20221215,Trigger,2022-12-15,,,,camera trap,2022-12-15T20:09:48Z/2022-12-15T20:09:51Z,,,, -DBNP_07_road_20221005_Z_Amir_triggerEvent_906,DBNP_07_road_20221005,Trigger,2022-12-10,,,,camera trap,2022-12-10T17:05:36Z/2022-12-10T17:05:36Z,,,, -DBNP_07_road_20221005_Z_Amir_triggerEvent_860,DBNP_07_road_20221005,Trigger,2022-12-10,,,,camera trap,2022-12-10T20:14:11Z/2022-12-10T20:14:12Z,,,, -DBNP_07_road_20221005_Z_Amir_triggerEvent_861,DBNP_07_road_20221005,Trigger,2022-12-10,,,,camera trap,2022-12-10T22:42:03Z/2022-12-10T22:42:04Z,,,, -DBNP_08_bush_20221005_Z_Amir_triggerEvent_446,DBNP_08_bush_20221005,Trigger,2022-11-05,,,,camera trap,2022-11-05T01:10:49Z/2022-11-05T01:11:00Z,,,, -DBNP_08_bush_20221005_Z_Amir_triggerEvent_447,DBNP_08_bush_20221005,Trigger,2022-11-05,,,,camera trap,2022-11-05T06:07:16Z/2022-11-05T06:08:49Z,,,, -DBNP_08_bush_20221005_Z_Amir_triggerEvent_448,DBNP_08_bush_20221005,Trigger,2022-11-06,,,,camera trap,2022-11-06T05:07:16Z/2022-11-06T05:07:24Z,,,, \ No newline at end of file +PEVENT_A,,Survey,,,,,,,,,, +PEVENT_B,,Survey,,,,,,,,,, +DEPLOY1_PE_A,PEVENT_A,Deployment,,-17.1,145.1,1,,,PEVENT_A,Site 1,sample remarks test 1,tropical rainforest +DEPLOY2_PE_A,PEVENT_A,Deployment,,-17.1,145.1,1,,,PEVENT_A,Site 1,sample remarks test 2,tropical rainforest +DEPLOY1_PE_B,PEVENT_B,Deployment,,-17.2,145.3,1,,,PEVENT_B,Site 2,sample remarks test 3,tropical rainforest +DEPLOY2_PE_B,PEVENT_B,Deployment,,-17.2,145.4,1,,,PEVENT_B,Site 2,sample remarks test 4,tropical rainforest +TR_1,DEPLOY1_PE_A,Trigger,2022-12-17,,,,camera trap,2022-12-17T08:25:51Z/2022-12-17T08:25:54Z,,,, +TR_2,DEPLOY1_PE_A,Trigger,2022-12-18,,,,camera trap,2022-12-18T02:30:01Z/2022-12-18T02:30:05Z,,,, +TR_3,DEPLOY2_PE_A,Trigger,2022-12-15,,,,camera trap,2022-12-15T04:10:45Z/2022-12-15T04:10:49Z,,,, +TR_4,DEPLOY2_PE_A,Trigger,2022-12-15,,,,camera trap,2022-12-15T13:20:10Z/2022-12-15T13:20:15Z,,,, +TR_5,DEPLOY2_PE_A,Trigger,2022-12-15,,,,camera trap,2022-12-15T20:09:48Z/2022-12-15T20:09:51Z,,,, +TR_6,DEPLOY1_PE_B,Trigger,2022-12-10,,,,camera trap,2022-12-10T17:05:36Z/2022-12-10T17:05:36Z,,,, +TR_7,DEPLOY1_PE_B,Trigger,2022-12-10,,,,camera trap,2022-12-10T20:14:11Z/2022-12-10T20:14:12Z,,,, +TR_8,DEPLOY1_PE_B,Trigger,2022-12-10,,,,camera trap,2022-12-10T22:42:03Z/2022-12-10T22:42:04Z,,,, +TR_9,DEPLOY2_PE_B,Trigger,2022-11-05,,,,camera trap,2022-11-05T01:10:49Z/2022-11-05T01:11:00Z,,,, +TR_10,DEPLOY2_PE_B,Trigger,2022-11-05,,,,camera trap,2022-11-05T06:07:16Z/2022-11-05T06:08:49Z,,,, +TR_11,DEPLOY2_PE_B,Trigger,2022-11-06,,,,camera trap,2022-11-06T05:07:16Z/2022-11-06T05:07:24Z,,,, \ No newline at end of file diff --git a/tests/input_files/event/cameratrap-sample1/measurement_or_fact.txt b/tests/input_files/event/cameratrap-sample1/measurement_or_fact.txt index 65aa801..a1c62a6 100644 --- a/tests/input_files/event/cameratrap-sample1/measurement_or_fact.txt +++ b/tests/input_files/event/cameratrap-sample1/measurement_or_fact.txt @@ -1,21 +1,21 @@ eventID,measurementID,measurementType,measurementValue,measurementAccuracy,measurementUnit,measurementDeterminedDate,measurementDeterminedBy,measurementRemarks -CFRAG_02_bush_20221215,11,cameraID,2065144,,,2023-02-11T23:58:40Z,ZA SK, -CFRAG_02_bush_20221215,12,cameraModel,Reconyx,,,2023-02-11T23:58:40Z,ZA SK, -CFRAG_02_bush_20221215,13,cameraDelay,0,,,2023-02-11T23:58:40Z,ZA SK, -CFRAG_02_bush_20221215,14,cameraHeight,0.3,,,2023-02-11T23:58:40Z,ZA SK, -CFRAG_02_bush_20221215,15,cameraTilt,0,,,2023-02-11T23:58:40Z,ZA SK, -CFRAG_01_road_20221215,6,cameraID,2065144,,,2023-03-06T13:20:18Z,ZA SK, -CFRAG_01_road_20221215,7,cameraModel,Reconyx,,,2023-03-06T13:20:18Z,ZA SK, -CFRAG_01_road_20221215,8,cameraDelay,0,,,2023-03-06T13:20:18Z,ZA SK, -CFRAG_01_road_20221215,9,cameraHeight,0.3,,,2023-03-06T13:20:18Z,ZA SK, -CFRAG_01_road_20221215,10,cameraTilt,0,,,2023-03-06T13:20:18Z,ZA SK, -DBNP_07_road_20221005,226,cameraID,2065149,,,2022-12-11T05:35:55Z,ZA, -DBNP_07_road_20221005,227,cameraModel,Hawkray,,,2022-12-11T05:35:55Z,ZA, -DBNP_07_road_20221005,228,cameraDelay,5,,,2022-12-11T05:35:55Z,ZA, -DBNP_07_road_20221005,229,cameraHeight,0.3,,,2022-12-11T05:35:55Z,ZA, -DBNP_07_road_20221005,230,cameraTilt,0,,,2022-12-11T05:35:55Z,ZA, -DBNP_08_bush_20221005,231,cameraID,2065149,,,2023-01-05T06:16:48Z,ZA, -DBNP_08_bush_20221005,232,cameraModel,Hawkray,,,2023-01-05T06:16:48Z,ZA, -DBNP_08_bush_20221005,233,cameraDelay,5,,,2023-01-05T06:16:48Z,ZA, -DBNP_08_bush_20221005,234,cameraHeight,0.3,,,2023-01-05T06:16:48Z,ZA, -DBNP_08_bush_20221005,235,cameraTilt,0,,,2023-01-05T06:16:48Z,ZA, \ No newline at end of file +DEPLOY1_PE_A,11,cameraID,2065144,,,2023-02-11T23:58:40Z,ZA SK, +DEPLOY1_PE_A,12,cameraModel,Reconyx,,,2023-02-11T23:58:40Z,ZA SK, +DEPLOY1_PE_A,13,cameraDelay,0,,,2023-02-11T23:58:40Z,ZA SK, +DEPLOY1_PE_A,14,cameraHeight,0.3,,,2023-02-11T23:58:40Z,ZA SK, +DEPLOY1_PE_A,15,cameraTilt,0,,,2023-02-11T23:58:40Z,ZA SK, +DEPLOY2_PE_A,6,cameraID,2065144,,,2023-03-06T13:20:18Z,ZA SK, +DEPLOY2_PE_A,7,cameraModel,Reconyx,,,2023-03-06T13:20:18Z,ZA SK, +DEPLOY2_PE_A,8,cameraDelay,0,,,2023-03-06T13:20:18Z,ZA SK, +DEPLOY2_PE_A,9,cameraHeight,0.3,,,2023-03-06T13:20:18Z,ZA SK, +DEPLOY2_PE_A,10,cameraTilt,0,,,2023-03-06T13:20:18Z,ZA SK, +DEPLOY1_PE_B,226,cameraID,2065149,,,2022-12-11T05:35:55Z,ZA, +DEPLOY1_PE_B,227,cameraModel,Hawkray,,,2022-12-11T05:35:55Z,ZA, +DEPLOY1_PE_B,228,cameraDelay,5,,,2022-12-11T05:35:55Z,ZA, +DEPLOY1_PE_B,229,cameraHeight,0.3,,,2022-12-11T05:35:55Z,ZA, +DEPLOY1_PE_B,230,cameraTilt,0,,,2022-12-11T05:35:55Z,ZA, +DEPLOY2_PE_B,231,cameraID,2065149,,,2023-01-05T06:16:48Z,ZA, +DEPLOY2_PE_B,232,cameraModel,Hawkray,,,2023-01-05T06:16:48Z,ZA, +DEPLOY2_PE_B,233,cameraDelay,5,,,2023-01-05T06:16:48Z,ZA, +DEPLOY2_PE_B,234,cameraHeight,0.3,,,2023-01-05T06:16:48Z,ZA, +DEPLOY2_PE_B,235,cameraTilt,0,,,2023-01-05T06:16:48Z,ZA, \ No newline at end of file diff --git a/tests/input_files/event/cameratrap-sample1/occurrence.txt b/tests/input_files/event/cameratrap-sample1/occurrence.txt index 337b5b3..85ef84c 100644 --- a/tests/input_files/event/cameratrap-sample1/occurrence.txt +++ b/tests/input_files/event/cameratrap-sample1/occurrence.txt @@ -1,12 +1,12 @@ eventID,occurrenceID,mediaID,scientificName,individualCount,lifeStage,sex,behavior,identifiedBy,occurrenceRemarks,occurrenceStatus,samplingProtocol,basisOfRecord,kingdom,classificationProbability,identificationRemarks,samplingEffort,eventDate -CFRAG_02_bush_20221215_Z_Amir_triggerEvent_55,CFRAG_02_bush_20221215_Z_Amir_observationID_58,CFRAG_02_bush_20221215_Z_Amir_mediaID_332,Megapodius reinwardt,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-17T08:25:51Z/2022-12-17T08:25:54Z,2022-12-17 -CFRAG_02_bush_20221215_Z_Amir_triggerEvent_56,CFRAG_02_bush_20221215_Z_Amir_observationID_59,CFRAG_02_bush_20221215_Z_Amir_mediaID_337,Megapodius reinwardt,2,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-18T02:30:01Z/2022-12-18T02:30:05Z,2022-12-18 -CFRAG_01_road_20221215_Z_Amir_triggerEvent_217,CFRAG_01_road_20221215_Z_Amir_observationID_212,CFRAG_01_road_20221215_Z_Amir_mediaID_1630,Alectura lathami,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T04:10:45Z/2022-12-15T04:10:49Z,2022-12-15 -CFRAG_01_road_20221215_Z_Amir_triggerEvent_521,CFRAG_01_road_20221215_Z_Amir_observationID_483,CFRAG_01_road_20221215_Z_Amir_mediaID_3057,Mammalia,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T13:20:10Z/2022-12-15T13:20:15Z,2022-12-15 -CFRAG_01_road_20221215_Z_Amir_triggerEvent_533,CFRAG_01_road_20221215_Z_Amir_observationID_496,CFRAG_01_road_20221215_Z_Amir_mediaID_3137,Megapodius reinwardt,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T20:09:48Z/2022-12-15T20:09:51Z,2022-12-15 -DBNP_07_road_20221005_Z_Amir_triggerEvent_906,DBNP_07_road_20221005_Z_Amir_observationID_1170,DBNP_07_road_20221005_Z_Amir_mediaID_5187,Mammalia,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T17:05:36Z/2022-12-10T17:05:36Z,2022-12-10 -DBNP_07_road_20221005_Z_Amir_triggerEvent_860,DBNP_07_road_20221005_Z_Amir_observationID_1110,DBNP_07_road_20221005_Z_Amir_mediaID_4992,Heteromyias cinereifrons,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T20:14:11Z/2022-12-10T20:14:12Z,2022-12-10 -DBNP_07_road_20221005_Z_Amir_triggerEvent_861,DBNP_07_road_20221005_Z_Amir_observationID_1111,DBNP_07_road_20221005_Z_Amir_mediaID_4995,Heteromyias cinereifrons,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T22:42:03Z/2022-12-10T22:42:04Z,2022-12-10 -DBNP_08_bush_20221005_Z_Amir_triggerEvent_446,DBNP_08_bush_20221005_Z_Amir_observationID_537,DBNP_08_bush_20221005_Z_Amir_mediaID_1656,Hypsiprymnodon moschatus,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-05T01:10:49Z/2022-11-05T01:11:00Z,2022-11-05 -DBNP_08_bush_20221005_Z_Amir_triggerEvent_447,DBNP_08_bush_20221005_Z_Amir_observationID_538,DBNP_08_bush_20221005_Z_Amir_mediaID_1662,Hypsiprymnodon moschatus,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-05T06:07:16Z/2022-11-05T06:08:49Z,2022-11-05 -DBNP_08_bush_20221005_Z_Amir_triggerEvent_448,DBNP_08_bush_20221005_Z_Amir_observationID_539,DBNP_08_bush_20221005_Z_Amir_mediaID_1677,Hypsiprymnodon moschatus,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-06T05:07:16Z/2022-11-06T05:07:24Z,2022-11-06 \ No newline at end of file +TR_1,DEPLOY1_PE_A_OBID_58,DEPLOY1_PE_A_MEDIA_ID_332,SPECIES A,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-17T08:25:51Z/2022-12-17T08:25:54Z,2022-12-17 +TR_2,DEPLOY1_PE_A_OBID_59,DEPLOY1_PE_A_MEDIA_ID_337,SPECIES A,2,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-18T02:30:01Z/2022-12-18T02:30:05Z,2022-12-18 +TR_3,DEPLOY2_PE_A_OBID_212,DEPLOY2_PE_A_MEDIA_ID_1630,SPECIES B,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T04:10:45Z/2022-12-15T04:10:49Z,2022-12-15 +TR_4,DEPLOY2_PE_A_OBID_483,DEPLOY2_PE_A_MEDIA_ID_3057,SPECIES C,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T13:20:10Z/2022-12-15T13:20:15Z,2022-12-15 +TR_5,DEPLOY2_PE_A_OBID_496,DEPLOY2_PE_A_MEDIA_ID_3137,SPECIES A,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T20:09:48Z/2022-12-15T20:09:51Z,2022-12-15 +TR_6,DEPLOY1_PE_B_OBID_1170,DEPLOY1_PE_B_MEDIA_ID_5187,SPECIES C,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T17:05:36Z/2022-12-10T17:05:36Z,2022-12-10 +TR_7,DEPLOY1_PE_B_OBID_1110,DEPLOY1_PE_B_MEDIA_ID_4992,SPECIES D,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T20:14:11Z/2022-12-10T20:14:12Z,2022-12-10 +TR_8,DEPLOY1_PE_B_OBID_1111,DEPLOY1_PE_B_MEDIA_ID_4995,SPECIES D,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T22:42:03Z/2022-12-10T22:42:04Z,2022-12-10 +TR_9,DEPLOY2_PE_B_OBID_537,DEPLOY2_PE_B_MEDIA_ID_1656,SPECIES E,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-05T01:10:49Z/2022-11-05T01:11:00Z,2022-11-05 +TR_10,DEPLOY2_PE_B_OBID_538,DEPLOY2_PE_B_MEDIA_ID_1662,SPECIES E,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-05T06:07:16Z/2022-11-05T06:08:49Z,2022-11-05 +TR_11,DEPLOY2_PE_B_OBID_539,DEPLOY2_PE_B_MEDIA_ID_1677,SPECIES E,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-06T05:07:16Z/2022-11-06T05:07:24Z,2022-11-06 \ No newline at end of file diff --git a/tests/input_files/event/cameratrap-sample2/event.txt b/tests/input_files/event/cameratrap-sample2/event.txt index c4093b3..4736190 100644 --- a/tests/input_files/event/cameratrap-sample2/event.txt +++ b/tests/input_files/event/cameratrap-sample2/event.txt @@ -1,18 +1,18 @@ eventID,parentEventID,eventType,eventDate,decimalLatitude,decimalLongitude,coordinateUncertaintyInMeters,samplingProtocol,samplingEffort,deploymentGroups,locality,eventRemarks,habitat -Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,,Survey,,,,,,,,,, -Danbulla_NP_2022,,Survey,,,,,,,,,, -CFRAG_01_bush_20221215,Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,Deployment,,-17.2911425,145.6316175,1,,,Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,Eacham_Curtain_Fig_NPs,camera trap with FALSE bait | tags: bait:none | Swapped low NIMH batteries for alkaline. Access behind large tree fall gap after following water pipes. & SD card = 34A & physical ID on cam = 7 working cam & had these animal signs: Bird calls & animal signs are: Bird calls,tropical rainforest -CFRAG_01_road_20221215,Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,Deployment,,-17.2911425,145.6316175,1,,,Central_Fragments_Eacham_and_Curtin_Fig_NPs_2022,Eacham_Curtain_Fig_NPs,camera trap with FALSE bait near hiking trail | tags: bait:none | & SD card = Oo2 & physical ID on cam = 31 working cam & had these animal signs: Bird calls & animal signs are: Bird calls,tropical rainforest -DBNP_07_road_20221005,Danbulla_NP_2022,Deployment,,-17.0900405,145.625301,1,,,Danbulla_NP_2022,Danbulla_NP,"camera trap with FALSE bait near hiking trail | tags: bait:none | SD card reader didn't work, couldn't check test photos. To access site, turn left onto informal track when heading up main track and it turns right, then walk 60 m & animal signs are: Bird calls",tropical rainforest -DBNP_08_bush_20221005,Danbulla_NP_2022,Deployment,,-17.0812305,145.616759,1,,,Danbulla_NP_2022,Danbulla_NP,"camera trap with FALSE bait | tags: bait:none | Access via ridge line from hiking track where road cam is, turn left before ridge declines. Couldn't check test images. & animal signs are: NA",tropical rainforest -CFRAG_02_bush_20221215_Z_Amir_triggerEvent_55,CFRAG_02_bush_20221215,Trigger,2022-12-17,,,,camera trap,2022-12-17T08:25:51Z/2022-12-17T08:25:54Z,,,, -CFRAG_02_bush_20221215_Z_Amir_triggerEvent_56,CFRAG_02_bush_20221215,Trigger,2022-12-18,,,,camera trap,2022-12-18T02:30:01Z/2022-12-18T02:30:05Z,,,, -CFRAG_01_road_20221215_Z_Amir_triggerEvent_217,CFRAG_01_road_20221215,Trigger,2022-12-15,,,,camera trap,2022-12-15T04:10:45Z/2022-12-15T04:10:49Z,,,, -CFRAG_01_road_20221215_Z_Amir_triggerEvent_521,CFRAG_01_road_20221215,Trigger,2022-12-15,,,,camera trap,2022-12-15T13:20:10Z/2022-12-15T13:20:15Z,,,, -CFRAG_01_road_20221215_Z_Amir_triggerEvent_533,CFRAG_01_road_20221215,Trigger,2022-12-15,,,,camera trap,2022-12-15T20:09:48Z/2022-12-15T20:09:51Z,,,, -DBNP_07_road_20221005_Z_Amir_triggerEvent_906,DBNP_07_road_20221005,Trigger,2022-12-10,,,,camera trap,2022-12-10T17:05:36Z/2022-12-10T17:05:36Z,,,, -DBNP_07_road_20221005_Z_Amir_triggerEvent_860,DBNP_07_road_20221005,Trigger,2022-12-10,,,,camera trap,2022-12-10T20:14:11Z/2022-12-10T20:14:12Z,,,, -DBNP_07_road_20221005_Z_Amir_triggerEvent_861,DBNP_07_road_20221005,Trigger,2022-12-10,,,,camera trap,2022-12-10T22:42:03Z/2022-12-10T22:42:04Z,,,, -DBNP_08_bush_20221005_Z_Amir_triggerEvent_446,DBNP_08_bush_20221005,Trigger,2022-11-05,,,,camera trap,2022-11-05T01:10:49Z/2022-11-05T01:11:00Z,,,, -DBNP_08_bush_20221005_Z_Amir_triggerEvent_447,DBNP_08_bush_20221005,Trigger,2022-11-05,,,,camera trap,2022-11-05T06:07:16Z/2022-11-05T06:08:49Z,,,, -DBNP_08_bush_20221005_Z_Amir_triggerEvent_448,DBNP_08_bush_20221005,Trigger,2022-11-06,,,,camera trap,2022-11-06T05:07:16Z/2022-11-06T05:07:24Z,,,, \ No newline at end of file +PEVENT_A,,Survey,,,,,,,,,, +PEVENT_B,,Survey,,,,,,,,,, +DEPLOY1_PE_A,PEVENT_A,Deployment,,-17.1,145.2,1,,,PEVENT_A,Site 1,sample remarks test 1,tropical rainforest +DEPLOY2_PE_A,PEVENT_A,Deployment,,-17.1,145.2,1,,,PEVENT_A,Site 1,sample remarks test 2,tropical rainforest +DEPLOY1_PE_B,PEVENT_B,Deployment,,-17.3,145.3,1,,,PEVENT_B,Site 2,sample remarks test 3,tropical rainforest +DEPLOY2_PE_B,PEVENT_B,Deployment,,-17.4,145.4,1,,,PEVENT_B,Site 2,sample remarks test 4,tropical rainforest +TR_1,DEPLOY1_PE_A,Trigger,2022-12-17,,,,camera trap,2022-12-17T08:25:51Z/2022-12-17T08:25:54Z,,,, +TR_2,DEPLOY1_PE_A,Trigger,2022-12-18,,,,camera trap,2022-12-18T02:30:01Z/2022-12-18T02:30:05Z,,,, +TR_3,DEPLOY2_PE_A,Trigger,2022-12-15,,,,camera trap,2022-12-15T04:10:45Z/2022-12-15T04:10:49Z,,,, +TR_4,DEPLOY2_PE_A,Trigger,2022-12-15,,,,camera trap,2022-12-15T13:20:10Z/2022-12-15T13:20:15Z,,,, +TR_5,DEPLOY2_PE_A,Trigger,2022-12-15,,,,camera trap,2022-12-15T20:09:48Z/2022-12-15T20:09:51Z,,,, +TR_6,DEPLOY1_PE_B,Trigger,2022-12-10,,,,camera trap,2022-12-10T17:05:36Z/2022-12-10T17:05:36Z,,,, +TR_7,DEPLOY1_PE_B,Trigger,2022-12-10,,,,camera trap,2022-12-10T20:14:11Z/2022-12-10T20:14:12Z,,,, +TR_8,DEPLOY1_PE_B,Trigger,2022-12-10,,,,camera trap,2022-12-10T22:42:03Z/2022-12-10T22:42:04Z,,,, +TR_9,DEPLOY2_PE_B,Trigger,2022-11-05,,,,camera trap,2022-11-05T01:10:49Z/2022-11-05T01:11:00Z,,,, +TR_10,DEPLOY2_PE_B,Trigger,2022-11-05,,,,camera trap,2022-11-05T06:07:16Z/2022-11-05T06:08:49Z,,,, +TR_11,DEPLOY2_PE_B,Trigger,2022-11-06,,,,camera trap,2022-11-06T05:07:16Z/2022-11-06T05:07:24Z,,,, \ No newline at end of file diff --git a/tests/input_files/event/cameratrap-sample2/extended_measurement_or_fact.txt b/tests/input_files/event/cameratrap-sample2/extended_measurement_or_fact.txt index ca62325..d3c1b52 100644 --- a/tests/input_files/event/cameratrap-sample2/extended_measurement_or_fact.txt +++ b/tests/input_files/event/cameratrap-sample2/extended_measurement_or_fact.txt @@ -1,21 +1,21 @@ eventID,measurementID,measurementType,measurementTypeID,measurementValue,measurementAccuracy,measurementUnit,measurementUnitID,measurementDeterminedDate,measurementDeterminedBy,measurementRemarks -CFRAG_02_bush_20221215,11,cameraID,cameraID_CFRAG_02_bush_20221215_3,2065144,,,,2023-02-11T23:58:40Z,ZA SK, -CFRAG_02_bush_20221215,12,cameraModel,cameraModel_CFRAG_02_bush_20221215_3,Reconyx,,,,2023-02-11T23:58:40Z,ZA SK, -CFRAG_02_bush_20221215,13,cameraDelay,cameraDelay_CFRAG_02_bush_20221215_3,0,,,,2023-02-11T23:58:40Z,ZA SK, -CFRAG_02_bush_20221215,14,cameraHeight,cameraHeight_CFRAG_02_bush_20221215_3,0.3,,,,2023-02-11T23:58:40Z,ZA SK, -CFRAG_02_bush_20221215,15,cameraTilt,cameraTilt_CFRAG_02_bush_20221215_3,0,,,,2023-02-11T23:58:40Z,ZA SK, -CFRAG_01_road_20221215,6,cameraID,cameraID_CFRAG_01_road_20221215_2,2065144,,,,2023-03-06T13:20:18Z,ZA SK, -CFRAG_01_road_20221215,7,cameraModel,cameraModel_CFRAG_01_road_20221215_2,Reconyx,,,,2023-03-06T13:20:18Z,ZA SK, -CFRAG_01_road_20221215,8,cameraDelay,cameraDelay_CFRAG_01_road_20221215_2,0,,,,2023-03-06T13:20:18Z,ZA SK, -CFRAG_01_road_20221215,9,cameraHeight,cameraHeight_CFRAG_01_road_20221215_2,0.3,,,,2023-03-06T13:20:18Z,ZA SK, -CFRAG_01_road_20221215,10,cameraTilt,cameraTilt_CFRAG_01_road_20221215_2,0,,,,2023-03-06T13:20:18Z,ZA SK, -DBNP_07_road_20221005,226,cameraID,cameraID_DBNP_07_road_20221005_46,2065149,,,,2022-12-11T05:35:55Z,ZA, -DBNP_07_road_20221005,227,cameraModel,cameraModel_DBNP_07_road_20221005_46,Hawkray,,,,2022-12-11T05:35:55Z,ZA, -DBNP_07_road_20221005,228,cameraDelay,cameraDelay_DBNP_07_road_20221005_46,5,,,,2022-12-11T05:35:55Z,ZA, -DBNP_07_road_20221005,229,cameraHeight,cameraHeight_DBNP_07_road_20221005_46,0.3,,,,2022-12-11T05:35:55Z,ZA, -DBNP_07_road_20221005,230,cameraTilt,cameraTilt_DBNP_07_road_20221005_46,0,,,,2022-12-11T05:35:55Z,ZA, -DBNP_08_bush_20221005,231,cameraID,cameraID_DBNP_08_bush_20221005_47,2065149,,,,2023-01-05T06:16:48Z,ZA, -DBNP_08_bush_20221005,232,cameraModel,cameraModel_DBNP_08_bush_20221005_47,Hawkray,,,,2023-01-05T06:16:48Z,ZA, -DBNP_08_bush_20221005,233,cameraDelay,cameraDelay_DBNP_08_bush_20221005_47,5,,,,2023-01-05T06:16:48Z,ZA, -DBNP_08_bush_20221005,234,cameraHeight,cameraHeight_DBNP_08_bush_20221005_47,0.3,,,,2023-01-05T06:16:48Z,ZA, -DBNP_08_bush_20221005,235,cameraTilt,cameraTilt_DBNP_08_bush_20221005_47,0,,,,2023-01-05T06:16:48Z,ZA, +DEPLOY1_PE_A,11,cameraID,cameraID_DEPLOY1_PE_A_3,2065144,,,,2023-02-11T23:58:40Z,ZA SK, +DEPLOY1_PE_A,12,cameraModel,cameraModel_DEPLOY1_PE_A_3,Reconyx,,,,2023-02-11T23:58:40Z,ZA SK, +DEPLOY1_PE_A,13,cameraDelay,cameraDelay_DEPLOY1_PE_A_3,0,,,,2023-02-11T23:58:40Z,ZA SK, +DEPLOY1_PE_A,14,cameraHeight,cameraHeight_DEPLOY1_PE_A_3,0.3,,,,2023-02-11T23:58:40Z,ZA SK, +DEPLOY1_PE_A,15,cameraTilt,cameraTilt_DEPLOY1_PE_A_3,0,,,,2023-02-11T23:58:40Z,ZA SK, +DEPLOY2_PE_A,6,cameraID,cameraID_DEPLOY2_PE_A_2,2065144,,,,2023-03-06T13:20:18Z,ZA SK, +DEPLOY2_PE_A,7,cameraModel,cameraModel_DEPLOY2_PE_A_2,Reconyx,,,,2023-03-06T13:20:18Z,ZA SK, +DEPLOY2_PE_A,8,cameraDelay,cameraDelay_DEPLOY2_PE_A_2,0,,,,2023-03-06T13:20:18Z,ZA SK, +DEPLOY2_PE_A,9,cameraHeight,cameraHeight_DEPLOY2_PE_A_2,0.3,,,,2023-03-06T13:20:18Z,ZA SK, +DEPLOY2_PE_A,10,cameraTilt,cameraTilt_DEPLOY2_PE_A_2,0,,,,2023-03-06T13:20:18Z,ZA SK, +DEPLOY1_PE_B,226,cameraID,cameraID_DEPLOY1_PE_B_46,2065149,,,,2022-12-11T05:35:55Z,ZA, +DEPLOY1_PE_B,227,cameraModel,cameraModel_DEPLOY1_PE_B_46,Hawkray,,,,2022-12-11T05:35:55Z,ZA, +DEPLOY1_PE_B,228,cameraDelay,cameraDelay_DEPLOY1_PE_B_46,5,,,,2022-12-11T05:35:55Z,ZA, +DEPLOY1_PE_B,229,cameraHeight,cameraHeight_DEPLOY1_PE_B_46,0.3,,,,2022-12-11T05:35:55Z,ZA, +DEPLOY1_PE_B,230,cameraTilt,cameraTilt_DEPLOY1_PE_B_46,0,,,,2022-12-11T05:35:55Z,ZA, +DEPLOY2_PE_B,231,cameraID,cameraID_DEPLOY2_PE_B_47,2065149,,,,2023-01-05T06:16:48Z,ZA, +DEPLOY2_PE_B,232,cameraModel,cameraModel_DEPLOY2_PE_B_47,Hawkray,,,,2023-01-05T06:16:48Z,ZA, +DEPLOY2_PE_B,233,cameraDelay,cameraDelay_DEPLOY2_PE_B_47,5,,,,2023-01-05T06:16:48Z,ZA, +DEPLOY2_PE_B,234,cameraHeight,cameraHeight_DEPLOY2_PE_B_47,0.3,,,,2023-01-05T06:16:48Z,ZA, +DEPLOY2_PE_B,235,cameraTilt,cameraTilt_DEPLOY2_PE_B_47,0,,,,2023-01-05T06:16:48Z,ZA, diff --git a/tests/input_files/event/cameratrap-sample2/occurrence.txt b/tests/input_files/event/cameratrap-sample2/occurrence.txt index 337b5b3..cf093ee 100644 --- a/tests/input_files/event/cameratrap-sample2/occurrence.txt +++ b/tests/input_files/event/cameratrap-sample2/occurrence.txt @@ -1,12 +1,12 @@ eventID,occurrenceID,mediaID,scientificName,individualCount,lifeStage,sex,behavior,identifiedBy,occurrenceRemarks,occurrenceStatus,samplingProtocol,basisOfRecord,kingdom,classificationProbability,identificationRemarks,samplingEffort,eventDate -CFRAG_02_bush_20221215_Z_Amir_triggerEvent_55,CFRAG_02_bush_20221215_Z_Amir_observationID_58,CFRAG_02_bush_20221215_Z_Amir_mediaID_332,Megapodius reinwardt,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-17T08:25:51Z/2022-12-17T08:25:54Z,2022-12-17 -CFRAG_02_bush_20221215_Z_Amir_triggerEvent_56,CFRAG_02_bush_20221215_Z_Amir_observationID_59,CFRAG_02_bush_20221215_Z_Amir_mediaID_337,Megapodius reinwardt,2,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-18T02:30:01Z/2022-12-18T02:30:05Z,2022-12-18 -CFRAG_01_road_20221215_Z_Amir_triggerEvent_217,CFRAG_01_road_20221215_Z_Amir_observationID_212,CFRAG_01_road_20221215_Z_Amir_mediaID_1630,Alectura lathami,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T04:10:45Z/2022-12-15T04:10:49Z,2022-12-15 -CFRAG_01_road_20221215_Z_Amir_triggerEvent_521,CFRAG_01_road_20221215_Z_Amir_observationID_483,CFRAG_01_road_20221215_Z_Amir_mediaID_3057,Mammalia,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T13:20:10Z/2022-12-15T13:20:15Z,2022-12-15 -CFRAG_01_road_20221215_Z_Amir_triggerEvent_533,CFRAG_01_road_20221215_Z_Amir_observationID_496,CFRAG_01_road_20221215_Z_Amir_mediaID_3137,Megapodius reinwardt,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T20:09:48Z/2022-12-15T20:09:51Z,2022-12-15 -DBNP_07_road_20221005_Z_Amir_triggerEvent_906,DBNP_07_road_20221005_Z_Amir_observationID_1170,DBNP_07_road_20221005_Z_Amir_mediaID_5187,Mammalia,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T17:05:36Z/2022-12-10T17:05:36Z,2022-12-10 -DBNP_07_road_20221005_Z_Amir_triggerEvent_860,DBNP_07_road_20221005_Z_Amir_observationID_1110,DBNP_07_road_20221005_Z_Amir_mediaID_4992,Heteromyias cinereifrons,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T20:14:11Z/2022-12-10T20:14:12Z,2022-12-10 -DBNP_07_road_20221005_Z_Amir_triggerEvent_861,DBNP_07_road_20221005_Z_Amir_observationID_1111,DBNP_07_road_20221005_Z_Amir_mediaID_4995,Heteromyias cinereifrons,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T22:42:03Z/2022-12-10T22:42:04Z,2022-12-10 -DBNP_08_bush_20221005_Z_Amir_triggerEvent_446,DBNP_08_bush_20221005_Z_Amir_observationID_537,DBNP_08_bush_20221005_Z_Amir_mediaID_1656,Hypsiprymnodon moschatus,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-05T01:10:49Z/2022-11-05T01:11:00Z,2022-11-05 -DBNP_08_bush_20221005_Z_Amir_triggerEvent_447,DBNP_08_bush_20221005_Z_Amir_observationID_538,DBNP_08_bush_20221005_Z_Amir_mediaID_1662,Hypsiprymnodon moschatus,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-05T06:07:16Z/2022-11-05T06:08:49Z,2022-11-05 -DBNP_08_bush_20221005_Z_Amir_triggerEvent_448,DBNP_08_bush_20221005_Z_Amir_observationID_539,DBNP_08_bush_20221005_Z_Amir_mediaID_1677,Hypsiprymnodon moschatus,1,,,,Zachary Amir,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-06T05:07:16Z/2022-11-06T05:07:24Z,2022-11-06 \ No newline at end of file +TR_1,DEPLOY1_PE_AOBID_58,DEPLOY1_PE_A_MEDIA_ID_332,SPECIES A,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-17T08:25:51Z/2022-12-17T08:25:54Z,2022-12-17 +TR_2,DEPLOY1_PE_AOBID_59,DEPLOY1_PE_A_MEDIA_ID_337,SPECIES A,2,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-18T02:30:01Z/2022-12-18T02:30:05Z,2022-12-18 +TR_3,DEPLOY2_PE_AOBID_212,DEPLOY2_PE_A_MEDIA_ID_1630,SPECIES B,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T04:10:45Z/2022-12-15T04:10:49Z,2022-12-15 +TR_4,DEPLOY2_PE_AOBID_483,DEPLOY2_PE_A_MEDIA_ID_3057,SPECIES C,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T13:20:10Z/2022-12-15T13:20:15Z,2022-12-15 +TR_5,DEPLOY2_PE_AOBID_496,DEPLOY2_PE_A_MEDIA_ID_3137,SPECIES A,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-15T20:09:48Z/2022-12-15T20:09:51Z,2022-12-15 +TR_6,DEPLOY1_PE_BOBID_1170,DEPLOY1_PE_B_MEDIA_ID_5187,SPECIES C,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T17:05:36Z/2022-12-10T17:05:36Z,2022-12-10 +TR_7,DEPLOY1_PE_BOBID_1110,DEPLOY1_PE_B_MEDIA_ID_4992,SPECIES Ds,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T20:14:11Z/2022-12-10T20:14:12Z,2022-12-10 +TR_8,DEPLOY1_PE_BOBID_1111,DEPLOY1_PE_B_MEDIA_ID_4995,SPECIES Ds,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-12-10T22:42:03Z/2022-12-10T22:42:04Z,2022-12-10 +TR_9,DEPLOY2_PE_BOBID_537,DEPLOY2_PE_B_MEDIA_ID_1656,SPECIES E,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-05T01:10:49Z/2022-11-05T01:11:00Z,2022-11-05 +TR_10,DEPLOY2_PE_BOBID_538,DEPLOY2_PE_B_MEDIA_ID_1662,SPECIES E,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-05T06:07:16Z/2022-11-05T06:08:49Z,2022-11-05 +TR_11,DEPLOY2_PE_BOBID_539,DEPLOY2_PE_B_MEDIA_ID_1677,SPECIES E,1,,,,Jane Doe,,present,camera trap,MachineObservation,Animalia,1.0,classified by a human with a degree of certainty of 100%,2022-11-06T05:07:16Z/2022-11-06T05:07:24Z,2022-11-06 \ No newline at end of file diff --git a/tests/test_delete_dwca_content.py b/tests/test_delete_dwca_content.py index 8f480ac..f6e291b 100644 --- a/tests/test_delete_dwca_content.py +++ b/tests/test_delete_dwca_content.py @@ -28,7 +28,7 @@ def test_delete_core_records(self): output_obj = BytesIO() - DwcaHandler.delete_records(dwca_file=dwca_obj, records_to_delete=delete_records, output_dwca_path=output_obj) + DwcaHandler.delete_records(dwca_file=dwca_obj, records_to_delete=delete_records, output_dwca=output_obj) expected_meta_xml = make_meta_xml_str(core_df=occ_df, use_col_idx_as_core_id=0) @@ -73,7 +73,7 @@ def test_delete_records_dwca_ext(self): output_obj = BytesIO() - DwcaHandler.delete_records(dwca_file=dwca_ext_obj, records_to_delete=delete_records, output_dwca_path=output_obj) + DwcaHandler.delete_records(dwca_file=dwca_ext_obj, records_to_delete=delete_records, output_dwca=output_obj) expected_meta_xml = make_meta_xml_str(core_df=occ_df, ext_df=multimedia_df, use_col_idx_as_core_id=0) diff --git a/tests/test_listterms.py b/tests/test_listterms.py index 2d1be42..0f5101f 100644 --- a/tests/test_listterms.py +++ b/tests/test_listterms.py @@ -12,7 +12,7 @@ def test_list_dwc_terms(self): """ Test that mandatory terms are present """ - df, class_df = DwcaHandler.list_dwc_terms() + df, class_df = DwcaHandler.list_terms() assert df.query('term == "occurrenceID"').shape[0] == 1 assert df.query('term == "basisOfRecord"').shape[0] == 1 assert df.query('term == "scientificName"').shape[0] == 1 diff --git a/tests/test_merge_dwca.py b/tests/test_merge_dwca.py index 4122575..3967f31 100644 --- a/tests/test_merge_dwca.py +++ b/tests/test_merge_dwca.py @@ -35,7 +35,7 @@ def test_merge_core_records_with_id(self): keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] DwcaHandler.merge_dwca(dwca_file=dwca_obj, delta_dwca_file=delta_dwca_obj, - output_dwca_path=output_obj, + output_dwca=output_obj, keys_lookup=keys_lookup) expected_meta_xml = make_meta_xml_str(core_df=occ_df, use_col_idx_as_core_id=0) @@ -89,7 +89,7 @@ def test_merge_core_records_without_id(self): keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] DwcaHandler.merge_dwca(dwca_file=dwca_obj, delta_dwca_file=delta_dwca_obj, - output_dwca_path=output_obj, + output_dwca=output_obj, keys_lookup=keys_lookup) expected_meta_xml = make_meta_xml_str(core_df=occ_df, use_col_idx_as_core_id=-1) @@ -144,7 +144,7 @@ def test_merge_core_records_with_separate_id(self): keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] DwcaHandler.merge_dwca(dwca_file=dwca_obj, delta_dwca_file=delta_dwca_obj, - output_dwca_path=output_obj, + output_dwca=output_obj, keys_lookup=keys_lookup) expected_meta_xml = make_meta_xml_str(core_df=occ_df, use_col_idx_as_core_id=-2) @@ -213,7 +213,7 @@ def test_merge_core_and_ext_records_with_id(self): #keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] # must be set for the multimedia extension to be updated DwcaHandler.merge_dwca(dwca_file=dwca_ext_obj, delta_dwca_file=delta_dwca_ext_obj, - output_dwca_path=output_obj, + output_dwca=output_obj, keys_lookup=keys_lookup) expected_occ_df = pd.DataFrame(data=[["1", "species1", "-30.0000", "144.0000", nan], @@ -295,7 +295,7 @@ def test_merge_core_and_ext_records_with_separate_id(self): #keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] # must be set for the multimedia extension to be updated DwcaHandler.merge_dwca(dwca_file=dwca_ext_obj, delta_dwca_file=delta_dwca_ext_obj, - output_dwca_path=output_obj, + output_dwca=output_obj, keys_lookup=keys_lookup) expected_occ_df = pd.DataFrame(data=[["1", "occ1", "species1", "-30.0000", "144.0000", nan], From a2748c5b5555f8875e676c4e33c957de914bf8fc Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Thu, 20 Feb 2025 17:28:50 +1100 Subject: [PATCH 03/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - fix code formatting --- src/dwcahandler/dwca/base_dwca.py | 8 ++-- src/dwcahandler/dwca/core_dwca.py | 47 ++++++++++----------- src/dwcahandler/dwca/dwca_meta.py | 4 +- src/dwcahandler/dwca/terms.py | 29 ++++++------- src/dwcahandler/dwca/terms/terms.csv | 6 +-- src/dwcahandler/scripts/update_dwc_terms.py | 2 +- tests/__init__.py | 16 +++---- tests/test_create_core_and_ext_content.py | 32 +++++++------- tests/test_create_dwca.py | 17 +++----- tests/test_listterms.py | 44 +++++++++---------- tests/test_merge_dwca.py | 4 -- tests/test_multimedia_content.py | 4 +- tests/test_validate_dwca.py | 4 +- 13 files changed, 102 insertions(+), 115 deletions(-) diff --git a/src/dwcahandler/dwca/base_dwca.py b/src/dwcahandler/dwca/base_dwca.py index 50d91f0..a6db536 100644 --- a/src/dwcahandler/dwca/base_dwca.py +++ b/src/dwcahandler/dwca/base_dwca.py @@ -47,12 +47,12 @@ def generate_meta(self): pass @abstractmethod - def write_dwca(self, output_dwca_path: Union[str | BytesIO]): + def write_dwca(self, output_dwca: Union[str, BytesIO]): """Write the content of the DwCA to a directory. Writes all CSV files, as well as a meta-file and EML file for the archive. - :param output_dwca_path: The path to write to + :param output_dwca: The path to write to or dwca in memory """ pass @@ -65,10 +65,10 @@ def extract_dwca(self, exclude_ext_files: list = None): pass @abstractmethod - def merge_contents(self, delta_dwca: BaseDwca, extension_sync: bool, match_by_filename: bool=False): + def merge_contents(self, delta_dwca: BaseDwca, extension_sync: bool, match_by_filename: bool = False): """Construct a new DwCA by merging the contents of a delta DwCA with this archive. - :param delta_dwca: The delta to merge + :param delta_dwca: The delta dwca for merging :param extension_sync: Merge extensions :param match_by_filename: Match the dwca and delta content by also filenames if supplied, this is extra condition in case if there are more than 1 content with same class type in a dwca diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index 8311894..43fcccf 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -201,9 +201,9 @@ def convert_values(v): return self.defaults_prop.translate_table[v] if v in invalid_values else v def _find_fields_with_zero_idx(meta_element_fields: list): - for field in meta_element_fields: - if field.index == "0": - return field + for elm_field in meta_element_fields: + if elm_field.index == "0": + return elm_field return None def _add_first_id_field_if_exists(meta_element: MetaElementAttributes): @@ -352,10 +352,11 @@ def set_keys(self, keys: dict = None): return set_keys - def _update_meta_fields(self, content: DfContent, key_field: str=None): + def _update_meta_fields(self, content: DfContent, key_field: str = None): """Update meta content fields by reading the content frame""" fields = self._read_header(content.df_content) - self.meta_content.update_meta_element(meta_element_info=content.meta_info, fields=fields, index_field=key_field) + self.meta_content.update_meta_element(meta_element_info=content.meta_info, fields=fields, + index_field=key_field) def _filter_content(self, df_content, delta_df_content): """Filter delta content that is not already in the existing content @@ -561,7 +562,7 @@ def merge_contents(self, delta_dwca: Dwca, extension_sync: bool = False, for _, delta_content in enumerate(delta_dwca.ext_content): contents = self.get_content(class_type=delta_content.meta_info.type, - file_name=delta_content.meta_info.file_name if match_by_filename else "") + file_name=delta_content.meta_info.file_name if match_by_filename else "") for content, _ in contents: if extension_sync: self._delete_old_ext_records(content, self.core_content.df_content, @@ -571,8 +572,8 @@ def merge_contents(self, delta_dwca: Dwca, extension_sync: bool = False, content.df_content = self._merge_df_content(content, delta_content, self.core_content.keys) - if len (contents) == 0: - # Copy delta ext content into self ext content + if len(contents) == 0: + # Copy delta ext content into self ext content self.ext_content.append(delta_content) self._update_meta_fields(delta_content) @@ -580,7 +581,6 @@ def merge_contents(self, delta_dwca: Dwca, extension_sync: bool = False, delta_content=delta_dwca.core_content, keys=self.core_content.keys) - def get_content(self, class_type: MetaElementTypes = None, name_space: str = None, file_name: str = None) -> list: """Get the content based on the class type, row type namespace and optional file name @@ -590,22 +590,22 @@ def get_content(self, class_type: MetaElementTypes = None, name_space: str = Non :return: A list of tuples containing the content data frame and core or extension type """ - def check_content(content, class_type, name_space): - if file_name and content.meta_info.file_name != file_name: + def check_content(current_content, class_type_to_match, name_space_to_match): + if file_name and current_content.meta_info.file_name != file_name: return False - if ((class_type and content.meta_info.type == class_type) or - (name_space and content.meta_info.type.value == name_space)): + if ((class_type_to_match and current_content.meta_info.type == class_type_to_match) or + (name_space_to_match and current_content.meta_info.type.value == name_space_to_match)): return True return False contents = [] - if check_content(self.core_content, class_type=class_type, name_space=name_space): - contents.append((self.core_content, CoreOrExtType.CORE)) + if check_content(self.core_content, class_type_to_match=class_type, name_space_to_match=name_space): + contents.append((self.core_content, CoreOrExtType.CORE)) for content in self.ext_content: - if check_content(content, class_type=class_type, name_space=name_space): + if check_content(content, class_type_to_match=class_type, name_space_to_match=name_space): contents.append((content, CoreOrExtType.EXTENSION)) return contents @@ -729,7 +729,6 @@ def convert_associated_media_to_extension(self): log.info("%s associated media extracted", str(len(image_df))) return CsvFileType(files=image_df, type=MetaElementTypes.MULTIMEDIA, keys=self.core_content.keys) - #keys=image_df.index.names) log.info("Nothing to extract from associated media") @@ -808,7 +807,7 @@ def to_lower(df): df_keys = to_lower(content_keys_df) duplicate_condition = df_keys.duplicated(keep='first') if duplicate_condition.values.any(): - log.error(f"Duplicate %s found. Total rows affected: %s", keys, duplicate_condition.sum()) + log.error("Duplicate %s found. Total rows affected: %s", keys, duplicate_condition.sum()) log.error("Duplicate values: %s", pd.unique(content_keys_df[duplicate_condition].stack())) if error_file: report_error(content_keys_df, keys, "Duplicate Values", @@ -895,7 +894,7 @@ def extract_csv_content(self, csv_info: CsvFileType, keys = csv_info.keys if self.__check_csv_info_value(csv_info, 'keys') else 'occurrenceID' core_id_field: str = "" if build_coreid_for_ext: - if len (keys) > 1: + if len(keys) > 1: if core_ext_type == CoreOrExtType.CORE: core_id_field = self._update_core_ids(csv_content) self._build_index_for_content(csv_content, keys) @@ -960,13 +959,13 @@ def _write_associated_files(self, dwca_zip: ZipFile): for file in self.embedded_files: dwca_zip.write(file, file.name) - def write_dwca(self, output_dwca_path: Union[str | BytesIO]): + def write_dwca(self, output_dwca: Union[str, BytesIO]): """Write a full DwCA to a zip file Any parent directories needed are created during writing. - :param output_dwca_path: The file path to write the .zip file to + :param output_dwca: The file path to write the .zip file to or dwca in memory """ - with ZipFile(output_dwca_path, 'w', allowZip64=True, + with ZipFile(output_dwca, 'w', allowZip64=True, compression=zipfile.ZIP_DEFLATED) as dwca_zip: self._write_df_content_to_zip_file(dwca_zip=dwca_zip, content=self.core_content) for ext in self.ext_content: @@ -976,7 +975,7 @@ def write_dwca(self, output_dwca_path: Union[str | BytesIO]): dwca_zip.writestr(self.defaults_prop.eml_xml_filename, self.eml_content) self._write_associated_files(dwca_zip=dwca_zip) dwca_zip.close() - log.info("Dwca written to: %s", output_dwca_path) + log.info("Dwca written to: %s", output_dwca) def _read_csv(self, csv_file: str | io.TextIOWrapper, @@ -1033,7 +1032,7 @@ def _read_csv(self, log.debug("Extracted %d rows from csv %s", len(ret_val), csv_file) # Strip column header spaces - ret_val.rename(str.strip, axis = 'columns', inplace=True) + ret_val.rename(str.strip, axis='columns', inplace=True) return ret_val diff --git a/src/dwcahandler/dwca/dwca_meta.py b/src/dwcahandler/dwca/dwca_meta.py index 1e3a78d..0fe0e53 100644 --- a/src/dwcahandler/dwca/dwca_meta.py +++ b/src/dwcahandler/dwca/dwca_meta.py @@ -17,7 +17,7 @@ DwcClassRowTypes = Terms.get_class_row_types() -MetaElementTypes = Enum ("MetaElementTypes", dict(DwcClassRowTypes)) +MetaElementTypes = Enum("MetaElementTypes", dict(DwcClassRowTypes)) @dataclass @@ -164,7 +164,7 @@ def __get_terms(self, field_elm): return col_name if len(self.terms_df[self.terms_df['term'].str.lower() == col_name.lower()]) <= 0 \ else self.terms_df[self.terms_df['term'].str.lower() == col_name.lower()]['uri'].values[0] - def map_headers(self, headers: list[str], index_field: str=None) -> (list[Field], Field): + def map_headers(self, headers: list[str], index_field: str = None) -> (list[Field], Field): """Map header column names onto a list of fields. Column names are mapped onto fields based on name, URI or qualified name diff --git a/src/dwcahandler/dwca/terms.py b/src/dwcahandler/dwca/terms.py index 2c4be39..bea0964 100644 --- a/src/dwcahandler/dwca/terms.py +++ b/src/dwcahandler/dwca/terms.py @@ -35,6 +35,7 @@ class NsPrefix(Enum): GBIF = "gbif" OBIS = "obis" + class ExtInfo(NamedTuple): """ Extension info @@ -43,6 +44,7 @@ class ExtInfo(NamedTuple): prefix: NsPrefix namespace: str + class GbifRegisteredExt(ExtInfo, Enum): """ Supported Gbif extensions. Add more extensions to expand the class row type and terms @@ -91,15 +93,15 @@ def _update_class_csv(ns: NsPrefix, updates: pd.DataFrame): """ if len(updates) > 0 and "class_uri" in updates.columns.tolist(): updates.insert(0, "class", - updates["class_uri"].apply(lambda x: - f"{Terms.extract_term(term_string = x, add_underscore = True).upper()}")) + updates["class_uri"].apply( + lambda x: f"{Terms.extract_term(term_string = x, add_underscore = True).upper()}")) updates["prefix"] = ns.value Terms._update_csv(ns, updates, True) return updates @staticmethod - def _update_csv(ns: NsPrefix, updates: pd.DataFrame, is_class: bool=True): + def _update_csv(ns: NsPrefix, updates: pd.DataFrame, is_class: bool = True): """ Update class rowtype or terms by replacing all the rows by prefix. @@ -120,8 +122,8 @@ def _update_csv(ns: NsPrefix, updates: pd.DataFrame, is_class: bool=True): df = pd.concat([df, updates[col_list]], ignore_index=False) df.to_csv(file, index=False) - log.info("Rows updated in %s: %s of %s", Path(Terms.CLASS_ROW_TYPE).name, - len(updates), len(df)) + log.info("Rows updated in %s: %s of %s", + Path(Terms.CLASS_ROW_TYPE).name, len(updates), len(df)) else: log.info("No updates to class csv %s", Path(Terms.CLASS_ROW_TYPE).name) @@ -197,7 +199,7 @@ def update_gbif_ext(): def _get_latest(identifier: str): d = requests.get(Terms.GBIF_EXT).json() gbif_ext_df = pd.DataFrame.from_dict(d["extensions"]) - ext_df = gbif_ext_df[(gbif_ext_df["identifier"]==identifier) & (gbif_ext_df["isLatest"]==True)] + ext_df = gbif_ext_df[(gbif_ext_df["identifier"] == identifier) & (gbif_ext_df["isLatest"])] url: str = "" if len(ext_df) > 0 and "url" in ext_df.columns.tolist(): url = ext_df["url"].values[0] @@ -205,8 +207,8 @@ def _get_latest(identifier: str): def _extract_term_info(every_term: tuple) -> list: def _extract_value(text: str): - return text.replace('\\',"").\ - replace('"',"").replace("'","").split("=")[1] + return text.replace('\\', "").\ + replace('"', "").replace("'", "").split("=")[1] term_name = _extract_value(every_term[0]) namespace = _extract_value(every_term[1]) @@ -235,18 +237,13 @@ def _extract_value(text: str): std_ns = ["http://rs.tdwg.org/dwc/terms/", "http://purl.org/dc/terms/"] existing_terms = Terms().terms_df extra_terms_df = df[(df["namespace"].isin(std_ns)) & (~df["uri"].isin(existing_terms["uri"]))] - log.info ("Additional standard terms found:\n%s", extra_terms_df) + log.info("Additional standard terms found:\n%s", extra_terms_df) new_terms = df[~df["uri"].isin(existing_terms["uri"])] if len(new_terms) > 0: new_terms["prefix"] = supported_ext.prefix.value Terms._update_csv(supported_ext.prefix, new_terms, False) - @staticmethod def update_terms(): - Terms.update_dwc_terms() - Terms.update_gbif_ext() - - - -#Terms.update_terms() + Terms.update_dwc_terms() + Terms.update_gbif_ext() diff --git a/src/dwcahandler/dwca/terms/terms.csv b/src/dwcahandler/dwca/terms/terms.csv index 6db290e..ef943e3 100644 --- a/src/dwcahandler/dwca/terms/terms.csv +++ b/src/dwcahandler/dwca/terms/terms.csv @@ -54,6 +54,9 @@ dc,temporal,http://purl.org/dc/terms/temporal dc,title,http://purl.org/dc/terms/title dc,type,http://purl.org/dc/terms/type dc,valid,http://purl.org/dc/terms/valid +obis,measurementTypeID,http://rs.iobis.org/obis/terms/measurementTypeID +obis,measurementValueID,http://rs.iobis.org/obis/terms/measurementValueID +obis,measurementUnitID,http://rs.iobis.org/obis/terms/measurementUnitID dwc,acceptedNameUsage,http://rs.tdwg.org/dwc/terms/acceptedNameUsage dwc,acceptedNameUsageID,http://rs.tdwg.org/dwc/terms/acceptedNameUsageID dwc,associatedMedia,http://rs.tdwg.org/dwc/terms/associatedMedia @@ -268,6 +271,3 @@ dwc,verticalDatum,http://rs.tdwg.org/dwc/terms/verticalDatum dwc,vitality,http://rs.tdwg.org/dwc/terms/vitality dwc,waterBody,http://rs.tdwg.org/dwc/terms/waterBody dwc,year,http://rs.tdwg.org/dwc/terms/year -obis,measurementTypeID,http://rs.iobis.org/obis/terms/measurementTypeID -obis,measurementValueID,http://rs.iobis.org/obis/terms/measurementValueID -obis,measurementUnitID,http://rs.iobis.org/obis/terms/measurementUnitID diff --git a/src/dwcahandler/scripts/update_dwc_terms.py b/src/dwcahandler/scripts/update_dwc_terms.py index 2f21b7c..0478000 100644 --- a/src/dwcahandler/scripts/update_dwc_terms.py +++ b/src/dwcahandler/scripts/update_dwc_terms.py @@ -19,4 +19,4 @@ def update_terms(): Call the update_dwc_terms to get the latest version of tdwg dwc terms Do we need to get a particular version of csv url to pass in?? """ - Terms.update_dwc_terms() + Terms.update_terms() diff --git a/tests/__init__.py b/tests/__init__.py index aa50483..e20f445 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -5,6 +5,7 @@ import csv from dwcahandler import Eml from xml.dom.minidom import parseString +from dwcahandler import MetaDwCA def get_eml_content(): @@ -20,9 +21,8 @@ def make_fields(columns: list, term_uri: str, field_start: int = 0, core_id: str fields = "" idx_start = 0 if field_start != -1: - fields = core_id if core_id else "" - idx_start = field_start if field_start != -2 else 0 - + fields = core_id if core_id else "" + idx_start = field_start if field_start != -2 else 0 for idx, col in enumerate(columns): if not (col in ["id", "coreid"]): @@ -56,12 +56,11 @@ def make_meta_xml_str(core_df: pd.DataFrame, ext_df: pd.DataFrame = None, use_co :return: str """ core_columns = core_df.columns.to_list() - field_start = use_col_idx_as_core_id #1 if any(x for x in core_columns if x in ["id", "coreid"]) else use_col_idx_as_core_id id_idx = use_col_idx_as_core_id if use_col_idx_as_core_id >= 0 else 0 - fields = make_fields(core_columns, "http://rs.tdwg.org/dwc/terms", field_start, + fields = make_fields(core_columns, "http://rs.tdwg.org/dwc/terms", use_col_idx_as_core_id, f'') ext_str = make_ext_str(ext_df.columns.to_list(), "http://purl.org/dc/terms", - field_start, id_idx) \ + use_col_idx_as_core_id, id_idx) \ if isinstance(ext_df, pd.DataFrame) else '' meta_xml_str = f''' @@ -108,12 +107,9 @@ def remove_pretty_print_xml(input_xml): return output_xml -from dwcahandler import MetaDwCA -from io import BytesIO - def get_xml_from_file(expected_file: str): dwca_meta = MetaDwCA() - dwca_meta.read_meta_file (meta_file=expected_file) + dwca_meta.read_meta_file(meta_file=expected_file) dwca_meta.create() expected_str = str(dwca_meta) return expected_str diff --git a/tests/test_create_core_and_ext_content.py b/tests/test_create_core_and_ext_content.py index 5a1a974..696b277 100644 --- a/tests/test_create_core_and_ext_content.py +++ b/tests/test_create_core_and_ext_content.py @@ -16,10 +16,12 @@ "delimiter": "\t"} duplicates_csv_occ_test = {"file_paths": single_csv_occ_test["file_paths"] + multiple_csv_occ_test["file_paths"], "delimiter": ","} -csv_occ_with_space = {"file_paths": ['./input_files/sample/occurrence/occ_file1.csv', './input_files/sample/occ_header_with_space.csv'], - "delimiter": ","} -multimedia_with_space = {"file_paths": ['./input_files/sample/multimedia/multimedia_file.csv', './input_files/sample/multimedia_header_with_space.csv'], +csv_occ_with_space = {"file_paths": ['./input_files/sample/occurrence/occ_file1.csv', + './input_files/sample/occ_header_with_space.csv'], "delimiter": ","} +multimedia_with_space = {"file_paths": ['./input_files/sample/multimedia/multimedia_file.csv', + './input_files/sample/multimedia_header_with_space.csv'], + "delimiter": ","} def get_expected_combined_occ_df(file_paths: list, keys: list, delimiter: str = ","): @@ -28,8 +30,6 @@ def get_expected_combined_occ_df(file_paths: list, keys: list, delimiter: str = for df in dfs: all_records_df = pd.concat([all_records_df, df], ignore_index=True) all_records_df.drop_duplicates(inplace=True) - #all_records_df.set_index(keys=keys, drop=False, inplace=True) - #all_records_df.reset_index(inplace=True) return all_records_df @@ -168,12 +168,12 @@ def test_extract_csv_with_header_space(self): core_ext_type=CoreOrExtType.CORE) expected_column_list = ["catalogNumber", "basisOfRecord", "scientificName", - "license","decimalLatitude","decimalLongitude"] + "license", "decimalLatitude", "decimalLongitude"] assert set(dwca_creator.core_content.df_content.columns) == set(expected_column_list) assert len(dwca_creator.core_content.df_content) == 5 pdtest.assert_series_equal(dwca_creator.core_content.df_content["catalogNumber"], - pd.Series(["C1", "C2", "C3", "C4", "C5"], dtype=str, name='catalogNumber'), - check_index_type=False, check_index=False) + pd.Series(["C1", "C2", "C3", "C4", "C5"], dtype=str, name='catalogNumber'), + check_index_type=False, check_index=False) def test_extract_csv_ext_with_header_space(self): """ @@ -196,16 +196,18 @@ def test_extract_csv_ext_with_header_space(self): core_ext_type=CoreOrExtType.EXTENSION) expected_column_list = ["catalogNumber", "basisOfRecord", "scientificName", - "license","decimalLatitude","decimalLongitude"] + "license", "decimalLatitude", "decimalLongitude"] assert set(dwca_creator.core_content.df_content.columns) == set(expected_column_list) assert len(dwca_creator.core_content.df_content) == 5 - pdtest.assert_series_equal(dwca_creator.core_content.df_content["catalogNumber"], - pd.Series(["C1", "C2", "C3", "C4", "C5"], dtype=str, name="catalogNumber"), - check_index_type=False, check_index=False) + pdtest.assert_series_equal( + dwca_creator.core_content.df_content["catalogNumber"], + pd.Series(["C1", "C2", "C3", "C4", "C5"], dtype=str, name="catalogNumber"), + check_index_type=False, check_index=False) expected_column_list = ["catalogNumber", "identifier", "format", "type"] assert set(dwca_creator.ext_content[0].df_content.columns) == set(expected_column_list) assert len(dwca_creator.ext_content[0].df_content) == 5 - pdtest.assert_series_equal(dwca_creator.ext_content[0].df_content["catalogNumber"], - pd.Series(["C1", "C2", "C3", "C4", "C5"], dtype=str, name="catalogNumber"), - check_index_type=False, check_index=False) + pdtest.assert_series_equal( + dwca_creator.ext_content[0].df_content["catalogNumber"], + pd.Series(["C1", "C2", "C3", "C4", "C5"], dtype=str, name="catalogNumber"), + check_index_type=False, check_index=False) diff --git a/tests/test_create_dwca.py b/tests/test_create_dwca.py index 67f6ee8..b35090f 100644 --- a/tests/test_create_dwca.py +++ b/tests/test_create_dwca.py @@ -38,7 +38,7 @@ def check_output(output_obj: BytesIO, test_files_folder: str, check_core_id: boo pd.testing.assert_frame_equal(actual_df, expected_df) else: core_id_list = ["id", "coreid"] - assert any(found :=[i for i in core_id_list if i in actual_df.columns.to_list()]) + assert any(found := [i for i in core_id_list if i in actual_df.columns.to_list()]) actual_df = actual_df.drop(columns=[found[0]]) for col in expected_df.columns: expected_df = expected_df[~expected_df[col].str.contains('ERROR')] @@ -66,15 +66,14 @@ def test_create_occurrence_dwca_occurrence(self): check_output(output_obj, test_files_folder) - def test_create_occurrence_dwca_occurrence_multiple_keys(self): test_files_folder = "./input_files/occurrence/sample2" core_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], - keys=['institutionCode','collectionCode','catalogNumber'], + keys=['institutionCode', 'collectionCode', 'catalogNumber'], type=MetaElementTypes.OCCURRENCE) ext1_csv = CsvFileType(files=[f"{test_files_folder}/multimedia.txt"], - keys=['institutionCode','collectionCode','catalogNumber'], + keys=['institutionCode', 'collectionCode', 'catalogNumber'], type=MetaElementTypes.MULTIMEDIA) output_obj = BytesIO() @@ -86,15 +85,14 @@ def test_create_occurrence_dwca_occurrence_multiple_keys(self): check_output(output_obj, test_files_folder, check_core_id=True) - def test_create_occurrence_dwca_occurrence_extra_multimedia_records(self): test_files_folder = "./input_files/occurrence/sample3" core_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], - keys=['institutionCode','collectionCode','catalogNumber'], + keys=['institutionCode', 'collectionCode', 'catalogNumber'], type=MetaElementTypes.OCCURRENCE) ext1_csv = CsvFileType(files=[f"{test_files_folder}/multimedia.txt"], - keys=['institutionCode','collectionCode','catalogNumber'], + keys=['institutionCode', 'collectionCode', 'catalogNumber'], type=MetaElementTypes.MULTIMEDIA) output_obj = BytesIO() @@ -113,9 +111,9 @@ def test_create_event_dwca_sample1(self): core_csv = CsvFileType(files=[f"{test_files_folder}/event.txt"], keys=['eventID'], type=MetaElementTypes.EVENT) ext1_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], keys=['eventID'], - type=MetaElementTypes.OCCURRENCE) + type=MetaElementTypes.OCCURRENCE) ext2_csv = CsvFileType(files=[f"{test_files_folder}/measurement_or_fact.txt"], keys=['eventID'], - type=MetaElementTypes.MEASUREMENT_OR_FACT) + type=MetaElementTypes.MEASUREMENT_OR_FACT) output_obj = BytesIO() @@ -145,4 +143,3 @@ def test_create_event_dwca_sample2(self): assert output_obj check_output(output_obj, test_files_folder) - diff --git a/tests/test_listterms.py b/tests/test_listterms.py index 0f5101f..0de55d7 100644 --- a/tests/test_listterms.py +++ b/tests/test_listterms.py @@ -19,37 +19,37 @@ def test_list_dwc_terms(self): assert df.query('term == "decimalLatitude"').shape[0] == 1 assert df.query('term == "decimalLongitude"').shape[0] == 1 assert df.query('term == "eventDate"').shape[0] == 1 - assert len(class_df[class_df["class"]=="OCCURRENCE"]) == 1 + assert len(class_df[class_df["class"] == "OCCURRENCE"]) == 1 def test_update_list_terms(self, mocker): """ Test that the terms are stored in expected format and deprecated terms are not brought over """ - mocker.patch.object(Terms, attribute="get_dwc_source_data", - return_value=pd.DataFrame( - {"term_localName": ["occurrenceID", "basisOfRecord", - "scientificName", "oldTerm"], - "term_isDefinedBy": ["http://rs.tdwg.org/dwc/terms/", - "http://rs.tdwg.org/dwc/terms/", - "http://rs.tdwg.org/dwc/terms/", - "http://rs.tdwg.org/dwc/terms/"], - "term_deprecated": [nan, nan, nan, "true"], - "tdwgutility_organizedInClass": ["http://rs.tdwg.org/dwc/terms/Occurrence", - "http://rs.tdwg.org/dwc/terms/Occurrence", - "http://rs.tdwg.org/dwc/terms/Occurrence", - "http://rs.tdwg.org/dwc/terms/Occurrence"]})) + mocker.patch.object(Terms, + attribute="get_dwc_source_data", + return_value=pd.DataFrame + ({"term_localName": ["occurrenceID", "basisOfRecord", "scientificName", "oldTerm"], + "term_isDefinedBy": ["http://rs.tdwg.org/dwc/terms/", + "http://rs.tdwg.org/dwc/terms/", + "http://rs.tdwg.org/dwc/terms/", + "http://rs.tdwg.org/dwc/terms/"], + "term_deprecated": [nan, nan, nan, "true"], + "tdwgutility_organizedInClass": ["http://rs.tdwg.org/dwc/terms/Occurrence", + "http://rs.tdwg.org/dwc/terms/Occurrence", + "http://rs.tdwg.org/dwc/terms/Occurrence", + "http://rs.tdwg.org/dwc/terms/Occurrence"]})) mocker.patch('pandas.DataFrame.to_csv') return_dwc_df, return_dwc_class_df = Terms.update_dwc_terms() pd.testing.assert_frame_equal(left=return_dwc_df, - right=pd.DataFrame({"prefix": [NsPrefix.DWC.value, NsPrefix.DWC.value, - NsPrefix.DWC.value], - "term": ["occurrenceID", "basisOfRecord", "scientificName"], - "uri": ["http://rs.tdwg.org/dwc/terms/occurrenceID", - "http://rs.tdwg.org/dwc/terms/basisOfRecord", - "http://rs.tdwg.org/dwc/terms/scientificName"]}), + right=pd.DataFrame( + {"prefix": [NsPrefix.DWC.value, NsPrefix.DWC.value, NsPrefix.DWC.value], + "term": ["occurrenceID", "basisOfRecord", "scientificName"], + "uri": ["http://rs.tdwg.org/dwc/terms/occurrenceID", + "http://rs.tdwg.org/dwc/terms/basisOfRecord", + "http://rs.tdwg.org/dwc/terms/scientificName"]}), check_like=True) pd.testing.assert_frame_equal(left=return_dwc_class_df, right=pd.DataFrame({"prefix": [NsPrefix.DWC.value], - "class": ["OCCURRENCE"], - "class_uri": ["http://rs.tdwg.org/dwc/terms/Occurrence"]}), + "class": ["OCCURRENCE"], + "class_uri": ["http://rs.tdwg.org/dwc/terms/Occurrence"]}), check_like=True) diff --git a/tests/test_merge_dwca.py b/tests/test_merge_dwca.py index 3967f31..25bdc6f 100644 --- a/tests/test_merge_dwca.py +++ b/tests/test_merge_dwca.py @@ -118,7 +118,6 @@ def test_merge_core_records_without_id(self): zf.close() - def test_merge_core_records_with_separate_id(self): """ Test for core record merging (update existing and add new rows) @@ -173,7 +172,6 @@ def test_merge_core_records_with_separate_id(self): zf.close() - def test_merge_core_and_ext_records_with_id(self): """ Test for core and extension record merging (update existing and add new rows, columns) @@ -210,7 +208,6 @@ def test_merge_core_and_ext_records_with_id(self): keys_lookup: dict = dict() keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] - #keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] # must be set for the multimedia extension to be updated DwcaHandler.merge_dwca(dwca_file=dwca_ext_obj, delta_dwca_file=delta_dwca_ext_obj, output_dwca=output_obj, @@ -292,7 +289,6 @@ def test_merge_core_and_ext_records_with_separate_id(self): keys_lookup: dict = dict() keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] - #keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] # must be set for the multimedia extension to be updated DwcaHandler.merge_dwca(dwca_file=dwca_ext_obj, delta_dwca_file=delta_dwca_ext_obj, output_dwca=output_obj, diff --git a/tests/test_multimedia_content.py b/tests/test_multimedia_content.py index 93c03b0..796f2d6 100644 --- a/tests/test_multimedia_content.py +++ b/tests/test_multimedia_content.py @@ -64,9 +64,9 @@ def test_extract_associate_media(self): assert sorted(list(map(attrgetter('field_name'), dwca.meta_content.meta_elements[0].fields))) == \ sorted(['occurrenceID', 'scientificName']) - #pd.testing.assert_frame_equal(associated_media_image_ext.files, image_ext.files) + pd.testing.assert_frame_equal(associated_media_image_ext.files.reset_index(drop=True), image_ext.files) assert associated_media_image_ext.type == image_ext.type - #assert associated_media_image_ext.keys[0] == image_ext.keys[0] + assert associated_media_image_ext.keys[0] == image_ext.keys[0] dwca.extract_csv_content(csv_info=associated_media_image_ext, core_ext_type=CoreOrExtType.EXTENSION, diff --git a/tests/test_validate_dwca.py b/tests/test_validate_dwca.py index a98eeda..3c2063c 100644 --- a/tests/test_validate_dwca.py +++ b/tests/test_validate_dwca.py @@ -72,5 +72,5 @@ def test_duplicate_columns_in_dwca(self): with pytest.raises(ValueError) as exc_info: DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) - assert "Duplicate columns ['catalogNumber'] specified in the " \ - "metadata for occurrence.csv" in str(exc_info.value) + assert ("Duplicate columns ['catalogNumber'] specified in the metadata for occurrence.csv" + in str(exc_info.value)) From a93056ccdabe6ac316f56e1ca52b5579ea9c5c64 Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Wed, 5 Mar 2025 09:15:19 +1100 Subject: [PATCH 04/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - changes to validate content and other tweaks --- pyproject.toml | 2 +- src/dwcahandler/dwca/__init__.py | 29 ++++++++++++++++++++++++---- src/dwcahandler/dwca/base_dwca.py | 18 +++++++++-------- src/dwcahandler/dwca/core_dwca.py | 11 ++++++----- src/dwcahandler/dwca/dwca_meta.py | 32 +++++++++++++------------------ 5 files changed, 55 insertions(+), 37 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 281d22b..44a0b9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dwcahandler" -version = "0.4.0" +version = "1.0.0b1" description = "Python package to handle Darwin Core Archive (DwCA) operations. This includes creating a DwCA zip file from one or more csvs, reading a DwCA, merge two DwCAs, validate DwCA and delete records from DwCA based on one or more key columns" authors = ["Atlas of Living Australia data team "] maintainers = ["Atlas of Living Australia data team "] diff --git a/src/dwcahandler/dwca/__init__.py b/src/dwcahandler/dwca/__init__.py index 8c3281b..40c4aba 100644 --- a/src/dwcahandler/dwca/__init__.py +++ b/src/dwcahandler/dwca/__init__.py @@ -16,6 +16,8 @@ the (usually Darwin Core) terms that each column contains. """ +from __future__ import annotations + from collections import namedtuple from dataclasses import dataclass, field from typing import Optional, Union @@ -160,13 +162,14 @@ class Defaults: # Imports at end of file to allow classes to be used from dwcahandler.dwca.terms import Terms, NsPrefix -from dwcahandler.dwca.dwca_meta import MetaElementTypes, MetaElementInfo, MetaDwCA, MetaElementAttributes +from dwcahandler.dwca.dwca_meta import (MetaElementTypes, MetaElementInfo, MetaDwCA, + MetaElementAttributes, get_meta_class_row_type) @dataclass class CsvFileType: """A description of a CSV file in a DwCA """ files: Union[list[str], pd.DataFrame] # can accept more than one file or a dataframe - type: MetaElementTypes # 'occurrence', 'taxon', 'event', multimedia,... + type: MetaElementTypes # 'occurrence', 'taxon', 'event', multimedia,... keys: Optional[list] = None # must be supplied for csv extensions to link extension records to core record # when creating dwca. for core other than occurrence, this neeeds to be supplied as key. # column keys lookup in core or extension for delete records @@ -174,8 +177,26 @@ class CsvFileType: csv_encoding: CSVEncoding = field( default_factory=lambda: CSVEncoding(csv_delimiter=",", csv_eol="\n", csv_text_enclosure='"', csv_escape_char='"')) - # delimiter: Optional[str] = None - # file delimiter type when reading the csv. if not supplied, the collectory setting delimiter is read in for the dr + + def check_for_empty(self, include_keys = True): + if self.files and len(self.files) > 0 and \ + self.type and isinstance(self.type, MetaElementTypes) and \ + (not include_keys or include_keys and self.keys and len(self.keys) > 0): + return True + return False + + def add_data(self, other_csv_file_type: CsvFileType): + if self.type and self.type == other_csv_file_type.type: + if isinstance(self.files, pd.DataFrame) and isinstance(other_csv_file_type.files, pd.DataFrame): + self.files = pd.concat([self.files, other_csv_file_type.files], ignore_index=False) + return True + elif isinstance(self.files, list) and isinstance(other_csv_file_type.files, list): + self.files.append(other_csv_file_type.files) + return True + elif not self.type: + self.files = other_csv_file_type.files + self.type = other_csv_file_type.type + return False from dwcahandler.dwca.eml import Eml from dwcahandler.dwca.base_dwca import BaseDwca diff --git a/src/dwcahandler/dwca/base_dwca.py b/src/dwcahandler/dwca/base_dwca.py index a6db536..f80ba2d 100644 --- a/src/dwcahandler/dwca/base_dwca.py +++ b/src/dwcahandler/dwca/base_dwca.py @@ -89,7 +89,7 @@ def delete_records(self, records_to_delete: CsvFileType): pass @abstractmethod - def validate_content(self, content_to_validate: list[MetaElementTypes] = None, error_file: str = None): + def validate_content(self, content_to_validate: dict = None, error_file: str = None): pass @abstractmethod @@ -127,7 +127,7 @@ def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca: Un def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO], ext_csv_list: list[CsvFileType] = None, validate_content: bool = True, - eml_content: Union[str, Eml] = ''): + eml_content: Union[str, Eml] = '', additional_validation_on_content: list[CsvFileType] = None): """Create a dwca given the contents of core and extensions and eml content :param core_csv: CsvFileType containing the files, class types and keys to form the core of the dwca @@ -136,6 +136,7 @@ def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO], extensions of the dwca if supplied :param validate_content: whether to validate the contents :param eml_content: eml content in string or a filled Eml object + :param additional_validation_on_content: additional validation to perform """ if ext_csv_list is None: ext_csv_list = [] @@ -143,10 +144,6 @@ def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO], self.extract_csv_content(csv_info=core_csv, core_ext_type=CoreOrExtType.CORE, build_coreid_for_ext=True if len(ext_csv_list) > 0 else False) - # Only validate core content - if validate_content and not self.validate_content(): - raise SystemExit(Exception("Some validations error found. Dwca is not created.")) - # if multimedia files is supplied, do not attempt to convert associated media to multimedia if not any(ext.type == MetaElementTypes.MULTIMEDIA for ext in ext_csv_list): image_ext = self.convert_associated_media_to_extension() @@ -154,9 +151,14 @@ def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO], ext_csv_list.append(image_ext) for ext in ext_csv_list: - self.extract_csv_content(ext, CoreOrExtType.EXTENSION, True) + self.extract_csv_content(csv_info=ext, core_ext_type=CoreOrExtType.EXTENSION, + build_coreid_for_ext=True) self.fill_additional_info() + + if validate_content and not self.validate_content(additional_validation_on_content): + raise SystemExit(Exception("Some validations error found. Dwca is not created.")) + self.generate_eml(eml_content) self.generate_meta() self.write_dwca(output_dwca) @@ -202,5 +204,5 @@ def validate_file(self, csv: CsvFileType, error_file: str): :param csv: CsvFileType to pass the csv, key and type :param error_file: optional error_file for the errored data """ - self.extract_csv_content(csv, CoreOrExtType.CORE) + self.extract_csv_content(csv_info=csv, core_ext_type=CoreOrExtType.CORE) return self.validate_content(error_file=error_file) diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index 43fcccf..d5bfcc8 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -858,13 +858,14 @@ def validate_content(self, content_to_validate: dict = None, error_file: str = N :return: True if the DwCA is value, False otherwise """ - content_set_to_validate = {self.core_content.meta_info.type: self.core_content.keys} + set_to_validate = {self.core_content.meta_info.type: self.core_content.keys} if content_to_validate: - for class_type, content_key in content_to_validate.items(): - if type != self.core_content.meta_info.type: - content_set_to_validate[class_type] = content_key + for class_type, content_keys in content_to_validate.items(): + if not (class_type == self.core_content.meta_info.type and + set(content_keys) == set(self.core_content.keys)): + set_to_validate[class_type] = content_keys - for class_type, key in content_set_to_validate.items(): + for class_type, key in set_to_validate.items(): contents = self.get_content(class_type=class_type) for content, _ in contents: keys_df = self._extract_keys(content.df_content, content.keys) diff --git a/src/dwcahandler/dwca/dwca_meta.py b/src/dwcahandler/dwca/dwca_meta.py index 0fe0e53..0ac29f0 100644 --- a/src/dwcahandler/dwca/dwca_meta.py +++ b/src/dwcahandler/dwca/dwca_meta.py @@ -19,6 +19,18 @@ MetaElementTypes = Enum("MetaElementTypes", dict(DwcClassRowTypes)) +def get_meta_class_row_type(row_type_uri: str): + """ + Find a row type by URI + + :param row_type: The row type URI + :return: The corresponding element + """ + for name, member in MetaElementTypes.__members__.items(): + if member.value == row_type_uri: + return member + return None + @dataclass class MetaElementInfo: @@ -74,18 +86,6 @@ def __extract_meta_info(self, ns, node_elm, core_or_ext_type): def extract_field_attr_value(field_elm, attrib): return field_elm.attrib.get(attrib) if field_elm.attrib.get(attrib) else None - def __get_element_by_row_type(row_type: str): - """ - Find a row type by URI - - :param row_type: The row type URI - :return: The corresponding element - """ - for name, member in MetaElementTypes.__members__.items(): - if member.value == row_type: - return member - return None - fields = node_elm.findall(f'{ns}field') id_field = [] if core_or_ext_type == 'core': @@ -95,7 +95,7 @@ def __get_element_by_row_type(row_type: str): file_name = node_elm.find(f'{ns}files').find(f'{ns}location').text meta_element_info = MetaElementInfo( core_or_ext_type=core_or_ext_type, - type=__get_element_by_row_type(node_elm.attrib['rowType']), + type=get_meta_class_row_type(node_elm.attrib['rowType']), csv_encoding=CSVEncoding( csv_delimiter=node_elm.attrib['fieldsTerminatedBy'], csv_eol=node_elm.attrib['linesTerminatedBy'], @@ -183,12 +183,6 @@ def map_headers(self, headers: list[str], index_field: str = None) -> (list[Fiel field_list.append(field_elm) return field_list, id_index - def _extract_meta_element(self, file_name): - for _, elm in enumerate(self.meta_elements): - if elm.meta_element_type.file_name == file_name: - return elm - return None - def update_meta_element(self, meta_element_info: MetaElementInfo, fields: list[str], index_field: str = None): """Replace or append meta information (based on file name) From eed134fa10e897c016d3d3e59300533ac3088c71 Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Wed, 12 Mar 2025 13:11:39 +1100 Subject: [PATCH 05/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - makes sure terms and classrowtype is sorted, plus other fixes --- README.md | 14 ++++++++++---- pyproject.toml | 2 +- src/dwcahandler/dwca/core_dwca.py | 10 ++++++++-- src/dwcahandler/dwca/dwca_factory.py | 7 ++++++- src/dwcahandler/dwca/dwca_meta.py | 6 +++--- src/dwcahandler/dwca/terms/class-rowtype.csv | 14 +++++++------- src/dwcahandler/dwca/terms/terms.csv | 6 +++--- tests/test_listterms.py | 18 +++++++++++------- 8 files changed, 49 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 6822ebb..e142cbd 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,16 @@ To install published package from testpypi pip install -i https://test.pypi.org/simple/ dwcahandler ```   +* List terms that is supported in dwcahandler package in [terms.csv](src/dwcahandler/dwca/terms/terms.csv) +* Class RowTypes are defined in MetaElementTypes enum class MetaElementTypes. + The supported types are defined by the class column in [class-rowtype.csv](src/dwcahandler/dwca/terms/class-rowtype.csv) + For eg: MetaElementTypes.OCCURRENCE +```python +from dwcahandler import DwcaHandler + +DwcaHandler.list_class_rowtypes() +``` +  ### Examples of dwcahandler usages: * Create Darwin Core Archive from csv file @@ -136,7 +146,3 @@ df_terms, df_class = DwcaHandler.list_terms() print(df_terms, df_class) ```   -* List terms that is supported in dwcahandler package in [terms.csv](src/dwcahandler/dwca/terms/terms.csv) -* Class RowTypes are defined in MetaElementTypes enum class MetaElementTypes. -The supported types are defined by the class column in [class-rowtype.csv](src/dwcahandler/dwca/terms/class-rowtype.csv) -For eg: MetaElementTypes.OCCURRENCE \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 44a0b9f..3e4d7b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ metapype = "^0.0.26" flake8 = "^7.1.1" [tool.poetry.scripts] -update-dwc-terms = "dwcahandler.scripts.update_dwc_terms:update_terms" +update-terms = "dwcahandler.scripts.update_terms:update_terms" [build-system] requires = ["poetry-core"] diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index d5bfcc8..ab0809b 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -876,6 +876,9 @@ def validate_content(self, content_to_validate: dict = None, error_file: str = N if not self._validate_columns(content): return False + log.info("Validation successful for %s %s content for unique keys %s", + content.meta_info.core_or_ext_type, content.meta_info.type, content.keys) + return True def extract_csv_content(self, csv_info: CsvFileType, @@ -892,7 +895,10 @@ def extract_csv_content(self, csv_info: CsvFileType, csv_content = self._combine_contents(csv_info.files, csv_info.csv_encoding) # Use default occurrenceID if not provided - keys = csv_info.keys if self.__check_csv_info_value(csv_info, 'keys') else 'occurrenceID' + if core_ext_type == CoreOrExtType.CORE: + keys = csv_info.keys if self.__check_csv_info_value(csv_info, 'keys') else ['occurrenceID'] + else: + keys = self.core_content.keys core_id_field: str = "" if build_coreid_for_ext: if len(keys) > 1: @@ -979,7 +985,7 @@ def write_dwca(self, output_dwca: Union[str, BytesIO]): log.info("Dwca written to: %s", output_dwca) def _read_csv(self, - csv_file: str | io.TextIOWrapper, + csv_file: Union[str, io.TextIOWrapper], csv_encoding_param: CSVEncoding = MISSING, columns: list = None, ignore_header_lines: int = 0, diff --git a/src/dwcahandler/dwca/dwca_factory.py b/src/dwcahandler/dwca/dwca_factory.py index 3fb7a90..e1bfaab 100644 --- a/src/dwcahandler/dwca/dwca_factory.py +++ b/src/dwcahandler/dwca/dwca_factory.py @@ -6,7 +6,7 @@ import logging from typing import Union import pandas as pd -from dwcahandler.dwca import CsvFileType, Dwca, Terms, Eml +from dwcahandler.dwca import CsvFileType, Dwca, Terms, Eml, MetaElementTypes from io import BytesIO logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) @@ -19,6 +19,11 @@ class DwcaHandler: def list_terms() -> (pd.DataFrame, pd.DataFrame): return Terms().terms_df, Terms().class_df + @staticmethod + def list_class_rowtypes() : + for name, member in MetaElementTypes.__members__.items(): + print(f"{name}: {member.value}") + """Perform various DwCA operations""" @staticmethod diff --git a/src/dwcahandler/dwca/dwca_meta.py b/src/dwcahandler/dwca/dwca_meta.py index 0ac29f0..8cb3fde 100644 --- a/src/dwcahandler/dwca/dwca_meta.py +++ b/src/dwcahandler/dwca/dwca_meta.py @@ -23,7 +23,7 @@ def get_meta_class_row_type(row_type_uri: str): """ Find a row type by URI - :param row_type: The row type URI + :param row_type_uri: The row type URI :return: The corresponding element """ for name, member in MetaElementTypes.__members__.items(): @@ -164,7 +164,7 @@ def __get_terms(self, field_elm): return col_name if len(self.terms_df[self.terms_df['term'].str.lower() == col_name.lower()]) <= 0 \ else self.terms_df[self.terms_df['term'].str.lower() == col_name.lower()]['uri'].values[0] - def map_headers(self, headers: list[str], index_field: str = None) -> (list[Field], Field): + def map_headers(self, headers: list[str], index_field: str = None) -> (list[Field], Optional[Field]): """Map header column names onto a list of fields. Column names are mapped onto fields based on name, URI or qualified name @@ -174,7 +174,7 @@ def map_headers(self, headers: list[str], index_field: str = None) -> (list[Fiel :return: The corresponding field list """ field_list: list[Field] = [] - id_index: Field = None + id_index = None for i, col in enumerate(headers): col_name = self.__remove_prefix(col) field_elm = Field(index=str(i), field_name=col_name, term=self.__get_terms(col_name)) diff --git a/src/dwcahandler/dwca/terms/class-rowtype.csv b/src/dwcahandler/dwca/terms/class-rowtype.csv index d8f79ec..fea1226 100644 --- a/src/dwcahandler/dwca/terms/class-rowtype.csv +++ b/src/dwcahandler/dwca/terms/class-rowtype.csv @@ -1,14 +1,14 @@ prefix,class,class_uri -dwc,TAXON,http://rs.tdwg.org/dwc/terms/Taxon -dwc,OCCURRENCE,http://rs.tdwg.org/dwc/terms/Occurrence -dwc,ORGANISM,http://rs.tdwg.org/dwc/terms/Organism -dwc,MATERIAL_ENTITY,http://rs.tdwg.org/dwc/terms/MaterialEntity +dwc,EVENT,http://rs.tdwg.org/dwc/terms/Event dwc,GEOLOGICAL_CONTEXT,http://rs.tdwg.org/dwc/terms/GeologicalContext -dwc,LOCATION,http://purl.org/dc/terms/Location dwc,IDENTIFICATION,http://rs.tdwg.org/dwc/terms/Identification -dwc,EVENT,http://rs.tdwg.org/dwc/terms/Event +dwc,LOCATION,http://purl.org/dc/terms/Location +dwc,MATERIAL_ENTITY,http://rs.tdwg.org/dwc/terms/MaterialEntity dwc,MATERIAL_SAMPLE,http://rs.tdwg.org/dwc/terms/MaterialSample dwc,MEASUREMENT_OR_FACT,http://rs.tdwg.org/dwc/terms/MeasurementOrFact +dwc,OCCURRENCE,http://rs.tdwg.org/dwc/terms/Occurrence +dwc,ORGANISM,http://rs.tdwg.org/dwc/terms/Organism dwc,RESOURCE_RELATIONSHIP,http://rs.tdwg.org/dwc/terms/ResourceRelationship -obis,EXTENDED_MEASUREMENT_OR_FACT,http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact +dwc,TAXON,http://rs.tdwg.org/dwc/terms/Taxon gbif,MULTIMEDIA,http://rs.gbif.org/terms/1.0/Multimedia +obis,EXTENDED_MEASUREMENT_OR_FACT,http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact diff --git a/src/dwcahandler/dwca/terms/terms.csv b/src/dwcahandler/dwca/terms/terms.csv index ef943e3..0a9b3e7 100644 --- a/src/dwcahandler/dwca/terms/terms.csv +++ b/src/dwcahandler/dwca/terms/terms.csv @@ -54,9 +54,6 @@ dc,temporal,http://purl.org/dc/terms/temporal dc,title,http://purl.org/dc/terms/title dc,type,http://purl.org/dc/terms/type dc,valid,http://purl.org/dc/terms/valid -obis,measurementTypeID,http://rs.iobis.org/obis/terms/measurementTypeID -obis,measurementValueID,http://rs.iobis.org/obis/terms/measurementValueID -obis,measurementUnitID,http://rs.iobis.org/obis/terms/measurementUnitID dwc,acceptedNameUsage,http://rs.tdwg.org/dwc/terms/acceptedNameUsage dwc,acceptedNameUsageID,http://rs.tdwg.org/dwc/terms/acceptedNameUsageID dwc,associatedMedia,http://rs.tdwg.org/dwc/terms/associatedMedia @@ -271,3 +268,6 @@ dwc,verticalDatum,http://rs.tdwg.org/dwc/terms/verticalDatum dwc,vitality,http://rs.tdwg.org/dwc/terms/vitality dwc,waterBody,http://rs.tdwg.org/dwc/terms/waterBody dwc,year,http://rs.tdwg.org/dwc/terms/year +obis,measurementTypeID,http://rs.iobis.org/obis/terms/measurementTypeID +obis,measurementUnitID,http://rs.iobis.org/obis/terms/measurementUnitID +obis,measurementValueID,http://rs.iobis.org/obis/terms/measurementValueID diff --git a/tests/test_listterms.py b/tests/test_listterms.py index 0de55d7..1ac9094 100644 --- a/tests/test_listterms.py +++ b/tests/test_listterms.py @@ -39,17 +39,21 @@ def test_update_list_terms(self, mocker): "http://rs.tdwg.org/dwc/terms/Occurrence", "http://rs.tdwg.org/dwc/terms/Occurrence"]})) mocker.patch('pandas.DataFrame.to_csv') - return_dwc_df, return_dwc_class_df = Terms.update_dwc_terms() - pd.testing.assert_frame_equal(left=return_dwc_df, + return_terms_df, return_class_df = Terms.update_terms() + return_dwc_terms_df = return_terms_df[return_terms_df.prefix.isin(['dwc'])].copy().reset_index(drop=True) + return_dwc_class_df = return_class_df[return_class_df.prefix.isin(['dwc'])].copy().reset_index(drop=True) + pd.testing.assert_frame_equal(left=return_dwc_terms_df, right=pd.DataFrame( {"prefix": [NsPrefix.DWC.value, NsPrefix.DWC.value, NsPrefix.DWC.value], - "term": ["occurrenceID", "basisOfRecord", "scientificName"], - "uri": ["http://rs.tdwg.org/dwc/terms/occurrenceID", - "http://rs.tdwg.org/dwc/terms/basisOfRecord", + "term": ["basisOfRecord", "occurrenceID", "scientificName"], + "uri": ["http://rs.tdwg.org/dwc/terms/basisOfRecord", + "http://rs.tdwg.org/dwc/terms/occurrenceID", "http://rs.tdwg.org/dwc/terms/scientificName"]}), - check_like=True) + check_index_type=False, + check_dtype=False) pd.testing.assert_frame_equal(left=return_dwc_class_df, right=pd.DataFrame({"prefix": [NsPrefix.DWC.value], "class": ["OCCURRENCE"], "class_uri": ["http://rs.tdwg.org/dwc/terms/Occurrence"]}), - check_like=True) + check_index_type=False, + check_dtype=False) From c3aa2ed9280cf9f08ee95024cc37347a92985030 Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Wed, 12 Mar 2025 13:12:16 +1100 Subject: [PATCH 06/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - makes sure terms and classrowtype is sorted, plus other fixes --- src/dwcahandler/dwca/terms.py | 132 ++++++++++++------ .../{update_dwc_terms.py => update_terms.py} | 4 - 2 files changed, 89 insertions(+), 47 deletions(-) rename src/dwcahandler/scripts/{update_dwc_terms.py => update_terms.py} (86%) diff --git a/src/dwcahandler/dwca/terms.py b/src/dwcahandler/dwca/terms.py index bea0964..eb1dd8a 100644 --- a/src/dwcahandler/dwca/terms.py +++ b/src/dwcahandler/dwca/terms.py @@ -12,6 +12,9 @@ this_dir, this_filename = os.path.split(__file__) +log.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=log.DEBUG) +log = log.getLogger("DwcaTerms") def absolute_file_paths(directory): """Convert files in a directory into absolute paths and return @@ -34,6 +37,7 @@ class NsPrefix(Enum): DC = "dc" GBIF = "gbif" OBIS = "obis" + AC = "ac" class ExtInfo(NamedTuple): @@ -49,13 +53,15 @@ class GbifRegisteredExt(ExtInfo, Enum): """ Supported Gbif extensions. Add more extensions to expand the class row type and terms """ - EXTENDED_MEASUREMENT_OR_FACT = ExtInfo(uri="http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact", - prefix=NsPrefix.OBIS, - namespace="http://rs.iobis.org/obis/terms/") SIMPLE_MULTIMEDIA = ExtInfo(uri="http://rs.gbif.org/terms/1.0/Multimedia", prefix=NsPrefix.GBIF, namespace="http://rs.gbif.org/terms/1.0/") - + EXTENDED_MEASUREMENT_OR_FACT = ExtInfo(uri="http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact", + prefix=NsPrefix.OBIS, + namespace="http://rs.iobis.org/obis/terms/") + #AC_MULTIMEDIA = ExtInfo(uri="http://rs.tdwg.org/ac/terms/Multimedia", + # prefix=NsPrefix.AC, + # namespace="http://rs.tdwg.org/ac/terms/") @dataclass class Terms: @@ -83,56 +89,56 @@ def __post_init__(self): self.terms_df = pd.read_csv(Terms.TERMS_FILE_PATH, dtype='str') self.class_df = pd.read_csv(Terms.CLASS_ROW_TYPE_PATH, dtype='str') - @staticmethod - def _update_class_csv(ns: NsPrefix, updates: pd.DataFrame): + def _update_class_df(self, ns: NsPrefix, updates: pd.DataFrame): """ Update class rowtype by replacing all the rows by prefix. :param ns: Name prefix :param updates: dataframe containing the class rows to update """ + def __get_class_term(existing_class_df, class_uri, prefix): + class_term = Terms.extract_term(term_string=class_uri, add_underscore=True).upper() + if len(existing_class_df[existing_class_df['class'].str.contains(class_term)]) > 0: + return f"{prefix.upper()}_{class_term}" + return class_term + if len(updates) > 0 and "class_uri" in updates.columns.tolist(): updates.insert(0, "class", updates["class_uri"].apply( - lambda x: f"{Terms.extract_term(term_string = x, add_underscore = True).upper()}")) + lambda x: f"{__get_class_term(self.class_df, x, ns.value)}")) updates["prefix"] = ns.value + return self._update_df(ns, updates, self.class_df) - Terms._update_csv(ns, updates, True) - return updates + return self.class_df - @staticmethod - def _update_csv(ns: NsPrefix, updates: pd.DataFrame, is_class: bool = True): + def _update_df(self, ns: NsPrefix, updates: pd.DataFrame, df: pd.DataFrame): """ - Update class rowtype or terms by replacing all the rows by prefix. + Update class row type or terms by replacing all the rows by prefix. :param ns: Name prefix :param updates: dataframe containing the class rows or terms to update - :param is_class: True if it is a class rowtype. False if this is terms + :param df: dataframe to update """ + def __get_update_info (update_df: pd.DataFrame): + update_type: str = "term" + count = len(update_df) + if 'class' in update_df.columns.tolist(): + update_type = "class" + return count, update_type - col_list = ["prefix", "class", "class_uri"] if is_class else ["prefix", "term", "uri"] - file = Terms.CLASS_ROW_TYPE_PATH if is_class else Terms.TERMS_FILE_PATH + col_list = df.columns.tolist() if all(col in updates.columns.tolist() for col in col_list): - df = updates - if Path(file).is_file(): - df = pd.read_csv(file) - if len(df) > 0: - df = df[df["prefix"] != ns.value] - df = pd.concat([df, updates[col_list]], ignore_index=False) - - df.to_csv(file, index=False) - log.info("Rows updated in %s: %s of %s", - Path(Terms.CLASS_ROW_TYPE).name, len(updates), len(df)) - else: - log.info("No updates to class csv %s", Path(Terms.CLASS_ROW_TYPE).name) + df = pd.concat([df, updates[col_list]], ignore_index=True) + log.info("Refreshed %s %s prefix %s", *__get_update_info(updates), ns) + return df @staticmethod def get_dwc_source_data() -> pd.DataFrame: return pd.read_csv(Terms.DWC_SOURCE_URL, delimiter=",", encoding='utf-8', dtype='str') - @staticmethod - def update_dwc_terms(): + #@staticmethod + def update_dwc_terms(self): """ Pull the latest terms from gbif dwc csv url and update the darwin core vocab terms in the package For reference: dublin-core-terms is derived from @@ -148,7 +154,7 @@ def update_dwc_terms(): dwc_df["prefix"] = NsPrefix.DWC.value if len(dwc_df) > 0: - Terms._update_csv(NsPrefix.DWC, dwc_df, False) + self.terms_df = self._update_df(NsPrefix.DWC, dwc_df, self.terms_df) dwc_class_df = pd.DataFrame() dwc_class_df["class_uri"] = df["tdwgutility_organizedInClass"].unique() @@ -158,9 +164,7 @@ def update_dwc_terms(): log.info("Total class downloaded: %i", len(dwc_class_df)) if len(dwc_class_df) > 0: - dwc_class_df = Terms._update_class_csv(NsPrefix.DWC, dwc_class_df) - - return dwc_df, dwc_class_df + self.class_df = self._update_class_df(NsPrefix.DWC, dwc_class_df) @staticmethod def extract_term(term_string, add_underscore: bool = False): @@ -191,8 +195,8 @@ def get_class_row_types(): class_list = list(tuple(zip(class_df["class"], class_df["class_uri"]))) return class_list - @staticmethod - def update_gbif_ext(): + #@staticmethod + def update_gbif_ext(self): """ Update the class row type and terms specified by GBIF_REGISTERED_EXTENSION and update by prefix """ @@ -220,7 +224,7 @@ def _extract_value(text: str): url = _get_latest(supported_ext.uri) if url: update_class = pd.DataFrame([supported_ext.uri], columns=["class_uri"]) - Terms._update_class_csv(supported_ext.prefix, update_class) + self.class_df = self._update_class_df(supported_ext.prefix, update_class) with urlopen(url) as f: @@ -235,15 +239,57 @@ def _extract_value(text: str): df = pd.DataFrame(term_info, columns=["term", "namespace", 'uri']) std_ns = ["http://rs.tdwg.org/dwc/terms/", "http://purl.org/dc/terms/"] - existing_terms = Terms().terms_df + existing_terms = self.terms_df #Terms().terms_df extra_terms_df = df[(df["namespace"].isin(std_ns)) & (~df["uri"].isin(existing_terms["uri"]))] - log.info("Additional standard terms found:\n%s", extra_terms_df) - new_terms = df[~df["uri"].isin(existing_terms["uri"])] + if len(extra_terms_df) > 0: + log.info("Additional standard terms found:\n%s", extra_terms_df) + new_terms = df[~df["uri"].isin(existing_terms["uri"])].copy() if len(new_terms) > 0: - new_terms["prefix"] = supported_ext.prefix.value - Terms._update_csv(supported_ext.prefix, new_terms, False) + new_terms.loc[:, "prefix"] = supported_ext.prefix.value + self.terms_df = self._update_df(supported_ext.prefix, new_terms, self.terms_df) @staticmethod def update_terms(): - Terms.update_dwc_terms() - Terms.update_gbif_ext() + """ + Refresh all the terms except for dublin core terms with dc prefix. As these are not obtained dynamically + :return: + """ + def __sort_values(df_to_sort: pd.DataFrame, sorting_column: str) -> pd.DataFrame: + """ + Make sure dc and dwc prefixes stay on top + :param df: dataframe + :return: sorted dataFrame + """ + df_to_sort = df_to_sort.sort_values(by=["prefix", sorting_column], key=lambda x: x.str.lower()) + std_filter_df = df_to_sort.prefix.isin(["dc", "dwc"]) + std_df = df_to_sort[std_filter_df].copy() + ext_df = df_to_sort[~std_filter_df].copy() + return pd.concat([std_df, ext_df], ignore_index=True) + + + log.info("Current class and terms") + + exclude_update_prefixes = [NsPrefix.DC.value] + terms = Terms() + print(terms.class_df.groupby(["prefix"]).agg( + class_prefix_count=pd.NamedAgg(column="prefix", aggfunc="count") + )) + print(terms.terms_df.groupby(["prefix"]).agg( + term_prefix_count=pd.NamedAgg(column="prefix", aggfunc="count") + )) + terms.class_df = terms.class_df[terms.class_df.prefix.isin(exclude_update_prefixes)] + terms.terms_df = terms.terms_df[terms.terms_df.prefix.isin(exclude_update_prefixes)] + terms.update_dwc_terms() + terms.update_gbif_ext() + terms.class_df = __sort_values(terms.class_df, "class") + terms.terms_df = __sort_values(terms.terms_df, "term") + terms.class_df.to_csv(Terms.CLASS_ROW_TYPE_PATH, index=False) + terms.terms_df.to_csv(Terms.TERMS_FILE_PATH, index=False) + + print(terms.class_df.groupby(["prefix"]).agg( + class_prefix_count=pd.NamedAgg(column="prefix", aggfunc="count") + )) + print(terms.terms_df.groupby(["prefix"]).agg( + term_prefix_count=pd.NamedAgg(column="prefix", aggfunc="count") + )) + return terms.terms_df, terms.class_df diff --git a/src/dwcahandler/scripts/update_dwc_terms.py b/src/dwcahandler/scripts/update_terms.py similarity index 86% rename from src/dwcahandler/scripts/update_dwc_terms.py rename to src/dwcahandler/scripts/update_terms.py index 0478000..1bc01b8 100644 --- a/src/dwcahandler/scripts/update_dwc_terms.py +++ b/src/dwcahandler/scripts/update_terms.py @@ -10,10 +10,6 @@ from dwcahandler.dwca.terms import Terms -# Need to populate the Dwc term version programmatically -DWC_TERM_VERSION = "2023-09-17" - - def update_terms(): """ Call the update_dwc_terms to get the latest version of tdwg dwc terms From 388a526ea365d8fb1dd66cadfd60c34a3925952b Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Wed, 12 Mar 2025 14:15:36 +1100 Subject: [PATCH 07/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - fix for validate keys with url for eg: http://rs.gbif.org/terms/1.0/gbifID --- src/dwcahandler/dwca/core_dwca.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index ab0809b..a96b3be 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -22,7 +22,7 @@ from pandas.errors import EmptyDataError from pandas.io import parsers from dwcahandler.dwca import (BaseDwca, CoreOrExtType, CSVEncoding, - CsvFileType, Defaults, Eml, + CsvFileType, Defaults, Eml, Terms, MetaDwCA, MetaElementInfo, MetaElementTypes, MetaElementAttributes, Stat, record_diff_stat) @@ -347,8 +347,15 @@ def set_keys(self, keys: dict = None): contents = self.get_content(class_type=k) # If found then set the key for the content for dwca_content, _ in contents: - dwca_content.keys = [v] if isinstance(v, str) else v - set_keys[k] = v + key_list = [v] if isinstance(v, str) else v + col_term = [] + for a_key in key_list: + if a_key not in dwca_content.df_content.columns.tolist(): + col_term.append(Terms.extract_term(a_key)) + else: + col_term.append(a_key) + dwca_content.keys = col_term + set_keys[k] = col_term return set_keys From 64c897c179d0326f800e8340786414a3401c55e6 Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Thu, 13 Mar 2025 13:19:04 +1100 Subject: [PATCH 08/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - update readme and some test cases --- README.md | 8 ++++++++ tests/test_create_core_and_ext_content.py | 2 -- tests/test_create_dwca.py | 12 +++++------- tests/test_multimedia_content.py | 6 ++---- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index e142cbd..66fa212 100644 --- a/README.md +++ b/README.md @@ -58,10 +58,18 @@ To install published package from testpypi pip install -i https://test.pypi.org/simple/ dwcahandler ```   +### Extensions supported: +Standard Darwin Core Terms and Class +Simple Multimedia https://rs.gbif.org/extension/gbif/1.0/multimedia.xml +Extended Measurement Or Fact http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact + * List terms that is supported in dwcahandler package in [terms.csv](src/dwcahandler/dwca/terms/terms.csv) + * Class RowTypes are defined in MetaElementTypes enum class MetaElementTypes. The supported types are defined by the class column in [class-rowtype.csv](src/dwcahandler/dwca/terms/class-rowtype.csv) For eg: MetaElementTypes.OCCURRENCE + +To list all the class rowtypes supported ```python from dwcahandler import DwcaHandler diff --git a/tests/test_create_core_and_ext_content.py b/tests/test_create_core_and_ext_content.py index 696b277..ce633ed 100644 --- a/tests/test_create_core_and_ext_content.py +++ b/tests/test_create_core_and_ext_content.py @@ -134,7 +134,6 @@ def test_extract_tsv_ext_content(self): multimedia_file_path = 'input_files/sample/multimedia/multimedia_file.tsv' dwca_creator.extract_csv_content(csv_info=CsvFileType(files=[multimedia_file_path], type=MetaElementTypes.MULTIMEDIA, - keys=['catalogNumber'], csv_encoding=CSVEncoding(csv_delimiter='\t')), core_ext_type=CoreOrExtType.EXTENSION) @@ -191,7 +190,6 @@ def test_extract_csv_ext_with_header_space(self): dwca_creator.extract_csv_content(csv_info=CsvFileType(files=multimedia_with_space['file_paths'], type=MetaElementTypes.MULTIMEDIA, - keys=['catalogNumber'], csv_encoding=CSVEncoding(csv_delimiter=',')), core_ext_type=CoreOrExtType.EXTENSION) diff --git a/tests/test_create_dwca.py b/tests/test_create_dwca.py index b35090f..b9499a7 100644 --- a/tests/test_create_dwca.py +++ b/tests/test_create_dwca.py @@ -54,7 +54,7 @@ def test_create_occurrence_dwca_occurrence(self): core_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], keys=['occurrenceID'], type=MetaElementTypes.OCCURRENCE) - ext1_csv = CsvFileType(files=[f"{test_files_folder}/multimedia.txt"], keys=['occurrenceID'], + ext1_csv = CsvFileType(files=[f"{test_files_folder}/multimedia.txt"], type=MetaElementTypes.MULTIMEDIA) output_obj = BytesIO() @@ -73,7 +73,6 @@ def test_create_occurrence_dwca_occurrence_multiple_keys(self): keys=['institutionCode', 'collectionCode', 'catalogNumber'], type=MetaElementTypes.OCCURRENCE) ext1_csv = CsvFileType(files=[f"{test_files_folder}/multimedia.txt"], - keys=['institutionCode', 'collectionCode', 'catalogNumber'], type=MetaElementTypes.MULTIMEDIA) output_obj = BytesIO() @@ -92,7 +91,6 @@ def test_create_occurrence_dwca_occurrence_extra_multimedia_records(self): keys=['institutionCode', 'collectionCode', 'catalogNumber'], type=MetaElementTypes.OCCURRENCE) ext1_csv = CsvFileType(files=[f"{test_files_folder}/multimedia.txt"], - keys=['institutionCode', 'collectionCode', 'catalogNumber'], type=MetaElementTypes.MULTIMEDIA) output_obj = BytesIO() @@ -110,9 +108,9 @@ def test_create_event_dwca_sample1(self): core_csv = CsvFileType(files=[f"{test_files_folder}/event.txt"], keys=['eventID'], type=MetaElementTypes.EVENT) - ext1_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], keys=['eventID'], + ext1_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], keys=['occurrenceID'], type=MetaElementTypes.OCCURRENCE) - ext2_csv = CsvFileType(files=[f"{test_files_folder}/measurement_or_fact.txt"], keys=['eventID'], + ext2_csv = CsvFileType(files=[f"{test_files_folder}/measurement_or_fact.txt"], type=MetaElementTypes.MEASUREMENT_OR_FACT) output_obj = BytesIO() @@ -130,9 +128,9 @@ def test_create_event_dwca_sample2(self): core_csv = CsvFileType(files=[f"{test_files_folder}/event.txt"], keys=['eventID'], type=MetaElementTypes.EVENT) - ext1_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], keys=['eventID'], + ext1_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], keys=['occurrenceID'], type=MetaElementTypes.OCCURRENCE) - ext2_csv = CsvFileType(files=[f"{test_files_folder}/extended_measurement_or_fact.txt"], keys=['eventID'], + ext2_csv = CsvFileType(files=[f"{test_files_folder}/extended_measurement_or_fact.txt"], type=MetaElementTypes.EXTENDED_MEASUREMENT_OR_FACT) output_obj = BytesIO() diff --git a/tests/test_multimedia_content.py b/tests/test_multimedia_content.py index 796f2d6..a6f2c81 100644 --- a/tests/test_multimedia_content.py +++ b/tests/test_multimedia_content.py @@ -151,8 +151,7 @@ def test_fill_multimedia_info_with_format_type_partially_supplied(self, mock_mim dwca.extract_csv_content(csv_info=CsvFileType(files=pd.DataFrame(data=image_data, columns=["occurrenceID", "identifier", "format", "type"]), - type=MetaElementTypes.MULTIMEDIA, - keys=['occurrenceID']), + type=MetaElementTypes.MULTIMEDIA), core_ext_type=CoreOrExtType.EXTENSION) # Fill multimedia extension info @@ -209,8 +208,7 @@ def test_fill_multimedia_info_type_from_format(self, mock_mime_types): dwca.extract_csv_content(csv_info=CsvFileType(files=pd.DataFrame(data=image_data, columns=["occurrenceID", "identifier", "format"]), - type=MetaElementTypes.MULTIMEDIA, - keys=['occurrenceID']), + type=MetaElementTypes.MULTIMEDIA), core_ext_type=CoreOrExtType.EXTENSION) # Fill multimedia extension info From aac762b0ee2cc25795f0e57be32ecb103e4b2e1d Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Thu, 13 Mar 2025 14:49:08 +1100 Subject: [PATCH 09/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - more test cases for validation with identifier urls and same class types for core and extns --- README.md | 13 ++++-- src/dwcahandler/dwca/core_dwca.py | 17 +++++-- tests/input_files/dwca/dwca-sample6/meta.xml | 36 +++++++++++++++ .../dwca/dwca-sample6/occurrence.txt | 10 ++++ .../dwca/dwca-sample6/verbatim.txt | 10 ++++ tests/input_files/dwca/dwca-sample7/meta.xml | 36 +++++++++++++++ .../dwca/dwca-sample7/occurrence.txt | 10 ++++ .../dwca/dwca-sample7/verbatim.txt | 10 ++++ tests/input_files/occurrence/sample4/meta.xml | 14 ++++++ .../occurrence/sample4/occurrence.txt | 13 ++++++ tests/test_create_dwca.py | 15 ++++++ tests/test_validate_dwca.py | 46 +++++++++++++++++++ 12 files changed, 222 insertions(+), 8 deletions(-) create mode 100644 tests/input_files/dwca/dwca-sample6/meta.xml create mode 100644 tests/input_files/dwca/dwca-sample6/occurrence.txt create mode 100644 tests/input_files/dwca/dwca-sample6/verbatim.txt create mode 100644 tests/input_files/dwca/dwca-sample7/meta.xml create mode 100644 tests/input_files/dwca/dwca-sample7/occurrence.txt create mode 100644 tests/input_files/dwca/dwca-sample7/verbatim.txt create mode 100755 tests/input_files/occurrence/sample4/meta.xml create mode 100644 tests/input_files/occurrence/sample4/occurrence.txt diff --git a/README.md b/README.md index 66fa212..808ad48 100644 --- a/README.md +++ b/README.md @@ -58,11 +58,12 @@ To install published package from testpypi pip install -i https://test.pypi.org/simple/ dwcahandler ```   -### Extensions supported: +### Extensions that are currently supported and have been tested in ALA ingestion: Standard Darwin Core Terms and Class Simple Multimedia https://rs.gbif.org/extension/gbif/1.0/multimedia.xml Extended Measurement Or Fact http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact + * List terms that is supported in dwcahandler package in [terms.csv](src/dwcahandler/dwca/terms/terms.csv) * Class RowTypes are defined in MetaElementTypes enum class MetaElementTypes. @@ -78,7 +79,11 @@ DwcaHandler.list_class_rowtypes()   ### Examples of dwcahandler usages: -* Create Darwin Core Archive from csv file +* Create Darwin Core Archive from csv file. +* Keys are used as id/core id for Dwca with extensions and must be supplied for the core and extensions in the data +* Validation is performed to make sure that the keys are unique in the core of the Dwca +* If Keys are not provided, the default keys is occurrenceID +* If multiple Keys are supplied, resulting dwca would generate id/core id * In creating a dwca with multimedia extension, provide format and type values in the Simple Multimedia extension, otherwise, dwcahandler will attempt to fill these info by guessing the mimetype from url. ```python @@ -88,7 +93,7 @@ from dwcahandler import MetaElementTypes from dwcahandler import Eml core_csv = CsvFileType(files=['/tmp/occurrence.csv'], type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) -ext_csvs = [CsvFileType(files=['/tmp/multimedia.csv'], type=MetaElementTypes.MULTIMEDIA, keys=['occurrenceID'])] +ext_csvs = [CsvFileType(files=['/tmp/multimedia.csv'], type=MetaElementTypes.MULTIMEDIA)] eml = Eml(dataset_name='Test Dataset', description='Dataset description', @@ -113,7 +118,7 @@ core_df = pd.read_csv("/tmp/occurrence.csv") core_frame = CsvFileType(files=core_df, type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) ext_df = pd.read_csv("/tmp/multimedia.csv") -ext_frame = [CsvFileType(files=ext_df, type=MetaElementTypes.MULTIMEDIA, keys=['occurrenceID'])] +ext_frame = [CsvFileType(files=ext_df, type=MetaElementTypes.MULTIMEDIA)] eml = Eml(dataset_name='Test Dataset', description='Dataset description', diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index a96b3be..8d9e0a0 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -872,21 +872,30 @@ def validate_content(self, content_to_validate: dict = None, error_file: str = N set(content_keys) == set(self.core_content.keys)): set_to_validate[class_type] = content_keys + validation_success = True for class_type, key in set_to_validate.items(): contents = self.get_content(class_type=class_type) for content, _ in contents: + validation_content_success = True keys_df = self._extract_keys(content.df_content, content.keys) if not self.check_duplicates(keys_df, content.keys, error_file): - return False + log.error("Validation failed for %s %s content for duplicates keys %s", + content.meta_info.core_or_ext_type, content.meta_info.type, content.keys) + validation_content_success = False if not self._validate_columns(content): - return False + log.error("Validation failed for %s %s content for duplicate columns", + content.meta_info.core_or_ext_type, content.meta_info.type) + validation_content_success = False - log.info("Validation successful for %s %s content for unique keys %s", + if validation_content_success: + log.info("Validation successful for %s %s content for unique keys %s", content.meta_info.core_or_ext_type, content.meta_info.type, content.keys) + else: + validation_success = False - return True + return True if validation_success else False def extract_csv_content(self, csv_info: CsvFileType, core_ext_type: CoreOrExtType, build_coreid_for_ext: bool = False): diff --git a/tests/input_files/dwca/dwca-sample6/meta.xml b/tests/input_files/dwca/dwca-sample6/meta.xml new file mode 100644 index 0000000..acef7ca --- /dev/null +++ b/tests/input_files/dwca/dwca-sample6/meta.xml @@ -0,0 +1,36 @@ + + + + occurrence.txt + + + + + + + + + + + + + + + + + + + verbatim.txt + + + + + + + + + + + + + diff --git a/tests/input_files/dwca/dwca-sample6/occurrence.txt b/tests/input_files/dwca/dwca-sample6/occurrence.txt new file mode 100644 index 0000000..35220df --- /dev/null +++ b/tests/input_files/dwca/dwca-sample6/occurrence.txt @@ -0,0 +1,10 @@ +gbifID license institutionCode collectionCode basisOfRecord catalogNumber occurrenceStatus decimalLatitude decimalLongitude scientificName kingdom genus +sample1 CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 33699 PRESENT -33.7948 112.4737 "Species A" "Kingdom A" "Genus A" +sample2 CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 33703 PRESENT -33.7948 112.4737 "Species A" "Kingdom A" "Genus A" +sample3 CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 31739 PRESENT -19.887 112.254 "Species B" "Kingdom A" "Genus A" +sample4 CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 31691 PRESENT -19.887 112.254 "Species B" "Kingdom A" "Genus A" +sample5 CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 31697 PRESENT -19.887 112.254 "Species B" "Kingdom A" "Genus A" +sample6 CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 31695 PRESENT -19.887 112.254 "Species B" "Kingdom A" "Genus A" +sample7 CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 31693 PRESENT -19.887 112.254 "Species B" "Kingdom A" "Genus A" +sample8 CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 31696 PRESENT -19.887 112.254 "Species B" "Kingdom A" "Genus A" +sample9 CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 31735 PRESENT -19.887 112.254 "Species B" "Kingdom A" "Genus A" \ No newline at end of file diff --git a/tests/input_files/dwca/dwca-sample6/verbatim.txt b/tests/input_files/dwca/dwca-sample6/verbatim.txt new file mode 100644 index 0000000..c440532 --- /dev/null +++ b/tests/input_files/dwca/dwca-sample6/verbatim.txt @@ -0,0 +1,10 @@ +gbifID institutionCode collectionCode basisOfRecord catalogNumber decimalLatitude decimalLongitude scientificName +sample1 I1 COL-A FossileSpecimen 33699 -33.7948 112.4737 "Species A" +sample2 I1 COL-A FossileSpecimen 33703 -33.7948 112.4737 "Species A" +sample3 I1 COL-A FossileSpecimen 31739 -19.887 112.254 "Species B" +sample4 I1 COL-A FossileSpecimen 31691 -19.887 112.254 "Species B" +sample5 I1 COL-A FossileSpecimen 31697 -19.887 112.254 "Species B" +sample6 I1 COL-A FossileSpecimen 31695 -19.887 112.254 "Species B" +sample7 I1 COL-A FossileSpecimen 31693 -19.887 112.254 "Species B" +sample8 I1 COL-A FossileSpecimen 31696 -19.887 112.254 "Species B" +sample9 I1 COL-A FossileSpecimen 31735 -19.887 112.254 "Species B" \ No newline at end of file diff --git a/tests/input_files/dwca/dwca-sample7/meta.xml b/tests/input_files/dwca/dwca-sample7/meta.xml new file mode 100644 index 0000000..acef7ca --- /dev/null +++ b/tests/input_files/dwca/dwca-sample7/meta.xml @@ -0,0 +1,36 @@ + + + + occurrence.txt + + + + + + + + + + + + + + + + + + + verbatim.txt + + + + + + + + + + + + + diff --git a/tests/input_files/dwca/dwca-sample7/occurrence.txt b/tests/input_files/dwca/dwca-sample7/occurrence.txt new file mode 100644 index 0000000..4e3fc8d --- /dev/null +++ b/tests/input_files/dwca/dwca-sample7/occurrence.txt @@ -0,0 +1,10 @@ +gbifID license institutionCode collectionCode basisOfRecord catalogNumber occurrenceStatus decimalLatitude decimalLongitude scientificName kingdom genus +sample1 CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 33699 PRESENT -33.7948 112.4737 "Species A" "Kingdom A" "Genus A" +sample2 CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 33703 PRESENT -33.7948 112.4737 "Species A" "Kingdom A" "Genus A" +sample3 CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 31739 PRESENT -19.887 112.254 "Species B" "Kingdom A" "Genus A" +sample CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 31691 PRESENT -19.887 112.254 "Species B" "Kingdom A" "Genus A" +sample5 CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 31697 PRESENT -19.887 112.254 "Species B" "Kingdom A" "Genus A" +sample CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 31695 PRESENT -19.887 112.254 "Species B" "Kingdom A" "Genus A" +sample CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 31693 PRESENT -19.887 112.254 "Species B" "Kingdom A" "Genus A" +sample8 CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 31696 PRESENT -19.887 112.254 "Species B" "Kingdom A" "Genus A" +sample9 CC_BY_4_0 I1 COL-A FOSSIL_SPECIMEN 31735 PRESENT -19.887 112.254 "Species B" "Kingdom A" "Genus A" \ No newline at end of file diff --git a/tests/input_files/dwca/dwca-sample7/verbatim.txt b/tests/input_files/dwca/dwca-sample7/verbatim.txt new file mode 100644 index 0000000..5ae0683 --- /dev/null +++ b/tests/input_files/dwca/dwca-sample7/verbatim.txt @@ -0,0 +1,10 @@ +gbifID institutionCode collectionCode basisOfRecord catalogNumber decimalLatitude decimalLongitude scientificName +sample1 I1 COL-A FossileSpecimen 33699 -33.7948 112.4737 "Species A" +sample I1 COL-A FossileSpecimen 33703 -33.7948 112.4737 "Species A" +sample3 I1 COL-A FossileSpecimen 31739 -19.887 112.254 "Species B" +sample I1 COL-A FossileSpecimen 31691 -19.887 112.254 "Species B" +sample5 I1 COL-A FossileSpecimen 31697 -19.887 112.254 "Species B" +sample I1 COL-A FossileSpecimen 31695 -19.887 112.254 "Species B" +sample7 I1 COL-A FossileSpecimen 31693 -19.887 112.254 "Species B" +sample I1 COL-A FossileSpecimen 31696 -19.887 112.254 "Species B" +sample9 I1 COL-A FossileSpecimen 31735 -19.887 112.254 "Species B" \ No newline at end of file diff --git a/tests/input_files/occurrence/sample4/meta.xml b/tests/input_files/occurrence/sample4/meta.xml new file mode 100755 index 0000000..891a293 --- /dev/null +++ b/tests/input_files/occurrence/sample4/meta.xml @@ -0,0 +1,14 @@ + + + + + occurrence.txt + + + + + + + + + \ No newline at end of file diff --git a/tests/input_files/occurrence/sample4/occurrence.txt b/tests/input_files/occurrence/sample4/occurrence.txt new file mode 100644 index 0000000..a745ee9 --- /dev/null +++ b/tests/input_files/occurrence/sample4/occurrence.txt @@ -0,0 +1,13 @@ +occurrenceID,basisOfRecord,scientificName,license,decimalLatitude,decimalLongitude +014826,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-30.0000,144.0000 +014825,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-31.1111,145.0000 +014824,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-32.085431,100.828059 +014823,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-33.097233,101.820888 +014822,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-34.099936,102.821654 +014821,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-35.893671,104.999974 +014802,Human Observation,Alectryon coriaceus,CC-BY 4.0 (Int),-34.113747,120.889354 +014801,Human Observation,Eucalyptus robusta,CC-BY 4.0 (Int),-36.0000,144.308848 +014800,Human Observation,Arundo donax,CC-BY 4.0 (Int),-30.440251,146.240159 +014799,Human Observation,Arundo donax,CC-BY 4.0 (Int),-31.547195,150.783246 +014798,Human Observation,Arundo donax,CC-BY 4.0 (Int),-40.481117,150.823468 +014792,Human Observation,Euphorbia paralias,CC-BY 4.0 (Int),-28.0000,115.0000 diff --git a/tests/test_create_dwca.py b/tests/test_create_dwca.py index b9499a7..469f353 100644 --- a/tests/test_create_dwca.py +++ b/tests/test_create_dwca.py @@ -141,3 +141,18 @@ def test_create_event_dwca_sample2(self): assert output_obj check_output(output_obj, test_files_folder) + + def test_create_occurrence_dwca_occurrence_without_ext(self): + test_files_folder = "./input_files/occurrence/sample4" + + core_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], + type=MetaElementTypes.OCCURRENCE) + + output_obj = BytesIO() + + DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=[], output_dwca=output_obj, + eml_content=get_eml_content()) + + assert output_obj + + check_output(output_obj, test_files_folder) \ No newline at end of file diff --git a/tests/test_validate_dwca.py b/tests/test_validate_dwca.py index 3c2063c..d22afcc 100644 --- a/tests/test_validate_dwca.py +++ b/tests/test_validate_dwca.py @@ -74,3 +74,49 @@ def test_duplicate_columns_in_dwca(self): assert ("Duplicate columns ['catalogNumber'] specified in the metadata for occurrence.csv" in str(exc_info.value)) + + def test_dwca_with_occ_core_ext(self, caplog): + """ + Test for read and extract dwca. Validate dwca with core and ext of same class type + """ + caplog.set_level(logging.INFO) + simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample6") + keys_lookup = {MetaElementTypes.OCCURRENCE: 'gbifID'} + + dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) + assert dwca_result + assert "Validation successful for core MetaElementTypes.OCCURRENCE content for unique keys ['gbifID']" in caplog.messages + assert "Validation successful for extension MetaElementTypes.OCCURRENCE content for unique keys ['gbifID']" in caplog.messages + + def test_dwca_with_occ_core_ext_with_url_as_key(self, caplog): + """ + Test for read and extract dwca. + Validate dwca with core and ext of same class type and with occurrence identifier as full url + """ + caplog.set_level(logging.INFO) + simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample6") + keys_lookup = {MetaElementTypes.OCCURRENCE: 'http://rs.gbif.org/terms/1.0/gbifID'} + + dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) + assert dwca_result + assert "Validation successful for core MetaElementTypes.OCCURRENCE content for unique keys ['gbifID']" in caplog.messages + assert "Validation successful for extension MetaElementTypes.OCCURRENCE content for unique keys ['gbifID']" in caplog.messages + + def test_dwca_with_occ_core_ext_with_duplicates(self, caplog): + """ + Test for read and extract dwca. Validate duplicate columns specified in metadata of dwca + """ + caplog.set_level(logging.INFO) + simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample7") + keys_lookup = {MetaElementTypes.OCCURRENCE: 'http://rs.gbif.org/terms/1.0/gbifID'} + + dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) + assert not dwca_result + assert "Duplicate ['gbifID'] found. Total rows affected: 2" in caplog.messages + assert "Duplicate values: ['sample']" in caplog.messages + assert "Validation failed for core MetaElementTypes.OCCURRENCE content for duplicates keys ['gbifID']" in caplog.messages + + assert "Duplicate ['gbifID'] found. Total rows affected: 3" in caplog.messages + assert "Duplicate values: ['sample']" in caplog.messages + assert "Validation failed for extension MetaElementTypes.OCCURRENCE content for duplicates keys ['gbifID']" in caplog.messages + From 19319bb42457d746ecfd4ce4d453d2c8cf34130d Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Fri, 14 Mar 2025 11:46:24 +1100 Subject: [PATCH 10/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - Resolve flake8 linting and fix readme --- README.md | 36 +++++++++++++------------ src/dwcahandler/dwca/core_dwca.py | 8 +++--- src/dwcahandler/dwca/dwca_meta.py | 1 + src/dwcahandler/dwca/terms.py | 19 ++++++------- src/dwcahandler/scripts/update_terms.py | 1 + tests/test_create_dwca.py | 2 +- tests/test_validate_dwca.py | 1 - 7 files changed, 34 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 808ad48..4d219f0 100644 --- a/README.md +++ b/README.md @@ -58,19 +58,29 @@ To install published package from testpypi pip install -i https://test.pypi.org/simple/ dwcahandler ```   -### Extensions that are currently supported and have been tested in ALA ingestion: -Standard Darwin Core Terms and Class -Simple Multimedia https://rs.gbif.org/extension/gbif/1.0/multimedia.xml -Extended Measurement Or Fact http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact +### Supported extensions that have been tested in ALA: +* Standard Darwin Core Terms and Class +* Simple Multimedia https://rs.gbif.org/extension/gbif/1.0/multimedia.xml +* Extended Measurement Or Fact http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact +#### Terms +* Terms are listed in [terms.csv](src/dwcahandler/dwca/terms/terms.csv) +```python +from dwcahandler import DwcaHandler -* List terms that is supported in dwcahandler package in [terms.csv](src/dwcahandler/dwca/terms/terms.csv) +df_terms, df_class = DwcaHandler.list_terms() +print(df_terms, df_class) +``` -* Class RowTypes are defined in MetaElementTypes enum class MetaElementTypes. - The supported types are defined by the class column in [class-rowtype.csv](src/dwcahandler/dwca/terms/class-rowtype.csv) - For eg: MetaElementTypes.OCCURRENCE +#### Class +* Listed in [class-rowtype.csv](src/dwcahandler/dwca/terms/class-rowtype.csv) +* Used in MetaElementTypes class enum name: +```python +MetaElementTypes.OCCURRENCE +MetaElementTypes.MULTIMEDIA +``` -To list all the class rowtypes supported +To list all the class rowtypes ```python from dwcahandler import DwcaHandler @@ -151,11 +161,3 @@ DwcaHandler.delete_records(dwca_file='/tmp/dwca.zip', output_dwca='/tmp/new-dwca.zip') ```   - -```python -from dwcahandler import DwcaHandler - -df_terms, df_class = DwcaHandler.list_terms() -print(df_terms, df_class) -``` -  diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index 8d9e0a0..b6f305c 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -351,9 +351,9 @@ def set_keys(self, keys: dict = None): col_term = [] for a_key in key_list: if a_key not in dwca_content.df_content.columns.tolist(): - col_term.append(Terms.extract_term(a_key)) + col_term.append(Terms.extract_term(a_key)) else: - col_term.append(a_key) + col_term.append(a_key) dwca_content.keys = col_term set_keys[k] = col_term @@ -881,7 +881,7 @@ def validate_content(self, content_to_validate: dict = None, error_file: str = N if not self.check_duplicates(keys_df, content.keys, error_file): log.error("Validation failed for %s %s content for duplicates keys %s", - content.meta_info.core_or_ext_type, content.meta_info.type, content.keys) + content.meta_info.core_or_ext_type, content.meta_info.type, content.keys) validation_content_success = False if not self._validate_columns(content): @@ -891,7 +891,7 @@ def validate_content(self, content_to_validate: dict = None, error_file: str = N if validation_content_success: log.info("Validation successful for %s %s content for unique keys %s", - content.meta_info.core_or_ext_type, content.meta_info.type, content.keys) + content.meta_info.core_or_ext_type, content.meta_info.type, content.keys) else: validation_success = False diff --git a/src/dwcahandler/dwca/dwca_meta.py b/src/dwcahandler/dwca/dwca_meta.py index 8cb3fde..bb3c41f 100644 --- a/src/dwcahandler/dwca/dwca_meta.py +++ b/src/dwcahandler/dwca/dwca_meta.py @@ -19,6 +19,7 @@ MetaElementTypes = Enum("MetaElementTypes", dict(DwcClassRowTypes)) + def get_meta_class_row_type(row_type_uri: str): """ Find a row type by URI diff --git a/src/dwcahandler/dwca/terms.py b/src/dwcahandler/dwca/terms.py index eb1dd8a..5fd9af1 100644 --- a/src/dwcahandler/dwca/terms.py +++ b/src/dwcahandler/dwca/terms.py @@ -1,5 +1,4 @@ import os -from pathlib import Path from dataclasses import dataclass, field import re import pandas as pd @@ -12,10 +11,10 @@ this_dir, this_filename = os.path.split(__file__) -log.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - level=log.DEBUG) +log.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=log.DEBUG) log = log.getLogger("DwcaTerms") + def absolute_file_paths(directory): """Convert files in a directory into absolute paths and return as a generator @@ -59,10 +58,11 @@ class GbifRegisteredExt(ExtInfo, Enum): EXTENDED_MEASUREMENT_OR_FACT = ExtInfo(uri="http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact", prefix=NsPrefix.OBIS, namespace="http://rs.iobis.org/obis/terms/") - #AC_MULTIMEDIA = ExtInfo(uri="http://rs.tdwg.org/ac/terms/Multimedia", + # AC_MULTIMEDIA = ExtInfo(uri="http://rs.tdwg.org/ac/terms/Multimedia", # prefix=NsPrefix.AC, # namespace="http://rs.tdwg.org/ac/terms/") + @dataclass class Terms: """ @@ -119,7 +119,7 @@ def _update_df(self, ns: NsPrefix, updates: pd.DataFrame, df: pd.DataFrame): :param updates: dataframe containing the class rows or terms to update :param df: dataframe to update """ - def __get_update_info (update_df: pd.DataFrame): + def __get_update_info(update_df: pd.DataFrame): update_type: str = "term" count = len(update_df) if 'class' in update_df.columns.tolist(): @@ -137,7 +137,6 @@ def __get_update_info (update_df: pd.DataFrame): def get_dwc_source_data() -> pd.DataFrame: return pd.read_csv(Terms.DWC_SOURCE_URL, delimiter=",", encoding='utf-8', dtype='str') - #@staticmethod def update_dwc_terms(self): """ Pull the latest terms from gbif dwc csv url and update the darwin core vocab terms in the package @@ -195,7 +194,6 @@ def get_class_row_types(): class_list = list(tuple(zip(class_df["class"], class_df["class_uri"]))) return class_list - #@staticmethod def update_gbif_ext(self): """ Update the class row type and terms specified by GBIF_REGISTERED_EXTENSION and update by prefix @@ -239,7 +237,7 @@ def _extract_value(text: str): df = pd.DataFrame(term_info, columns=["term", "namespace", 'uri']) std_ns = ["http://rs.tdwg.org/dwc/terms/", "http://purl.org/dc/terms/"] - existing_terms = self.terms_df #Terms().terms_df + existing_terms = self.terms_df extra_terms_df = df[(df["namespace"].isin(std_ns)) & (~df["uri"].isin(existing_terms["uri"]))] if len(extra_terms_df) > 0: log.info("Additional standard terms found:\n%s", extra_terms_df) @@ -266,7 +264,6 @@ def __sort_values(df_to_sort: pd.DataFrame, sorting_column: str) -> pd.DataFrame ext_df = df_to_sort[~std_filter_df].copy() return pd.concat([std_df, ext_df], ignore_index=True) - log.info("Current class and terms") exclude_update_prefixes = [NsPrefix.DC.value] @@ -281,8 +278,8 @@ def __sort_values(df_to_sort: pd.DataFrame, sorting_column: str) -> pd.DataFrame terms.terms_df = terms.terms_df[terms.terms_df.prefix.isin(exclude_update_prefixes)] terms.update_dwc_terms() terms.update_gbif_ext() - terms.class_df = __sort_values(terms.class_df, "class") - terms.terms_df = __sort_values(terms.terms_df, "term") + terms.class_df = __sort_values(terms.class_df, "class") + terms.terms_df = __sort_values(terms.terms_df, "term") terms.class_df.to_csv(Terms.CLASS_ROW_TYPE_PATH, index=False) terms.terms_df.to_csv(Terms.TERMS_FILE_PATH, index=False) diff --git a/src/dwcahandler/scripts/update_terms.py b/src/dwcahandler/scripts/update_terms.py index 1bc01b8..0548878 100644 --- a/src/dwcahandler/scripts/update_terms.py +++ b/src/dwcahandler/scripts/update_terms.py @@ -10,6 +10,7 @@ from dwcahandler.dwca.terms import Terms + def update_terms(): """ Call the update_dwc_terms to get the latest version of tdwg dwc terms diff --git a/tests/test_create_dwca.py b/tests/test_create_dwca.py index 469f353..57c3c3d 100644 --- a/tests/test_create_dwca.py +++ b/tests/test_create_dwca.py @@ -155,4 +155,4 @@ def test_create_occurrence_dwca_occurrence_without_ext(self): assert output_obj - check_output(output_obj, test_files_folder) \ No newline at end of file + check_output(output_obj, test_files_folder) diff --git a/tests/test_validate_dwca.py b/tests/test_validate_dwca.py index d22afcc..71909e8 100644 --- a/tests/test_validate_dwca.py +++ b/tests/test_validate_dwca.py @@ -119,4 +119,3 @@ def test_dwca_with_occ_core_ext_with_duplicates(self, caplog): assert "Duplicate ['gbifID'] found. Total rows affected: 3" in caplog.messages assert "Duplicate values: ['sample']" in caplog.messages assert "Validation failed for extension MetaElementTypes.OCCURRENCE content for duplicates keys ['gbifID']" in caplog.messages - From e05d81c7e23cc4695bf7fe47584abfd8dbca9c89 Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Fri, 14 Mar 2025 17:12:36 +1100 Subject: [PATCH 11/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - Fix for default keys for core if not provided --- src/dwcahandler/dwca/core_dwca.py | 33 ++++++++++++++++++------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index b6f305c..129565b 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -770,18 +770,6 @@ def _combine_contents(self, contents: list, csv_encoding, use_chunking=False): raise ValueError('content is empty') - def __check_csv_info_value(self, csv_info: CsvFileType, col: str): - """Look for a column in a CSV file - - :param csv_info: The CSV file - :param col: The column name - :return: Either column information or False for not found - """ - csv_info_dict = asdict(csv_info) - if col in csv_info_dict: - return csv_info_dict[col] - return False - def check_duplicates(self, content_keys_df, keys, error_file=None): """Check a content frame for duplicate keys @@ -905,14 +893,31 @@ def extract_csv_content(self, csv_info: CsvFileType, :param core_ext_type: Whether this is a core or extension content frame :param build_coreid_for_ext: indicator to build id and core id to support dwca with extension """ + def __get_default_core_key(core_sv_info: CsvFileType): + """Look for a column in a CSV file + + :param csv_info: The CSV file + :param col: The column name + :return: Either column information or False for not found + """ + if not core_sv_info.keys or len(core_sv_info.keys) == 0: + if core_sv_info.type == MetaElementTypes.EVENT: + return ["eventID"] + elif core_sv_info.type == MetaElementTypes.OCCURRENCE: + return ["occurrenceID"] + else: + raise ValueError("Keys need to be set for core content") + elif len(core_sv_info.keys) > 0: + return core_sv_info.keys + if isinstance(csv_info.files, pd.DataFrame): - csv_content = csv_info.files.copy(deep=True) + csv_content = csv_info.files else: csv_content = self._combine_contents(csv_info.files, csv_info.csv_encoding) # Use default occurrenceID if not provided if core_ext_type == CoreOrExtType.CORE: - keys = csv_info.keys if self.__check_csv_info_value(csv_info, 'keys') else ['occurrenceID'] + keys = __get_default_core_key(csv_info) else: keys = self.core_content.keys core_id_field: str = "" From 0d7b66e9ddda4816557bd2b69dd495d352671e98 Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Fri, 14 Mar 2025 17:15:54 +1100 Subject: [PATCH 12/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - Fix for default keys for core if not provided --- src/dwcahandler/dwca/core_dwca.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index 129565b..d931308 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -896,9 +896,9 @@ def extract_csv_content(self, csv_info: CsvFileType, def __get_default_core_key(core_sv_info: CsvFileType): """Look for a column in a CSV file - :param csv_info: The CSV file - :param col: The column name - :return: Either column information or False for not found + :param core_sv_info: The CSV file + :return: default key if csv_info.keys not provided. + Default key is eventID for EVENT type and occurrenceID for occurrence type """ if not core_sv_info.keys or len(core_sv_info.keys) == 0: if core_sv_info.type == MetaElementTypes.EVENT: From 533c6d0ca0c025edc53a8b9899d712043987b72b Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Tue, 18 Mar 2025 11:20:07 +1100 Subject: [PATCH 13/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - Provide convenient helper methods for creating dwca --- README.md | 56 +++++++++++++- src/dwcahandler/dwca/__init__.py | 22 +++++- src/dwcahandler/dwca/base_dwca.py | 10 ++- src/dwcahandler/dwca/core_dwca.py | 30 +++----- src/dwcahandler/dwca/dwca_factory.py | 110 +++++++++++++++++++++++++-- src/dwcahandler/dwca/terms.py | 3 +- 6 files changed, 196 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 4d219f0..2f0441f 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,6 @@ ALA receive different forms of data from various data providers in the form of C The operations provided by dwcahandler includes creating a dwca from csv/text file, merge 2 dwcas, delete records in dwca and perform core key validations like testing duplicates of one or more keys, empty and duplicate keys. -The module uses and maintain the standard dwc terms from a point in time versioned copy of https://dwc.tdwg.org/terms/ and extensions like https://rs.gbif.org/extension/gbif/1.0/multimedia.xml. - - ### Technologies This package is developed in Python. Tested with Python 3.12, 3.11, 3.10 and 3.9 @@ -76,6 +73,8 @@ print(df_terms, df_class) * Listed in [class-rowtype.csv](src/dwcahandler/dwca/terms/class-rowtype.csv) * Used in MetaElementTypes class enum name: ```python +from dwcahandler import MetaElementTypes + MetaElementTypes.OCCURRENCE MetaElementTypes.MULTIMEDIA ``` @@ -137,7 +136,58 @@ eml = Eml(dataset_name='Test Dataset', rights="test rights") DwcaHandler.create_dwca(core_csv=core_frame, ext_csv_list=ext_frame, eml_content=eml, output_dwca='/tmp/dwca.zip') +``` +  +* Create Darwin Core Archive from csv files in a zip files. +* Class row types are determined by file names of the csvs. +* If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content +* Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding +```python +from dwcahandler import DwcaHandler +from dwcahandler import Eml + +eml = Eml(dataset_name='Test Dataset', + description='Dataset description', + license='Creative Commons Attribution (International) (CC-BY 4.0 (Int) 4.0)', + citation="test citation", + rights="test rights") + +DwcaHandler.create_dwca_from_zip_content(zip_file="/tmp/txt_files.zip", eml_content=eml, output_dwca='/tmp/dwca.zip') +``` +  +* Convenient helper function to create Darwin Core Archive from csv files in a zip files. +* Class row types are determined by file names of the csvs. +* If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content +* Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding +```python +from dwcahandler import DwcaHandler +from dwcahandler import Eml + +eml = Eml(dataset_name='Test Dataset', + description='Dataset description', + license='Creative Commons Attribution (International) (CC-BY 4.0 (Int) 4.0)', + citation="test citation", + rights="test rights") + +DwcaHandler.create_dwca_from_zip_content(zip_file="/tmp/txt_files.zip", eml_content=eml, output_dwca='/tmp/dwca.zip') +``` +  +* Convenient helper function to create Darwin Core Archive from list of csv files. +* Class row types are determined by file names of the csvs. +* If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content +* Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding +```python +from dwcahandler import DwcaHandler +from dwcahandler import Eml + +eml = Eml(dataset_name='Test Dataset', + description='Dataset description', + license='Creative Commons Attribution (International) (CC-BY 4.0 (Int) 4.0)', + citation="test citation", + rights="test rights") +DwcaHandler.create_dwca_from_file_list(files=["/tmp/event.txt", "/tmp/occurrence.txt", "/tmp/measurement_or_fact.txt"], + eml_content=eml, output_dwca='/tmp/dwca.zip') ```   * Merge Darwin Core Archive diff --git a/src/dwcahandler/dwca/__init__.py b/src/dwcahandler/dwca/__init__.py index 40c4aba..ca95cb3 100644 --- a/src/dwcahandler/dwca/__init__.py +++ b/src/dwcahandler/dwca/__init__.py @@ -18,6 +18,7 @@ """ from __future__ import annotations +import io from collections import namedtuple from dataclasses import dataclass, field from typing import Optional, Union @@ -30,6 +31,25 @@ EXTENSION="extension" ) +# Default keys for content when creating dwca +DefaultKeys = namedtuple("DefaultKeys", ["EVENT", "OCCURRENCE"])( + EVENT = "eventID", + OCCURRENCE = "occurrenceID" +) + +def get_keys(type: MetaElementTypes, override_content_keys: dict[[MetaElementTypes, list]] = None): + """ + # If override_content_keys not supplied, return the default keys based on content type + :param type: type of content + :param override_content_keys: given content keys + :return: the list of keys for the content + """ + if override_content_keys: + for content_type, keys in override_content_keys.items(): + if type == content_type and keys and len(keys) > 0: + return keys + defaults = DefaultKeys._asdict() + return [defaults[type.name]] if type.name in defaults.keys() else [] @dataclass class CSVEncoding: @@ -168,7 +188,7 @@ class Defaults: class CsvFileType: """A description of a CSV file in a DwCA """ - files: Union[list[str], pd.DataFrame] # can accept more than one file or a dataframe + files: Union[list[str], pd.DataFrame, io.TextIOWrapper] # can accept more than one file or a dataframe type: MetaElementTypes # 'occurrence', 'taxon', 'event', multimedia,... keys: Optional[list] = None # must be supplied for csv extensions to link extension records to core record # when creating dwca. for core other than occurrence, this neeeds to be supplied as key. diff --git a/src/dwcahandler/dwca/base_dwca.py b/src/dwcahandler/dwca/base_dwca.py index f80ba2d..6bd893f 100644 --- a/src/dwcahandler/dwca/base_dwca.py +++ b/src/dwcahandler/dwca/base_dwca.py @@ -127,7 +127,7 @@ def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca: Un def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO], ext_csv_list: list[CsvFileType] = None, validate_content: bool = True, - eml_content: Union[str, Eml] = '', additional_validation_on_content: list[CsvFileType] = None): + eml_content: Union[str, Eml] = ''): """Create a dwca given the contents of core and extensions and eml content :param core_csv: CsvFileType containing the files, class types and keys to form the core of the dwca @@ -136,7 +136,6 @@ def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO], extensions of the dwca if supplied :param validate_content: whether to validate the contents :param eml_content: eml content in string or a filled Eml object - :param additional_validation_on_content: additional validation to perform """ if ext_csv_list is None: ext_csv_list = [] @@ -150,13 +149,16 @@ def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO], if image_ext: ext_csv_list.append(image_ext) + content_to_validate = {} for ext in ext_csv_list: + if ext.keys and len(ext.keys) > 0: + content_to_validate[ext.type] = ext.keys self.extract_csv_content(csv_info=ext, core_ext_type=CoreOrExtType.EXTENSION, build_coreid_for_ext=True) self.fill_additional_info() - if validate_content and not self.validate_content(additional_validation_on_content): + if validate_content and not self.validate_content(content_to_validate): raise SystemExit(Exception("Some validations error found. Dwca is not created.")) self.generate_eml(eml_content) @@ -191,7 +193,7 @@ def validate_dwca(self, content_keys: dict, error_file: str): If additional checks required in another content, supply it as content_keys :param content_keys: a dictionary of class type and the key - for eg. {MetaElementTypes.OCCURRENCE, "occurrenceId"} + for eg. {MetaElementTypes.OCCURRENCE, "occurrenceID"} :param error_file: optional error_file for the errored data """ self.extract_dwca() diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index d931308..770165e 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -22,7 +22,7 @@ from pandas.errors import EmptyDataError from pandas.io import parsers from dwcahandler.dwca import (BaseDwca, CoreOrExtType, CSVEncoding, - CsvFileType, Defaults, Eml, Terms, + CsvFileType, Defaults, Eml, Terms, get_keys, MetaDwCA, MetaElementInfo, MetaElementTypes, MetaElementAttributes, Stat, record_diff_stat) @@ -350,6 +350,7 @@ def set_keys(self, keys: dict = None): key_list = [v] if isinstance(v, str) else v col_term = [] for a_key in key_list: + # this is in case a_key is url form for eg: http://rs.gbif.org/terms/1.0/gbifID if a_key not in dwca_content.df_content.columns.tolist(): col_term.append(Terms.extract_term(a_key)) else: @@ -893,31 +894,17 @@ def extract_csv_content(self, csv_info: CsvFileType, :param core_ext_type: Whether this is a core or extension content frame :param build_coreid_for_ext: indicator to build id and core id to support dwca with extension """ - def __get_default_core_key(core_sv_info: CsvFileType): - """Look for a column in a CSV file - - :param core_sv_info: The CSV file - :return: default key if csv_info.keys not provided. - Default key is eventID for EVENT type and occurrenceID for occurrence type - """ - if not core_sv_info.keys or len(core_sv_info.keys) == 0: - if core_sv_info.type == MetaElementTypes.EVENT: - return ["eventID"] - elif core_sv_info.type == MetaElementTypes.OCCURRENCE: - return ["occurrenceID"] - else: - raise ValueError("Keys need to be set for core content") - elif len(core_sv_info.keys) > 0: - return core_sv_info.keys - - if isinstance(csv_info.files, pd.DataFrame): + if isinstance(csv_info.files, pd.DataFrame) : csv_content = csv_info.files + elif isinstance(csv_info.files, io.TextIOWrapper): + csv_content = self._read_csv(csv_info.files) else: csv_content = self._combine_contents(csv_info.files, csv_info.csv_encoding) - # Use default occurrenceID if not provided + # Use default keys if not provided if core_ext_type == CoreOrExtType.CORE: - keys = __get_default_core_key(csv_info) + override_keys = {csv_info.type: csv_info.keys} if csv_info.keys and len(csv_info.keys) > 0 else None + keys = get_keys(type=csv_info.type, override_content_keys=override_keys) else: keys = self.core_content.keys core_id_field: str = "" @@ -945,6 +932,7 @@ def __get_default_core_key(core_sv_info: CsvFileType): content.keys = keys self.core_content = content else: + content.keys = csv_info.keys self.ext_content.append(content) def _to_csv(self, df: pd.DataFrame, meta_info: MetaElementInfo, diff --git a/src/dwcahandler/dwca/dwca_factory.py b/src/dwcahandler/dwca/dwca_factory.py index e1bfaab..ffcc625 100644 --- a/src/dwcahandler/dwca/dwca_factory.py +++ b/src/dwcahandler/dwca/dwca_factory.py @@ -2,12 +2,14 @@ Module contains factory class for Dwca. This is used to decide the type of darwin core class to perform the operation. """ - +import io import logging from typing import Union import pandas as pd -from dwcahandler.dwca import CsvFileType, Dwca, Terms, Eml, MetaElementTypes +from dwcahandler.dwca import CsvFileType, Dwca, Terms, Eml, MetaElementTypes, CSVEncoding, get_keys from io import BytesIO +from pathlib import Path +from zipfile import ZipFile logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) log = logging.getLogger("DwcaFactoryManager") @@ -24,7 +26,104 @@ def list_class_rowtypes() : for name, member in MetaElementTypes.__members__.items(): print(f"{name}: {member.value}") + @staticmethod + def get_contents_from_file_names(files: list) -> (dict[MetaElementTypes, str], dict[MetaElementTypes, str]): + """Find the core content and extension contents from a list of file paths. + Core content will always be event if present, otherwise, occurrence content + + :param files: list of files + :param output_dwca: Where to place the resulting Dwca + :param eml_content: eml content in string or Eml class + :param csv_encoding: delimiter for txt file. Default is comma delimiter txt files if not supplied + :param content_keys: optional dictionary of MetaElementTypes and key list + for eg. {MetaElementTypes.OCCURRENCE, ["occurrenceID"]} + """ + def derive_type(file_list: list) -> dict[str, MetaElementTypes]: + file_types = {} + for file in file_list: + if (filename:=Path(file).stem.upper()) in dict(MetaElementTypes.__members__.items()).keys(): + file_types[file] = dict(MetaElementTypes.__members__.items())[filename] + return file_types + + contents = derive_type(files) + + core_file = {k: v for k, v in contents.items() if v == MetaElementTypes.EVENT} + if not core_file: + core_file = {k: v for k, v in contents.items() if v == MetaElementTypes.OCCURRENCE} + + if core_file: + core_filename = next(iter(core_file)) + core_type = core_file[core_filename] + ext_files = {k: v for k, v in contents.items() if v != core_type} + return core_file, ext_files + + return None + """Perform various DwCA operations""" + @staticmethod + def create_dwca_from_file_list(files: list, output_dwca: Union[str, BytesIO], + eml_content: Union[str, Eml] = '', csv_encoding: CSVEncoding = CSVEncoding(), + content_keys: dict[MetaElementTypes, list] = None): + """Create a suitable DwCA from a list of CSV files + + :param files: Zip file containing txt files + :param output_dwca: Where to place the resulting Dwca + :param eml_content: eml content in string or Eml class + :param csv_encoding: delimiter for txt file. Default is comma delimiter txt files if not supplied + :param content_keys: optional dictionary of MetaElementTypes and key list + for eg. {MetaElementTypes.OCCURRENCE, ["occurrenceID"]} + """ + core_content, ext_content_list = DwcaHandler.get_contents_from_file_names(files) + if core_content: + core_filename = next(iter(core_content)) + core_type = core_content[core_filename] + + core_content = CsvFileType(files=[core_filename], type=core_type, csv_encoding=csv_encoding, + keys=get_keys(type=core_type, override_content_keys=content_keys)) + ext_content = [] + for ext_file, ext_type in ext_content_list.items(): + ext_content.append(CsvFileType(files=[ext_file], + type=ext_type, csv_encoding=csv_encoding, + keys=get_keys(type=ext_type, + override_content_keys=content_keys))) + DwcaHandler.create_dwca(core_csv=core_content, ext_csv_list=ext_content, output_dwca=output_dwca, + eml_content=eml_content) + else: + raise ValueError("The core content cannot be determined. Please check filename in zip file") + + @staticmethod + def create_dwca_from_zip_content(zip_file: str, output_dwca: Union[str, BytesIO], + eml_content: Union[str, Eml] = '', csv_encoding: CSVEncoding = CSVEncoding(), + content_keys: dict[MetaElementTypes, list] = None): + """Create a suitable DwCA from a list of CSV files + + :param zip_file: Zip file containing txt files + :param output_dwca: Where to place the resulting Dwca + :param eml_content: eml content in string or Eml class + :param csv_encoding: delimiter for txt file. Default is comma delimiter txt files if not supplied + :param content_keys: optional dictionary of class type and the key + for eg. {MetaElementTypes.OCCURRENCE, ["occurrenceID"]} + """ + with ZipFile(zip_file, 'r') as zf: + files = zf.namelist() + core_content, ext_content_list = DwcaHandler.get_contents_from_file_names(files) + if core_content: + core_filename = next(iter(core_content)) + core_type = core_content[core_filename] + core_content = CsvFileType(files=io.TextIOWrapper(zf.open(core_filename), encoding="utf-8"), + type=core_type, csv_encoding=csv_encoding, + keys=get_keys(type=core_type, + override_content_keys=content_keys)) + ext_content = [] + for ext_file, ext_type in ext_content_list.items(): + ext_content.append(CsvFileType(files=io.TextIOWrapper(zf.open(ext_file), encoding="utf-8"), + type=ext_type, csv_encoding=csv_encoding, + keys=get_keys(type=ext_type, + override_content_keys=content_keys))) + DwcaHandler.create_dwca(core_csv=core_content, ext_csv_list=ext_content, output_dwca=output_dwca, + eml_content=eml_content) + else: + raise ValueError("The core content cannot be determined. Please check filename in zip file") @staticmethod def create_dwca(core_csv: CsvFileType, @@ -75,14 +174,15 @@ def merge_dwca(dwca_file: Union[str, BytesIO], delta_dwca_file: Union[str, Bytes validate_delta=validate_delta_content) @staticmethod - def validate_dwca(dwca_file: Union[str, BytesIO], keys_lookup: dict = None, error_file: str = None): + def validate_dwca(dwca_file: Union[str, BytesIO], content_keys: dict = None, error_file: str = None): """Test a dwca for consistency :param dwca_file: The path to the DwCA - :param keys_lookup: The keys that identify a unique record + :param content_keys: a dictionary of class type and the key + for eg. {MetaElementTypes.OCCURRENCE, "occurrenceID"} :param error_file: The file to write errors to. If None, errors are logged """ - return Dwca(dwca_file_loc=dwca_file).validate_dwca(keys_lookup, error_file) + return Dwca(dwca_file_loc=dwca_file).validate_dwca(content_keys, error_file) @staticmethod def validate_file(csv_file: CsvFileType, error_file: str = None): diff --git a/src/dwcahandler/dwca/terms.py b/src/dwcahandler/dwca/terms.py index 5fd9af1..b106d6e 100644 --- a/src/dwcahandler/dwca/terms.py +++ b/src/dwcahandler/dwca/terms.py @@ -255,7 +255,8 @@ def update_terms(): def __sort_values(df_to_sort: pd.DataFrame, sorting_column: str) -> pd.DataFrame: """ Make sure dc and dwc prefixes stay on top - :param df: dataframe + :param df_to_sort: dataframe to be sorted + :param sorting_column: other column to sort :return: sorted dataFrame """ df_to_sort = df_to_sort.sort_values(by=["prefix", sorting_column], key=lambda x: x.str.lower()) From 253061dc3a1e73429567b252b42ece3e70789ee3 Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Tue, 18 Mar 2025 11:38:18 +1100 Subject: [PATCH 14/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - Fix test which is failed --- src/dwcahandler/dwca/dwca_factory.py | 6 ++++-- tests/test_validate_dwca.py | 32 ++++++++++++++-------------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/dwcahandler/dwca/dwca_factory.py b/src/dwcahandler/dwca/dwca_factory.py index ffcc625..83ae050 100644 --- a/src/dwcahandler/dwca/dwca_factory.py +++ b/src/dwcahandler/dwca/dwca_factory.py @@ -175,10 +175,12 @@ def merge_dwca(dwca_file: Union[str, BytesIO], delta_dwca_file: Union[str, Bytes @staticmethod def validate_dwca(dwca_file: Union[str, BytesIO], content_keys: dict = None, error_file: str = None): - """Test a dwca for consistency + """Validate dwca for unique key and column for core content by default. + If content_keys is supplied, the content is also validated. :param dwca_file: The path to the DwCA - :param content_keys: a dictionary of class type and the key + :param content_keys: a dictionary of class type and the key. + When content_keys are provided, validation will be performed on the content as well. for eg. {MetaElementTypes.OCCURRENCE, "occurrenceID"} :param error_file: The file to write errors to. If None, errors are logged """ diff --git a/tests/test_validate_dwca.py b/tests/test_validate_dwca.py index 71909e8..ca97cbd 100644 --- a/tests/test_validate_dwca.py +++ b/tests/test_validate_dwca.py @@ -25,8 +25,8 @@ def test_validate_dwca(self): Test for read and extract dwca. Validate core content """ simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample1") - keys_lookup = {MetaElementTypes.OCCURRENCE: 'occurrenceID'} - dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) + content_keys = {MetaElementTypes.OCCURRENCE: 'occurrenceID'} + dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, content_keys=content_keys) assert dwca_result def test_validate_dwca2(self): @@ -34,8 +34,8 @@ def test_validate_dwca2(self): Test for read and extract dwca. Validate core content """ simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample2") - keys_lookup = {MetaElementTypes.OCCURRENCE: 'occurrenceID'} - dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) + content_keys = {MetaElementTypes.OCCURRENCE: 'occurrenceID'} + dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, content_keys=content_keys) assert dwca_result def test_empty_keys(self, caplog): @@ -44,8 +44,8 @@ def test_empty_keys(self, caplog): """ caplog.set_level(logging.INFO) simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample3") - keys_lookup = {MetaElementTypes.OCCURRENCE: 'occurrenceID'} - dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) + content_keys = {MetaElementTypes.OCCURRENCE: 'occurrenceID'} + dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, content_keys=content_keys) assert not dwca_result assert "Empty values found in ['occurrenceID']. Total rows affected: 1" in caplog.messages assert "Empty values found in dataframe row: [0]" in caplog.messages @@ -56,8 +56,8 @@ def test_duplicate_key(self, caplog): """ caplog.set_level(logging.INFO) simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample4") - keys_lookup = {MetaElementTypes.OCCURRENCE: 'catalogNumber'} - dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) + content_keys = {MetaElementTypes.OCCURRENCE: 'catalogNumber'} + dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, content_keys=content_keys) assert not dwca_result assert "Duplicate ['catalogNumber'] found. Total rows affected: 3" in caplog.messages assert "Duplicate values: ['014800' '014823']" in caplog.messages @@ -67,10 +67,10 @@ def test_duplicate_columns_in_dwca(self): Test for read and extract dwca. Validate duplicate columns specified in metadata of dwca """ simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample5") - keys_lookup = {MetaElementTypes.OCCURRENCE: 'catalogNumber'} + content_keys = {MetaElementTypes.OCCURRENCE: 'catalogNumber'} with pytest.raises(ValueError) as exc_info: - DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) + DwcaHandler.validate_dwca(dwca_file=simple_dwca, content_keys=content_keys) assert ("Duplicate columns ['catalogNumber'] specified in the metadata for occurrence.csv" in str(exc_info.value)) @@ -81,9 +81,9 @@ def test_dwca_with_occ_core_ext(self, caplog): """ caplog.set_level(logging.INFO) simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample6") - keys_lookup = {MetaElementTypes.OCCURRENCE: 'gbifID'} + content_keys = {MetaElementTypes.OCCURRENCE: 'gbifID'} - dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) + dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, content_keys=content_keys) assert dwca_result assert "Validation successful for core MetaElementTypes.OCCURRENCE content for unique keys ['gbifID']" in caplog.messages assert "Validation successful for extension MetaElementTypes.OCCURRENCE content for unique keys ['gbifID']" in caplog.messages @@ -95,9 +95,9 @@ def test_dwca_with_occ_core_ext_with_url_as_key(self, caplog): """ caplog.set_level(logging.INFO) simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample6") - keys_lookup = {MetaElementTypes.OCCURRENCE: 'http://rs.gbif.org/terms/1.0/gbifID'} + content_keys = {MetaElementTypes.OCCURRENCE: 'http://rs.gbif.org/terms/1.0/gbifID'} - dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) + dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, content_keys=content_keys) assert dwca_result assert "Validation successful for core MetaElementTypes.OCCURRENCE content for unique keys ['gbifID']" in caplog.messages assert "Validation successful for extension MetaElementTypes.OCCURRENCE content for unique keys ['gbifID']" in caplog.messages @@ -108,9 +108,9 @@ def test_dwca_with_occ_core_ext_with_duplicates(self, caplog): """ caplog.set_level(logging.INFO) simple_dwca = make_zip_from_folder_contents(f"{input_folder}/dwca-sample7") - keys_lookup = {MetaElementTypes.OCCURRENCE: 'http://rs.gbif.org/terms/1.0/gbifID'} + content_keys = {MetaElementTypes.OCCURRENCE: 'http://rs.gbif.org/terms/1.0/gbifID'} - dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, keys_lookup=keys_lookup) + dwca_result = DwcaHandler.validate_dwca(dwca_file=simple_dwca, content_keys=content_keys) assert not dwca_result assert "Duplicate ['gbifID'] found. Total rows affected: 2" in caplog.messages assert "Duplicate values: ['sample']" in caplog.messages From 954c7973ada8cf55b8ed416dcf629fba2b885f3b Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Wed, 19 Mar 2025 13:33:12 +1100 Subject: [PATCH 15/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - Provide more clarity on the content class --- README.md | 26 ++++--- src/dwcahandler/dwca/__init__.py | 32 ++++---- src/dwcahandler/dwca/base_dwca.py | 32 ++++---- src/dwcahandler/dwca/core_dwca.py | 26 +++---- src/dwcahandler/dwca/dwca_factory.py | 41 +++++----- tests/test_create_core_and_ext_content.py | 18 ++--- tests/test_create_dwca.py | 28 +++---- tests/test_delete_dwca_content.py | 6 +- tests/test_merge_dwca.py | 93 ++++++++++++++++++++++- tests/test_multimedia_content.py | 72 +++++++++--------- tests/test_write_dwca.py | 10 +-- 11 files changed, 239 insertions(+), 145 deletions(-) diff --git a/README.md b/README.md index 2f0441f..305412f 100644 --- a/README.md +++ b/README.md @@ -96,13 +96,13 @@ DwcaHandler.list_class_rowtypes() * In creating a dwca with multimedia extension, provide format and type values in the Simple Multimedia extension, otherwise, dwcahandler will attempt to fill these info by guessing the mimetype from url. ```python -from dwcahandler import CsvFileType +from dwcahandler import ContentData from dwcahandler import DwcaHandler from dwcahandler import MetaElementTypes from dwcahandler import Eml -core_csv = CsvFileType(files=['/tmp/occurrence.csv'], type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) -ext_csvs = [CsvFileType(files=['/tmp/multimedia.csv'], type=MetaElementTypes.MULTIMEDIA)] +core_csv = ContentData(data=['/tmp/occurrence.csv'], type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) +ext_csvs = [ContentData(data=['/tmp/multimedia.csv'], type=MetaElementTypes.MULTIMEDIA)] eml = Eml(dataset_name='Test Dataset', description='Dataset description', @@ -118,16 +118,16 @@ DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=ext_csvs, eml_content=em ```python from dwcahandler import DwcaHandler -from dwcahandler.dwca import CsvFileType +from dwcahandler.dwca import ContentData from dwcahandler import MetaElementTypes from dwcahandler import Eml import pandas as pd core_df = pd.read_csv("/tmp/occurrence.csv") -core_frame = CsvFileType(files=core_df, type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) +core_frame = ContentData(data=core_df, type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) ext_df = pd.read_csv("/tmp/multimedia.csv") -ext_frame = [CsvFileType(files=ext_df, type=MetaElementTypes.MULTIMEDIA)] +ext_frame = [ContentData(data=ext_df, type=MetaElementTypes.MULTIMEDIA)] eml = Eml(dataset_name='Test Dataset', description='Dataset description', @@ -138,7 +138,9 @@ eml = Eml(dataset_name='Test Dataset', DwcaHandler.create_dwca(core_csv=core_frame, ext_csv_list=ext_frame, eml_content=eml, output_dwca='/tmp/dwca.zip') ```   -* Create Darwin Core Archive from csv files in a zip files. +* Convenient helper function to build Darwin Core Archive from a list of csv files. +* Build event core DwCA if event.txt file is supplied, otherwise, occurrence DwCA if occurrence.txt is supplied. +* Raises error if neither event.txt not occurrence.txt is in the list * Class row types are determined by file names of the csvs. * If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content * Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding @@ -152,10 +154,12 @@ eml = Eml(dataset_name='Test Dataset', citation="test citation", rights="test rights") -DwcaHandler.create_dwca_from_zip_content(zip_file="/tmp/txt_files.zip", eml_content=eml, output_dwca='/tmp/dwca.zip') +DwcaHandler.create_dwca_from_file_list(files=["/tmp/event.csv", "/tmp/occurrence.csv"], eml_content=eml, output_dwca='/tmp/dwca.zip') ```   * Convenient helper function to create Darwin Core Archive from csv files in a zip files. +* Build event core DwCA if event.txt file is supplied, otherwise, occurrence DwCA if occurrence.txt is supplied in the zip file +* Raises error if neither event.txt not occurrence.txt is in the list * Class row types are determined by file names of the csvs. * If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content * Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding @@ -201,13 +205,13 @@ DwcaHandler.merge_dwca(dwca_file='/tmp/dwca.zip', delta_dwca_file='/tmp/delta-dw   * Delete Rows from core file in Darwin Core Archive ```python -from dwcahandler import CsvFileType +from dwcahandler import ContentData from dwcahandler import DwcaHandler, MetaElementTypes -delete_csv = CsvFileType(files=['/tmp/old-records.csv'], type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) +delete_csv = ContentData(data=['/tmp/old-records.csv'], type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) DwcaHandler.delete_records(dwca_file='/tmp/dwca.zip', - records_to_delete=delete_csv, + records_to_delete=delete_csv, output_dwca='/tmp/new-dwca.zip') ```   diff --git a/src/dwcahandler/dwca/__init__.py b/src/dwcahandler/dwca/__init__.py index ca95cb3..370a7ca 100644 --- a/src/dwcahandler/dwca/__init__.py +++ b/src/dwcahandler/dwca/__init__.py @@ -1,6 +1,6 @@ # flake8: noqa """ -Tools to convert data frames into Darwin Core Archive (DwCA) files. +Tools to convert data frame or text files into Darwin Core Archive (DwCA) file. See https://ipt.gbif.org/manual/en/ipt/2.6/dwca-guide for a guide to DwCAs. @@ -185,36 +185,36 @@ class Defaults: from dwcahandler.dwca.dwca_meta import (MetaElementTypes, MetaElementInfo, MetaDwCA, MetaElementAttributes, get_meta_class_row_type) @dataclass -class CsvFileType: - """A description of a CSV file in a DwCA +class ContentData: + """A class describing the content data used for core and extension. + Use this class to define the core content and extension content to build a DwCA (see README on usage) """ - files: Union[list[str], pd.DataFrame, io.TextIOWrapper] # can accept more than one file or a dataframe - type: MetaElementTypes # 'occurrence', 'taxon', 'event', multimedia,... - keys: Optional[list] = None # must be supplied for csv extensions to link extension records to core record - # when creating dwca. for core other than occurrence, this neeeds to be supplied as key. - # column keys lookup in core or extension for delete records - associated_files_loc: Optional[str] = None # in case there are associated media that need to be packaged in dwca + data: Union[list[str], pd.DataFrame, io.TextIOWrapper] # can accept more than one files, dataframe or file pointer + type: MetaElementTypes # Enumerated types from the class row type. + keys: Optional[list] = None # keys that uniquely identify a record in the content + associated_files_loc: Optional[str] = None # provide a folder path containing the embedded images. + # Embedded images file name must be supplied as associatedMedia in the content csv_encoding: CSVEncoding = field( default_factory=lambda: CSVEncoding(csv_delimiter=",", csv_eol="\n", csv_text_enclosure='"', csv_escape_char='"')) def check_for_empty(self, include_keys = True): - if self.files and len(self.files) > 0 and \ + if self.data and len(self.data) > 0 and \ self.type and isinstance(self.type, MetaElementTypes) and \ (not include_keys or include_keys and self.keys and len(self.keys) > 0): return True return False - def add_data(self, other_csv_file_type: CsvFileType): + def add_data(self, other_csv_file_type: ContentData): if self.type and self.type == other_csv_file_type.type: - if isinstance(self.files, pd.DataFrame) and isinstance(other_csv_file_type.files, pd.DataFrame): - self.files = pd.concat([self.files, other_csv_file_type.files], ignore_index=False) + if isinstance(self.data, pd.DataFrame) and isinstance(other_csv_file_type.data, pd.DataFrame): + self.data = pd.concat([self.data, other_csv_file_type.data], ignore_index=False) return True - elif isinstance(self.files, list) and isinstance(other_csv_file_type.files, list): - self.files.append(other_csv_file_type.files) + elif isinstance(self.data, list) and isinstance(other_csv_file_type.data, list): + self.data.append(other_csv_file_type.data) return True elif not self.type: - self.files = other_csv_file_type.files + self.data = other_csv_file_type.data self.type = other_csv_file_type.type return False diff --git a/src/dwcahandler/dwca/base_dwca.py b/src/dwcahandler/dwca/base_dwca.py index 6bd893f..1472a1e 100644 --- a/src/dwcahandler/dwca/base_dwca.py +++ b/src/dwcahandler/dwca/base_dwca.py @@ -7,7 +7,7 @@ from abc import ABCMeta, abstractmethod from typing import Union from io import BytesIO -from dwcahandler.dwca import CoreOrExtType, CsvFileType, MetaElementTypes +from dwcahandler.dwca import CoreOrExtType, ContentData, MetaElementTypes from dwcahandler.dwca.eml import Eml @@ -15,7 +15,7 @@ class BaseDwca(metaclass=ABCMeta): """An abstract DwCA that provides basic operations""" @abstractmethod - def extract_csv_content(self, csv_info: CsvFileType, core_ext_type: CoreOrExtType, + def extract_csv_content(self, csv_info: ContentData, core_ext_type: CoreOrExtType, build_coreid_for_ext: bool = False): """Get the content from a single file in the DwCA @@ -48,11 +48,11 @@ def generate_meta(self): @abstractmethod def write_dwca(self, output_dwca: Union[str, BytesIO]): - """Write the content of the DwCA to a directory. + """Write the content of the DwCA to a file path (supplied as string) or to BytesIO in memory. - Writes all CSV files, as well as a meta-file and EML file for the archive. + Writes all CSV data, as well as a meta-file and EML file for the archive. - :param output_dwca: The path to write to or dwca in memory + :param output_dwca: The file path or BytesIO """ pass @@ -85,7 +85,7 @@ def convert_associated_media_to_extension(self): pass @abstractmethod - def delete_records(self, records_to_delete: CsvFileType): + def delete_records(self, records_to_delete: ContentData): pass @abstractmethod @@ -111,10 +111,10 @@ def fill_additional_info(self): for multimedia_content, _ in contents: self.add_multimedia_info_to_content(multimedia_content) - def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca: Union[str, BytesIO]): - """Delete records in dwca if the key records are defined in CsvFileType + def delete_records_in_dwca(self, records_to_delete: ContentData, output_dwca: Union[str, BytesIO]): + """Delete records in dwca if the key records are defined in ContentData - :param records_to_delete: A CsvFileType that containing the text file of the record keys, + :param records_to_delete: A ContentData that containing the text file of the record keys, the key names of the records and MetaElementType type class of the dwca where the records need to be removed :param output_dwca: output dwca path where the result of the dwca is writen to or the output dwca in memory @@ -125,14 +125,14 @@ def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca: Un self.generate_meta() self.write_dwca(output_dwca) - def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO], - ext_csv_list: list[CsvFileType] = None, validate_content: bool = True, + def create_dwca(self, core_csv: ContentData, output_dwca: Union[str, BytesIO], + ext_csv_list: list[ContentData] = None, validate_content: bool = True, eml_content: Union[str, Eml] = ''): """Create a dwca given the contents of core and extensions and eml content - :param core_csv: CsvFileType containing the files, class types and keys to form the core of the dwca + :param core_csv: ContentData containing the data, class type and keys to form the core of the dwca :param output_dwca: the resulting path of the dwca or the dwca in memory - :param ext_csv_list: list of CsvFileTypes containing the files, class types and keys to form the + :param ext_csv_list: list of ContentData containing the data, class type and keys to form the extensions of the dwca if supplied :param validate_content: whether to validate the contents :param eml_content: eml content in string or a filled Eml object @@ -143,7 +143,7 @@ def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO], self.extract_csv_content(csv_info=core_csv, core_ext_type=CoreOrExtType.CORE, build_coreid_for_ext=True if len(ext_csv_list) > 0 else False) - # if multimedia files is supplied, do not attempt to convert associated media to multimedia + # if multimedia data is supplied, do not attempt to convert associated media to multimedia if not any(ext.type == MetaElementTypes.MULTIMEDIA for ext in ext_csv_list): image_ext = self.convert_associated_media_to_extension() if image_ext: @@ -200,10 +200,10 @@ def validate_dwca(self, content_keys: dict, error_file: str): set_keys = self.set_keys(content_keys) return self.validate_content(content_to_validate=set_keys, error_file=error_file) - def validate_file(self, csv: CsvFileType, error_file: str): + def validate_file(self, csv: ContentData, error_file: str): """Validate the text file - :param csv: CsvFileType to pass the csv, key and type + :param csv: ContentData to pass the csv, key and type :param error_file: optional error_file for the errored data """ self.extract_csv_content(csv_info=csv, core_ext_type=CoreOrExtType.CORE) diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index 770165e..b50ac93 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -22,7 +22,7 @@ from pandas.errors import EmptyDataError from pandas.io import parsers from dwcahandler.dwca import (BaseDwca, CoreOrExtType, CSVEncoding, - CsvFileType, Defaults, Eml, Terms, get_keys, + ContentData, Defaults, Eml, Terms, get_keys, MetaDwCA, MetaElementInfo, MetaElementTypes, MetaElementAttributes, Stat, record_diff_stat) @@ -488,16 +488,16 @@ def _delete_content(self, content, delete_content): content = self._filter_content(delete_content, content.df_content) return content - def delete_records(self, records_to_delete: CsvFileType): + def delete_records(self, records_to_delete: ContentData): """Delete records from either a core or extension content frame :param records_to_delete: A CSV file of records to delete, keyed to the DwCA file """ delete_content = pd.DataFrame() - if isinstance(records_to_delete.files, pd.DataFrame): - delete_content = records_to_delete.files.copy(deep=True) + if isinstance(records_to_delete.data, pd.DataFrame): + delete_content = records_to_delete.data.copy(deep=True) else: - delete_content = self._combine_contents(records_to_delete.files, records_to_delete.csv_encoding, + delete_content = self._combine_contents(records_to_delete.data, records_to_delete.csv_encoding, use_chunking=False) valid_delete_file = (all(col in delete_content.columns for col in records_to_delete.keys) or len(delete_content) > 0) @@ -735,7 +735,7 @@ def convert_associated_media_to_extension(self): if len(image_df) > 0: self._update_meta_fields(content=self.core_content, key_field=self.core_content.keys[0]) log.info("%s associated media extracted", str(len(image_df))) - return CsvFileType(files=image_df, type=MetaElementTypes.MULTIMEDIA, + return ContentData(data=image_df, type=MetaElementTypes.MULTIMEDIA, keys=self.core_content.keys) log.info("Nothing to extract from associated media") @@ -886,20 +886,20 @@ def validate_content(self, content_to_validate: dict = None, error_file: str = N return True if validation_success else False - def extract_csv_content(self, csv_info: CsvFileType, + def extract_csv_content(self, csv_info: ContentData, core_ext_type: CoreOrExtType, build_coreid_for_ext: bool = False): - """Read the files from a CSV description into a content frame and include it in the Dwca. + """Read the data from a CSV description into a content frame and include it in the Dwca. :param csv_info: The CSV file(s) :param core_ext_type: Whether this is a core or extension content frame :param build_coreid_for_ext: indicator to build id and core id to support dwca with extension """ - if isinstance(csv_info.files, pd.DataFrame) : - csv_content = csv_info.files - elif isinstance(csv_info.files, io.TextIOWrapper): - csv_content = self._read_csv(csv_info.files) + if isinstance(csv_info.data, pd.DataFrame) : + csv_content = csv_info.data + elif isinstance(csv_info.data, io.TextIOWrapper): + csv_content = self._read_csv(csv_info.data) else: - csv_content = self._combine_contents(csv_info.files, csv_info.csv_encoding) + csv_content = self._combine_contents(csv_info.data, csv_info.csv_encoding) # Use default keys if not provided if core_ext_type == CoreOrExtType.CORE: diff --git a/src/dwcahandler/dwca/dwca_factory.py b/src/dwcahandler/dwca/dwca_factory.py index 83ae050..4a7ff1a 100644 --- a/src/dwcahandler/dwca/dwca_factory.py +++ b/src/dwcahandler/dwca/dwca_factory.py @@ -6,7 +6,7 @@ import logging from typing import Union import pandas as pd -from dwcahandler.dwca import CsvFileType, Dwca, Terms, Eml, MetaElementTypes, CSVEncoding, get_keys +from dwcahandler.dwca import ContentData, Dwca, Terms, Eml, MetaElementTypes, CSVEncoding, get_keys from io import BytesIO from pathlib import Path from zipfile import ZipFile @@ -32,11 +32,7 @@ def get_contents_from_file_names(files: list) -> (dict[MetaElementTypes, str], d Core content will always be event if present, otherwise, occurrence content :param files: list of files - :param output_dwca: Where to place the resulting Dwca - :param eml_content: eml content in string or Eml class - :param csv_encoding: delimiter for txt file. Default is comma delimiter txt files if not supplied - :param content_keys: optional dictionary of MetaElementTypes and key list - for eg. {MetaElementTypes.OCCURRENCE, ["occurrenceID"]} + :return dict of core content type and file name and dict containing ext content type and file name """ def derive_type(file_list: list) -> dict[str, MetaElementTypes]: file_types = {} @@ -64,9 +60,11 @@ def derive_type(file_list: list) -> dict[str, MetaElementTypes]: def create_dwca_from_file_list(files: list, output_dwca: Union[str, BytesIO], eml_content: Union[str, Eml] = '', csv_encoding: CSVEncoding = CSVEncoding(), content_keys: dict[MetaElementTypes, list] = None): - """Create a suitable DwCA from a list of CSV files + """Helper function to create a dwca based on a list of txt files. The file names will determine the class type + Builds event core dwca if event.txt is supplied, + otherwise build an occurrence core dwca if occurrence.txt is supplied. - :param files: Zip file containing txt files + :param files: List of txt files :param output_dwca: Where to place the resulting Dwca :param eml_content: eml content in string or Eml class :param csv_encoding: delimiter for txt file. Default is comma delimiter txt files if not supplied @@ -78,11 +76,11 @@ def create_dwca_from_file_list(files: list, output_dwca: Union[str, BytesIO], core_filename = next(iter(core_content)) core_type = core_content[core_filename] - core_content = CsvFileType(files=[core_filename], type=core_type, csv_encoding=csv_encoding, + core_content = ContentData(data=[core_filename], type=core_type, csv_encoding=csv_encoding, keys=get_keys(type=core_type, override_content_keys=content_keys)) ext_content = [] for ext_file, ext_type in ext_content_list.items(): - ext_content.append(CsvFileType(files=[ext_file], + ext_content.append(ContentData(data=[ext_file], type=ext_type, csv_encoding=csv_encoding, keys=get_keys(type=ext_type, override_content_keys=content_keys))) @@ -95,7 +93,10 @@ def create_dwca_from_file_list(files: list, output_dwca: Union[str, BytesIO], def create_dwca_from_zip_content(zip_file: str, output_dwca: Union[str, BytesIO], eml_content: Union[str, Eml] = '', csv_encoding: CSVEncoding = CSVEncoding(), content_keys: dict[MetaElementTypes, list] = None): - """Create a suitable DwCA from a list of CSV files + """Helper function to create a dwca based on a list of txt files in a zip file. + The file names will determine the class type + Builds event core dwca if event.txt is supplied, + otherwise build an occurrence core dwca if occurrence.txt is supplied. :param zip_file: Zip file containing txt files :param output_dwca: Where to place the resulting Dwca @@ -110,13 +111,13 @@ def create_dwca_from_zip_content(zip_file: str, output_dwca: Union[str, BytesIO] if core_content: core_filename = next(iter(core_content)) core_type = core_content[core_filename] - core_content = CsvFileType(files=io.TextIOWrapper(zf.open(core_filename), encoding="utf-8"), + core_content = ContentData(data=io.TextIOWrapper(zf.open(core_filename), encoding="utf-8"), type=core_type, csv_encoding=csv_encoding, keys=get_keys(type=core_type, override_content_keys=content_keys)) ext_content = [] for ext_file, ext_type in ext_content_list.items(): - ext_content.append(CsvFileType(files=io.TextIOWrapper(zf.open(ext_file), encoding="utf-8"), + ext_content.append(ContentData(data=io.TextIOWrapper(zf.open(ext_file), encoding="utf-8"), type=ext_type, csv_encoding=csv_encoding, keys=get_keys(type=ext_type, override_content_keys=content_keys))) @@ -126,12 +127,12 @@ def create_dwca_from_zip_content(zip_file: str, output_dwca: Union[str, BytesIO] raise ValueError("The core content cannot be determined. Please check filename in zip file") @staticmethod - def create_dwca(core_csv: CsvFileType, + def create_dwca(core_csv: ContentData, output_dwca: Union[str, BytesIO], - ext_csv_list: list[CsvFileType] = None, + ext_csv_list: list[ContentData] = None, validate_content: bool = True, eml_content: Union[str, Eml] = ''): - """Create a suitable DwCA from a list of CSV files + """Create a suitable DwCA from a list of CSV data :param core_csv: The core source :param ext_csv_list: A list of extension sources @@ -143,13 +144,13 @@ def create_dwca(core_csv: CsvFileType, validate_content=validate_content, eml_content=eml_content) @staticmethod - def delete_records(dwca_file: Union[str, BytesIO], records_to_delete: CsvFileType, + def delete_records(dwca_file: Union[str, BytesIO], records_to_delete: ContentData, output_dwca: Union[str, BytesIO]): """Delete core records listed in the records_to_delete file from DwCA. The specified keys listed in records_to_delete param must exist in the dwca core file - :param dwca_file: The path to the DwCA - :param records_to_delete: File containing the records to delete and the column key for mapping + :param dwca_file: The path to the DwCA or ByteIO of the DwCA + :param records_to_delete: content containing the records to delete and the column key for mapping :param output_dwca: Where to place the resulting DwCA or the dwca output in memory """ Dwca(dwca_file_loc=dwca_file).delete_records_in_dwca(records_to_delete=records_to_delete, @@ -187,7 +188,7 @@ def validate_dwca(dwca_file: Union[str, BytesIO], content_keys: dict = None, err return Dwca(dwca_file_loc=dwca_file).validate_dwca(content_keys, error_file) @staticmethod - def validate_file(csv_file: CsvFileType, error_file: str = None): + def validate_file(csv_file: ContentData, error_file: str = None): """Test a CSV file for consistency :param csv_file: The path to the CSV diff --git a/tests/test_create_core_and_ext_content.py b/tests/test_create_core_and_ext_content.py index ce633ed..055e789 100644 --- a/tests/test_create_core_and_ext_content.py +++ b/tests/test_create_core_and_ext_content.py @@ -4,7 +4,7 @@ import pytest import pandas as pd from pandas import testing as pdtest -from dwcahandler.dwca import CSVEncoding, CsvFileType, CoreOrExtType, MetaElementTypes +from dwcahandler.dwca import CSVEncoding, ContentData, CoreOrExtType, MetaElementTypes from dwcahandler.dwca.core_dwca import Dwca @@ -35,7 +35,7 @@ def get_expected_combined_occ_df(file_paths: list, keys: list, delimiter: str = @pytest.fixture def test_case(request): - yield {"file_type": CsvFileType(files=request.param["file_paths"], + yield {"file_type": ContentData(data=request.param["file_paths"], type=MetaElementTypes.OCCURRENCE, keys=['catalogNumber'], csv_encoding=CSVEncoding(csv_delimiter=request.param["delimiter"])), @@ -87,7 +87,7 @@ def test_extract_csv_ext_content(self): dwca_creator = Dwca() - dwca_creator.extract_csv_content(csv_info=CsvFileType(files=multiple_csv_occ_test['file_paths'], + dwca_creator.extract_csv_content(csv_info=ContentData(data=multiple_csv_occ_test['file_paths'], type=MetaElementTypes.OCCURRENCE, keys=['catalogNumber'], csv_encoding=CSVEncoding( @@ -95,7 +95,7 @@ def test_extract_csv_ext_content(self): core_ext_type=CoreOrExtType.CORE) multimedia_file_path = 'input_files/sample/multimedia/multimedia_file.csv' - dwca_creator.extract_csv_content(csv_info=CsvFileType(files=[multimedia_file_path], + dwca_creator.extract_csv_content(csv_info=ContentData(data=[multimedia_file_path], type=MetaElementTypes.MULTIMEDIA, keys=['catalogNumber'], csv_encoding=CSVEncoding(csv_delimiter=',')), @@ -124,7 +124,7 @@ def test_extract_tsv_ext_content(self): dwca_creator = Dwca() - dwca_creator.extract_csv_content(csv_info=CsvFileType(files=multiple_tsv_occ_test['file_paths'], + dwca_creator.extract_csv_content(csv_info=ContentData(data=multiple_tsv_occ_test['file_paths'], type=MetaElementTypes.OCCURRENCE, keys=['catalogNumber'], csv_encoding=CSVEncoding( @@ -132,7 +132,7 @@ def test_extract_tsv_ext_content(self): core_ext_type=CoreOrExtType.CORE) multimedia_file_path = 'input_files/sample/multimedia/multimedia_file.tsv' - dwca_creator.extract_csv_content(csv_info=CsvFileType(files=[multimedia_file_path], + dwca_creator.extract_csv_content(csv_info=ContentData(data=[multimedia_file_path], type=MetaElementTypes.MULTIMEDIA, csv_encoding=CSVEncoding(csv_delimiter='\t')), core_ext_type=CoreOrExtType.EXTENSION) @@ -159,7 +159,7 @@ def test_extract_csv_with_header_space(self): dwca_creator = Dwca() - dwca_creator.extract_csv_content(csv_info=CsvFileType(files=csv_occ_with_space['file_paths'], + dwca_creator.extract_csv_content(csv_info=ContentData(data=csv_occ_with_space['file_paths'], type=MetaElementTypes.OCCURRENCE, keys=['catalogNumber'], csv_encoding=CSVEncoding( @@ -181,14 +181,14 @@ def test_extract_csv_ext_with_header_space(self): dwca_creator = Dwca() - dwca_creator.extract_csv_content(csv_info=CsvFileType(files=csv_occ_with_space['file_paths'], + dwca_creator.extract_csv_content(csv_info=ContentData(data=csv_occ_with_space['file_paths'], type=MetaElementTypes.OCCURRENCE, keys=['catalogNumber'], csv_encoding=CSVEncoding( csv_delimiter=csv_occ_with_space["delimiter"])), core_ext_type=CoreOrExtType.CORE) - dwca_creator.extract_csv_content(csv_info=CsvFileType(files=multimedia_with_space['file_paths'], + dwca_creator.extract_csv_content(csv_info=ContentData(data=multimedia_with_space['file_paths'], type=MetaElementTypes.MULTIMEDIA, csv_encoding=CSVEncoding(csv_delimiter=',')), core_ext_type=CoreOrExtType.EXTENSION) diff --git a/tests/test_create_dwca.py b/tests/test_create_dwca.py index 57c3c3d..9074131 100644 --- a/tests/test_create_dwca.py +++ b/tests/test_create_dwca.py @@ -1,6 +1,6 @@ import pandas as pd -from dwcahandler import CsvFileType, DwcaHandler, MetaElementTypes +from dwcahandler import ContentData, DwcaHandler, MetaElementTypes from pathlib import Path from io import BytesIO from tests import get_eml_content, get_xml_from_file @@ -52,9 +52,9 @@ class TestCreateDwca: def test_create_occurrence_dwca_occurrence(self): test_files_folder = "./input_files/occurrence/sample1" - core_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], keys=['occurrenceID'], + core_csv = ContentData(data=[f"{test_files_folder}/occurrence.txt"], keys=['occurrenceID'], type=MetaElementTypes.OCCURRENCE) - ext1_csv = CsvFileType(files=[f"{test_files_folder}/multimedia.txt"], + ext1_csv = ContentData(data=[f"{test_files_folder}/multimedia.txt"], type=MetaElementTypes.MULTIMEDIA) output_obj = BytesIO() @@ -69,10 +69,10 @@ def test_create_occurrence_dwca_occurrence(self): def test_create_occurrence_dwca_occurrence_multiple_keys(self): test_files_folder = "./input_files/occurrence/sample2" - core_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], + core_csv = ContentData(data=[f"{test_files_folder}/occurrence.txt"], keys=['institutionCode', 'collectionCode', 'catalogNumber'], type=MetaElementTypes.OCCURRENCE) - ext1_csv = CsvFileType(files=[f"{test_files_folder}/multimedia.txt"], + ext1_csv = ContentData(data=[f"{test_files_folder}/multimedia.txt"], type=MetaElementTypes.MULTIMEDIA) output_obj = BytesIO() @@ -87,10 +87,10 @@ def test_create_occurrence_dwca_occurrence_multiple_keys(self): def test_create_occurrence_dwca_occurrence_extra_multimedia_records(self): test_files_folder = "./input_files/occurrence/sample3" - core_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], + core_csv = ContentData(data=[f"{test_files_folder}/occurrence.txt"], keys=['institutionCode', 'collectionCode', 'catalogNumber'], type=MetaElementTypes.OCCURRENCE) - ext1_csv = CsvFileType(files=[f"{test_files_folder}/multimedia.txt"], + ext1_csv = ContentData(data=[f"{test_files_folder}/multimedia.txt"], type=MetaElementTypes.MULTIMEDIA) output_obj = BytesIO() @@ -106,11 +106,11 @@ def test_create_event_dwca_sample1(self): test_files_folder = "./input_files/event/cameratrap-sample1" - core_csv = CsvFileType(files=[f"{test_files_folder}/event.txt"], keys=['eventID'], + core_csv = ContentData(data=[f"{test_files_folder}/event.txt"], keys=['eventID'], type=MetaElementTypes.EVENT) - ext1_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], keys=['occurrenceID'], + ext1_csv = ContentData(data=[f"{test_files_folder}/occurrence.txt"], keys=['occurrenceID'], type=MetaElementTypes.OCCURRENCE) - ext2_csv = CsvFileType(files=[f"{test_files_folder}/measurement_or_fact.txt"], + ext2_csv = ContentData(data=[f"{test_files_folder}/measurement_or_fact.txt"], type=MetaElementTypes.MEASUREMENT_OR_FACT) output_obj = BytesIO() @@ -126,11 +126,11 @@ def test_create_event_dwca_sample2(self): test_files_folder = "./input_files/event/cameratrap-sample2" - core_csv = CsvFileType(files=[f"{test_files_folder}/event.txt"], keys=['eventID'], + core_csv = ContentData(data=[f"{test_files_folder}/event.txt"], keys=['eventID'], type=MetaElementTypes.EVENT) - ext1_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], keys=['occurrenceID'], + ext1_csv = ContentData(data=[f"{test_files_folder}/occurrence.txt"], keys=['occurrenceID'], type=MetaElementTypes.OCCURRENCE) - ext2_csv = CsvFileType(files=[f"{test_files_folder}/extended_measurement_or_fact.txt"], + ext2_csv = ContentData(data=[f"{test_files_folder}/extended_measurement_or_fact.txt"], type=MetaElementTypes.EXTENDED_MEASUREMENT_OR_FACT) output_obj = BytesIO() @@ -145,7 +145,7 @@ def test_create_event_dwca_sample2(self): def test_create_occurrence_dwca_occurrence_without_ext(self): test_files_folder = "./input_files/occurrence/sample4" - core_csv = CsvFileType(files=[f"{test_files_folder}/occurrence.txt"], + core_csv = ContentData(data=[f"{test_files_folder}/occurrence.txt"], type=MetaElementTypes.OCCURRENCE) output_obj = BytesIO() diff --git a/tests/test_delete_dwca_content.py b/tests/test_delete_dwca_content.py index f6e291b..5662fae 100644 --- a/tests/test_delete_dwca_content.py +++ b/tests/test_delete_dwca_content.py @@ -3,7 +3,7 @@ from zipfile import ZipFile from tests import make_meta_xml_str, remove_pretty_print_xml from tests import make_dwca -from dwcahandler import DwcaHandler, CsvFileType, MetaElementTypes +from dwcahandler import DwcaHandler, ContentData, MetaElementTypes from io import BytesIO @@ -24,7 +24,7 @@ def test_delete_core_records(self): ["occ3", "species3"]], columns=['occurrenceID', 'scientificName']) - delete_records = CsvFileType(files=delete_df, type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) + delete_records = ContentData(data=delete_df, type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) output_obj = BytesIO() @@ -69,7 +69,7 @@ def test_delete_records_dwca_ext(self): ["occ3", "species3"]], columns=["occurrenceID", "scientificName"]) - delete_records = CsvFileType(files=delete_df, type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) + delete_records = ContentData(data=delete_df, type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']) output_obj = BytesIO() diff --git a/tests/test_merge_dwca.py b/tests/test_merge_dwca.py index 25bdc6f..0ec915a 100644 --- a/tests/test_merge_dwca.py +++ b/tests/test_merge_dwca.py @@ -196,7 +196,8 @@ def test_merge_core_and_ext_records_with_id(self): columns=["occurrenceID", "scientificName", "decimalLatitude", "decimalLongitude", "basisOfRecord"]) - delta_multimedia_df = pd.DataFrame(data=[["3", "https://new-image3.webp", "image/webp", "StillImage", "RightsHolder3"], + delta_multimedia_df = pd.DataFrame(data=[["3", "https://image3.jpg", "image/webp", "StillImage", "RightsHolder3"], + ["3", "https://new-image3.webp", "image/webp", "StillImage", "RightsHolder3"], ["4", "https://image4.webp", "image/webp", "StillImage", nan], ["5", "https://image5.webp", "image/webp", "StillImage", "RightsHolder5"], ["6", "https://image6.webp", "image/webp", "StillImage", "RightsHolder6"]], @@ -208,6 +209,7 @@ def test_merge_core_and_ext_records_with_id(self): keys_lookup: dict = dict() keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] + keys_lookup[MetaElementTypes.MULTIMEDIA] = ['identifier'] DwcaHandler.merge_dwca(dwca_file=dwca_ext_obj, delta_dwca_file=delta_dwca_ext_obj, output_dwca=output_obj, @@ -224,6 +226,7 @@ def test_merge_core_and_ext_records_with_id(self): expected_multimedia_df = pd.DataFrame(data=[["1", "https://image1.jpg", "image/jpeg", "StillImage", nan], ["2", "https://image2.jpg", "image/jpeg", "StillImage", nan], + ["3", "https://image3.jpg", "image/webp", "StillImage", "RightsHolder3"], ["3", "https://new-image3.webp", "image/webp", "StillImage", "RightsHolder3"], ["4", "https://image4.webp", "image/webp", "StillImage", nan], ["5", "https://image5.webp", "image/webp", "StillImage", "RightsHolder5"], @@ -277,7 +280,8 @@ def test_merge_core_and_ext_records_with_separate_id(self): columns=["id", "occurrenceID", "scientificName", "decimalLatitude", "decimalLongitude", "basisOfRecord"]) - delta_multimedia_df = pd.DataFrame(data=[["3", "https://new-image3.webp", "image/webp", "StillImage", "RightsHolder3"], + delta_multimedia_df = pd.DataFrame(data=[["3", "https://image3.jpg", "image/webp", "StillImage", "RightsHolder3"], + ["3", "https://new-image3.webp", "image/webp", "StillImage", "RightsHolder3"], ["4", "https://image4.webp", "image/webp", "StillImage", nan], ["5", "https://image5.webp", "image/webp", "StillImage", "RightsHolder5"], ["6", "https://image6.webp", "image/webp", "StillImage", "RightsHolder6"]], @@ -289,6 +293,7 @@ def test_merge_core_and_ext_records_with_separate_id(self): keys_lookup: dict = dict() keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] + keys_lookup[MetaElementTypes.MULTIMEDIA] = ['identifier'] DwcaHandler.merge_dwca(dwca_file=dwca_ext_obj, delta_dwca_file=delta_dwca_ext_obj, output_dwca=output_obj, @@ -303,6 +308,89 @@ def test_merge_core_and_ext_records_with_separate_id(self): columns=["id", "occurrenceID", "scientificName", "decimalLatitude", "decimalLongitude", "basisOfRecord"]) + expected_multimedia_df = pd.DataFrame(data=[["1", "https://image1.jpg", "image/jpeg", "StillImage", nan], + ["2", "https://image2.jpg", "image/jpeg", "StillImage", nan], + ["3", "https://image3.jpg", "image/webp", "StillImage", "RightsHolder3"], + ["3", "https://new-image3.webp", "image/webp", "StillImage", "RightsHolder3"], + ["4", "https://image4.webp", "image/webp", "StillImage", nan], + ["5", "https://image5.webp", "image/webp", "StillImage", "RightsHolder5"], + ["6", "https://image6.webp", "image/webp", "StillImage", "RightsHolder6"]], + columns=["coreid", "identifier", "format", "type", "rightsHolder"]) + + expected_meta_xml = make_meta_xml_str(core_df=expected_occ_df, ext_df=expected_multimedia_df, use_col_idx_as_core_id=0) + + with ZipFile(output_obj, 'r') as zf: + files = zf.namelist() + assert 'occurrence.csv' in files + assert 'multimedia.csv' in files + assert 'meta.xml' in files + assert 'eml.xml' in files + + with zf.open('meta.xml') as meta_xml_file: + meta_str = meta_xml_file.read().decode("utf-8") + assert remove_pretty_print_xml(meta_str) == remove_pretty_print_xml(expected_meta_xml) + + with zf.open('occurrence.csv') as occ_file: + df_output = pd.read_csv(occ_file, dtype='str') + pd.testing.assert_frame_equal(df_output, expected_occ_df) + + with zf.open('multimedia.csv') as multimedia_file: + multimedia_df_output = pd.read_csv(multimedia_file, dtype='str') + pd.testing.assert_frame_equal(multimedia_df_output, expected_multimedia_df) + + zf.close() + + def test_merge_core_and_ext_records_with_ext_sync(self): + """ + Test for core and extension record merging (update existing and add new rows, columns) + Occurrence, multimedia and meta xml output is merged as expected + """ + occ_df = pd.DataFrame(data=[["1", "occ1", "species1", "-30.0000", "144.0000"], + ["2", "occ2", "species2", "-28.0000", "115.0000"], + ["3", "occ3", "species3", "-36.0000", "144.30848"]], + columns=["id", "occurrenceID", "scientificName", "decimalLatitude", "decimalLongitude"]) + + multimedia_df = pd.DataFrame(data=[["1", "https://image1.jpg", "image/jpeg", "StillImage"], + ["2", "https://image2.jpg", "image/jpeg", "StillImage"], + ["3", "https://image3.jpg", "image/jpeg", "StillImage"]], + columns=["id", "identifier", "format", "type"]) + + dwca_ext_obj = make_dwca(core_content=occ_df, ext_mult_content=multimedia_df, use_col_idx_as_core_id=0) + + delta_occ_df = pd.DataFrame(data=[["3", "occ3", "species3", "-40.0000", "144.0000", "Observation"], + ["4", "occ4", "species4", "-10.0000", "144.0000", "Observation"], + ["5", "occ5", "species5", "-20.0000", "145.0000", nan], + ["6", "occ6", "species6", "-30.0000", "146.3048", nan]], + columns=["id", "occurrenceID", "scientificName", "decimalLatitude", "decimalLongitude", + "basisOfRecord"]) + + delta_multimedia_df = pd.DataFrame(data=[["3", "https://new-image3.webp", "image/webp", "StillImage", "RightsHolder3"], + ["4", "https://image4.webp", "image/webp", "StillImage", nan], + ["5", "https://image5.webp", "image/webp", "StillImage", "RightsHolder5"], + ["6", "https://image6.webp", "image/webp", "StillImage", "RightsHolder6"]], + columns=["id", "identifier", "format", "type", "rightsHolder"]) + + delta_dwca_ext_obj = make_dwca(core_content=delta_occ_df, ext_mult_content=delta_multimedia_df, use_col_idx_as_core_id=0) + + output_obj = BytesIO() + + keys_lookup: dict = dict() + keys_lookup[MetaElementTypes.OCCURRENCE] = ['occurrenceID'] + keys_lookup[MetaElementTypes.MULTIMEDIA] = ['identifier'] + + DwcaHandler.merge_dwca(dwca_file=dwca_ext_obj, delta_dwca_file=delta_dwca_ext_obj, + output_dwca=output_obj, + keys_lookup=keys_lookup, extension_sync=True) + + expected_occ_df = pd.DataFrame(data=[["1", "occ1", "species1", "-30.0000", "144.0000", nan], + ["2", "occ2", "species2", "-28.0000", "115.0000", nan], + ["3", "occ3", "species3", "-40.0000", "144.0000", "Observation"], + ["4", "occ4", "species4", "-10.0000", "144.0000", "Observation"], + ["5", "occ5", "species5", "-20.0000", "145.0000", nan], + ["6", "occ6", "species6", "-30.0000", "146.3048", nan]], + columns=["id", "occurrenceID", "scientificName", "decimalLatitude", "decimalLongitude", + "basisOfRecord"]) + expected_multimedia_df = pd.DataFrame(data=[["1", "https://image1.jpg", "image/jpeg", "StillImage", nan], ["2", "https://image2.jpg", "image/jpeg", "StillImage", nan], ["3", "https://new-image3.webp", "image/webp", "StillImage", "RightsHolder3"], @@ -333,3 +421,4 @@ def test_merge_core_and_ext_records_with_separate_id(self): pd.testing.assert_frame_equal(multimedia_df_output, expected_multimedia_df) zf.close() + diff --git a/tests/test_multimedia_content.py b/tests/test_multimedia_content.py index a6f2c81..25324df 100644 --- a/tests/test_multimedia_content.py +++ b/tests/test_multimedia_content.py @@ -1,6 +1,6 @@ import pandas as pd import dwcahandler -from dwcahandler.dwca import CsvFileType, CoreOrExtType, MetaElementTypes +from dwcahandler.dwca import ContentData, CoreOrExtType, MetaElementTypes from dwcahandler.dwca.core_dwca import Dwca from operator import attrgetter import logging @@ -17,11 +17,11 @@ INVALID_URL = "test" DELETED_MEDIA_URL = "https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=nonexistent" -image_ext = CsvFileType(files=pd.DataFrame(data=[["1", IMAGE_URL], - ["2", AUDIO_URL], - ["3", VIDEO_URL], - ["3", MIMETYPE_IMAGE_URL]], - columns=['occurrenceID', 'identifier']), +image_ext = ContentData(data=pd.DataFrame(data=[["1", IMAGE_URL], + ["2", AUDIO_URL], + ["3", VIDEO_URL], + ["3", MIMETYPE_IMAGE_URL]], + columns=['occurrenceID', 'identifier']), type=MetaElementTypes.MULTIMEDIA, keys=['occurrenceID']) @@ -53,7 +53,7 @@ def test_extract_associate_media(self): dwca = Dwca() - dwca.extract_csv_content(csv_info=CsvFileType(files=occ_associated_media_df, + dwca.extract_csv_content(csv_info=ContentData(data=occ_associated_media_df, keys=['occurrenceID'], type=MetaElementTypes.OCCURRENCE), core_ext_type=CoreOrExtType.CORE) @@ -64,7 +64,7 @@ def test_extract_associate_media(self): assert sorted(list(map(attrgetter('field_name'), dwca.meta_content.meta_elements[0].fields))) == \ sorted(['occurrenceID', 'scientificName']) - pd.testing.assert_frame_equal(associated_media_image_ext.files.reset_index(drop=True), image_ext.files) + pd.testing.assert_frame_equal(associated_media_image_ext.data.reset_index(drop=True), image_ext.data) assert associated_media_image_ext.type == image_ext.type assert associated_media_image_ext.keys[0] == image_ext.keys[0] @@ -74,7 +74,7 @@ def test_extract_associate_media(self): # Compare multimedia ext dataframe (without the coreid) against the expected image_ext dataframe pd.testing.assert_frame_equal(dwca.ext_content[0].df_content.reset_index(drop=True), - image_ext.files, check_index_type=False) + image_ext.data, check_index_type=False) # Check the meta content is updated assert sorted(list(map(attrgetter('field_name'), dwca.meta_content.meta_elements[1].fields))) == \ @@ -90,10 +90,10 @@ def test_fill_additional_multimedia_info(self, mock_mime_types): dwca = Dwca() # Extract core occurrence - dwca.extract_csv_content(csv_info=CsvFileType(files=pd.DataFrame(data=[["1", "species1"], - ["2", "species2"], - ["3", "species3"]], - columns=['occurrenceID', 'scientificName']), + dwca.extract_csv_content(csv_info=ContentData(data=pd.DataFrame(data=[["1", "species1"], + ["2", "species2"], + ["3", "species3"]], + columns=['occurrenceID', 'scientificName']), type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']), core_ext_type=CoreOrExtType.CORE) @@ -123,16 +123,16 @@ def test_fill_multimedia_info_with_format_type_partially_supplied(self, mock_mim dwca = Dwca() # Extract core occurrence - dwca.extract_csv_content(csv_info=CsvFileType(files=pd.DataFrame(data=[["1", "species1"], - ["2", "species2"], - ["3", "species3"], - ["4", "species4"], - ["5", "species5"], - ["6", "species6"], - ["7", "species7"], - ["8", "species8"], - ["9", "species9"]], - columns=['occurrenceID', 'scientificName']), + dwca.extract_csv_content(csv_info=ContentData(data=pd.DataFrame(data=[["1", "species1"], + ["2", "species2"], + ["3", "species3"], + ["4", "species4"], + ["5", "species5"], + ["6", "species6"], + ["7", "species7"], + ["8", "species8"], + ["9", "species9"]], + columns=['occurrenceID', 'scientificName']), type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']), core_ext_type=CoreOrExtType.CORE) @@ -148,8 +148,8 @@ def test_fill_multimedia_info_with_format_type_partially_supplied(self, mock_mim ["9", None, None, None]] # Extract multimedia ext without format - dwca.extract_csv_content(csv_info=CsvFileType(files=pd.DataFrame(data=image_data, - columns=["occurrenceID", "identifier", + dwca.extract_csv_content(csv_info=ContentData(data=pd.DataFrame(data=image_data, + columns=["occurrenceID", "identifier", "format", "type"]), type=MetaElementTypes.MULTIMEDIA), core_ext_type=CoreOrExtType.EXTENSION) @@ -182,15 +182,15 @@ def test_fill_multimedia_info_type_from_format(self, mock_mime_types): dwca = Dwca() # Extract core occurrence - dwca.extract_csv_content(csv_info=CsvFileType(files=pd.DataFrame(data=[["1", "species1"], - ["2", "species2"], - ["3", "species3"], - ["4", "species4"], - ["5", "species5"], - ["6", "species6"], - ["7", "species7"], - ["8", "species8"]], - columns=['occurrenceID', 'scientificName']), + dwca.extract_csv_content(csv_info=ContentData(data=pd.DataFrame(data=[["1", "species1"], + ["2", "species2"], + ["3", "species3"], + ["4", "species4"], + ["5", "species5"], + ["6", "species6"], + ["7", "species7"], + ["8", "species8"]], + columns=['occurrenceID', 'scientificName']), type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID']), core_ext_type=CoreOrExtType.CORE) @@ -205,8 +205,8 @@ def test_fill_multimedia_info_type_from_format(self, mock_mime_types): ["8", INVALID_MIMETYPE_URL, None]] # Extract multimedia ext without format - dwca.extract_csv_content(csv_info=CsvFileType(files=pd.DataFrame(data=image_data, - columns=["occurrenceID", "identifier", + dwca.extract_csv_content(csv_info=ContentData(data=pd.DataFrame(data=image_data, + columns=["occurrenceID", "identifier", "format"]), type=MetaElementTypes.MULTIMEDIA), core_ext_type=CoreOrExtType.EXTENSION) diff --git a/tests/test_write_dwca.py b/tests/test_write_dwca.py index 9e32400..6920fc0 100644 --- a/tests/test_write_dwca.py +++ b/tests/test_write_dwca.py @@ -1,5 +1,5 @@ import io -from dwcahandler import DwcaHandler, CsvFileType, CoreOrExtType, MetaElementTypes +from dwcahandler import DwcaHandler, ContentData, CoreOrExtType, MetaElementTypes from zipfile import ZipFile from pathlib import Path import xml.etree.ElementTree as ET @@ -34,7 +34,7 @@ def test_generate_dwca_without_ext(self): """ Test that generated dwca is valid with core occ data """ - core_csv = CsvFileType(files=[occurrence_sample_file], keys=['occurrenceID'], + core_csv = ContentData(data=[occurrence_sample_file], keys=['occurrenceID'], type=MetaElementTypes.OCCURRENCE) p = Path("temp") p.mkdir(parents=True, exist_ok=True) @@ -71,9 +71,9 @@ def test_generate_dwca_with_ext(self): """ Test that generated dwca is valid with core occ and multimedia data """ - core_csv = CsvFileType(files=[occurrence_sample_file], keys=['occurrenceID'], + core_csv = ContentData(data=[occurrence_sample_file], keys=['occurrenceID'], type=MetaElementTypes.OCCURRENCE) - ext_csv = CsvFileType(files=[multimedia_sample_file], keys=['occurrenceID'], + ext_csv = ContentData(data=[multimedia_sample_file], keys=['occurrenceID'], type=MetaElementTypes.MULTIMEDIA) p = Path("temp") p.mkdir(parents=True, exist_ok=True) @@ -131,7 +131,7 @@ def test_generate_dwca_in_memory(self): ["3", "species3"]], columns=['catalogNumber', 'scientificName']) - core_csv = CsvFileType(files=occ_df, + core_csv = ContentData(data=occ_df, type=MetaElementTypes.OCCURRENCE, keys=['catalogNumber']) From 1c10c40c475a2d501a1103bb2f13bd112b486d3b Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Wed, 19 Mar 2025 13:37:09 +1100 Subject: [PATCH 16/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - fix flake8 linting formating --- src/dwcahandler/dwca/dwca_factory.py | 4 ++-- tests/test_merge_dwca.py | 1 - tests/test_multimedia_content.py | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/dwcahandler/dwca/dwca_factory.py b/src/dwcahandler/dwca/dwca_factory.py index 4a7ff1a..316ee72 100644 --- a/src/dwcahandler/dwca/dwca_factory.py +++ b/src/dwcahandler/dwca/dwca_factory.py @@ -37,7 +37,7 @@ def get_contents_from_file_names(files: list) -> (dict[MetaElementTypes, str], d def derive_type(file_list: list) -> dict[str, MetaElementTypes]: file_types = {} for file in file_list: - if (filename:=Path(file).stem.upper()) in dict(MetaElementTypes.__members__.items()).keys(): + if (filename := Path(file).stem.upper()) in dict(MetaElementTypes.__members__.items()).keys(): file_types[file] = dict(MetaElementTypes.__members__.items())[filename] return file_types @@ -122,7 +122,7 @@ def create_dwca_from_zip_content(zip_file: str, output_dwca: Union[str, BytesIO] keys=get_keys(type=ext_type, override_content_keys=content_keys))) DwcaHandler.create_dwca(core_csv=core_content, ext_csv_list=ext_content, output_dwca=output_dwca, - eml_content=eml_content) + eml_content=eml_content) else: raise ValueError("The core content cannot be determined. Please check filename in zip file") diff --git a/tests/test_merge_dwca.py b/tests/test_merge_dwca.py index 0ec915a..e5524ad 100644 --- a/tests/test_merge_dwca.py +++ b/tests/test_merge_dwca.py @@ -421,4 +421,3 @@ def test_merge_core_and_ext_records_with_ext_sync(self): pd.testing.assert_frame_equal(multimedia_df_output, expected_multimedia_df) zf.close() - diff --git a/tests/test_multimedia_content.py b/tests/test_multimedia_content.py index 25324df..d999479 100644 --- a/tests/test_multimedia_content.py +++ b/tests/test_multimedia_content.py @@ -150,7 +150,7 @@ def test_fill_multimedia_info_with_format_type_partially_supplied(self, mock_mim # Extract multimedia ext without format dwca.extract_csv_content(csv_info=ContentData(data=pd.DataFrame(data=image_data, columns=["occurrenceID", "identifier", - "format", "type"]), + "format", "type"]), type=MetaElementTypes.MULTIMEDIA), core_ext_type=CoreOrExtType.EXTENSION) @@ -207,7 +207,7 @@ def test_fill_multimedia_info_type_from_format(self, mock_mime_types): # Extract multimedia ext without format dwca.extract_csv_content(csv_info=ContentData(data=pd.DataFrame(data=image_data, columns=["occurrenceID", "identifier", - "format"]), + "format"]), type=MetaElementTypes.MULTIMEDIA), core_ext_type=CoreOrExtType.EXTENSION) From a09f2c71a81a09fa846b750b8823d87820b5d7c0 Mon Sep 17 00:00:00 2001 From: Mahmoud Date: Wed, 19 Mar 2025 14:38:14 +1100 Subject: [PATCH 17/23] changed an image extension merge test --- tests/test_merge_dwca.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/test_merge_dwca.py b/tests/test_merge_dwca.py index e5524ad..16191aa 100644 --- a/tests/test_merge_dwca.py +++ b/tests/test_merge_dwca.py @@ -1,12 +1,13 @@ -import pandas as pd -from zipfile import ZipFile -from tests import make_meta_xml_str, remove_pretty_print_xml -from tests import make_dwca -from dwcahandler import DwcaHandler, MetaElementTypes from io import BytesIO +from zipfile import ZipFile + +import pandas as pd from numpy import nan +from dwcahandler import DwcaHandler, MetaElementTypes +from tests import make_dwca, make_meta_xml_str, remove_pretty_print_xml + class TestMergeDwcaContent: @@ -352,7 +353,8 @@ def test_merge_core_and_ext_records_with_ext_sync(self): multimedia_df = pd.DataFrame(data=[["1", "https://image1.jpg", "image/jpeg", "StillImage"], ["2", "https://image2.jpg", "image/jpeg", "StillImage"], - ["3", "https://image3.jpg", "image/jpeg", "StillImage"]], + ["3", "https://image3-0.jpg", "image/jpeg", "StillImage"], + ["3", "https://image3-1.jpg", "image/jpeg", "StillImage"]], columns=["id", "identifier", "format", "type"]) dwca_ext_obj = make_dwca(core_content=occ_df, ext_mult_content=multimedia_df, use_col_idx_as_core_id=0) @@ -364,7 +366,8 @@ def test_merge_core_and_ext_records_with_ext_sync(self): columns=["id", "occurrenceID", "scientificName", "decimalLatitude", "decimalLongitude", "basisOfRecord"]) - delta_multimedia_df = pd.DataFrame(data=[["3", "https://new-image3.webp", "image/webp", "StillImage", "RightsHolder3"], + delta_multimedia_df = pd.DataFrame(data=[["3", "https://image3-0.jpg", "image/jpeg", "StillImage", nan], + ["3", "https://new-image3.webp", "image/webp", "StillImage", "RightsHolder3"], ["4", "https://image4.webp", "image/webp", "StillImage", nan], ["5", "https://image5.webp", "image/webp", "StillImage", "RightsHolder5"], ["6", "https://image6.webp", "image/webp", "StillImage", "RightsHolder6"]], @@ -393,6 +396,7 @@ def test_merge_core_and_ext_records_with_ext_sync(self): expected_multimedia_df = pd.DataFrame(data=[["1", "https://image1.jpg", "image/jpeg", "StillImage", nan], ["2", "https://image2.jpg", "image/jpeg", "StillImage", nan], + ["3", "https://image3-0.jpg", "image/jpeg", "StillImage", nan], ["3", "https://new-image3.webp", "image/webp", "StillImage", "RightsHolder3"], ["4", "https://image4.webp", "image/webp", "StillImage", nan], ["5", "https://image5.webp", "image/webp", "StillImage", "RightsHolder5"], From a641fd14f15952c90c0c33b41099e6fe8ea2f3d0 Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Wed, 19 Mar 2025 17:14:06 +1100 Subject: [PATCH 18/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - review feedback, remove hardcoded id and coreid --- README.md | 24 +++++++++--------- src/dwcahandler/dwca/__init__.py | 13 +++++++--- src/dwcahandler/dwca/core_dwca.py | 37 ++++++++++++++-------------- src/dwcahandler/dwca/dwca_factory.py | 8 +++--- src/dwcahandler/dwca/dwca_meta.py | 8 +++--- tests/__init__.py | 4 +-- 6 files changed, 49 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index 305412f..53da77e 100644 --- a/README.md +++ b/README.md @@ -89,12 +89,12 @@ DwcaHandler.list_class_rowtypes() ### Examples of dwcahandler usages: * Create Darwin Core Archive from csv file. -* Keys are used as id/core id for Dwca with extensions and must be supplied for the core and extensions in the data -* Validation is performed to make sure that the keys are unique in the core of the Dwca -* If Keys are not provided, the default keys is occurrenceID -* If multiple Keys are supplied, resulting dwca would generate id/core id +* Keys in core content are used as id/core id for Dwca with extensions and must be supplied in the data for core and extensions +* If core data have more than 1 key (for eg: institutionCode, collectionCode and catalogNumber), resulting dwca would generate id/core id for extension +* Validation is performed to make sure that the keys are unique in the core of the Dwca by default +* If keys are supplied for the content extension, the validation will be run to check the uniqueness of the keys in the content +* If keys are not provided, the default keys is eventID for event content and occurrenceID for occurrence content * In creating a dwca with multimedia extension, provide format and type values in the Simple Multimedia extension, otherwise, dwcahandler will attempt to fill these info by guessing the mimetype from url. - ```python from dwcahandler import ContentData from dwcahandler import DwcaHandler @@ -139,9 +139,9 @@ DwcaHandler.create_dwca(core_csv=core_frame, ext_csv_list=ext_frame, eml_content ```   * Convenient helper function to build Darwin Core Archive from a list of csv files. -* Build event core DwCA if event.txt file is supplied, otherwise, occurrence DwCA if occurrence.txt is supplied. -* Raises error if neither event.txt not occurrence.txt is in the list -* Class row types are determined by file names of the csvs. +* Build event core DwCA if event.txt file is supplied, otherwise, occurrence core DwCA if occurrence.txt is supplied. +* Raises error if neither event.txt nor occurrence.txt is in the list +* Class row types are determined by file names of the text files. * If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content * Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding ```python @@ -158,10 +158,10 @@ DwcaHandler.create_dwca_from_file_list(files=["/tmp/event.csv", "/tmp/occurrence ```   * Convenient helper function to create Darwin Core Archive from csv files in a zip files. -* Build event core DwCA if event.txt file is supplied, otherwise, occurrence DwCA if occurrence.txt is supplied in the zip file -* Raises error if neither event.txt not occurrence.txt is in the list -* Class row types are determined by file names of the csvs. -* If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content +* Build event core DwCA if event.txt file is supplied, otherwise, occurrence core DwCA if occurrence.txt is supplied in the zip file +* Raises error if neither event.txt nor occurrence.txt is in the zip file +* Class row types are determined by file names of the text files. +* If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content. * Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding ```python from dwcahandler import DwcaHandler diff --git a/src/dwcahandler/dwca/__init__.py b/src/dwcahandler/dwca/__init__.py index 370a7ca..a59380b 100644 --- a/src/dwcahandler/dwca/__init__.py +++ b/src/dwcahandler/dwca/__init__.py @@ -31,25 +31,30 @@ EXTENSION="extension" ) +MetaDefaultFields = namedtuple("MetaDefaultFields", ["ID", "CORE_ID"])( + ID="id", + CORE_ID="coreid" +) + # Default keys for content when creating dwca DefaultKeys = namedtuple("DefaultKeys", ["EVENT", "OCCURRENCE"])( EVENT = "eventID", OCCURRENCE = "occurrenceID" ) -def get_keys(type: MetaElementTypes, override_content_keys: dict[[MetaElementTypes, list]] = None): +def get_keys(class_type: MetaElementTypes, override_content_keys: dict[[MetaElementTypes, list]] = None): """ # If override_content_keys not supplied, return the default keys based on content type - :param type: type of content + :param class_type: class_type of content :param override_content_keys: given content keys :return: the list of keys for the content """ if override_content_keys: for content_type, keys in override_content_keys.items(): - if type == content_type and keys and len(keys) > 0: + if class_type == content_type and keys and len(keys) > 0: return keys defaults = DefaultKeys._asdict() - return [defaults[type.name]] if type.name in defaults.keys() else [] + return [defaults[class_type.name]] if class_type.name in defaults.keys() else [] @dataclass class CSVEncoding: diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index b50ac93..ec98d41 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -21,7 +21,7 @@ from numpy import nan from pandas.errors import EmptyDataError from pandas.io import parsers -from dwcahandler.dwca import (BaseDwca, CoreOrExtType, CSVEncoding, +from dwcahandler.dwca import (BaseDwca, CoreOrExtType, MetaDefaultFields, CSVEncoding, ContentData, Defaults, Eml, Terms, get_keys, MetaDwCA, MetaElementInfo, MetaElementTypes, MetaElementAttributes, Stat, record_diff_stat) @@ -93,13 +93,11 @@ def _update_core_ids(self, core_df) -> str: :param core_df: The data frame to generate identifiers for return id field """ - id_field = "id" - if id_field not in core_df.columns.to_list(): - core_df.insert(0, id_field, core_df.apply(lambda _: uuid.uuid4(), axis=1), False) - return id_field + if MetaDefaultFields.ID not in core_df.columns.to_list(): + core_df.insert(0, MetaDefaultFields.ID, core_df.apply(lambda _: uuid.uuid4(), axis=1), False) + return MetaDefaultFields.ID else: raise ValueError("core df should not contain id column") - # core_df['id'] = core_df['id'].map(lambda _: uuid.uuid4()) def _update_df(self, to_update_df, lookup_df, update_field, from_update_field): """Update a data frame via lookup @@ -142,23 +140,23 @@ def _update_extension_ids(self, csv_content: pd.DataFrame, core_df_content: pd.D set(link_col).issubset(set(csv_content.index.names))): csv_content.reset_index(inplace=True, drop=True) - csv_content = csv_content.merge(core_df_content.loc[:, 'id'], + csv_content = csv_content.merge(core_df_content.loc[:, MetaDefaultFields.ID], left_on=link_col, right_on=link_col, how='outer') - if 'id' in csv_content.columns.to_list(): - unmatched_content = csv_content[csv_content["id"].isnull()] - unmatched_content = unmatched_content.drop(columns=["id"]) + if MetaDefaultFields.ID in csv_content.columns.to_list(): + unmatched_content = csv_content[csv_content[MetaDefaultFields.ID].isnull()] + unmatched_content = unmatched_content.drop(columns=[MetaDefaultFields.ID]) if len(unmatched_content) > 0: log.info("There are orphaned keys in extension file") pd.set_option("display.max_columns", 7) pd.set_option('display.max_colwidth', 15) pd.set_option('display.max_rows', 10) log.info("\n%s", unmatched_content) - csv_content = csv_content[~csv_content["id"].isnull()] - col = csv_content.pop('id') + csv_content = csv_content[~csv_content[MetaDefaultFields.ID].isnull()] + col = csv_content.pop(MetaDefaultFields.ID) csv_content.insert(0, col.name, col) - csv_content.rename(columns={"id": ext_core_id_field}, inplace=True) + csv_content.rename(columns={MetaDefaultFields.ID: ext_core_id_field}, inplace=True) return csv_content, ext_core_id_field else: raise ValueError("Something is not right. The core id failed to be created") @@ -209,7 +207,8 @@ def _find_fields_with_zero_idx(meta_element_fields: list): def _add_first_id_field_if_exists(meta_element: MetaElementAttributes): zero_index_exist = _find_fields_with_zero_idx(meta_element.fields) if meta_element.core_id and meta_element.core_id.index and not zero_index_exist: - return ["id"] if meta_element.meta_element_type.core_or_ext_type == CoreOrExtType.CORE else ["coreid"] + return [MetaDefaultFields.ID] if meta_element.meta_element_type.core_or_ext_type == CoreOrExtType.CORE \ + else [MetaDefaultFields.CORE_ID] else: return [] @@ -288,7 +287,7 @@ def _update_values(self, df_content, delta_df_content, keys, stat): :return: The updated content """ # Extract columns that need updating, excluding self.keys and id - non_update_column = ['id', 'coreid'] + non_update_column = list(MetaDefaultFields) non_update_column.extend(keys) update_columns = [i for i in delta_df_content.columns.to_list() if i not in non_update_column] @@ -430,12 +429,12 @@ def _extract_core_keys(self, core_content, keys): :return: A data frame indexed by the `id` column that contains the key elements for each record """ - columns = ['id'] if "id" in core_content.columns.tolist() else [] + columns = [MetaDefaultFields.ID] if MetaDefaultFields.ID in core_content.columns.tolist() else [] if all(key in core_content.columns for key in keys): columns.extend(keys) df = core_content[columns] - if "id" in core_content.columns.tolist(): - df.set_index('id', drop=True, inplace=True) + if MetaDefaultFields.ID in core_content.columns.tolist(): + df.set_index(MetaDefaultFields.ID, drop=True, inplace=True) else: raise ValueError(f"Keys does not exist in core content {''.join(keys)}") return df @@ -904,7 +903,7 @@ def extract_csv_content(self, csv_info: ContentData, # Use default keys if not provided if core_ext_type == CoreOrExtType.CORE: override_keys = {csv_info.type: csv_info.keys} if csv_info.keys and len(csv_info.keys) > 0 else None - keys = get_keys(type=csv_info.type, override_content_keys=override_keys) + keys = get_keys(class_type=csv_info.type, override_content_keys=override_keys) else: keys = self.core_content.keys core_id_field: str = "" diff --git a/src/dwcahandler/dwca/dwca_factory.py b/src/dwcahandler/dwca/dwca_factory.py index 316ee72..9fbdb35 100644 --- a/src/dwcahandler/dwca/dwca_factory.py +++ b/src/dwcahandler/dwca/dwca_factory.py @@ -77,12 +77,12 @@ def create_dwca_from_file_list(files: list, output_dwca: Union[str, BytesIO], core_type = core_content[core_filename] core_content = ContentData(data=[core_filename], type=core_type, csv_encoding=csv_encoding, - keys=get_keys(type=core_type, override_content_keys=content_keys)) + keys=get_keys(class_type=core_type, override_content_keys=content_keys)) ext_content = [] for ext_file, ext_type in ext_content_list.items(): ext_content.append(ContentData(data=[ext_file], type=ext_type, csv_encoding=csv_encoding, - keys=get_keys(type=ext_type, + keys=get_keys(class_type=ext_type, override_content_keys=content_keys))) DwcaHandler.create_dwca(core_csv=core_content, ext_csv_list=ext_content, output_dwca=output_dwca, eml_content=eml_content) @@ -113,13 +113,13 @@ def create_dwca_from_zip_content(zip_file: str, output_dwca: Union[str, BytesIO] core_type = core_content[core_filename] core_content = ContentData(data=io.TextIOWrapper(zf.open(core_filename), encoding="utf-8"), type=core_type, csv_encoding=csv_encoding, - keys=get_keys(type=core_type, + keys=get_keys(class_type=core_type, override_content_keys=content_keys)) ext_content = [] for ext_file, ext_type in ext_content_list.items(): ext_content.append(ContentData(data=io.TextIOWrapper(zf.open(ext_file), encoding="utf-8"), type=ext_type, csv_encoding=csv_encoding, - keys=get_keys(type=ext_type, + keys=get_keys(class_type=ext_type, override_content_keys=content_keys))) DwcaHandler.create_dwca(core_csv=core_content, ext_csv_list=ext_content, output_dwca=output_dwca, eml_content=eml_content) diff --git a/src/dwcahandler/dwca/dwca_meta.py b/src/dwcahandler/dwca/dwca_meta.py index bb3c41f..9465991 100644 --- a/src/dwcahandler/dwca/dwca_meta.py +++ b/src/dwcahandler/dwca/dwca_meta.py @@ -11,7 +11,7 @@ import re from dataclasses import dataclass, field from typing import Optional -from dwcahandler.dwca import CSVEncoding, CoreOrExtType, Terms +from dwcahandler.dwca import CSVEncoding, CoreOrExtType, Terms, MetaDefaultFields from enum import Enum @@ -225,13 +225,13 @@ def _build_meta_xml(self, meta_elem_attrib: MetaElementAttributes): location = ET.SubElement(files, 'location') location.text = meta_elem_attrib.meta_element_type.file_name if meta_elem_attrib.core_id: - id_field = ET.SubElement(elem, 'id') \ + id_field = ET.SubElement(elem, MetaDefaultFields.ID) \ if meta_elem_attrib.meta_element_type.core_or_ext_type == 'core' \ - else ET.SubElement(elem, 'coreid') + else ET.SubElement(elem, MetaDefaultFields.CORE_ID) id_field.attrib['index'] = meta_elem_attrib.core_id.index for _, f in enumerate(meta_elem_attrib.fields): - if f.field_name not in ('id', 'coreid'): + if f.field_name not in list(MetaDefaultFields): field_elem = ET.SubElement(elem, "field") if f.index is not None: field_elem.attrib['index'] = f.index diff --git a/tests/__init__.py b/tests/__init__.py index e20f445..ae62bf9 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -5,7 +5,7 @@ import csv from dwcahandler import Eml from xml.dom.minidom import parseString -from dwcahandler import MetaDwCA +from dwcahandler import MetaDwCA, MetaDefaultFields def get_eml_content(): @@ -25,7 +25,7 @@ def make_fields(columns: list, term_uri: str, field_start: int = 0, core_id: str idx_start = field_start if field_start != -2 else 0 for idx, col in enumerate(columns): - if not (col in ["id", "coreid"]): + if not (col in list(MetaDefaultFields)): dwc_term_uri = "http://rs.tdwg.org/dwc/terms" if col == 'occurrenceID' else term_uri fields = fields + '\n' + f'' From a3c9b7d469f5eec256438b9fac5c9396333566f2 Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Thu, 20 Mar 2025 13:57:13 +1100 Subject: [PATCH 19/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - refactor code --- src/dwcahandler/dwca/dwca_factory.py | 68 ++++++++++++---------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/src/dwcahandler/dwca/dwca_factory.py b/src/dwcahandler/dwca/dwca_factory.py index 9fbdb35..54bbb59 100644 --- a/src/dwcahandler/dwca/dwca_factory.py +++ b/src/dwcahandler/dwca/dwca_factory.py @@ -27,11 +27,17 @@ def list_class_rowtypes() : print(f"{name}: {member.value}") @staticmethod - def get_contents_from_file_names(files: list) -> (dict[MetaElementTypes, str], dict[MetaElementTypes, str]): + def get_contents_from_file_names(files: list, csv_encoding: CSVEncoding, + content_keys: dict[MetaElementTypes, list] = None, zf: ZipFile = None) \ + -> (ContentData, list[ContentData]): """Find the core content and extension contents from a list of file paths. Core content will always be event if present, otherwise, occurrence content :param files: list of files + :param csv_encoding: delimiter for txt file. Default is comma delimiter txt files if not supplied + :param content_keys: optional dictionary of MetaElementTypes and key list + for eg. {MetaElementTypes.OCCURRENCE, ["occurrenceID"]} + :param zf: Zipfile pointer if using :return dict of core content type and file name and dict containing ext content type and file name """ def derive_type(file_list: list) -> dict[str, MetaElementTypes]: @@ -51,9 +57,23 @@ def derive_type(file_list: list) -> dict[str, MetaElementTypes]: core_filename = next(iter(core_file)) core_type = core_file[core_filename] ext_files = {k: v for k, v in contents.items() if v != core_type} - return core_file, ext_files - return None + core_data = [core_filename] if not zf else io.TextIOWrapper(zf.open(core_filename), encoding="utf-8") + core_content = ContentData(data=core_data, + type=core_type, csv_encoding=csv_encoding, + keys=get_keys(class_type=core_type, + override_content_keys=content_keys)) + ext_content = [] + for ext_file, ext_type in ext_files.items(): + ext_data = [ext_file] if not zf else io.TextIOWrapper(zf.open(ext_file), encoding="utf-8") + ext_content.append(ContentData(data=ext_data, + type=ext_type, csv_encoding=csv_encoding, + keys=get_keys(class_type=ext_type, + override_content_keys=content_keys))) + return core_content, ext_content + else: + raise ValueError("The core content cannot be determined. Please check filenames against the class type. " + "Use list_class_rowtypes to print the class types. ") """Perform various DwCA operations""" @staticmethod @@ -71,23 +91,9 @@ def create_dwca_from_file_list(files: list, output_dwca: Union[str, BytesIO], :param content_keys: optional dictionary of MetaElementTypes and key list for eg. {MetaElementTypes.OCCURRENCE, ["occurrenceID"]} """ - core_content, ext_content_list = DwcaHandler.get_contents_from_file_names(files) - if core_content: - core_filename = next(iter(core_content)) - core_type = core_content[core_filename] - - core_content = ContentData(data=[core_filename], type=core_type, csv_encoding=csv_encoding, - keys=get_keys(class_type=core_type, override_content_keys=content_keys)) - ext_content = [] - for ext_file, ext_type in ext_content_list.items(): - ext_content.append(ContentData(data=[ext_file], - type=ext_type, csv_encoding=csv_encoding, - keys=get_keys(class_type=ext_type, - override_content_keys=content_keys))) - DwcaHandler.create_dwca(core_csv=core_content, ext_csv_list=ext_content, output_dwca=output_dwca, - eml_content=eml_content) - else: - raise ValueError("The core content cannot be determined. Please check filename in zip file") + core_content, ext_content_list = DwcaHandler.get_contents_from_file_names(files=files, csv_encoding=csv_encoding) + DwcaHandler.create_dwca(core_csv=core_content, ext_csv_list=ext_content_list, output_dwca=output_dwca, + eml_content=eml_content) @staticmethod def create_dwca_from_zip_content(zip_file: str, output_dwca: Union[str, BytesIO], @@ -107,24 +113,10 @@ def create_dwca_from_zip_content(zip_file: str, output_dwca: Union[str, BytesIO] """ with ZipFile(zip_file, 'r') as zf: files = zf.namelist() - core_content, ext_content_list = DwcaHandler.get_contents_from_file_names(files) - if core_content: - core_filename = next(iter(core_content)) - core_type = core_content[core_filename] - core_content = ContentData(data=io.TextIOWrapper(zf.open(core_filename), encoding="utf-8"), - type=core_type, csv_encoding=csv_encoding, - keys=get_keys(class_type=core_type, - override_content_keys=content_keys)) - ext_content = [] - for ext_file, ext_type in ext_content_list.items(): - ext_content.append(ContentData(data=io.TextIOWrapper(zf.open(ext_file), encoding="utf-8"), - type=ext_type, csv_encoding=csv_encoding, - keys=get_keys(class_type=ext_type, - override_content_keys=content_keys))) - DwcaHandler.create_dwca(core_csv=core_content, ext_csv_list=ext_content, output_dwca=output_dwca, - eml_content=eml_content) - else: - raise ValueError("The core content cannot be determined. Please check filename in zip file") + core_content, ext_content_list = DwcaHandler.get_contents_from_file_names(files=files, csv_encoding=csv_encoding, zf=zf) + DwcaHandler.create_dwca(core_csv=core_content, ext_csv_list=ext_content_list, output_dwca=output_dwca, + eml_content=eml_content) + zf.close() @staticmethod def create_dwca(core_csv: ContentData, From e0d8eed0122450a04d4531916a125823027994e6 Mon Sep 17 00:00:00 2001 From: Mahmoud Date: Thu, 20 Mar 2025 16:26:24 +1100 Subject: [PATCH 20/23] refactor types from namedTuple to class --- src/dwcahandler/dwca/__init__.py | 36 ++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/dwcahandler/dwca/__init__.py b/src/dwcahandler/dwca/__init__.py index a59380b..390255f 100644 --- a/src/dwcahandler/dwca/__init__.py +++ b/src/dwcahandler/dwca/__init__.py @@ -19,22 +19,22 @@ from __future__ import annotations import io +import logging from collections import namedtuple from dataclasses import dataclass, field +from functools import wraps from typing import Optional, Union -import logging + import pandas as pd -from functools import wraps -CoreOrExtType = namedtuple("CoreOrExtType", ["CORE", "EXTENSION"])( - CORE="core", - EXTENSION="extension" -) -MetaDefaultFields = namedtuple("MetaDefaultFields", ["ID", "CORE_ID"])( - ID="id", - CORE_ID="coreid" -) +class CoreOrExtType(Enum): + CORE = "core" + EXTENSION = "extension" +@dataclass(frozen=True) +class MetaDefaultFields: + ID: str = "id" + CORE_ID: str = "coreid" # Default keys for content when creating dwca DefaultKeys = namedtuple("DefaultKeys", ["EVENT", "OCCURRENCE"])( @@ -185,10 +185,13 @@ class Defaults: default_factory=lambda: {'LF': '\r\n', '\\t': '\t', '\\n': '\n'}) +from dwcahandler.dwca.dwca_meta import (MetaDwCA, MetaElementAttributes, + MetaElementInfo, MetaElementTypes, + get_meta_class_row_type) # Imports at end of file to allow classes to be used -from dwcahandler.dwca.terms import Terms, NsPrefix -from dwcahandler.dwca.dwca_meta import (MetaElementTypes, MetaElementInfo, MetaDwCA, - MetaElementAttributes, get_meta_class_row_type) +from dwcahandler.dwca.terms import NsPrefix, Terms + + @dataclass class ContentData: """A class describing the content data used for core and extension. @@ -223,8 +226,9 @@ def add_data(self, other_csv_file_type: ContentData): self.type = other_csv_file_type.type return False -from dwcahandler.dwca.eml import Eml +from enum import Enum + from dwcahandler.dwca.base_dwca import BaseDwca -from dwcahandler.dwca.core_dwca import Dwca, DfContent +from dwcahandler.dwca.core_dwca import DfContent, Dwca from dwcahandler.dwca.dwca_factory import DwcaHandler - +from dwcahandler.dwca.eml import Eml From 2abb2337938a96a8cefef863e163f3a438840f19 Mon Sep 17 00:00:00 2001 From: Mahmoud Date: Thu, 20 Mar 2025 16:37:55 +1100 Subject: [PATCH 21/23] fix an import package --- src/dwcahandler/dwca/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dwcahandler/dwca/__init__.py b/src/dwcahandler/dwca/__init__.py index 390255f..14843c4 100644 --- a/src/dwcahandler/dwca/__init__.py +++ b/src/dwcahandler/dwca/__init__.py @@ -22,6 +22,7 @@ import logging from collections import namedtuple from dataclasses import dataclass, field +from enum import Enum from functools import wraps from typing import Optional, Union @@ -226,7 +227,6 @@ def add_data(self, other_csv_file_type: ContentData): self.type = other_csv_file_type.type return False -from enum import Enum from dwcahandler.dwca.base_dwca import BaseDwca from dwcahandler.dwca.core_dwca import DfContent, Dwca From 9d9daca643122893ce35ae830450e344a5f2340b Mon Sep 17 00:00:00 2001 From: Mahmoud Date: Thu, 20 Mar 2025 17:00:12 +1100 Subject: [PATCH 22/23] fixed recursive import --- src/dwcahandler/dwca/__init__.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/dwcahandler/dwca/__init__.py b/src/dwcahandler/dwca/__init__.py index 14843c4..ffceabc 100644 --- a/src/dwcahandler/dwca/__init__.py +++ b/src/dwcahandler/dwca/__init__.py @@ -186,13 +186,6 @@ class Defaults: default_factory=lambda: {'LF': '\r\n', '\\t': '\t', '\\n': '\n'}) -from dwcahandler.dwca.dwca_meta import (MetaDwCA, MetaElementAttributes, - MetaElementInfo, MetaElementTypes, - get_meta_class_row_type) -# Imports at end of file to allow classes to be used -from dwcahandler.dwca.terms import NsPrefix, Terms - - @dataclass class ContentData: """A class describing the content data used for core and extension. @@ -231,4 +224,9 @@ def add_data(self, other_csv_file_type: ContentData): from dwcahandler.dwca.base_dwca import BaseDwca from dwcahandler.dwca.core_dwca import DfContent, Dwca from dwcahandler.dwca.dwca_factory import DwcaHandler +from dwcahandler.dwca.dwca_meta import (MetaDwCA, MetaElementAttributes, + MetaElementInfo, MetaElementTypes, + get_meta_class_row_type) from dwcahandler.dwca.eml import Eml +# Imports at end of file to allow classes to be used +from dwcahandler.dwca.terms import NsPrefix, Terms From 8f8172b2ec80794973e7e90103c05c10111df09f Mon Sep 17 00:00:00 2001 From: Patricia Koh Date: Fri, 21 Mar 2025 11:22:33 +1100 Subject: [PATCH 23/23] https://github.com/AtlasOfLivingAustralia/dwcahandler/issues/20 - change CoreOrExtType to enum for type hinting (review feedback) --- src/dwcahandler/dwca/__init__.py | 26 +++++++++---------- src/dwcahandler/dwca/core_dwca.py | 42 ++++++++++++++++--------------- src/dwcahandler/dwca/dwca_meta.py | 24 +++++++++--------- src/dwcahandler/dwca/terms.py | 10 +++++--- tests/__init__.py | 4 +-- tests/test_write_dwca.py | 8 +++--- 6 files changed, 59 insertions(+), 55 deletions(-) diff --git a/src/dwcahandler/dwca/__init__.py b/src/dwcahandler/dwca/__init__.py index ffceabc..ea0cbe6 100644 --- a/src/dwcahandler/dwca/__init__.py +++ b/src/dwcahandler/dwca/__init__.py @@ -25,17 +25,13 @@ from enum import Enum from functools import wraps from typing import Optional, Union - +import logging import pandas as pd class CoreOrExtType(Enum): CORE = "core" EXTENSION = "extension" -@dataclass(frozen=True) -class MetaDefaultFields: - ID: str = "id" - CORE_ID: str = "coreid" # Default keys for content when creating dwca DefaultKeys = namedtuple("DefaultKeys", ["EVENT", "OCCURRENCE"])( @@ -184,8 +180,17 @@ class Defaults: # Translation csv encoding values translate_table: dict = field(init=False, default_factory=lambda: {'LF': '\r\n', '\\t': '\t', '\\n': '\n'}) + MetaDefaultFields: namedtuple = namedtuple("MetaDefaultFields", ["ID", "CORE_ID"])( + ID="id", + CORE_ID="coreid" + ) + +# Imports at end of file to allow classes to be used +from dwcahandler.dwca.terms import Terms, NsPrefix +from dwcahandler.dwca.dwca_meta import (MetaElementTypes, MetaElementInfo, MetaDwCA, + MetaElementAttributes, get_meta_class_row_type) @dataclass class ContentData: """A class describing the content data used for core and extension. @@ -220,13 +225,8 @@ def add_data(self, other_csv_file_type: ContentData): self.type = other_csv_file_type.type return False - +from dwcahandler.dwca.eml import Eml from dwcahandler.dwca.base_dwca import BaseDwca -from dwcahandler.dwca.core_dwca import DfContent, Dwca +from dwcahandler.dwca.core_dwca import Dwca, DfContent from dwcahandler.dwca.dwca_factory import DwcaHandler -from dwcahandler.dwca.dwca_meta import (MetaDwCA, MetaElementAttributes, - MetaElementInfo, MetaElementTypes, - get_meta_class_row_type) -from dwcahandler.dwca.eml import Eml -# Imports at end of file to allow classes to be used -from dwcahandler.dwca.terms import NsPrefix, Terms + diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py index ec98d41..e9f6eec 100644 --- a/src/dwcahandler/dwca/core_dwca.py +++ b/src/dwcahandler/dwca/core_dwca.py @@ -21,7 +21,7 @@ from numpy import nan from pandas.errors import EmptyDataError from pandas.io import parsers -from dwcahandler.dwca import (BaseDwca, CoreOrExtType, MetaDefaultFields, CSVEncoding, +from dwcahandler.dwca import (BaseDwca, CoreOrExtType, CSVEncoding, ContentData, Defaults, Eml, Terms, get_keys, MetaDwCA, MetaElementInfo, MetaElementTypes, MetaElementAttributes, Stat, record_diff_stat) @@ -93,9 +93,9 @@ def _update_core_ids(self, core_df) -> str: :param core_df: The data frame to generate identifiers for return id field """ - if MetaDefaultFields.ID not in core_df.columns.to_list(): - core_df.insert(0, MetaDefaultFields.ID, core_df.apply(lambda _: uuid.uuid4(), axis=1), False) - return MetaDefaultFields.ID + if self.defaults_prop.MetaDefaultFields.ID not in core_df.columns.to_list(): + core_df.insert(0, self.defaults_prop.MetaDefaultFields.ID, core_df.apply(lambda _: uuid.uuid4(), axis=1), False) + return self.defaults_prop.MetaDefaultFields.ID else: raise ValueError("core df should not contain id column") @@ -140,23 +140,23 @@ def _update_extension_ids(self, csv_content: pd.DataFrame, core_df_content: pd.D set(link_col).issubset(set(csv_content.index.names))): csv_content.reset_index(inplace=True, drop=True) - csv_content = csv_content.merge(core_df_content.loc[:, MetaDefaultFields.ID], + csv_content = csv_content.merge(core_df_content.loc[:, self.defaults_prop.MetaDefaultFields.ID], left_on=link_col, right_on=link_col, how='outer') - if MetaDefaultFields.ID in csv_content.columns.to_list(): - unmatched_content = csv_content[csv_content[MetaDefaultFields.ID].isnull()] - unmatched_content = unmatched_content.drop(columns=[MetaDefaultFields.ID]) + if self.defaults_prop.MetaDefaultFields.ID in csv_content.columns.to_list(): + unmatched_content = csv_content[csv_content[self.defaults_prop.MetaDefaultFields.ID].isnull()] + unmatched_content = unmatched_content.drop(columns=[self.defaults_prop.MetaDefaultFields.ID]) if len(unmatched_content) > 0: log.info("There are orphaned keys in extension file") pd.set_option("display.max_columns", 7) pd.set_option('display.max_colwidth', 15) pd.set_option('display.max_rows', 10) log.info("\n%s", unmatched_content) - csv_content = csv_content[~csv_content[MetaDefaultFields.ID].isnull()] - col = csv_content.pop(MetaDefaultFields.ID) + csv_content = csv_content[~csv_content[self.defaults_prop.MetaDefaultFields.ID].isnull()] + col = csv_content.pop(self.defaults_prop.MetaDefaultFields.ID) csv_content.insert(0, col.name, col) - csv_content.rename(columns={MetaDefaultFields.ID: ext_core_id_field}, inplace=True) + csv_content.rename(columns={self.defaults_prop.MetaDefaultFields.ID: ext_core_id_field}, inplace=True) return csv_content, ext_core_id_field else: raise ValueError("Something is not right. The core id failed to be created") @@ -207,8 +207,9 @@ def _find_fields_with_zero_idx(meta_element_fields: list): def _add_first_id_field_if_exists(meta_element: MetaElementAttributes): zero_index_exist = _find_fields_with_zero_idx(meta_element.fields) if meta_element.core_id and meta_element.core_id.index and not zero_index_exist: - return [MetaDefaultFields.ID] if meta_element.meta_element_type.core_or_ext_type == CoreOrExtType.CORE \ - else [MetaDefaultFields.CORE_ID] + return [self.defaults_prop.MetaDefaultFields.ID] if ( + meta_element.meta_element_type.core_or_ext_type == CoreOrExtType.CORE) \ + else [self.defaults_prop.MetaDefaultFields.CORE_ID] else: return [] @@ -287,7 +288,7 @@ def _update_values(self, df_content, delta_df_content, keys, stat): :return: The updated content """ # Extract columns that need updating, excluding self.keys and id - non_update_column = list(MetaDefaultFields) + non_update_column = list(self.defaults_prop.MetaDefaultFields) non_update_column.extend(keys) update_columns = [i for i in delta_df_content.columns.to_list() if i not in non_update_column] @@ -429,12 +430,13 @@ def _extract_core_keys(self, core_content, keys): :return: A data frame indexed by the `id` column that contains the key elements for each record """ - columns = [MetaDefaultFields.ID] if MetaDefaultFields.ID in core_content.columns.tolist() else [] + columns = [self.defaults_prop.MetaDefaultFields.ID] \ + if self.defaults_prop.MetaDefaultFields.ID in core_content.columns.tolist() else [] if all(key in core_content.columns for key in keys): columns.extend(keys) df = core_content[columns] - if MetaDefaultFields.ID in core_content.columns.tolist(): - df.set_index(MetaDefaultFields.ID, drop=True, inplace=True) + if self.defaults_prop.MetaDefaultFields.ID in core_content.columns.tolist(): + df.set_index(self.defaults_prop.MetaDefaultFields.ID, drop=True, inplace=True) else: raise ValueError(f"Keys does not exist in core content {''.join(keys)}") return df @@ -869,17 +871,17 @@ def validate_content(self, content_to_validate: dict = None, error_file: str = N if not self.check_duplicates(keys_df, content.keys, error_file): log.error("Validation failed for %s %s content for duplicates keys %s", - content.meta_info.core_or_ext_type, content.meta_info.type, content.keys) + content.meta_info.core_or_ext_type.value, content.meta_info.type, content.keys) validation_content_success = False if not self._validate_columns(content): log.error("Validation failed for %s %s content for duplicate columns", - content.meta_info.core_or_ext_type, content.meta_info.type) + content.meta_info.core_or_ext_type.value, content.meta_info.type) validation_content_success = False if validation_content_success: log.info("Validation successful for %s %s content for unique keys %s", - content.meta_info.core_or_ext_type, content.meta_info.type, content.keys) + content.meta_info.core_or_ext_type.value, content.meta_info.type, content.keys) else: validation_success = False diff --git a/src/dwcahandler/dwca/dwca_meta.py b/src/dwcahandler/dwca/dwca_meta.py index 9465991..52ed5c3 100644 --- a/src/dwcahandler/dwca/dwca_meta.py +++ b/src/dwcahandler/dwca/dwca_meta.py @@ -11,7 +11,7 @@ import re from dataclasses import dataclass, field from typing import Optional -from dwcahandler.dwca import CSVEncoding, CoreOrExtType, Terms, MetaDefaultFields +from dwcahandler.dwca import CSVEncoding, CoreOrExtType, Terms, Defaults from enum import Enum @@ -72,7 +72,7 @@ class MetaElementAttributes: @dataclass class MetaDwCA: """Complete Metadata for a DwCA including dataset metadata and schema information""" - eml_xml_filename: str = field(default='eml.xml') + eml_xml_filename: str = field(default=Defaults.eml_xml_filename) dwca_meta: ET.Element = field(init=False) meta_elements: list[MetaElementAttributes] = field(default_factory=list, init=False) @@ -89,10 +89,10 @@ def extract_field_attr_value(field_elm, attrib): fields = node_elm.findall(f'{ns}field') id_field = [] - if core_or_ext_type == 'core': - id_field = node_elm.findall(f'{ns}id') + if core_or_ext_type == CoreOrExtType.CORE: + id_field = node_elm.findall(f'{ns}{Defaults.MetaDefaultFields.ID}') else: - id_field = node_elm.findall(f'{ns}coreid') + id_field = node_elm.findall(f'{ns}{Defaults.MetaDefaultFields.CORE_ID}') file_name = node_elm.find(f'{ns}files').find(f'{ns}location').text meta_element_info = MetaElementInfo( core_or_ext_type=core_or_ext_type, @@ -138,11 +138,11 @@ def read_meta_file(self, meta_file): tree = ET.parse(meta_file) root = tree.getroot() ns = self._get_namespace(root) - node_elm = root.find(f'{ns}{CoreOrExtType.CORE}') + node_elm = root.find(f"{ns}{CoreOrExtType.CORE.value}") self.meta_elements = [self.__extract_meta_info(ns, node_elm, CoreOrExtType.CORE)] self.meta_elements.extend( [self.__extract_meta_info(ns, ne, CoreOrExtType.EXTENSION) - for ne in root.findall(f'{ns}{CoreOrExtType.EXTENSION}')]) + for ne in root.findall(f"{ns}{CoreOrExtType.EXTENSION.value}")]) def remove_meta_elements(self, exts_to_remove): """Remove extension files from the metadata @@ -211,7 +211,7 @@ def _build_meta_xml(self, meta_elem_attrib: MetaElementAttributes): :param meta_elem_attrib: The meta information for the row """ - elem = ET.SubElement(self.dwca_meta, meta_elem_attrib.meta_element_type.core_or_ext_type) + elem = ET.SubElement(self.dwca_meta, meta_elem_attrib.meta_element_type.core_or_ext_type.value) elem.attrib['encoding'] = meta_elem_attrib.meta_element_type.charset_encoding elem.attrib['rowType'] = meta_elem_attrib.meta_element_type.type.value elem.attrib['fieldsTerminatedBy'] = meta_elem_attrib.meta_element_type.csv_encoding.csv_delimiter @@ -225,13 +225,13 @@ def _build_meta_xml(self, meta_elem_attrib: MetaElementAttributes): location = ET.SubElement(files, 'location') location.text = meta_elem_attrib.meta_element_type.file_name if meta_elem_attrib.core_id: - id_field = ET.SubElement(elem, MetaDefaultFields.ID) \ - if meta_elem_attrib.meta_element_type.core_or_ext_type == 'core' \ - else ET.SubElement(elem, MetaDefaultFields.CORE_ID) + id_field = ET.SubElement(elem, Defaults.MetaDefaultFields.ID) \ + if meta_elem_attrib.meta_element_type.core_or_ext_type == CoreOrExtType.CORE \ + else ET.SubElement(elem, Defaults.MetaDefaultFields.CORE_ID) id_field.attrib['index'] = meta_elem_attrib.core_id.index for _, f in enumerate(meta_elem_attrib.fields): - if f.field_name not in list(MetaDefaultFields): + if f.field_name not in list(Defaults.MetaDefaultFields): field_elem = ET.SubElement(elem, "field") if f.index is not None: field_elem.attrib['index'] = f.index diff --git a/src/dwcahandler/dwca/terms.py b/src/dwcahandler/dwca/terms.py index b106d6e..c08a8bd 100644 --- a/src/dwcahandler/dwca/terms.py +++ b/src/dwcahandler/dwca/terms.py @@ -16,7 +16,7 @@ def absolute_file_paths(directory): - """Convert files in a directory into absolute paths and return + """Convert data in a directory into absolute paths and return as a generator :param directory: The directory to scan. @@ -71,7 +71,7 @@ class Terms: GBIF_EXT = "https://rs.gbif.org/extensions.json" - GBIF_REGISTERED_EXTENSION = [e for e in GbifRegisteredExt] + GBIF_REGISTERED_EXTENSION = pd.DataFrame(columns=["prefix", "identifier", "namespace", "issued_date"])#[e for e in GbifRegisteredExt] DWC_SOURCE_URL = "https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/terms/terms.csv" @@ -267,7 +267,7 @@ def __sort_values(df_to_sort: pd.DataFrame, sorting_column: str) -> pd.DataFrame log.info("Current class and terms") - exclude_update_prefixes = [NsPrefix.DC.value] + exclude_update_prefixes = [NsPrefix.DC.value, NsPrefix.DWC.value] terms = Terms() print(terms.class_df.groupby(["prefix"]).agg( class_prefix_count=pd.NamedAgg(column="prefix", aggfunc="count") @@ -277,7 +277,7 @@ def __sort_values(df_to_sort: pd.DataFrame, sorting_column: str) -> pd.DataFrame )) terms.class_df = terms.class_df[terms.class_df.prefix.isin(exclude_update_prefixes)] terms.terms_df = terms.terms_df[terms.terms_df.prefix.isin(exclude_update_prefixes)] - terms.update_dwc_terms() + #terms.update_dwc_terms() terms.update_gbif_ext() terms.class_df = __sort_values(terms.class_df, "class") terms.terms_df = __sort_values(terms.terms_df, "term") @@ -291,3 +291,5 @@ def __sort_values(df_to_sort: pd.DataFrame, sorting_column: str) -> pd.DataFrame term_prefix_count=pd.NamedAgg(column="prefix", aggfunc="count") )) return terms.terms_df, terms.class_df + +#Terms.update_terms() \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py index ae62bf9..5f89f53 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -5,7 +5,7 @@ import csv from dwcahandler import Eml from xml.dom.minidom import parseString -from dwcahandler import MetaDwCA, MetaDefaultFields +from dwcahandler import MetaDwCA, Defaults def get_eml_content(): @@ -25,7 +25,7 @@ def make_fields(columns: list, term_uri: str, field_start: int = 0, core_id: str idx_start = field_start if field_start != -2 else 0 for idx, col in enumerate(columns): - if not (col in list(MetaDefaultFields)): + if not (col in list(Defaults.MetaDefaultFields)): dwc_term_uri = "http://rs.tdwg.org/dwc/terms" if col == 'occurrenceID' else term_uri fields = fields + '\n' + f'' diff --git a/tests/test_write_dwca.py b/tests/test_write_dwca.py index 6920fc0..5898452 100644 --- a/tests/test_write_dwca.py +++ b/tests/test_write_dwca.py @@ -51,7 +51,7 @@ def test_generate_dwca_without_ext(self): root = tree.getroot() ns = _get_namespace(root) assert ns == "{http://rs.tdwg.org/dwc/text/}" - core_node = root.find(f'{ns}{CoreOrExtType.CORE}') + core_node = root.find(f'{ns}{CoreOrExtType.CORE.value}') assert core_node is not None fields = core_node.findall(f'{ns}field') term_fields = [f.attrib.get('term') for f in fields] @@ -90,7 +90,7 @@ def test_generate_dwca_with_ext(self): root = tree.getroot() ns = _get_namespace(root) assert ns == "{http://rs.tdwg.org/dwc/text/}" - core_node = root.find(f'{ns}{CoreOrExtType.CORE}') + core_node = root.find(f'{ns}{CoreOrExtType.CORE.value}') assert core_node is not None fields = core_node.findall(f'{ns}field') term_fields = [f.attrib.get('term') for f in fields] @@ -99,7 +99,7 @@ def test_generate_dwca_with_ext(self): assert any(sample_col in f for f in term_fields) core_file = core_node.find(f'{ns}files').find(f'{ns}location').text - ext_node = root.find(f'{ns}{CoreOrExtType.EXTENSION}') + ext_node = root.find(f'{ns}{CoreOrExtType.EXTENSION.value}') assert ext_node is not None fields = ext_node.findall(f'{ns}field') term_fields = [f.attrib.get('term') for f in fields] @@ -150,7 +150,7 @@ def test_generate_dwca_in_memory(self): root = tree.getroot() ns = _get_namespace(root) assert ns == "{http://rs.tdwg.org/dwc/text/}" - core_node = root.find(f'{ns}{CoreOrExtType.CORE}') + core_node = root.find(f'{ns}{CoreOrExtType.CORE.value}') assert core_node is not None fields = core_node.findall(f'{ns}field') term_fields = [f.attrib.get('term') for f in fields]