#20 - Provide convenient helper methods for creating dwca

patkyn · patkyn · commit 533c6d0ca0c0 · 2025-03-18T11:20:07.000+11:00
diff --git a/README.md b/README.md
@@ -9,9 +9,6 @@ ALA receive different forms of data from various data providers in the form of C
 
 The operations provided by dwcahandler includes creating a dwca from csv/text file, merge 2 dwcas, delete records in dwca and perform core key validations like testing duplicates of one or more keys, empty and duplicate keys.  
 
-The module uses and maintain the standard dwc terms from a point in time versioned copy of https://dwc.tdwg.org/terms/ and extensions like https://rs.gbif.org/extension/gbif/1.0/multimedia.xml. 
-
-
 ### Technologies
 
 This package is developed in Python. Tested with Python 3.12, 3.11, 3.10 and 3.9
@@ -76,6 +73,8 @@ print(df_terms, df_class)
 * Listed in [class-rowtype.csv](src/dwcahandler/dwca/terms/class-rowtype.csv)
 * Used in MetaElementTypes class enum name:
 ```python 
+from dwcahandler import MetaElementTypes
+
 MetaElementTypes.OCCURRENCE
 MetaElementTypes.MULTIMEDIA
 ```
@@ -137,7 +136,58 @@ eml = Eml(dataset_name='Test Dataset',
           rights="test rights")
 
 DwcaHandler.create_dwca(core_csv=core_frame, ext_csv_list=ext_frame, eml_content=eml, output_dwca='/tmp/dwca.zip')
+```
+&nbsp;
+* Create Darwin Core Archive from csv files in a zip files.
+* Class row types are determined by file names of the csvs.
+* If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content
+* Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding
+```python
+from dwcahandler import DwcaHandler
+from dwcahandler import Eml
+
+eml = Eml(dataset_name='Test Dataset',
+          description='Dataset description',
+          license='Creative Commons Attribution (International) (CC-BY 4.0 (Int) 4.0)',
+          citation="test citation",
+          rights="test rights")
+
+DwcaHandler.create_dwca_from_zip_content(zip_file="/tmp/txt_files.zip",  eml_content=eml, output_dwca='/tmp/dwca.zip')
+```
+&nbsp;
+* Convenient helper function to create Darwin Core Archive from csv files in a zip files.
+* Class row types are determined by file names of the csvs.
+* If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content
+* Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding
+```python
+from dwcahandler import DwcaHandler
+from dwcahandler import Eml
+
+eml = Eml(dataset_name='Test Dataset',
+          description='Dataset description',
+          license='Creative Commons Attribution (International) (CC-BY 4.0 (Int) 4.0)',
+          citation="test citation",
+          rights="test rights")
+
+DwcaHandler.create_dwca_from_zip_content(zip_file="/tmp/txt_files.zip",  eml_content=eml, output_dwca='/tmp/dwca.zip')
+```
+&nbsp;
+* Convenient helper function to create Darwin Core Archive from list of csv files.
+* Class row types are determined by file names of the csvs.
+* If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content
+* Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding
+```python
+from dwcahandler import DwcaHandler
+from dwcahandler import Eml
+
+eml = Eml(dataset_name='Test Dataset',
+          description='Dataset description',
+          license='Creative Commons Attribution (International) (CC-BY 4.0 (Int) 4.0)',
+          citation="test citation",
+          rights="test rights")
 
+DwcaHandler.create_dwca_from_file_list(files=["/tmp/event.txt", "/tmp/occurrence.txt", "/tmp/measurement_or_fact.txt"], 
+                                       eml_content=eml, output_dwca='/tmp/dwca.zip')
 ```
 &nbsp;
 * Merge Darwin Core Archive
diff --git a/src/dwcahandler/dwca/__init__.py b/src/dwcahandler/dwca/__init__.py
@@ -18,6 +18,7 @@
 """
 from __future__ import annotations
 
+import io
 from collections import namedtuple
 from dataclasses import dataclass, field
 from typing import Optional, Union
@@ -30,6 +31,25 @@
     EXTENSION="extension"
 )
 
+# Default keys for content when creating dwca
+DefaultKeys = namedtuple("DefaultKeys", ["EVENT", "OCCURRENCE"])(
+    EVENT = "eventID",
+    OCCURRENCE = "occurrenceID"
+)
+
+def get_keys(type: MetaElementTypes, override_content_keys: dict[[MetaElementTypes, list]] = None):
+    """
+    # If override_content_keys not supplied, return the default keys based on content type
+    :param type:  type of content
+    :param override_content_keys: given content keys
+    :return: the list of keys for the content
+    """
+    if override_content_keys:
+        for content_type, keys in override_content_keys.items():
+            if type == content_type and keys and len(keys) > 0:
+                return keys
+    defaults = DefaultKeys._asdict()
+    return [defaults[type.name]] if type.name in defaults.keys() else []
 
 @dataclass
 class CSVEncoding:
@@ -168,7 +188,7 @@ class Defaults:
 class CsvFileType:
     """A description of a CSV file in a DwCA
     """
-    files: Union[list[str], pd.DataFrame]  # can accept more than one file or a dataframe
+    files: Union[list[str], pd.DataFrame, io.TextIOWrapper]  # can accept more than one file or a dataframe
     type: MetaElementTypes # 'occurrence', 'taxon', 'event', multimedia,...
     keys: Optional[list] = None  # must be supplied for csv extensions to link extension records to core record
     # when creating dwca. for core other than occurrence, this neeeds to be supplied as key.
diff --git a/src/dwcahandler/dwca/base_dwca.py b/src/dwcahandler/dwca/base_dwca.py
@@ -127,7 +127,7 @@ def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca: Un
 
     def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO],
                     ext_csv_list: list[CsvFileType] = None, validate_content: bool = True,
-                    eml_content: Union[str, Eml] = '', additional_validation_on_content: list[CsvFileType] = None):
+                    eml_content: Union[str, Eml] = ''):
         """Create a dwca given the contents of core and extensions and eml content
 
         :param core_csv: CsvFileType containing the files, class types and keys to form the core of the dwca
@@ -136,7 +136,6 @@ def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO],
                               extensions of the dwca if supplied
         :param validate_content: whether to validate the contents
         :param eml_content: eml content in string or a filled Eml object
-        :param additional_validation_on_content: additional validation to perform
         """
         if ext_csv_list is None:
             ext_csv_list = []
@@ -150,13 +149,16 @@ def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO],
             if image_ext:
                 ext_csv_list.append(image_ext)
 
+        content_to_validate = {}
         for ext in ext_csv_list:
+            if ext.keys and len(ext.keys) > 0:
+                content_to_validate[ext.type] = ext.keys
             self.extract_csv_content(csv_info=ext, core_ext_type=CoreOrExtType.EXTENSION,
                                      build_coreid_for_ext=True)
 
         self.fill_additional_info()
 
-        if validate_content and not self.validate_content(additional_validation_on_content):
+        if validate_content and not self.validate_content(content_to_validate):
             raise SystemExit(Exception("Some validations error found. Dwca is not created."))
 
         self.generate_eml(eml_content)
@@ -191,7 +193,7 @@ def validate_dwca(self, content_keys: dict, error_file: str):
            If additional checks required in another content, supply it as content_keys
 
         :param content_keys: a dictionary of class type and the key
-                             for eg. {MetaElementTypes.OCCURRENCE, "occurrenceId"}
+                             for eg. {MetaElementTypes.OCCURRENCE, "occurrenceID"}
         :param error_file: optional error_file for the errored data
         """
         self.extract_dwca()
diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py
@@ -22,7 +22,7 @@
 from pandas.errors import EmptyDataError
 from pandas.io import parsers
 from dwcahandler.dwca import (BaseDwca, CoreOrExtType, CSVEncoding,
-                              CsvFileType, Defaults, Eml, Terms,
+                              CsvFileType, Defaults, Eml, Terms, get_keys,
                               MetaDwCA, MetaElementInfo, MetaElementTypes,
                               MetaElementAttributes, Stat, record_diff_stat)
 
@@ -350,6 +350,7 @@ def set_keys(self, keys: dict = None):
                     key_list = [v] if isinstance(v, str) else v
                     col_term = []
                     for a_key in key_list:
+                        # this is in case a_key is url form for eg: http://rs.gbif.org/terms/1.0/gbifID
                         if a_key not in dwca_content.df_content.columns.tolist():
                             col_term.append(Terms.extract_term(a_key))
                         else:
@@ -893,31 +894,17 @@ def extract_csv_content(self, csv_info: CsvFileType,
         :param core_ext_type: Whether this is a core or extension content frame
         :param build_coreid_for_ext: indicator to build id and core id to support dwca with extension
         """
-        def __get_default_core_key(core_sv_info: CsvFileType):
-            """Look for a column in a CSV file
-
-            :param core_sv_info: The CSV file
-            :return: default key if csv_info.keys not provided.
-                     Default key is eventID for EVENT type and occurrenceID for occurrence type
-            """
-            if not core_sv_info.keys or len(core_sv_info.keys) == 0:
-                if core_sv_info.type == MetaElementTypes.EVENT:
-                    return ["eventID"]
-                elif core_sv_info.type == MetaElementTypes.OCCURRENCE:
-                    return ["occurrenceID"]
-                else:
-                    raise ValueError("Keys need to be set for core content")
-            elif len(core_sv_info.keys) > 0:
-                return core_sv_info.keys
-
-        if isinstance(csv_info.files, pd.DataFrame):
+        if isinstance(csv_info.files, pd.DataFrame) :
             csv_content = csv_info.files
+        elif isinstance(csv_info.files, io.TextIOWrapper):
+            csv_content = self._read_csv(csv_info.files)
         else:
             csv_content = self._combine_contents(csv_info.files, csv_info.csv_encoding)
 
-        # Use default occurrenceID if not provided
+        # Use default keys if not provided
         if core_ext_type == CoreOrExtType.CORE:
-            keys = __get_default_core_key(csv_info)
+            override_keys = {csv_info.type: csv_info.keys} if csv_info.keys and len(csv_info.keys) > 0 else None
+            keys = get_keys(type=csv_info.type, override_content_keys=override_keys)
         else:
             keys = self.core_content.keys
         core_id_field: str = ""
@@ -945,6 +932,7 @@ def __get_default_core_key(core_sv_info: CsvFileType):
             content.keys = keys
             self.core_content = content
         else:
+            content.keys = csv_info.keys
             self.ext_content.append(content)
 
     def _to_csv(self, df: pd.DataFrame, meta_info: MetaElementInfo,
diff --git a/src/dwcahandler/dwca/dwca_factory.py b/src/dwcahandler/dwca/dwca_factory.py
@@ -2,12 +2,14 @@
 Module contains factory class for Dwca. This is used to decide the type of darwin core class to perform the operation.
 
 """
-
+import io
 import logging
 from typing import Union
 import pandas as pd
-from dwcahandler.dwca import CsvFileType, Dwca, Terms, Eml, MetaElementTypes
+from dwcahandler.dwca import CsvFileType, Dwca, Terms, Eml, MetaElementTypes, CSVEncoding, get_keys
 from io import BytesIO
+from pathlib import Path
+from zipfile import ZipFile
 
 logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
 log = logging.getLogger("DwcaFactoryManager")
@@ -24,7 +26,104 @@ def list_class_rowtypes() :
         for name, member in MetaElementTypes.__members__.items():
             print(f"{name}: {member.value}")
 
+    @staticmethod
+    def get_contents_from_file_names(files: list) -> (dict[MetaElementTypes, str], dict[MetaElementTypes, str]):
+        """Find the core content and extension contents from a list of file paths.
+        Core content will always be event if present, otherwise, occurrence content
+
+        :param files: list of files
+        :param output_dwca: Where to place the resulting Dwca
+        :param eml_content: eml content in string or Eml class
+        :param csv_encoding: delimiter for txt file. Default is comma delimiter txt files if not supplied
+        :param content_keys: optional dictionary of MetaElementTypes and key list
+                                      for eg. {MetaElementTypes.OCCURRENCE, ["occurrenceID"]}
+        """
+        def derive_type(file_list: list) -> dict[str, MetaElementTypes]:
+            file_types = {}
+            for file in file_list:
+                if (filename:=Path(file).stem.upper()) in dict(MetaElementTypes.__members__.items()).keys():
+                    file_types[file] = dict(MetaElementTypes.__members__.items())[filename]
+            return file_types
+
+        contents = derive_type(files)
+
+        core_file = {k: v for k, v in contents.items() if v == MetaElementTypes.EVENT}
+        if not core_file:
+            core_file = {k: v for k, v in contents.items() if v == MetaElementTypes.OCCURRENCE}
+
+        if core_file:
+            core_filename = next(iter(core_file))
+            core_type = core_file[core_filename]
+            ext_files = {k: v for k, v in contents.items() if v != core_type}
+            return core_file, ext_files
+
+        return None
+
     """Perform various DwCA operations"""
+    @staticmethod
+    def create_dwca_from_file_list(files: list, output_dwca: Union[str, BytesIO],
+                                   eml_content: Union[str, Eml] = '', csv_encoding: CSVEncoding = CSVEncoding(),
+                                   content_keys: dict[MetaElementTypes, list] = None):
+        """Create a suitable DwCA from a list of CSV files
+
+        :param files: Zip file containing txt files
+        :param output_dwca: Where to place the resulting Dwca
+        :param eml_content: eml content in string or Eml class
+        :param csv_encoding: delimiter for txt file. Default is comma delimiter txt files if not supplied
+        :param content_keys: optional dictionary of MetaElementTypes and key list
+                                      for eg. {MetaElementTypes.OCCURRENCE, ["occurrenceID"]}
+        """
+        core_content, ext_content_list = DwcaHandler.get_contents_from_file_names(files)
+        if core_content:
+            core_filename = next(iter(core_content))
+            core_type = core_content[core_filename]
+
+            core_content = CsvFileType(files=[core_filename], type=core_type, csv_encoding=csv_encoding,
+                                       keys=get_keys(type=core_type, override_content_keys=content_keys))
+            ext_content = []
+            for ext_file, ext_type in ext_content_list.items():
+                ext_content.append(CsvFileType(files=[ext_file],
+                                               type=ext_type, csv_encoding=csv_encoding,
+                                               keys=get_keys(type=ext_type,
+                                                             override_content_keys=content_keys)))
+            DwcaHandler.create_dwca(core_csv=core_content, ext_csv_list=ext_content, output_dwca=output_dwca,
+                                    eml_content=eml_content)
+        else:
+            raise ValueError("The core content cannot be determined. Please check filename in zip file")
+
+    @staticmethod
+    def create_dwca_from_zip_content(zip_file: str, output_dwca: Union[str, BytesIO],
+                                     eml_content: Union[str, Eml] = '', csv_encoding: CSVEncoding = CSVEncoding(),
+                                     content_keys: dict[MetaElementTypes, list] = None):
+        """Create a suitable DwCA from a list of CSV files
+
+        :param zip_file: Zip file containing txt files
+        :param output_dwca: Where to place the resulting Dwca
+        :param eml_content: eml content in string or Eml class
+        :param csv_encoding: delimiter for txt file. Default is comma delimiter txt files if not supplied
+        :param content_keys: optional dictionary of class type and the key
+                             for eg. {MetaElementTypes.OCCURRENCE, ["occurrenceID"]}
+        """
+        with ZipFile(zip_file, 'r') as zf:
+            files = zf.namelist()
+            core_content, ext_content_list = DwcaHandler.get_contents_from_file_names(files)
+            if core_content:
+                core_filename = next(iter(core_content))
+                core_type = core_content[core_filename]
+                core_content = CsvFileType(files=io.TextIOWrapper(zf.open(core_filename), encoding="utf-8"),
+                                           type=core_type, csv_encoding=csv_encoding,
+                                           keys=get_keys(type=core_type,
+                                                         override_content_keys=content_keys))
+                ext_content = []
+                for ext_file, ext_type in ext_content_list.items():
+                    ext_content.append(CsvFileType(files=io.TextIOWrapper(zf.open(ext_file), encoding="utf-8"),
+                                                   type=ext_type, csv_encoding=csv_encoding,
+                                                   keys=get_keys(type=ext_type,
+                                                                 override_content_keys=content_keys)))
+                DwcaHandler.create_dwca(core_csv=core_content, ext_csv_list=ext_content, output_dwca=output_dwca,
+                                    eml_content=eml_content)
+            else:
+                raise ValueError("The core content cannot be determined. Please check filename in zip file")
 
     @staticmethod
     def create_dwca(core_csv: CsvFileType,
@@ -75,14 +174,15 @@ def merge_dwca(dwca_file: Union[str, BytesIO], delta_dwca_file: Union[str, Bytes
                                                  validate_delta=validate_delta_content)
 
     @staticmethod
-    def validate_dwca(dwca_file: Union[str, BytesIO], keys_lookup: dict = None, error_file: str = None):
+    def validate_dwca(dwca_file: Union[str, BytesIO], content_keys: dict = None, error_file: str = None):
         """Test a dwca for consistency
 
         :param dwca_file: The path to the DwCA
-        :param keys_lookup: The keys that identify a unique record
+        :param content_keys: a dictionary of class type and the key
+                             for eg. {MetaElementTypes.OCCURRENCE, "occurrenceID"}
         :param error_file: The file to write errors to. If None, errors are logged
         """
-        return Dwca(dwca_file_loc=dwca_file).validate_dwca(keys_lookup, error_file)
+        return Dwca(dwca_file_loc=dwca_file).validate_dwca(content_keys, error_file)
 
     @staticmethod
     def validate_file(csv_file: CsvFileType, error_file: str = None):
diff --git a/src/dwcahandler/dwca/terms.py b/src/dwcahandler/dwca/terms.py
@@ -255,7 +255,8 @@ def update_terms():
         def __sort_values(df_to_sort: pd.DataFrame, sorting_column: str) -> pd.DataFrame:
             """
             Make sure dc and dwc prefixes stay on top
-            :param df: dataframe
+            :param df_to_sort: dataframe to be sorted
+            :param sorting_column: other column to sort
             :return: sorted dataFrame
             """
             df_to_sort = df_to_sort.sort_values(by=["prefix", sorting_column], key=lambda x: x.str.lower())