AtlasOfLivingAustralia · patkyn · Feb 19, 2025 · Feb 19, 2025 · Feb 20, 2025 · Mar 4, 2025
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -2,7 +2,7 @@ name: Run test
 
 on:
   push:
-    branches: [ "main", "develop" ]
+    branches: [ "main", "develop", "feature/events" ]
   pull_request:
     branches: [ "main", "develop" ]
 

diff --git a/README.md b/README.md
@@ -9,9 +9,6 @@ ALA receive different forms of data from various data providers in the form of C
 
 The operations provided by dwcahandler includes creating a dwca from csv/text file, merge 2 dwcas, delete records in dwca and perform core key validations like testing duplicates of one or more keys, empty and duplicate keys.  
 
-The module uses and maintain the standard dwc terms from a point in time versioned copy of https://dwc.tdwg.org/terms/ and extensions like https://rs.gbif.org/extension/gbif/1.0/multimedia.xml. 
-
-
 ### Technologies
 
 This package is developed in Python. Tested with Python 3.12, 3.11, 3.10 and 3.9
@@ -58,26 +55,62 @@ To install published package from testpypi
 pip install -i https://test.pypi.org/simple/ dwcahandler
 ```
 &nbsp;
+### Supported extensions that have been tested in ALA:
+* Standard Darwin Core Terms and Class
+* Simple Multimedia https://rs.gbif.org/extension/gbif/1.0/multimedia.xml
+* Extended Measurement Or Fact http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact
+
+#### Terms
+* Terms are listed in [terms.csv](src/dwcahandler/dwca/terms/terms.csv)
+```python
+from dwcahandler import DwcaHandler
+
+df_terms, df_class = DwcaHandler.list_terms()
+print(df_terms, df_class)
+```
+
+#### Class
+* Listed in [class-rowtype.csv](src/dwcahandler/dwca/terms/class-rowtype.csv)
+* Used in MetaElementTypes class enum name:
+```python 
+from dwcahandler import MetaElementTypes
+
+MetaElementTypes.OCCURRENCE
+MetaElementTypes.MULTIMEDIA
+```
+
+To list all the class rowtypes
+```python
+from dwcahandler import DwcaHandler
+
+DwcaHandler.list_class_rowtypes()
+```
+&nbsp;
 ### Examples of dwcahandler usages:
 
-* Create Darwin Core Archive from csv file
+* Create Darwin Core Archive from csv file. 
+* Keys are used as id/core id for Dwca with extensions and must be supplied for the core and extensions in the data
+* Validation is performed to make sure that the keys are unique in the core of the Dwca
+* If Keys are not provided, the default keys is occurrenceID
+* If multiple Keys are supplied, resulting dwca would generate id/core id
 * In creating a dwca with multimedia extension, provide format and type values in the Simple Multimedia extension, otherwise, dwcahandler will attempt to fill these info by guessing the mimetype from url.
 
 ```python
 from dwcahandler import CsvFileType
 from dwcahandler import DwcaHandler
+from dwcahandler import MetaElementTypes
 from dwcahandler import Eml
 
-core_csv = CsvFileType(files=['/tmp/occurrence.csv'], type='occurrence', keys=['occurrenceID'])
-ext_csvs = [CsvFileType(files=['/tmp/multimedia.csv'], type='multimedia', keys=['occurrenceID'])]
+core_csv = CsvFileType(files=['/tmp/occurrence.csv'], type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID'])
+ext_csvs = [CsvFileType(files=['/tmp/multimedia.csv'], type=MetaElementTypes.MULTIMEDIA)]
 
 eml = Eml(dataset_name='Test Dataset',
           description='Dataset description',
           license='Creative Commons Attribution (International) (CC-BY 4.0 (Int) 4.0)',
           citation="test citation",
           rights="test rights")
 
-DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=ext_csvs, eml_content=eml, output_dwca_path='/tmp/dwca.zip')
+DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=ext_csvs, eml_content=eml, output_dwca='/tmp/dwca.zip')
 ```
 &nbsp;
 * Create Darwin Core Archive from pandas dataframe
@@ -86,75 +119,95 @@ DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=ext_csvs, eml_content=em
 ```python
 from dwcahandler import DwcaHandler
 from dwcahandler.dwca import CsvFileType
+from dwcahandler import MetaElementTypes
 from dwcahandler import Eml
 import pandas as pd
 
 core_df = pd.read_csv("/tmp/occurrence.csv")
-core_frame = CsvFileType(files=core_df, type='occurrence', keys=['occurrenceID'])
+core_frame = CsvFileType(files=core_df, type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID'])
 
 ext_df = pd.read_csv("/tmp/multimedia.csv")
-ext_frame = [CsvFileType(files=ext_df, type='multimedia', keys=['occurrenceID'])]
+ext_frame = [CsvFileType(files=ext_df, type=MetaElementTypes.MULTIMEDIA)]
 
 eml = Eml(dataset_name='Test Dataset',
           description='Dataset description',
           license='Creative Commons Attribution (International) (CC-BY 4.0 (Int) 4.0)',
           citation="test citation",
           rights="test rights")
 
-DwcaHandler.create_dwca(core_csv=core_frame, ext_csv_list=ext_frame, eml_content=eml, output_dwca_path='/tmp/dwca.zip')
-
+DwcaHandler.create_dwca(core_csv=core_frame, ext_csv_list=ext_frame, eml_content=eml, output_dwca='/tmp/dwca.zip')
 ```
 &nbsp;
-* Merge Darwin Core Archive
+* Create Darwin Core Archive from csv files in a zip files.
+* Class row types are determined by file names of the csvs.
+* If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content
+* Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding
 ```python
 from dwcahandler import DwcaHandler
+from dwcahandler import Eml
 
-DwcaHandler.merge_dwca(dwca_file='/tmp/dwca.zip', delta_dwca_file='/tmp/delta-dwca.zip',
-                       output_dwca_path='/tmp/new-dwca.zip', 
-                       keys_lookup={'occurrence':'occurrenceID'})
+eml = Eml(dataset_name='Test Dataset',
+          description='Dataset description',
+          license='Creative Commons Attribution (International) (CC-BY 4.0 (Int) 4.0)',
+          citation="test citation",
+          rights="test rights")
+
+DwcaHandler.create_dwca_from_zip_content(zip_file="/tmp/txt_files.zip",  eml_content=eml, output_dwca='/tmp/dwca.zip')
 ```
 &nbsp;
-* Delete Rows from core file in Darwin Core Archive
+* Convenient helper function to create Darwin Core Archive from csv files in a zip files.
+* Class row types are determined by file names of the csvs.
+* If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content
+* Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding
 ```python
-from dwcahandler import CsvFileType
 from dwcahandler import DwcaHandler
+from dwcahandler import Eml
 
-delete_csv = CsvFileType(files=['/tmp/old-records.csv'], type='occurrence', keys=['occurrenceID'])
+eml = Eml(dataset_name='Test Dataset',
+          description='Dataset description',
+          license='Creative Commons Attribution (International) (CC-BY 4.0 (Int) 4.0)',
+          citation="test citation",
+          rights="test rights")
 
-DwcaHandler.delete_records(dwca_file='/tmp/dwca.zip',
-                           records_to_delete=delete_csv, 
-                           output_dwca_path='/tmp/new-dwca.zip')
+DwcaHandler.create_dwca_from_zip_content(zip_file="/tmp/txt_files.zip",  eml_content=eml, output_dwca='/tmp/dwca.zip')
 ```
 &nbsp;
-* List darwin core terms that is supported in dwcahandler package
+* Convenient helper function to create Darwin Core Archive from list of csv files.
+* Class row types are determined by file names of the csvs.
+* If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content
+* Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding
 ```python
 from dwcahandler import DwcaHandler
+from dwcahandler import Eml
+
+eml = Eml(dataset_name='Test Dataset',
+          description='Dataset description',
+          license='Creative Commons Attribution (International) (CC-BY 4.0 (Int) 4.0)',
+          citation="test citation",
+          rights="test rights")
 
-df = DwcaHandler.list_dwc_terms()
-print(df)
+DwcaHandler.create_dwca_from_file_list(files=["/tmp/event.txt", "/tmp/occurrence.txt", "/tmp/measurement_or_fact.txt"], 
+                                       eml_content=eml, output_dwca='/tmp/dwca.zip')
 ```
 &nbsp;
-* Other usages may include subclassing the dwca class, modifying the core dataframe content and rebuilding the dwca.
+* Merge Darwin Core Archive
+```python
+from dwcahandler import DwcaHandler, MetaElementTypes
+
+DwcaHandler.merge_dwca(dwca_file='/tmp/dwca.zip', delta_dwca_file='/tmp/delta-dwca.zip',
+                       output_dwca='/tmp/new-dwca.zip', 
+                       keys_lookup={MetaElementTypes.OCCURRENCE:'occurrenceID'})
+```
+&nbsp;
+* Delete Rows from core file in Darwin Core Archive
 ```python
-from dwcahandler import Dwca
-
-class DerivedDwca(Dwca):
-    """
-    Derived class to perform other custom operations that is not included as part of the core operations
-    """
-    def drop_columns(self):
-        """
-        Drop existing column in the core content
-        """
-        self.core_content.df_content.drop(columns=['column1', 'column2'], inplace=True)
-        self._update_meta_fields(self.core_content)
-
-
-dwca = DerivedDwca(dwca_file_loc='/tmp/dwca.zip')
-dwca.extract_dwca()
-dwca.drop_columns()
-dwca.generate_eml()
-dwca.generate_meta()
-dwca.write_dwca('/tmp/newdwca.zip')
-
-```
+from dwcahandler import CsvFileType
+from dwcahandler import DwcaHandler, MetaElementTypes
+
+delete_csv = CsvFileType(files=['/tmp/old-records.csv'], type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID'])
+
+DwcaHandler.delete_records(dwca_file='/tmp/dwca.zip',
+                           records_to_delete=delete_csv, 
+                           output_dwca='/tmp/new-dwca.zip')
+```
+&nbsp;
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dwcahandler"
-version = "0.4.0"
+version = "1.0.0b1"
 description = "Python package to handle Darwin Core Archive (DwCA) operations. This includes creating a DwCA zip file from one or more csvs, reading a DwCA, merge two DwCAs, validate DwCA and delete records from DwCA based on one or more key columns"
 authors = ["Atlas of Living Australia data team <support@ala.org.au>"]
 maintainers = ["Atlas of Living Australia data team <support@ala.org.au>"]
@@ -19,7 +19,7 @@ metapype = "^0.0.26"
 flake8 = "^7.1.1"
 
 [tool.poetry.scripts]
-update-dwc-terms = "dwcahandler.scripts.update_dwc_terms:update_terms"
+update-terms = "dwcahandler.scripts.update_terms:update_terms"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/src/dwcahandler/dwca/__init__.py b/src/dwcahandler/dwca/__init__.py
@@ -16,6 +16,9 @@
 the (usually Darwin Core) terms that each column contains.
 
 """
+from __future__ import annotations
+
+import io
 from collections import namedtuple
 from dataclasses import dataclass, field
 from typing import Optional, Union
@@ -28,6 +31,25 @@
     EXTENSION="extension"
 )
 
+# Default keys for content when creating dwca
+DefaultKeys = namedtuple("DefaultKeys", ["EVENT", "OCCURRENCE"])(
+    EVENT = "eventID",
+    OCCURRENCE = "occurrenceID"
+)
+
+def get_keys(type: MetaElementTypes, override_content_keys: dict[[MetaElementTypes, list]] = None):
+    """
+    # If override_content_keys not supplied, return the default keys based on content type
+    :param type:  type of content
+    :param override_content_keys: given content keys
+    :return: the list of keys for the content
+    """
+    if override_content_keys:
+        for content_type, keys in override_content_keys.items():
+            if type == content_type and keys and len(keys) > 0:
+                return keys
+    defaults = DefaultKeys._asdict()
+    return [defaults[type.name]] if type.name in defaults.keys() else []
 
 @dataclass
 class CSVEncoding:
@@ -60,23 +82,6 @@ def __convert_values(self, v):
         return translate_table[v] if v in translate_table.keys() else v
 
 
-@dataclass
-class CsvFileType:
-    """A description of a CSV file in a DwCA
-    """
-    files: Union[list[str], pd.DataFrame]  # can accept more than one file or a dataframe
-    type: str  # 'occurrence', 'taxon', 'event', multimedia,...
-    keys: Optional[list] = None  # must be supplied for csv extensions to link extension records to core record
-    # when creating dwca. for core other than occurrence, this neeeds to be supplied as key.
-    # column keys lookup in core or extension for delete records
-    associated_files_loc: Optional[str] = None  # in case there are associated media that need to be packaged in dwca
-    csv_encoding: CSVEncoding = field(
-        default_factory=lambda: CSVEncoding(csv_delimiter=",", csv_eol="\n", csv_text_enclosure='"',
-                                            csv_escape_char='"'))
-    # delimiter: Optional[str] = None
-    # file delimiter type when reading the csv. if not supplied, the collectory setting delimiter is read in for the dr
-
-
 class Stat:
     """Record statistics for a DwCA"""
     start_record_count: int = 0
@@ -176,8 +181,43 @@ class Defaults:
 
 
 # Imports at end of file to allow classes to be used
-from dwcahandler.dwca.terms import Terms
-from dwcahandler.dwca.dwca_meta import Element, MetaElementTypes, MetaElementInfo, MetaDwCA
+from dwcahandler.dwca.terms import Terms, NsPrefix
+from dwcahandler.dwca.dwca_meta import (MetaElementTypes, MetaElementInfo, MetaDwCA,
+                                        MetaElementAttributes, get_meta_class_row_type)
+@dataclass
+class CsvFileType:
+    """A description of a CSV file in a DwCA
+    """
+    files: Union[list[str], pd.DataFrame, io.TextIOWrapper]  # can accept more than one file or a dataframe
+    type: MetaElementTypes # 'occurrence', 'taxon', 'event', multimedia,...
+    keys: Optional[list] = None  # must be supplied for csv extensions to link extension records to core record
+    # when creating dwca. for core other than occurrence, this neeeds to be supplied as key.
+    # column keys lookup in core or extension for delete records
+    associated_files_loc: Optional[str] = None  # in case there are associated media that need to be packaged in dwca
+    csv_encoding: CSVEncoding = field(
+        default_factory=lambda: CSVEncoding(csv_delimiter=",", csv_eol="\n", csv_text_enclosure='"',
+                                            csv_escape_char='"'))
+
+    def check_for_empty(self, include_keys = True):
+        if self.files and len(self.files) > 0 and \
+                self.type and isinstance(self.type, MetaElementTypes) and \
+                (not include_keys or include_keys and self.keys and len(self.keys) > 0):
+            return True
+        return False
+
+    def add_data(self, other_csv_file_type: CsvFileType):
+        if self.type and self.type == other_csv_file_type.type:
+            if isinstance(self.files, pd.DataFrame) and isinstance(other_csv_file_type.files, pd.DataFrame):
+                self.files = pd.concat([self.files, other_csv_file_type.files], ignore_index=False)
+                return True
+            elif isinstance(self.files, list) and isinstance(other_csv_file_type.files, list):
+                self.files.append(other_csv_file_type.files)
+                return True
+        elif not self.type:
+            self.files = other_csv_file_type.files
+            self.type = other_csv_file_type.type
+        return False
+
 from dwcahandler.dwca.eml import Eml
 from dwcahandler.dwca.base_dwca import BaseDwca
 from dwcahandler.dwca.core_dwca import Dwca, DfContent