Release v0.4.0

patkyn · web-flow · commit 56cbab2acb9d · 2025-01-30T11:33:46.000+11:00
Release v0.4.0
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,30 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/python
+{
+	"name": "Python 3",
+	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
+	"image": "mcr.microsoft.com/devcontainers/python:1-3.9-bookworm",
+	
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	"features": { "ghcr.io/devcontainers-contrib/features/poetry": "latest" },
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Use 'postCreateCommand' to run commands after the container is created.
+	"postCreateCommand": "poetry install",
+
+	// Configure tool-specific properties.
+	// "customizations": {},
+
+	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+	"remoteUser": "vscode",
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-python.python",
+				"vector-of-bool.gitflow"
+			]
+		}
+	}
+}
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        os: [ubuntu-latest, macos-12, Windows-latest]
+        os: [ubuntu-latest, macos-15, Windows-latest]
         python:
           - "3.9"
           - "3.10"
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dwcahandler"
-version = "0.3.0"
+version = "0.4.0"
 description = "Python package to handle Darwin Core Archive (DwCA) operations. This includes creating a DwCA zip file from one or more csvs, reading a DwCA, merge two DwCAs, validate DwCA and delete records from DwCA based on one or more key columns"
 authors = ["Atlas of Living Australia data team <support@ala.org.au>"]
 maintainers = ["Atlas of Living Australia data team <support@ala.org.au>"]
@@ -26,4 +26,4 @@ requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.pytest.ini_options]
-pythonpath = "src"
+pythonpath = "src"
diff --git a/src/dwcahandler/dwca/core_dwca.py b/src/dwcahandler/dwca/core_dwca.py
@@ -207,6 +207,10 @@ def convert_values(v):
                 csv_file_name = meta_elm.meta_element_type.file_name
                 with io.TextIOWrapper(zf.open(csv_file_name), encoding="utf-8") as csv_file:
                     dwc_headers = [f.field_name for f in meta_elm.fields if f.index is not None]
+                    duplicates = [i for i in set(dwc_headers) if dwc_headers.count(i) > 1]
+                    if len(duplicates) > 0:
+                        raise ValueError(f"Duplicate columns {duplicates} specified in the "
+                                         f"metadata for {csv_file_name}")
                     csv_encoding = {key: convert_values(value) for key, value in
                                     asdict(meta_elm.meta_element_type.csv_encoding).items()}
                     csv_content = self._read_csv(
@@ -825,17 +829,18 @@ def check_duplicates(self, content_keys_df, keys, error_file=None):
         """
 
         def report_error(content, keys, message, condition, error_file=None):
-            log.error("%s found in keys %s", message, keys)
-            log.error("\n%s count\n%s", message, condition.sum())
-            log.error("\n%s", content.loc[condition.values, keys].index.tolist())
-            if error_file:
-                content.loc[condition.values, keys].to_csv(error_file, index=False)
+            content.loc[condition.values, keys].to_csv(error_file, index=False)
 
         checks_status: bool = True
         if len(keys) > 0:
             empty_values_condition = content_keys_df.isnull()
             if empty_values_condition.values.any():
-                report_error(content_keys_df, keys, "Empty Values", empty_values_condition)
+                log.error("Empty values found in %s. Total rows affected: %s", keys,
+                          empty_values_condition.sum().sum())
+                log.error("Empty values found in dataframe row: %s",
+                          content_keys_df.index[empty_values_condition.all(axis=1)].tolist())
+                if error_file:
+                    report_error(content_keys_df, keys, "Empty Values", empty_values_condition)
                 checks_status = False
 
             # check incase-sensitive duplicates
@@ -846,8 +851,11 @@ def to_lower(df):
             df_keys = to_lower(content_keys_df)
             duplicate_condition = df_keys.duplicated(keep='first')
             if duplicate_condition.values.any():
-                report_error(content_keys_df, keys, "Duplicate Values",
-                             duplicate_condition, error_file)
+                log.error(f"Duplicate %s found. Total rows affected: %s", keys, duplicate_condition.sum())
+                log.error("Duplicate values: %s", pd.unique(content_keys_df[duplicate_condition].stack()))
+                if error_file:
+                    report_error(content_keys_df, keys, "Duplicate Values",
+                                 duplicate_condition, error_file)
                 checks_status = False
 
         return checks_status
@@ -888,7 +896,6 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil
 
         - No duplicate record keys
         - Valid columns
-        - No duplicate columns
 
         :param error_file: A file to record errors
         :return: True if the DwCA is value, False otherwise
@@ -907,10 +914,6 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil
             if not self._validate_columns(content):
                 return False
 
-            dup_cols = self._find_duplicate_columns(content)
-            if len(dup_cols) > 0:
-                return False
-
         return True
 
     def extract_csv_content(self, csv_info: CsvFileType,
@@ -1062,9 +1065,16 @@ def _read_csv(self,
                 ret_val.dropna(how="all", inplace=True)
                 log.debug("Extracted %d rows from csv %s", len(ret_val), csv_file)
 
+                # Strip column header spaces
+                ret_val.rename(str.strip, axis = 'columns', inplace=True)
+
             return ret_val
 
         except EmptyDataError:
-            log.error(f"The expected columns: %s are not present in the {csv_file}. "
-                      f"The file may be empty", ','.join(columns))
+            if columns:
+                log.error(f"The file may be empty {csv_file}")
+            else:
+                log.error(f"The expected columns: %s are not present in the {csv_file}. "
+                          f"The file may be empty", ','.join(columns))
+
             return pd.DataFrame()
diff --git a/src/dwcahandler/dwca/dwca_factory.py b/src/dwcahandler/dwca/dwca_factory.py
@@ -82,7 +82,7 @@ def merge_dwca(dwca_file: Union[str, BytesIO], delta_dwca_file: Union[str, Bytes
                                                  regen_ids=regen_ids, validate_delta=validate_delta_content)
 
     @staticmethod
-    def validate_dwca(dwca_file: str, keys_lookup: dict = None, error_file: str = None):
+    def validate_dwca(dwca_file: Union[str, BytesIO], keys_lookup: dict = None, error_file: str = None):
         """Test a dwca for consistency
 
         :param dwca_file: The path to the DwCA
diff --git a/src/dwcahandler/dwca/dwca_meta.py b/src/dwcahandler/dwca/dwca_meta.py
@@ -143,7 +143,16 @@ def __extract_meta_info(self, ns, node_elm, core_or_ext_type):
         def extract_field_attr_value(field_elm, attrib):
             return field_elm.attrib.get(attrib) if field_elm.attrib.get(attrib) else None
 
+        def __find_id_in_fields(local_fields, id_field):
+            index_number = id_field[0].attrib["index"] if len(id_field) > 0 else "0"
+            return next((item for item in local_fields if "index" in item.attrib and item.attrib["index"]==index_number), None)
+
         fields = node_elm.findall(f'{ns}field')
+        id_field = []
+        if core_or_ext_type == 'core':
+            id_field = node_elm.findall(f'{ns}id')
+        else:
+            id_field = node_elm.findall(f'{ns}coreid')
         file_name = node_elm.find(f'{ns}files').find(f'{ns}location').text
         meta_element_info = MetaElementInfo(
             core_or_ext_type=core_or_ext_type,
@@ -157,7 +166,8 @@ def extract_field_attr_value(field_elm, attrib):
             charset_encoding=node_elm.attrib['encoding'],
             file_name=file_name)
         # set first field with index 0 if it's not present in list of fields
-        if fields[0].attrib['index'] != '0':
+        field_elm = __find_id_in_fields(fields, id_field)
+        if field_elm is None and len(id_field) > 0:
             if CoreOrExtType.CORE == core_or_ext_type:
                 field_list = [Field(index=0, field_name="id")]
             else:
diff --git a/tests/input_files/dwca/dwca-sample1/meta.xml b/tests/input_files/dwca/dwca-sample1/meta.xml
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<archive xmlns="http://rs.tdwg.org/dwc/text/">
+  <core rowType="http://rs.tdwg.org/dwc/terms/Occurrence" encoding="utf-8" fieldsTerminatedBy="," linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1">
+    <files>
+      <location>occurrence.txt</location>
+    </files>
+    <id index="0" />
+    <field index="1" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
+    <field index="2" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
+    <field index="3" term="http://rs.tdwg.org/dwc/terms/decimalLatitude"/>
+    <field index="4" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
+    <field index="5" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
+    <field index="6" term="http://rs.tdwg.org/dwc/terms/recordedBy"/>
+    <field index="7" term="http://rs.tdwg.org/dwc/terms/geodeticDatum"/>
+    <field index="8" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
+  </core>
+</archive>
diff --git a/tests/input_files/dwca/dwca-sample1/occurrence.txt b/tests/input_files/dwca/dwca-sample1/occurrence.txt
@@ -0,0 +1,6 @@
+id,occurrenceID,scientificName,decimalLatitude,decimalLongitude,eventDate,recordedBy,geodeticDatum,basisOfRecord
+1,1001,SpeciesA,12.34,-56.78,2023-01-01,John Doe,WGS84,PreservedSpecimen
+2,1002,SpeciesB,-34.56,78.90,2023-02-15,Jane Smith,WGS84,HumanObservation
+3,1003,SpeciesC,0.123,45.678,2023-03-20,Bob Johnson,WGS84,FossilSpecimen
+4,1004,SpeciesD,-23.456,-12.345,2023-04-10,Alice Brown,WGS84,MachineObservation
+5,1005,SpeciesE,89.012,-67.890,2023-05-25,Charlie White,WGS84,PreservedSpecimen
diff --git a/tests/input_files/dwca/dwca-sample2/meta.xml b/tests/input_files/dwca/dwca-sample2/meta.xml
@@ -0,0 +1,15 @@
+<archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="metadata.xml">
+    <core encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
+        <files>
+            <location>occurrence.txt</location>
+        </files>
+        <id index="0" />
+        <field default="WGS84" term="http://rs.tdwg.org/dwc/terms/geodeticDatum"/>
+        <field index="0" term="http://rs.gbif.org/terms/1.0/gbifID"/>
+        <field index="1" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
+        <field index="2" term="http://purl.org/dc/terms/scientificName"/>
+        <field index="3" term="http://purl.org/dc/terms/decimalLatitude"/>
+        <field index="4" term="http://purl.org/dc/terms/decimalLongitude"/>
+        <field index="5" term="http://purl.org/dc/terms/basisOfRecord"/>
+    </core>
+</archive>
diff --git a/tests/input_files/dwca/dwca-sample2/occurrence.txt b/tests/input_files/dwca/dwca-sample2/occurrence.txt
@@ -0,0 +1,3 @@
+gbifID	occurrenceID    scientificName	decimalLatitude	decimalLongitude	basisOfRecord
+1	occ1	Euphorbia paralias	-36.00000	150.5678	Observations
+2	occ2	Acaciella angustissima	-20.0000	145.1234	Observations
diff --git a/tests/input_files/dwca/dwca-sample3/meta.xml b/tests/input_files/dwca/dwca-sample3/meta.xml
@@ -0,0 +1,15 @@
+<archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="metadata.xml">
+    <core encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
+        <files>
+            <location>occurrence.txt</location>
+        </files>
+        <id index="0" />
+        <field default="WGS84" term="http://rs.tdwg.org/dwc/terms/geodeticDatum"/>
+        <field index="0" term="http://rs.gbif.org/terms/1.0/gbifID"/>
+        <field index="1" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
+        <field index="2" term="http://purl.org/dc/terms/scientificName"/>
+        <field index="3" term="http://purl.org/dc/terms/decimalLatitude"/>
+        <field index="4" term="http://purl.org/dc/terms/decimalLongitude"/>
+        <field index="5" term="http://purl.org/dc/terms/basisOfRecord"/>
+    </core>
+</archive>
diff --git a/tests/input_files/dwca/dwca-sample3/occurrence.txt b/tests/input_files/dwca/dwca-sample3/occurrence.txt
@@ -0,0 +1,4 @@
+gbifID	occurrenceID    scientificName	decimalLatitude	decimalLongitude	basisOfRecord
+1		Euphorbia paralias	-36.00000	150.5678	Observations
+2	occ2	Acaciella angustissima	-20.0000	145.1234	Observations
+3	occ3	Acaciella angustissima	-20.0000	145.1234	Observations
diff --git a/tests/input_files/dwca/dwca-sample4/meta.xml b/tests/input_files/dwca/dwca-sample4/meta.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<archive xmlns="http://rs.tdwg.org/dwc/text/">
+  <core rowType="http://rs.tdwg.org/dwc/terms/Occurrence" encoding="utf-8" fieldsTerminatedBy="," linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1">
+    <files>
+      <location>occurrence.csv</location>
+    </files>
+    <id index="3" />
+    <field index="0" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
+    <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
+    <field index="2" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
+    <field index="3" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
+    <field index="4" term="http://rs.tdwg.org/dwc/terms/decimalLatitude"/>
+    <field index="5" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
+  </core>
+</archive>
diff --git a/tests/input_files/dwca/dwca-sample4/occurrence.csv b/tests/input_files/dwca/dwca-sample4/occurrence.csv
@@ -0,0 +1,13 @@
+eventDate,basisOfRecord,scientificName,catalogNumber,decimalLatitude,decimalLongitude
+2015-07-18T12:58:00+00:00,Human Observation,Species A,014800,-30.00000,144
+2015-07-18T12:58:31+00:00,Human Observation,Species B,,-31.00000,145
+2015-07-18T18:16:52+00:00,Human Observation,Species C,014824,-32.00000,100.828059
+2015-07-19T04:28:19+00:00,Human Observation,Species D,014823,-33.00000,101.820888
+2015-07-19T18:29:25+00:00,Human Observation,Species A1,014822,-34.00000,102.821654
+2015-07-20T18:03:12+00:00,Human Observation,Species A2,014821,-35.00000,104.999974
+2015-07-21T18:06:58+00:00,Human Observation,Species A3,014802,-34.00000,120.889354
+2015-07-22T04:42:47+00:00,Human Observation,Species B1,014800,-36.00000,150.308848
+2015-07-22T17:54:18+00:00,Human Observation,Species B2,014800,-30.00000,146.240159
+2015-07-22T23:09:51+00:00,Human Observation,Species C1,014799,-31.00000,150.783246
+2015-07-23T17:37:26+00:00,Human Observation,Species D,014798,-40.00000,150.823468
+2015-07-24T13:10:00+00:00,Human Observation,Species E,014823,-28.00000,115
diff --git a/tests/input_files/dwca/dwca-sample5/meta.xml b/tests/input_files/dwca/dwca-sample5/meta.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<archive xmlns="http://rs.tdwg.org/dwc/text/">
+  <core rowType="http://rs.tdwg.org/dwc/terms/Occurrence" encoding="utf-8" fieldsTerminatedBy="," linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1">
+    <files>
+      <location>occurrence.csv</location>
+    </files>
+    <field index="0" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
+    <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
+    <field index="2" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
+    <field index="3" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
+    <field index="4" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
+    <field index="5" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
+  </core>
+</archive>
diff --git a/tests/input_files/dwca/dwca-sample5/occurrence.csv b/tests/input_files/dwca/dwca-sample5/occurrence.csv
@@ -0,0 +1,13 @@
+eventDate,basisOfRecord,scientificName,catalogNumber,decimalLatitude,decimalLongitude
+2015-07-18T12:58:00+00:00,Human Observation,Species A,014826,-30,144
+2015-07-18T12:58:31+00:00,Human Observation,Species A1,014825,-31.1111,145
+2015-07-18T18:16:52+00:00,Human Observation,Species A2,014824,-32.085431,100.828059
+2015-07-19T04:28:19+00:00,Human Observation,Species A3,014823,-33.097233,101.820888
+2015-07-19T18:29:25+00:00,Human Observation,Species B1,014822,-34.099936,102.821654
+2015-07-20T18:03:12+00:00,Human Observation,Species B2,014821,-35.893671,104.999974
+2015-07-21T18:06:58+00:00,Human Observation,Species C,014820,-34.113747,120.889354
+2015-07-22T04:42:47+00:00,Human Observation,Species C2,014810,-36,144.308848
+2015-07-22T17:54:18+00:00,Human Observation,Species C3,014800,-30.440251,146.240159
+2015-07-22T23:09:51+00:00,Human Observation,Species D,014799,-31.547195,150.783246
+2015-07-23T17:37:26+00:00,Human Observation,Species D1,-40.481117,150.823468
+2015-07-24T13:10:00+00:00,Human Observation,Species D2,014792,-28,115
diff --git a/tests/input_files/sample/multimedia_header_with_space.csv b/tests/input_files/sample/multimedia_header_with_space.csv
@@ -0,0 +1,3 @@
+ catalogNumber ,identifier, format ,type
+C4,https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=XXX,image/jpeg,StillImage
+C5,https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=XYZ,image/jpeg,StillImage
diff --git a/tests/input_files/sample/occ_header_with_space.csv b/tests/input_files/sample/occ_header_with_space.csv
@@ -0,0 +1,4 @@
+  catalogNumber ,basisOfRecord,scientificName ,license,decimalLatitude,decimalLongitude
+C3,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-10.0000,120.0000
+C4,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-11.1111,125.0000
+C5,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-12.085431,130.828059
diff --git a/tests/test_create_core_and_ext_content.py b/tests/test_create_core_and_ext_content.py
diff --git a/tests/test_validate_dwca.py b/tests/test_validate_dwca.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+gbifID occurrenceID scientificName decimalLatitude decimalLongitude basisOfRecord`
	`2`	`+1 occ1 Euphorbia paralias -36.00000 150.5678 Observations`
	`3`	`+2 occ2 Acaciella angustissima -20.0000 145.1234 Observations`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+ catalogNumber ,identifier, format ,type`
	`2`	`+C4,https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=XXX,image/jpeg,StillImage`
	`3`	`+C5,https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=XYZ,image/jpeg,StillImage`