Skip to content

Commit 56cbab2

Browse files
authored
Release v0.4.0
Release v0.4.0
2 parents 836b6ef + d7bfd60 commit 56cbab2

20 files changed

+330
-20
lines changed

.devcontainer/devcontainer.json

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
2+
// README at: https://github.com/devcontainers/templates/tree/main/src/python
3+
{
4+
"name": "Python 3",
5+
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
6+
"image": "mcr.microsoft.com/devcontainers/python:1-3.9-bookworm",
7+
8+
// Features to add to the dev container. More info: https://containers.dev/features.
9+
"features": { "ghcr.io/devcontainers-contrib/features/poetry": "latest" },
10+
11+
// Use 'forwardPorts' to make a list of ports inside the container available locally.
12+
// "forwardPorts": [],
13+
14+
// Use 'postCreateCommand' to run commands after the container is created.
15+
"postCreateCommand": "poetry install",
16+
17+
// Configure tool-specific properties.
18+
// "customizations": {},
19+
20+
// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
21+
"remoteUser": "vscode",
22+
"customizations": {
23+
"vscode": {
24+
"extensions": [
25+
"ms-python.python",
26+
"vector-of-bool.gitflow"
27+
]
28+
}
29+
}
30+
}

.github/workflows/run-tests.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
strategy:
1616
fail-fast: true
1717
matrix:
18-
os: [ubuntu-latest, macos-12, Windows-latest]
18+
os: [ubuntu-latest, macos-15, Windows-latest]
1919
python:
2020
- "3.9"
2121
- "3.10"

pyproject.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "dwcahandler"
3-
version = "0.3.0"
3+
version = "0.4.0"
44
description = "Python package to handle Darwin Core Archive (DwCA) operations. This includes creating a DwCA zip file from one or more csvs, reading a DwCA, merge two DwCAs, validate DwCA and delete records from DwCA based on one or more key columns"
55
authors = ["Atlas of Living Australia data team <support@ala.org.au>"]
66
maintainers = ["Atlas of Living Australia data team <support@ala.org.au>"]
@@ -26,4 +26,4 @@ requires = ["poetry-core"]
2626
build-backend = "poetry.core.masonry.api"
2727

2828
[tool.pytest.ini_options]
29-
pythonpath = "src"
29+
pythonpath = "src"

src/dwcahandler/dwca/core_dwca.py

+25-15
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,10 @@ def convert_values(v):
207207
csv_file_name = meta_elm.meta_element_type.file_name
208208
with io.TextIOWrapper(zf.open(csv_file_name), encoding="utf-8") as csv_file:
209209
dwc_headers = [f.field_name for f in meta_elm.fields if f.index is not None]
210+
duplicates = [i for i in set(dwc_headers) if dwc_headers.count(i) > 1]
211+
if len(duplicates) > 0:
212+
raise ValueError(f"Duplicate columns {duplicates} specified in the "
213+
f"metadata for {csv_file_name}")
210214
csv_encoding = {key: convert_values(value) for key, value in
211215
asdict(meta_elm.meta_element_type.csv_encoding).items()}
212216
csv_content = self._read_csv(
@@ -825,17 +829,18 @@ def check_duplicates(self, content_keys_df, keys, error_file=None):
825829
"""
826830

827831
def report_error(content, keys, message, condition, error_file=None):
828-
log.error("%s found in keys %s", message, keys)
829-
log.error("\n%s count\n%s", message, condition.sum())
830-
log.error("\n%s", content.loc[condition.values, keys].index.tolist())
831-
if error_file:
832-
content.loc[condition.values, keys].to_csv(error_file, index=False)
832+
content.loc[condition.values, keys].to_csv(error_file, index=False)
833833

834834
checks_status: bool = True
835835
if len(keys) > 0:
836836
empty_values_condition = content_keys_df.isnull()
837837
if empty_values_condition.values.any():
838-
report_error(content_keys_df, keys, "Empty Values", empty_values_condition)
838+
log.error("Empty values found in %s. Total rows affected: %s", keys,
839+
empty_values_condition.sum().sum())
840+
log.error("Empty values found in dataframe row: %s",
841+
content_keys_df.index[empty_values_condition.all(axis=1)].tolist())
842+
if error_file:
843+
report_error(content_keys_df, keys, "Empty Values", empty_values_condition)
839844
checks_status = False
840845

841846
# check incase-sensitive duplicates
@@ -846,8 +851,11 @@ def to_lower(df):
846851
df_keys = to_lower(content_keys_df)
847852
duplicate_condition = df_keys.duplicated(keep='first')
848853
if duplicate_condition.values.any():
849-
report_error(content_keys_df, keys, "Duplicate Values",
850-
duplicate_condition, error_file)
854+
log.error(f"Duplicate %s found. Total rows affected: %s", keys, duplicate_condition.sum())
855+
log.error("Duplicate values: %s", pd.unique(content_keys_df[duplicate_condition].stack()))
856+
if error_file:
857+
report_error(content_keys_df, keys, "Duplicate Values",
858+
duplicate_condition, error_file)
851859
checks_status = False
852860

853861
return checks_status
@@ -888,7 +896,6 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil
888896
889897
- No duplicate record keys
890898
- Valid columns
891-
- No duplicate columns
892899
893900
:param error_file: A file to record errors
894901
:return: True if the DwCA is value, False otherwise
@@ -907,10 +914,6 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil
907914
if not self._validate_columns(content):
908915
return False
909916

910-
dup_cols = self._find_duplicate_columns(content)
911-
if len(dup_cols) > 0:
912-
return False
913-
914917
return True
915918

916919
def extract_csv_content(self, csv_info: CsvFileType,
@@ -1062,9 +1065,16 @@ def _read_csv(self,
10621065
ret_val.dropna(how="all", inplace=True)
10631066
log.debug("Extracted %d rows from csv %s", len(ret_val), csv_file)
10641067

1068+
# Strip column header spaces
1069+
ret_val.rename(str.strip, axis = 'columns', inplace=True)
1070+
10651071
return ret_val
10661072

10671073
except EmptyDataError:
1068-
log.error(f"The expected columns: %s are not present in the {csv_file}. "
1069-
f"The file may be empty", ','.join(columns))
1074+
if columns:
1075+
log.error(f"The file may be empty {csv_file}")
1076+
else:
1077+
log.error(f"The expected columns: %s are not present in the {csv_file}. "
1078+
f"The file may be empty", ','.join(columns))
1079+
10701080
return pd.DataFrame()

src/dwcahandler/dwca/dwca_factory.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def merge_dwca(dwca_file: Union[str, BytesIO], delta_dwca_file: Union[str, Bytes
8282
regen_ids=regen_ids, validate_delta=validate_delta_content)
8383

8484
@staticmethod
85-
def validate_dwca(dwca_file: str, keys_lookup: dict = None, error_file: str = None):
85+
def validate_dwca(dwca_file: Union[str, BytesIO], keys_lookup: dict = None, error_file: str = None):
8686
"""Test a dwca for consistency
8787
8888
:param dwca_file: The path to the DwCA

src/dwcahandler/dwca/dwca_meta.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,16 @@ def __extract_meta_info(self, ns, node_elm, core_or_ext_type):
143143
def extract_field_attr_value(field_elm, attrib):
144144
return field_elm.attrib.get(attrib) if field_elm.attrib.get(attrib) else None
145145

146+
def __find_id_in_fields(local_fields, id_field):
147+
index_number = id_field[0].attrib["index"] if len(id_field) > 0 else "0"
148+
return next((item for item in local_fields if "index" in item.attrib and item.attrib["index"]==index_number), None)
149+
146150
fields = node_elm.findall(f'{ns}field')
151+
id_field = []
152+
if core_or_ext_type == 'core':
153+
id_field = node_elm.findall(f'{ns}id')
154+
else:
155+
id_field = node_elm.findall(f'{ns}coreid')
147156
file_name = node_elm.find(f'{ns}files').find(f'{ns}location').text
148157
meta_element_info = MetaElementInfo(
149158
core_or_ext_type=core_or_ext_type,
@@ -157,7 +166,8 @@ def extract_field_attr_value(field_elm, attrib):
157166
charset_encoding=node_elm.attrib['encoding'],
158167
file_name=file_name)
159168
# set first field with index 0 if it's not present in list of fields
160-
if fields[0].attrib['index'] != '0':
169+
field_elm = __find_id_in_fields(fields, id_field)
170+
if field_elm is None and len(id_field) > 0:
161171
if CoreOrExtType.CORE == core_or_ext_type:
162172
field_list = [Field(index=0, field_name="id")]
163173
else:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<archive xmlns="http://rs.tdwg.org/dwc/text/">
3+
<core rowType="http://rs.tdwg.org/dwc/terms/Occurrence" encoding="utf-8" fieldsTerminatedBy="," linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1">
4+
<files>
5+
<location>occurrence.txt</location>
6+
</files>
7+
<id index="0" />
8+
<field index="1" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
9+
<field index="2" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
10+
<field index="3" term="http://rs.tdwg.org/dwc/terms/decimalLatitude"/>
11+
<field index="4" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
12+
<field index="5" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
13+
<field index="6" term="http://rs.tdwg.org/dwc/terms/recordedBy"/>
14+
<field index="7" term="http://rs.tdwg.org/dwc/terms/geodeticDatum"/>
15+
<field index="8" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
16+
</core>
17+
</archive>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
id,occurrenceID,scientificName,decimalLatitude,decimalLongitude,eventDate,recordedBy,geodeticDatum,basisOfRecord
2+
1,1001,SpeciesA,12.34,-56.78,2023-01-01,John Doe,WGS84,PreservedSpecimen
3+
2,1002,SpeciesB,-34.56,78.90,2023-02-15,Jane Smith,WGS84,HumanObservation
4+
3,1003,SpeciesC,0.123,45.678,2023-03-20,Bob Johnson,WGS84,FossilSpecimen
5+
4,1004,SpeciesD,-23.456,-12.345,2023-04-10,Alice Brown,WGS84,MachineObservation
6+
5,1005,SpeciesE,89.012,-67.890,2023-05-25,Charlie White,WGS84,PreservedSpecimen
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="metadata.xml">
2+
<core encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
3+
<files>
4+
<location>occurrence.txt</location>
5+
</files>
6+
<id index="0" />
7+
<field default="WGS84" term="http://rs.tdwg.org/dwc/terms/geodeticDatum"/>
8+
<field index="0" term="http://rs.gbif.org/terms/1.0/gbifID"/>
9+
<field index="1" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
10+
<field index="2" term="http://purl.org/dc/terms/scientificName"/>
11+
<field index="3" term="http://purl.org/dc/terms/decimalLatitude"/>
12+
<field index="4" term="http://purl.org/dc/terms/decimalLongitude"/>
13+
<field index="5" term="http://purl.org/dc/terms/basisOfRecord"/>
14+
</core>
15+
</archive>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
gbifID occurrenceID scientificName decimalLatitude decimalLongitude basisOfRecord
2+
1 occ1 Euphorbia paralias -36.00000 150.5678 Observations
3+
2 occ2 Acaciella angustissima -20.0000 145.1234 Observations
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="metadata.xml">
2+
<core encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
3+
<files>
4+
<location>occurrence.txt</location>
5+
</files>
6+
<id index="0" />
7+
<field default="WGS84" term="http://rs.tdwg.org/dwc/terms/geodeticDatum"/>
8+
<field index="0" term="http://rs.gbif.org/terms/1.0/gbifID"/>
9+
<field index="1" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
10+
<field index="2" term="http://purl.org/dc/terms/scientificName"/>
11+
<field index="3" term="http://purl.org/dc/terms/decimalLatitude"/>
12+
<field index="4" term="http://purl.org/dc/terms/decimalLongitude"/>
13+
<field index="5" term="http://purl.org/dc/terms/basisOfRecord"/>
14+
</core>
15+
</archive>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
gbifID occurrenceID scientificName decimalLatitude decimalLongitude basisOfRecord
2+
1 Euphorbia paralias -36.00000 150.5678 Observations
3+
2 occ2 Acaciella angustissima -20.0000 145.1234 Observations
4+
3 occ3 Acaciella angustissima -20.0000 145.1234 Observations
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<archive xmlns="http://rs.tdwg.org/dwc/text/">
3+
<core rowType="http://rs.tdwg.org/dwc/terms/Occurrence" encoding="utf-8" fieldsTerminatedBy="," linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1">
4+
<files>
5+
<location>occurrence.csv</location>
6+
</files>
7+
<id index="3" />
8+
<field index="0" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
9+
<field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
10+
<field index="2" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
11+
<field index="3" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
12+
<field index="4" term="http://rs.tdwg.org/dwc/terms/decimalLatitude"/>
13+
<field index="5" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
14+
</core>
15+
</archive>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
eventDate,basisOfRecord,scientificName,catalogNumber,decimalLatitude,decimalLongitude
2+
2015-07-18T12:58:00+00:00,Human Observation,Species A,014800,-30.00000,144
3+
2015-07-18T12:58:31+00:00,Human Observation,Species B,,-31.00000,145
4+
2015-07-18T18:16:52+00:00,Human Observation,Species C,014824,-32.00000,100.828059
5+
2015-07-19T04:28:19+00:00,Human Observation,Species D,014823,-33.00000,101.820888
6+
2015-07-19T18:29:25+00:00,Human Observation,Species A1,014822,-34.00000,102.821654
7+
2015-07-20T18:03:12+00:00,Human Observation,Species A2,014821,-35.00000,104.999974
8+
2015-07-21T18:06:58+00:00,Human Observation,Species A3,014802,-34.00000,120.889354
9+
2015-07-22T04:42:47+00:00,Human Observation,Species B1,014800,-36.00000,150.308848
10+
2015-07-22T17:54:18+00:00,Human Observation,Species B2,014800,-30.00000,146.240159
11+
2015-07-22T23:09:51+00:00,Human Observation,Species C1,014799,-31.00000,150.783246
12+
2015-07-23T17:37:26+00:00,Human Observation,Species D,014798,-40.00000,150.823468
13+
2015-07-24T13:10:00+00:00,Human Observation,Species E,014823,-28.00000,115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<archive xmlns="http://rs.tdwg.org/dwc/text/">
3+
<core rowType="http://rs.tdwg.org/dwc/terms/Occurrence" encoding="utf-8" fieldsTerminatedBy="," linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1">
4+
<files>
5+
<location>occurrence.csv</location>
6+
</files>
7+
<field index="0" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
8+
<field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
9+
<field index="2" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
10+
<field index="3" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
11+
<field index="4" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
12+
<field index="5" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
13+
</core>
14+
</archive>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
eventDate,basisOfRecord,scientificName,catalogNumber,decimalLatitude,decimalLongitude
2+
2015-07-18T12:58:00+00:00,Human Observation,Species A,014826,-30,144
3+
2015-07-18T12:58:31+00:00,Human Observation,Species A1,014825,-31.1111,145
4+
2015-07-18T18:16:52+00:00,Human Observation,Species A2,014824,-32.085431,100.828059
5+
2015-07-19T04:28:19+00:00,Human Observation,Species A3,014823,-33.097233,101.820888
6+
2015-07-19T18:29:25+00:00,Human Observation,Species B1,014822,-34.099936,102.821654
7+
2015-07-20T18:03:12+00:00,Human Observation,Species B2,014821,-35.893671,104.999974
8+
2015-07-21T18:06:58+00:00,Human Observation,Species C,014820,-34.113747,120.889354
9+
2015-07-22T04:42:47+00:00,Human Observation,Species C2,014810,-36,144.308848
10+
2015-07-22T17:54:18+00:00,Human Observation,Species C3,014800,-30.440251,146.240159
11+
2015-07-22T23:09:51+00:00,Human Observation,Species D,014799,-31.547195,150.783246
12+
2015-07-23T17:37:26+00:00,Human Observation,Species D1,-40.481117,150.823468
13+
2015-07-24T13:10:00+00:00,Human Observation,Species D2,014792,-28,115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
catalogNumber ,identifier, format ,type
2+
C4,https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=XXX,image/jpeg,StillImage
3+
C5,https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=XYZ,image/jpeg,StillImage
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
catalogNumber ,basisOfRecord,scientificName ,license,decimalLatitude,decimalLongitude
2+
C3,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-10.0000,120.0000
3+
C4,Human Observation,Ageratina adenophora,CC-BY 4.0 (Int),-11.1111,125.0000
4+
C5,Human Observation,Delairea odorata,CC-BY 4.0 (Int),-12.085431,130.828059

0 commit comments

Comments
 (0)