Skip to content

Commit 533c6d0

Browse files
committed
#20 - Provide convenient helper methods for creating dwca
1 parent 0d7b66e commit 533c6d0

File tree

6 files changed

+196
-35
lines changed

6 files changed

+196
-35
lines changed

README.md

+53-3
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,6 @@ ALA receive different forms of data from various data providers in the form of C
99

1010
The operations provided by dwcahandler includes creating a dwca from csv/text file, merge 2 dwcas, delete records in dwca and perform core key validations like testing duplicates of one or more keys, empty and duplicate keys.
1111

12-
The module uses and maintain the standard dwc terms from a point in time versioned copy of https://dwc.tdwg.org/terms/ and extensions like https://rs.gbif.org/extension/gbif/1.0/multimedia.xml.
13-
14-
1512
### Technologies
1613

1714
This package is developed in Python. Tested with Python 3.12, 3.11, 3.10 and 3.9
@@ -76,6 +73,8 @@ print(df_terms, df_class)
7673
* Listed in [class-rowtype.csv](src/dwcahandler/dwca/terms/class-rowtype.csv)
7774
* Used in MetaElementTypes class enum name:
7875
```python
76+
from dwcahandler import MetaElementTypes
77+
7978
MetaElementTypes.OCCURRENCE
8079
MetaElementTypes.MULTIMEDIA
8180
```
@@ -137,7 +136,58 @@ eml = Eml(dataset_name='Test Dataset',
137136
rights="test rights")
138137

139138
DwcaHandler.create_dwca(core_csv=core_frame, ext_csv_list=ext_frame, eml_content=eml, output_dwca='/tmp/dwca.zip')
139+
```
140+
 
141+
* Create Darwin Core Archive from csv files in a zip files.
142+
* Class row types are determined by file names of the csvs.
143+
* If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content
144+
* Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding
145+
```python
146+
from dwcahandler import DwcaHandler
147+
from dwcahandler import Eml
148+
149+
eml = Eml(dataset_name='Test Dataset',
150+
description='Dataset description',
151+
license='Creative Commons Attribution (International) (CC-BY 4.0 (Int) 4.0)',
152+
citation="test citation",
153+
rights="test rights")
154+
155+
DwcaHandler.create_dwca_from_zip_content(zip_file="/tmp/txt_files.zip", eml_content=eml, output_dwca='/tmp/dwca.zip')
156+
```
157+
 
158+
* Convenient helper function to create Darwin Core Archive from csv files in a zip files.
159+
* Class row types are determined by file names of the csvs.
160+
* If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content
161+
* Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding
162+
```python
163+
from dwcahandler import DwcaHandler
164+
from dwcahandler import Eml
165+
166+
eml = Eml(dataset_name='Test Dataset',
167+
description='Dataset description',
168+
license='Creative Commons Attribution (International) (CC-BY 4.0 (Int) 4.0)',
169+
citation="test citation",
170+
rights="test rights")
171+
172+
DwcaHandler.create_dwca_from_zip_content(zip_file="/tmp/txt_files.zip", eml_content=eml, output_dwca='/tmp/dwca.zip')
173+
```
174+
 
175+
* Convenient helper function to create Darwin Core Archive from list of csv files.
176+
* Class row types are determined by file names of the csvs.
177+
* If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content
178+
* Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding
179+
```python
180+
from dwcahandler import DwcaHandler
181+
from dwcahandler import Eml
182+
183+
eml = Eml(dataset_name='Test Dataset',
184+
description='Dataset description',
185+
license='Creative Commons Attribution (International) (CC-BY 4.0 (Int) 4.0)',
186+
citation="test citation",
187+
rights="test rights")
140188

189+
DwcaHandler.create_dwca_from_file_list(files=["/tmp/event.txt", "/tmp/occurrence.txt", "/tmp/measurement_or_fact.txt"],
190+
eml_content=eml, output_dwca='/tmp/dwca.zip')
141191
```
142192
 
143193
* Merge Darwin Core Archive

src/dwcahandler/dwca/__init__.py

+21-1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"""
1919
from __future__ import annotations
2020

21+
import io
2122
from collections import namedtuple
2223
from dataclasses import dataclass, field
2324
from typing import Optional, Union
@@ -30,6 +31,25 @@
3031
EXTENSION="extension"
3132
)
3233

34+
# Default keys for content when creating dwca
35+
DefaultKeys = namedtuple("DefaultKeys", ["EVENT", "OCCURRENCE"])(
36+
EVENT = "eventID",
37+
OCCURRENCE = "occurrenceID"
38+
)
39+
40+
def get_keys(type: MetaElementTypes, override_content_keys: dict[[MetaElementTypes, list]] = None):
41+
"""
42+
# If override_content_keys not supplied, return the default keys based on content type
43+
:param type: type of content
44+
:param override_content_keys: given content keys
45+
:return: the list of keys for the content
46+
"""
47+
if override_content_keys:
48+
for content_type, keys in override_content_keys.items():
49+
if type == content_type and keys and len(keys) > 0:
50+
return keys
51+
defaults = DefaultKeys._asdict()
52+
return [defaults[type.name]] if type.name in defaults.keys() else []
3353

3454
@dataclass
3555
class CSVEncoding:
@@ -168,7 +188,7 @@ class Defaults:
168188
class CsvFileType:
169189
"""A description of a CSV file in a DwCA
170190
"""
171-
files: Union[list[str], pd.DataFrame] # can accept more than one file or a dataframe
191+
files: Union[list[str], pd.DataFrame, io.TextIOWrapper] # can accept more than one file or a dataframe
172192
type: MetaElementTypes # 'occurrence', 'taxon', 'event', multimedia,...
173193
keys: Optional[list] = None # must be supplied for csv extensions to link extension records to core record
174194
# when creating dwca. for core other than occurrence, this neeeds to be supplied as key.

src/dwcahandler/dwca/base_dwca.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca: Un
127127

128128
def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO],
129129
ext_csv_list: list[CsvFileType] = None, validate_content: bool = True,
130-
eml_content: Union[str, Eml] = '', additional_validation_on_content: list[CsvFileType] = None):
130+
eml_content: Union[str, Eml] = ''):
131131
"""Create a dwca given the contents of core and extensions and eml content
132132
133133
:param core_csv: CsvFileType containing the files, class types and keys to form the core of the dwca
@@ -136,7 +136,6 @@ def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO],
136136
extensions of the dwca if supplied
137137
:param validate_content: whether to validate the contents
138138
:param eml_content: eml content in string or a filled Eml object
139-
:param additional_validation_on_content: additional validation to perform
140139
"""
141140
if ext_csv_list is None:
142141
ext_csv_list = []
@@ -150,13 +149,16 @@ def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO],
150149
if image_ext:
151150
ext_csv_list.append(image_ext)
152151

152+
content_to_validate = {}
153153
for ext in ext_csv_list:
154+
if ext.keys and len(ext.keys) > 0:
155+
content_to_validate[ext.type] = ext.keys
154156
self.extract_csv_content(csv_info=ext, core_ext_type=CoreOrExtType.EXTENSION,
155157
build_coreid_for_ext=True)
156158

157159
self.fill_additional_info()
158160

159-
if validate_content and not self.validate_content(additional_validation_on_content):
161+
if validate_content and not self.validate_content(content_to_validate):
160162
raise SystemExit(Exception("Some validations error found. Dwca is not created."))
161163

162164
self.generate_eml(eml_content)
@@ -191,7 +193,7 @@ def validate_dwca(self, content_keys: dict, error_file: str):
191193
If additional checks required in another content, supply it as content_keys
192194
193195
:param content_keys: a dictionary of class type and the key
194-
for eg. {MetaElementTypes.OCCURRENCE, "occurrenceId"}
196+
for eg. {MetaElementTypes.OCCURRENCE, "occurrenceID"}
195197
:param error_file: optional error_file for the errored data
196198
"""
197199
self.extract_dwca()

src/dwcahandler/dwca/core_dwca.py

+9-21
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from pandas.errors import EmptyDataError
2323
from pandas.io import parsers
2424
from dwcahandler.dwca import (BaseDwca, CoreOrExtType, CSVEncoding,
25-
CsvFileType, Defaults, Eml, Terms,
25+
CsvFileType, Defaults, Eml, Terms, get_keys,
2626
MetaDwCA, MetaElementInfo, MetaElementTypes,
2727
MetaElementAttributes, Stat, record_diff_stat)
2828

@@ -350,6 +350,7 @@ def set_keys(self, keys: dict = None):
350350
key_list = [v] if isinstance(v, str) else v
351351
col_term = []
352352
for a_key in key_list:
353+
# this is in case a_key is url form for eg: http://rs.gbif.org/terms/1.0/gbifID
353354
if a_key not in dwca_content.df_content.columns.tolist():
354355
col_term.append(Terms.extract_term(a_key))
355356
else:
@@ -893,31 +894,17 @@ def extract_csv_content(self, csv_info: CsvFileType,
893894
:param core_ext_type: Whether this is a core or extension content frame
894895
:param build_coreid_for_ext: indicator to build id and core id to support dwca with extension
895896
"""
896-
def __get_default_core_key(core_sv_info: CsvFileType):
897-
"""Look for a column in a CSV file
898-
899-
:param core_sv_info: The CSV file
900-
:return: default key if csv_info.keys not provided.
901-
Default key is eventID for EVENT type and occurrenceID for occurrence type
902-
"""
903-
if not core_sv_info.keys or len(core_sv_info.keys) == 0:
904-
if core_sv_info.type == MetaElementTypes.EVENT:
905-
return ["eventID"]
906-
elif core_sv_info.type == MetaElementTypes.OCCURRENCE:
907-
return ["occurrenceID"]
908-
else:
909-
raise ValueError("Keys need to be set for core content")
910-
elif len(core_sv_info.keys) > 0:
911-
return core_sv_info.keys
912-
913-
if isinstance(csv_info.files, pd.DataFrame):
897+
if isinstance(csv_info.files, pd.DataFrame) :
914898
csv_content = csv_info.files
899+
elif isinstance(csv_info.files, io.TextIOWrapper):
900+
csv_content = self._read_csv(csv_info.files)
915901
else:
916902
csv_content = self._combine_contents(csv_info.files, csv_info.csv_encoding)
917903

918-
# Use default occurrenceID if not provided
904+
# Use default keys if not provided
919905
if core_ext_type == CoreOrExtType.CORE:
920-
keys = __get_default_core_key(csv_info)
906+
override_keys = {csv_info.type: csv_info.keys} if csv_info.keys and len(csv_info.keys) > 0 else None
907+
keys = get_keys(type=csv_info.type, override_content_keys=override_keys)
921908
else:
922909
keys = self.core_content.keys
923910
core_id_field: str = ""
@@ -945,6 +932,7 @@ def __get_default_core_key(core_sv_info: CsvFileType):
945932
content.keys = keys
946933
self.core_content = content
947934
else:
935+
content.keys = csv_info.keys
948936
self.ext_content.append(content)
949937

950938
def _to_csv(self, df: pd.DataFrame, meta_info: MetaElementInfo,

src/dwcahandler/dwca/dwca_factory.py

+105-5
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22
Module contains factory class for Dwca. This is used to decide the type of darwin core class to perform the operation.
33
44
"""
5-
5+
import io
66
import logging
77
from typing import Union
88
import pandas as pd
9-
from dwcahandler.dwca import CsvFileType, Dwca, Terms, Eml, MetaElementTypes
9+
from dwcahandler.dwca import CsvFileType, Dwca, Terms, Eml, MetaElementTypes, CSVEncoding, get_keys
1010
from io import BytesIO
11+
from pathlib import Path
12+
from zipfile import ZipFile
1113

1214
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
1315
log = logging.getLogger("DwcaFactoryManager")
@@ -24,7 +26,104 @@ def list_class_rowtypes() :
2426
for name, member in MetaElementTypes.__members__.items():
2527
print(f"{name}: {member.value}")
2628

29+
@staticmethod
30+
def get_contents_from_file_names(files: list) -> (dict[MetaElementTypes, str], dict[MetaElementTypes, str]):
31+
"""Find the core content and extension contents from a list of file paths.
32+
Core content will always be event if present, otherwise, occurrence content
33+
34+
:param files: list of files
35+
:param output_dwca: Where to place the resulting Dwca
36+
:param eml_content: eml content in string or Eml class
37+
:param csv_encoding: delimiter for txt file. Default is comma delimiter txt files if not supplied
38+
:param content_keys: optional dictionary of MetaElementTypes and key list
39+
for eg. {MetaElementTypes.OCCURRENCE, ["occurrenceID"]}
40+
"""
41+
def derive_type(file_list: list) -> dict[str, MetaElementTypes]:
42+
file_types = {}
43+
for file in file_list:
44+
if (filename:=Path(file).stem.upper()) in dict(MetaElementTypes.__members__.items()).keys():
45+
file_types[file] = dict(MetaElementTypes.__members__.items())[filename]
46+
return file_types
47+
48+
contents = derive_type(files)
49+
50+
core_file = {k: v for k, v in contents.items() if v == MetaElementTypes.EVENT}
51+
if not core_file:
52+
core_file = {k: v for k, v in contents.items() if v == MetaElementTypes.OCCURRENCE}
53+
54+
if core_file:
55+
core_filename = next(iter(core_file))
56+
core_type = core_file[core_filename]
57+
ext_files = {k: v for k, v in contents.items() if v != core_type}
58+
return core_file, ext_files
59+
60+
return None
61+
2762
"""Perform various DwCA operations"""
63+
@staticmethod
64+
def create_dwca_from_file_list(files: list, output_dwca: Union[str, BytesIO],
65+
eml_content: Union[str, Eml] = '', csv_encoding: CSVEncoding = CSVEncoding(),
66+
content_keys: dict[MetaElementTypes, list] = None):
67+
"""Create a suitable DwCA from a list of CSV files
68+
69+
:param files: Zip file containing txt files
70+
:param output_dwca: Where to place the resulting Dwca
71+
:param eml_content: eml content in string or Eml class
72+
:param csv_encoding: delimiter for txt file. Default is comma delimiter txt files if not supplied
73+
:param content_keys: optional dictionary of MetaElementTypes and key list
74+
for eg. {MetaElementTypes.OCCURRENCE, ["occurrenceID"]}
75+
"""
76+
core_content, ext_content_list = DwcaHandler.get_contents_from_file_names(files)
77+
if core_content:
78+
core_filename = next(iter(core_content))
79+
core_type = core_content[core_filename]
80+
81+
core_content = CsvFileType(files=[core_filename], type=core_type, csv_encoding=csv_encoding,
82+
keys=get_keys(type=core_type, override_content_keys=content_keys))
83+
ext_content = []
84+
for ext_file, ext_type in ext_content_list.items():
85+
ext_content.append(CsvFileType(files=[ext_file],
86+
type=ext_type, csv_encoding=csv_encoding,
87+
keys=get_keys(type=ext_type,
88+
override_content_keys=content_keys)))
89+
DwcaHandler.create_dwca(core_csv=core_content, ext_csv_list=ext_content, output_dwca=output_dwca,
90+
eml_content=eml_content)
91+
else:
92+
raise ValueError("The core content cannot be determined. Please check filename in zip file")
93+
94+
@staticmethod
95+
def create_dwca_from_zip_content(zip_file: str, output_dwca: Union[str, BytesIO],
96+
eml_content: Union[str, Eml] = '', csv_encoding: CSVEncoding = CSVEncoding(),
97+
content_keys: dict[MetaElementTypes, list] = None):
98+
"""Create a suitable DwCA from a list of CSV files
99+
100+
:param zip_file: Zip file containing txt files
101+
:param output_dwca: Where to place the resulting Dwca
102+
:param eml_content: eml content in string or Eml class
103+
:param csv_encoding: delimiter for txt file. Default is comma delimiter txt files if not supplied
104+
:param content_keys: optional dictionary of class type and the key
105+
for eg. {MetaElementTypes.OCCURRENCE, ["occurrenceID"]}
106+
"""
107+
with ZipFile(zip_file, 'r') as zf:
108+
files = zf.namelist()
109+
core_content, ext_content_list = DwcaHandler.get_contents_from_file_names(files)
110+
if core_content:
111+
core_filename = next(iter(core_content))
112+
core_type = core_content[core_filename]
113+
core_content = CsvFileType(files=io.TextIOWrapper(zf.open(core_filename), encoding="utf-8"),
114+
type=core_type, csv_encoding=csv_encoding,
115+
keys=get_keys(type=core_type,
116+
override_content_keys=content_keys))
117+
ext_content = []
118+
for ext_file, ext_type in ext_content_list.items():
119+
ext_content.append(CsvFileType(files=io.TextIOWrapper(zf.open(ext_file), encoding="utf-8"),
120+
type=ext_type, csv_encoding=csv_encoding,
121+
keys=get_keys(type=ext_type,
122+
override_content_keys=content_keys)))
123+
DwcaHandler.create_dwca(core_csv=core_content, ext_csv_list=ext_content, output_dwca=output_dwca,
124+
eml_content=eml_content)
125+
else:
126+
raise ValueError("The core content cannot be determined. Please check filename in zip file")
28127

29128
@staticmethod
30129
def create_dwca(core_csv: CsvFileType,
@@ -75,14 +174,15 @@ def merge_dwca(dwca_file: Union[str, BytesIO], delta_dwca_file: Union[str, Bytes
75174
validate_delta=validate_delta_content)
76175

77176
@staticmethod
78-
def validate_dwca(dwca_file: Union[str, BytesIO], keys_lookup: dict = None, error_file: str = None):
177+
def validate_dwca(dwca_file: Union[str, BytesIO], content_keys: dict = None, error_file: str = None):
79178
"""Test a dwca for consistency
80179
81180
:param dwca_file: The path to the DwCA
82-
:param keys_lookup: The keys that identify a unique record
181+
:param content_keys: a dictionary of class type and the key
182+
for eg. {MetaElementTypes.OCCURRENCE, "occurrenceID"}
83183
:param error_file: The file to write errors to. If None, errors are logged
84184
"""
85-
return Dwca(dwca_file_loc=dwca_file).validate_dwca(keys_lookup, error_file)
185+
return Dwca(dwca_file_loc=dwca_file).validate_dwca(content_keys, error_file)
86186

87187
@staticmethod
88188
def validate_file(csv_file: CsvFileType, error_file: str = None):

src/dwcahandler/dwca/terms.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,8 @@ def update_terms():
255255
def __sort_values(df_to_sort: pd.DataFrame, sorting_column: str) -> pd.DataFrame:
256256
"""
257257
Make sure dc and dwc prefixes stay on top
258-
:param df: dataframe
258+
:param df_to_sort: dataframe to be sorted
259+
:param sorting_column: other column to sort
259260
:return: sorted dataFrame
260261
"""
261262
df_to_sort = df_to_sort.sort_values(by=["prefix", sorting_column], key=lambda x: x.str.lower())

0 commit comments

Comments
 (0)