Skip to content

Commit 954c797

Browse files
committed
#20 - Provide more clarity on the content class
1 parent 253061d commit 954c797

11 files changed

+239
-145
lines changed

README.md

+15-11
Original file line numberDiff line numberDiff line change
@@ -96,13 +96,13 @@ DwcaHandler.list_class_rowtypes()
9696
* In creating a dwca with multimedia extension, provide format and type values in the Simple Multimedia extension, otherwise, dwcahandler will attempt to fill these info by guessing the mimetype from url.
9797

9898
```python
99-
from dwcahandler import CsvFileType
99+
from dwcahandler import ContentData
100100
from dwcahandler import DwcaHandler
101101
from dwcahandler import MetaElementTypes
102102
from dwcahandler import Eml
103103

104-
core_csv = CsvFileType(files=['/tmp/occurrence.csv'], type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID'])
105-
ext_csvs = [CsvFileType(files=['/tmp/multimedia.csv'], type=MetaElementTypes.MULTIMEDIA)]
104+
core_csv = ContentData(data=['/tmp/occurrence.csv'], type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID'])
105+
ext_csvs = [ContentData(data=['/tmp/multimedia.csv'], type=MetaElementTypes.MULTIMEDIA)]
106106

107107
eml = Eml(dataset_name='Test Dataset',
108108
description='Dataset description',
@@ -118,16 +118,16 @@ DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=ext_csvs, eml_content=em
118118

119119
```python
120120
from dwcahandler import DwcaHandler
121-
from dwcahandler.dwca import CsvFileType
121+
from dwcahandler.dwca import ContentData
122122
from dwcahandler import MetaElementTypes
123123
from dwcahandler import Eml
124124
import pandas as pd
125125

126126
core_df = pd.read_csv("/tmp/occurrence.csv")
127-
core_frame = CsvFileType(files=core_df, type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID'])
127+
core_frame = ContentData(data=core_df, type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID'])
128128

129129
ext_df = pd.read_csv("/tmp/multimedia.csv")
130-
ext_frame = [CsvFileType(files=ext_df, type=MetaElementTypes.MULTIMEDIA)]
130+
ext_frame = [ContentData(data=ext_df, type=MetaElementTypes.MULTIMEDIA)]
131131

132132
eml = Eml(dataset_name='Test Dataset',
133133
description='Dataset description',
@@ -138,7 +138,9 @@ eml = Eml(dataset_name='Test Dataset',
138138
DwcaHandler.create_dwca(core_csv=core_frame, ext_csv_list=ext_frame, eml_content=eml, output_dwca='/tmp/dwca.zip')
139139
```
140140
 
141-
* Create Darwin Core Archive from csv files in a zip files.
141+
* Convenient helper function to build Darwin Core Archive from a list of csv files.
142+
* Build event core DwCA if event.txt file is supplied, otherwise, occurrence DwCA if occurrence.txt is supplied.
143+
* Raises error if neither event.txt not occurrence.txt is in the list
142144
* Class row types are determined by file names of the csvs.
143145
* If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content
144146
* Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding
@@ -152,10 +154,12 @@ eml = Eml(dataset_name='Test Dataset',
152154
citation="test citation",
153155
rights="test rights")
154156

155-
DwcaHandler.create_dwca_from_zip_content(zip_file="/tmp/txt_files.zip", eml_content=eml, output_dwca='/tmp/dwca.zip')
157+
DwcaHandler.create_dwca_from_file_list(files=["/tmp/event.csv", "/tmp/occurrence.csv"], eml_content=eml, output_dwca='/tmp/dwca.zip')
156158
```
157159
 
158160
* Convenient helper function to create Darwin Core Archive from csv files in a zip files.
161+
* Build event core DwCA if event.txt file is supplied, otherwise, occurrence DwCA if occurrence.txt is supplied in the zip file
162+
* Raises error if neither event.txt not occurrence.txt is in the list
159163
* Class row types are determined by file names of the csvs.
160164
* If no content keys provided, the default keys are eventID for event content and occurrenceID for occurrence content
161165
* Delimiter for txt files are comma delimiter by default. For tab delimiter, supply CsvEncoding
@@ -201,13 +205,13 @@ DwcaHandler.merge_dwca(dwca_file='/tmp/dwca.zip', delta_dwca_file='/tmp/delta-dw
201205
 
202206
* Delete Rows from core file in Darwin Core Archive
203207
```python
204-
from dwcahandler import CsvFileType
208+
from dwcahandler import ContentData
205209
from dwcahandler import DwcaHandler, MetaElementTypes
206210

207-
delete_csv = CsvFileType(files=['/tmp/old-records.csv'], type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID'])
211+
delete_csv = ContentData(data=['/tmp/old-records.csv'], type=MetaElementTypes.OCCURRENCE, keys=['occurrenceID'])
208212

209213
DwcaHandler.delete_records(dwca_file='/tmp/dwca.zip',
210-
records_to_delete=delete_csv,
214+
records_to_delete=delete_csv,
211215
output_dwca='/tmp/new-dwca.zip')
212216
```
213217
 

src/dwcahandler/dwca/__init__.py

+16-16
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# flake8: noqa
22
"""
3-
Tools to convert data frames into Darwin Core Archive (DwCA) files.
3+
Tools to convert data frame or text files into Darwin Core Archive (DwCA) file.
44
55
See https://ipt.gbif.org/manual/en/ipt/2.6/dwca-guide for a guide to DwCAs.
66
@@ -185,36 +185,36 @@ class Defaults:
185185
from dwcahandler.dwca.dwca_meta import (MetaElementTypes, MetaElementInfo, MetaDwCA,
186186
MetaElementAttributes, get_meta_class_row_type)
187187
@dataclass
188-
class CsvFileType:
189-
"""A description of a CSV file in a DwCA
188+
class ContentData:
189+
"""A class describing the content data used for core and extension.
190+
Use this class to define the core content and extension content to build a DwCA (see README on usage)
190191
"""
191-
files: Union[list[str], pd.DataFrame, io.TextIOWrapper] # can accept more than one file or a dataframe
192-
type: MetaElementTypes # 'occurrence', 'taxon', 'event', multimedia,...
193-
keys: Optional[list] = None # must be supplied for csv extensions to link extension records to core record
194-
# when creating dwca. for core other than occurrence, this neeeds to be supplied as key.
195-
# column keys lookup in core or extension for delete records
196-
associated_files_loc: Optional[str] = None # in case there are associated media that need to be packaged in dwca
192+
data: Union[list[str], pd.DataFrame, io.TextIOWrapper] # can accept more than one files, dataframe or file pointer
193+
type: MetaElementTypes # Enumerated types from the class row type.
194+
keys: Optional[list] = None # keys that uniquely identify a record in the content
195+
associated_files_loc: Optional[str] = None # provide a folder path containing the embedded images.
196+
# Embedded images file name must be supplied as associatedMedia in the content
197197
csv_encoding: CSVEncoding = field(
198198
default_factory=lambda: CSVEncoding(csv_delimiter=",", csv_eol="\n", csv_text_enclosure='"',
199199
csv_escape_char='"'))
200200

201201
def check_for_empty(self, include_keys = True):
202-
if self.files and len(self.files) > 0 and \
202+
if self.data and len(self.data) > 0 and \
203203
self.type and isinstance(self.type, MetaElementTypes) and \
204204
(not include_keys or include_keys and self.keys and len(self.keys) > 0):
205205
return True
206206
return False
207207

208-
def add_data(self, other_csv_file_type: CsvFileType):
208+
def add_data(self, other_csv_file_type: ContentData):
209209
if self.type and self.type == other_csv_file_type.type:
210-
if isinstance(self.files, pd.DataFrame) and isinstance(other_csv_file_type.files, pd.DataFrame):
211-
self.files = pd.concat([self.files, other_csv_file_type.files], ignore_index=False)
210+
if isinstance(self.data, pd.DataFrame) and isinstance(other_csv_file_type.data, pd.DataFrame):
211+
self.data = pd.concat([self.data, other_csv_file_type.data], ignore_index=False)
212212
return True
213-
elif isinstance(self.files, list) and isinstance(other_csv_file_type.files, list):
214-
self.files.append(other_csv_file_type.files)
213+
elif isinstance(self.data, list) and isinstance(other_csv_file_type.data, list):
214+
self.data.append(other_csv_file_type.data)
215215
return True
216216
elif not self.type:
217-
self.files = other_csv_file_type.files
217+
self.data = other_csv_file_type.data
218218
self.type = other_csv_file_type.type
219219
return False
220220

src/dwcahandler/dwca/base_dwca.py

+16-16
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@
77
from abc import ABCMeta, abstractmethod
88
from typing import Union
99
from io import BytesIO
10-
from dwcahandler.dwca import CoreOrExtType, CsvFileType, MetaElementTypes
10+
from dwcahandler.dwca import CoreOrExtType, ContentData, MetaElementTypes
1111
from dwcahandler.dwca.eml import Eml
1212

1313

1414
class BaseDwca(metaclass=ABCMeta):
1515
"""An abstract DwCA that provides basic operations"""
1616

1717
@abstractmethod
18-
def extract_csv_content(self, csv_info: CsvFileType, core_ext_type: CoreOrExtType,
18+
def extract_csv_content(self, csv_info: ContentData, core_ext_type: CoreOrExtType,
1919
build_coreid_for_ext: bool = False):
2020
"""Get the content from a single file in the DwCA
2121
@@ -48,11 +48,11 @@ def generate_meta(self):
4848

4949
@abstractmethod
5050
def write_dwca(self, output_dwca: Union[str, BytesIO]):
51-
"""Write the content of the DwCA to a directory.
51+
"""Write the content of the DwCA to a file path (supplied as string) or to BytesIO in memory.
5252
53-
Writes all CSV files, as well as a meta-file and EML file for the archive.
53+
Writes all CSV data, as well as a meta-file and EML file for the archive.
5454
55-
:param output_dwca: The path to write to or dwca in memory
55+
:param output_dwca: The file path or BytesIO
5656
"""
5757
pass
5858

@@ -85,7 +85,7 @@ def convert_associated_media_to_extension(self):
8585
pass
8686

8787
@abstractmethod
88-
def delete_records(self, records_to_delete: CsvFileType):
88+
def delete_records(self, records_to_delete: ContentData):
8989
pass
9090

9191
@abstractmethod
@@ -111,10 +111,10 @@ def fill_additional_info(self):
111111
for multimedia_content, _ in contents:
112112
self.add_multimedia_info_to_content(multimedia_content)
113113

114-
def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca: Union[str, BytesIO]):
115-
"""Delete records in dwca if the key records are defined in CsvFileType
114+
def delete_records_in_dwca(self, records_to_delete: ContentData, output_dwca: Union[str, BytesIO]):
115+
"""Delete records in dwca if the key records are defined in ContentData
116116
117-
:param records_to_delete: A CsvFileType that containing the text file of the record keys,
117+
:param records_to_delete: A ContentData that containing the text file of the record keys,
118118
the key names of the records and MetaElementType type class of the dwca
119119
where the records need to be removed
120120
:param output_dwca: output dwca path where the result of the dwca is writen to or the output dwca in memory
@@ -125,14 +125,14 @@ def delete_records_in_dwca(self, records_to_delete: CsvFileType, output_dwca: Un
125125
self.generate_meta()
126126
self.write_dwca(output_dwca)
127127

128-
def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO],
129-
ext_csv_list: list[CsvFileType] = None, validate_content: bool = True,
128+
def create_dwca(self, core_csv: ContentData, output_dwca: Union[str, BytesIO],
129+
ext_csv_list: list[ContentData] = None, validate_content: bool = True,
130130
eml_content: Union[str, Eml] = ''):
131131
"""Create a dwca given the contents of core and extensions and eml content
132132
133-
:param core_csv: CsvFileType containing the files, class types and keys to form the core of the dwca
133+
:param core_csv: ContentData containing the data, class type and keys to form the core of the dwca
134134
:param output_dwca: the resulting path of the dwca or the dwca in memory
135-
:param ext_csv_list: list of CsvFileTypes containing the files, class types and keys to form the
135+
:param ext_csv_list: list of ContentData containing the data, class type and keys to form the
136136
extensions of the dwca if supplied
137137
:param validate_content: whether to validate the contents
138138
:param eml_content: eml content in string or a filled Eml object
@@ -143,7 +143,7 @@ def create_dwca(self, core_csv: CsvFileType, output_dwca: Union[str, BytesIO],
143143
self.extract_csv_content(csv_info=core_csv, core_ext_type=CoreOrExtType.CORE,
144144
build_coreid_for_ext=True if len(ext_csv_list) > 0 else False)
145145

146-
# if multimedia files is supplied, do not attempt to convert associated media to multimedia
146+
# if multimedia data is supplied, do not attempt to convert associated media to multimedia
147147
if not any(ext.type == MetaElementTypes.MULTIMEDIA for ext in ext_csv_list):
148148
image_ext = self.convert_associated_media_to_extension()
149149
if image_ext:
@@ -200,10 +200,10 @@ def validate_dwca(self, content_keys: dict, error_file: str):
200200
set_keys = self.set_keys(content_keys)
201201
return self.validate_content(content_to_validate=set_keys, error_file=error_file)
202202

203-
def validate_file(self, csv: CsvFileType, error_file: str):
203+
def validate_file(self, csv: ContentData, error_file: str):
204204
"""Validate the text file
205205
206-
:param csv: CsvFileType to pass the csv, key and type
206+
:param csv: ContentData to pass the csv, key and type
207207
:param error_file: optional error_file for the errored data
208208
"""
209209
self.extract_csv_content(csv_info=csv, core_ext_type=CoreOrExtType.CORE)

src/dwcahandler/dwca/core_dwca.py

+13-13
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from pandas.errors import EmptyDataError
2323
from pandas.io import parsers
2424
from dwcahandler.dwca import (BaseDwca, CoreOrExtType, CSVEncoding,
25-
CsvFileType, Defaults, Eml, Terms, get_keys,
25+
ContentData, Defaults, Eml, Terms, get_keys,
2626
MetaDwCA, MetaElementInfo, MetaElementTypes,
2727
MetaElementAttributes, Stat, record_diff_stat)
2828

@@ -488,16 +488,16 @@ def _delete_content(self, content, delete_content):
488488
content = self._filter_content(delete_content, content.df_content)
489489
return content
490490

491-
def delete_records(self, records_to_delete: CsvFileType):
491+
def delete_records(self, records_to_delete: ContentData):
492492
"""Delete records from either a core or extension content frame
493493
494494
:param records_to_delete: A CSV file of records to delete, keyed to the DwCA file
495495
"""
496496
delete_content = pd.DataFrame()
497-
if isinstance(records_to_delete.files, pd.DataFrame):
498-
delete_content = records_to_delete.files.copy(deep=True)
497+
if isinstance(records_to_delete.data, pd.DataFrame):
498+
delete_content = records_to_delete.data.copy(deep=True)
499499
else:
500-
delete_content = self._combine_contents(records_to_delete.files, records_to_delete.csv_encoding,
500+
delete_content = self._combine_contents(records_to_delete.data, records_to_delete.csv_encoding,
501501
use_chunking=False)
502502
valid_delete_file = (all(col in delete_content.columns for col in records_to_delete.keys)
503503
or len(delete_content) > 0)
@@ -735,7 +735,7 @@ def convert_associated_media_to_extension(self):
735735
if len(image_df) > 0:
736736
self._update_meta_fields(content=self.core_content, key_field=self.core_content.keys[0])
737737
log.info("%s associated media extracted", str(len(image_df)))
738-
return CsvFileType(files=image_df, type=MetaElementTypes.MULTIMEDIA,
738+
return ContentData(data=image_df, type=MetaElementTypes.MULTIMEDIA,
739739
keys=self.core_content.keys)
740740

741741
log.info("Nothing to extract from associated media")
@@ -886,20 +886,20 @@ def validate_content(self, content_to_validate: dict = None, error_file: str = N
886886

887887
return True if validation_success else False
888888

889-
def extract_csv_content(self, csv_info: CsvFileType,
889+
def extract_csv_content(self, csv_info: ContentData,
890890
core_ext_type: CoreOrExtType, build_coreid_for_ext: bool = False):
891-
"""Read the files from a CSV description into a content frame and include it in the Dwca.
891+
"""Read the data from a CSV description into a content frame and include it in the Dwca.
892892
893893
:param csv_info: The CSV file(s)
894894
:param core_ext_type: Whether this is a core or extension content frame
895895
:param build_coreid_for_ext: indicator to build id and core id to support dwca with extension
896896
"""
897-
if isinstance(csv_info.files, pd.DataFrame) :
898-
csv_content = csv_info.files
899-
elif isinstance(csv_info.files, io.TextIOWrapper):
900-
csv_content = self._read_csv(csv_info.files)
897+
if isinstance(csv_info.data, pd.DataFrame) :
898+
csv_content = csv_info.data
899+
elif isinstance(csv_info.data, io.TextIOWrapper):
900+
csv_content = self._read_csv(csv_info.data)
901901
else:
902-
csv_content = self._combine_contents(csv_info.files, csv_info.csv_encoding)
902+
csv_content = self._combine_contents(csv_info.data, csv_info.csv_encoding)
903903

904904
# Use default keys if not provided
905905
if core_ext_type == CoreOrExtType.CORE:

0 commit comments

Comments
 (0)