2
2
Module contains factory class for Dwca. This is used to decide the type of darwin core class to perform the operation.
3
3
4
4
"""
5
-
5
+ import io
6
6
import logging
7
7
from typing import Union
8
8
import pandas as pd
9
- from dwcahandler .dwca import CsvFileType , Dwca , Terms , Eml , MetaElementTypes
9
+ from dwcahandler .dwca import CsvFileType , Dwca , Terms , Eml , MetaElementTypes , CSVEncoding , get_keys
10
10
from io import BytesIO
11
+ from pathlib import Path
12
+ from zipfile import ZipFile
11
13
12
14
logging .basicConfig (format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' , level = logging .INFO )
13
15
log = logging .getLogger ("DwcaFactoryManager" )
@@ -24,7 +26,104 @@ def list_class_rowtypes() :
24
26
for name , member in MetaElementTypes .__members__ .items ():
25
27
print (f"{ name } : { member .value } " )
26
28
29
+ @staticmethod
30
+ def get_contents_from_file_names (files : list ) -> (dict [MetaElementTypes , str ], dict [MetaElementTypes , str ]):
31
+ """Find the core content and extension contents from a list of file paths.
32
+ Core content will always be event if present, otherwise, occurrence content
33
+
34
+ :param files: list of files
35
+ :param output_dwca: Where to place the resulting Dwca
36
+ :param eml_content: eml content in string or Eml class
37
+ :param csv_encoding: delimiter for txt file. Default is comma delimiter txt files if not supplied
38
+ :param content_keys: optional dictionary of MetaElementTypes and key list
39
+ for eg. {MetaElementTypes.OCCURRENCE, ["occurrenceID"]}
40
+ """
41
+ def derive_type (file_list : list ) -> dict [str , MetaElementTypes ]:
42
+ file_types = {}
43
+ for file in file_list :
44
+ if (filename := Path (file ).stem .upper ()) in dict (MetaElementTypes .__members__ .items ()).keys ():
45
+ file_types [file ] = dict (MetaElementTypes .__members__ .items ())[filename ]
46
+ return file_types
47
+
48
+ contents = derive_type (files )
49
+
50
+ core_file = {k : v for k , v in contents .items () if v == MetaElementTypes .EVENT }
51
+ if not core_file :
52
+ core_file = {k : v for k , v in contents .items () if v == MetaElementTypes .OCCURRENCE }
53
+
54
+ if core_file :
55
+ core_filename = next (iter (core_file ))
56
+ core_type = core_file [core_filename ]
57
+ ext_files = {k : v for k , v in contents .items () if v != core_type }
58
+ return core_file , ext_files
59
+
60
+ return None
61
+
27
62
"""Perform various DwCA operations"""
63
+ @staticmethod
64
+ def create_dwca_from_file_list (files : list , output_dwca : Union [str , BytesIO ],
65
+ eml_content : Union [str , Eml ] = '' , csv_encoding : CSVEncoding = CSVEncoding (),
66
+ content_keys : dict [MetaElementTypes , list ] = None ):
67
+ """Create a suitable DwCA from a list of CSV files
68
+
69
+ :param files: Zip file containing txt files
70
+ :param output_dwca: Where to place the resulting Dwca
71
+ :param eml_content: eml content in string or Eml class
72
+ :param csv_encoding: delimiter for txt file. Default is comma delimiter txt files if not supplied
73
+ :param content_keys: optional dictionary of MetaElementTypes and key list
74
+ for eg. {MetaElementTypes.OCCURRENCE, ["occurrenceID"]}
75
+ """
76
+ core_content , ext_content_list = DwcaHandler .get_contents_from_file_names (files )
77
+ if core_content :
78
+ core_filename = next (iter (core_content ))
79
+ core_type = core_content [core_filename ]
80
+
81
+ core_content = CsvFileType (files = [core_filename ], type = core_type , csv_encoding = csv_encoding ,
82
+ keys = get_keys (type = core_type , override_content_keys = content_keys ))
83
+ ext_content = []
84
+ for ext_file , ext_type in ext_content_list .items ():
85
+ ext_content .append (CsvFileType (files = [ext_file ],
86
+ type = ext_type , csv_encoding = csv_encoding ,
87
+ keys = get_keys (type = ext_type ,
88
+ override_content_keys = content_keys )))
89
+ DwcaHandler .create_dwca (core_csv = core_content , ext_csv_list = ext_content , output_dwca = output_dwca ,
90
+ eml_content = eml_content )
91
+ else :
92
+ raise ValueError ("The core content cannot be determined. Please check filename in zip file" )
93
+
94
+ @staticmethod
95
+ def create_dwca_from_zip_content (zip_file : str , output_dwca : Union [str , BytesIO ],
96
+ eml_content : Union [str , Eml ] = '' , csv_encoding : CSVEncoding = CSVEncoding (),
97
+ content_keys : dict [MetaElementTypes , list ] = None ):
98
+ """Create a suitable DwCA from a list of CSV files
99
+
100
+ :param zip_file: Zip file containing txt files
101
+ :param output_dwca: Where to place the resulting Dwca
102
+ :param eml_content: eml content in string or Eml class
103
+ :param csv_encoding: delimiter for txt file. Default is comma delimiter txt files if not supplied
104
+ :param content_keys: optional dictionary of class type and the key
105
+ for eg. {MetaElementTypes.OCCURRENCE, ["occurrenceID"]}
106
+ """
107
+ with ZipFile (zip_file , 'r' ) as zf :
108
+ files = zf .namelist ()
109
+ core_content , ext_content_list = DwcaHandler .get_contents_from_file_names (files )
110
+ if core_content :
111
+ core_filename = next (iter (core_content ))
112
+ core_type = core_content [core_filename ]
113
+ core_content = CsvFileType (files = io .TextIOWrapper (zf .open (core_filename ), encoding = "utf-8" ),
114
+ type = core_type , csv_encoding = csv_encoding ,
115
+ keys = get_keys (type = core_type ,
116
+ override_content_keys = content_keys ))
117
+ ext_content = []
118
+ for ext_file , ext_type in ext_content_list .items ():
119
+ ext_content .append (CsvFileType (files = io .TextIOWrapper (zf .open (ext_file ), encoding = "utf-8" ),
120
+ type = ext_type , csv_encoding = csv_encoding ,
121
+ keys = get_keys (type = ext_type ,
122
+ override_content_keys = content_keys )))
123
+ DwcaHandler .create_dwca (core_csv = core_content , ext_csv_list = ext_content , output_dwca = output_dwca ,
124
+ eml_content = eml_content )
125
+ else :
126
+ raise ValueError ("The core content cannot be determined. Please check filename in zip file" )
28
127
29
128
@staticmethod
30
129
def create_dwca (core_csv : CsvFileType ,
@@ -75,14 +174,15 @@ def merge_dwca(dwca_file: Union[str, BytesIO], delta_dwca_file: Union[str, Bytes
75
174
validate_delta = validate_delta_content )
76
175
77
176
@staticmethod
78
- def validate_dwca (dwca_file : Union [str , BytesIO ], keys_lookup : dict = None , error_file : str = None ):
177
+ def validate_dwca (dwca_file : Union [str , BytesIO ], content_keys : dict = None , error_file : str = None ):
79
178
"""Test a dwca for consistency
80
179
81
180
:param dwca_file: The path to the DwCA
82
- :param keys_lookup: The keys that identify a unique record
181
+ :param content_keys: a dictionary of class type and the key
182
+ for eg. {MetaElementTypes.OCCURRENCE, "occurrenceID"}
83
183
:param error_file: The file to write errors to. If None, errors are logged
84
184
"""
85
- return Dwca (dwca_file_loc = dwca_file ).validate_dwca (keys_lookup , error_file )
185
+ return Dwca (dwca_file_loc = dwca_file ).validate_dwca (content_keys , error_file )
86
186
87
187
@staticmethod
88
188
def validate_file (csv_file : CsvFileType , error_file : str = None ):
0 commit comments