1
1
import os
2
- from pathlib import Path
3
2
from dataclasses import dataclass , field
4
3
import re
5
4
import pandas as pd
12
11
13
12
this_dir , this_filename = os .path .split (__file__ )
14
13
15
- log .basicConfig (format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' ,
16
- level = log .DEBUG )
14
+ log .basicConfig (format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' , level = log .DEBUG )
17
15
log = log .getLogger ("DwcaTerms" )
18
16
17
+
19
18
def absolute_file_paths (directory ):
20
19
"""Convert files in a directory into absolute paths and return
21
20
as a generator
@@ -59,10 +58,11 @@ class GbifRegisteredExt(ExtInfo, Enum):
59
58
EXTENDED_MEASUREMENT_OR_FACT = ExtInfo (uri = "http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact" ,
60
59
prefix = NsPrefix .OBIS ,
61
60
namespace = "http://rs.iobis.org/obis/terms/" )
62
- #AC_MULTIMEDIA = ExtInfo(uri="http://rs.tdwg.org/ac/terms/Multimedia",
61
+ # AC_MULTIMEDIA = ExtInfo(uri="http://rs.tdwg.org/ac/terms/Multimedia",
63
62
# prefix=NsPrefix.AC,
64
63
# namespace="http://rs.tdwg.org/ac/terms/")
65
64
65
+
66
66
@dataclass
67
67
class Terms :
68
68
"""
@@ -119,7 +119,7 @@ def _update_df(self, ns: NsPrefix, updates: pd.DataFrame, df: pd.DataFrame):
119
119
:param updates: dataframe containing the class rows or terms to update
120
120
:param df: dataframe to update
121
121
"""
122
- def __get_update_info (update_df : pd .DataFrame ):
122
+ def __get_update_info (update_df : pd .DataFrame ):
123
123
update_type : str = "term"
124
124
count = len (update_df )
125
125
if 'class' in update_df .columns .tolist ():
@@ -137,7 +137,6 @@ def __get_update_info (update_df: pd.DataFrame):
137
137
def get_dwc_source_data () -> pd .DataFrame :
138
138
return pd .read_csv (Terms .DWC_SOURCE_URL , delimiter = "," , encoding = 'utf-8' , dtype = 'str' )
139
139
140
- #@staticmethod
141
140
def update_dwc_terms (self ):
142
141
"""
143
142
Pull the latest terms from gbif dwc csv url and update the darwin core vocab terms in the package
@@ -195,7 +194,6 @@ def get_class_row_types():
195
194
class_list = list (tuple (zip (class_df ["class" ], class_df ["class_uri" ])))
196
195
return class_list
197
196
198
- #@staticmethod
199
197
def update_gbif_ext (self ):
200
198
"""
201
199
Update the class row type and terms specified by GBIF_REGISTERED_EXTENSION and update by prefix
@@ -239,7 +237,7 @@ def _extract_value(text: str):
239
237
240
238
df = pd .DataFrame (term_info , columns = ["term" , "namespace" , 'uri' ])
241
239
std_ns = ["http://rs.tdwg.org/dwc/terms/" , "http://purl.org/dc/terms/" ]
242
- existing_terms = self .terms_df #Terms().terms_df
240
+ existing_terms = self .terms_df
243
241
extra_terms_df = df [(df ["namespace" ].isin (std_ns )) & (~ df ["uri" ].isin (existing_terms ["uri" ]))]
244
242
if len (extra_terms_df ) > 0 :
245
243
log .info ("Additional standard terms found:\n %s" , extra_terms_df )
@@ -266,7 +264,6 @@ def __sort_values(df_to_sort: pd.DataFrame, sorting_column: str) -> pd.DataFrame
266
264
ext_df = df_to_sort [~ std_filter_df ].copy ()
267
265
return pd .concat ([std_df , ext_df ], ignore_index = True )
268
266
269
-
270
267
log .info ("Current class and terms" )
271
268
272
269
exclude_update_prefixes = [NsPrefix .DC .value ]
@@ -281,8 +278,8 @@ def __sort_values(df_to_sort: pd.DataFrame, sorting_column: str) -> pd.DataFrame
281
278
terms .terms_df = terms .terms_df [terms .terms_df .prefix .isin (exclude_update_prefixes )]
282
279
terms .update_dwc_terms ()
283
280
terms .update_gbif_ext ()
284
- terms .class_df = __sort_values (terms .class_df , "class" )
285
- terms .terms_df = __sort_values (terms .terms_df , "term" )
281
+ terms .class_df = __sort_values (terms .class_df , "class" )
282
+ terms .terms_df = __sort_values (terms .terms_df , "term" )
286
283
terms .class_df .to_csv (Terms .CLASS_ROW_TYPE_PATH , index = False )
287
284
terms .terms_df .to_csv (Terms .TERMS_FILE_PATH , index = False )
288
285
0 commit comments