@@ -207,6 +207,10 @@ def convert_values(v):
207
207
csv_file_name = meta_elm .meta_element_type .file_name
208
208
with io .TextIOWrapper (zf .open (csv_file_name ), encoding = "utf-8" ) as csv_file :
209
209
dwc_headers = [f .field_name for f in meta_elm .fields if f .index is not None ]
210
+ duplicates = [i for i in set (dwc_headers ) if dwc_headers .count (i ) > 1 ]
211
+ if len (duplicates ) > 0 :
212
+ raise ValueError (f"Duplicate columns { duplicates } specified in the "
213
+ f"metadata for { csv_file_name } " )
210
214
csv_encoding = {key : convert_values (value ) for key , value in
211
215
asdict (meta_elm .meta_element_type .csv_encoding ).items ()}
212
216
csv_content = self ._read_csv (
@@ -825,17 +829,18 @@ def check_duplicates(self, content_keys_df, keys, error_file=None):
825
829
"""
826
830
827
831
def report_error (content , keys , message , condition , error_file = None ):
828
- log .error ("%s found in keys %s" , message , keys )
829
- log .error ("\n %s count\n %s" , message , condition .sum ())
830
- log .error ("\n %s" , content .loc [condition .values , keys ].index .tolist ())
831
- if error_file :
832
- content .loc [condition .values , keys ].to_csv (error_file , index = False )
832
+ content .loc [condition .values , keys ].to_csv (error_file , index = False )
833
833
834
834
checks_status : bool = True
835
835
if len (keys ) > 0 :
836
836
empty_values_condition = content_keys_df .isnull ()
837
837
if empty_values_condition .values .any ():
838
- report_error (content_keys_df , keys , "Empty Values" , empty_values_condition )
838
+ log .error ("Empty values found in %s. Total rows affected: %s" , keys ,
839
+ empty_values_condition .sum ().sum ())
840
+ log .error ("Empty values found in dataframe row: %s" ,
841
+ content_keys_df .index [empty_values_condition .all (axis = 1 )].tolist ())
842
+ if error_file :
843
+ report_error (content_keys_df , keys , "Empty Values" , empty_values_condition )
839
844
checks_status = False
840
845
841
846
# check incase-sensitive duplicates
@@ -846,8 +851,11 @@ def to_lower(df):
846
851
df_keys = to_lower (content_keys_df )
847
852
duplicate_condition = df_keys .duplicated (keep = 'first' )
848
853
if duplicate_condition .values .any ():
849
- report_error (content_keys_df , keys , "Duplicate Values" ,
850
- duplicate_condition , error_file )
854
+ log .error (f"Duplicate %s found. Total rows affected: %s" , keys , duplicate_condition .sum ())
855
+ log .error ("Duplicate values: %s" , pd .unique (content_keys_df [duplicate_condition ].stack ()))
856
+ if error_file :
857
+ report_error (content_keys_df , keys , "Duplicate Values" ,
858
+ duplicate_condition , error_file )
851
859
checks_status = False
852
860
853
861
return checks_status
@@ -888,7 +896,6 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil
888
896
889
897
- No duplicate record keys
890
898
- Valid columns
891
- - No duplicate columns
892
899
893
900
:param error_file: A file to record errors
894
901
:return: True if the DwCA is value, False otherwise
@@ -907,10 +914,6 @@ def validate_content(self, content_type_to_validate: list[str] = None, error_fil
907
914
if not self ._validate_columns (content ):
908
915
return False
909
916
910
- dup_cols = self ._find_duplicate_columns (content )
911
- if len (dup_cols ) > 0 :
912
- return False
913
-
914
917
return True
915
918
916
919
def extract_csv_content (self , csv_info : CsvFileType ,
@@ -1062,9 +1065,16 @@ def _read_csv(self,
1062
1065
ret_val .dropna (how = "all" , inplace = True )
1063
1066
log .debug ("Extracted %d rows from csv %s" , len (ret_val ), csv_file )
1064
1067
1068
+ # Strip column header spaces
1069
+ ret_val .rename (str .strip , axis = 'columns' , inplace = True )
1070
+
1065
1071
return ret_val
1066
1072
1067
1073
except EmptyDataError :
1068
- log .error (f"The expected columns: %s are not present in the { csv_file } . "
1069
- f"The file may be empty" , ',' .join (columns ))
1074
+ if columns :
1075
+ log .error (f"The file may be empty { csv_file } " )
1076
+ else :
1077
+ log .error (f"The expected columns: %s are not present in the { csv_file } . "
1078
+ f"The file may be empty" , ',' .join (columns ))
1079
+
1070
1080
return pd .DataFrame ()
0 commit comments