@@ -54,7 +54,7 @@ def validate_schema(self, feature_group, df, df_features):
54
54
)
55
55
# Execute data type specific validation
56
56
errors , column_lengths , is_pk_null , is_string_length_exceeded = (
57
- self ._validate_df_specifics (feature_group , df , bool ( feature_group . id ) )
57
+ self ._validate_df_specifics (feature_group , df )
58
58
)
59
59
60
60
# Handle errors
@@ -68,7 +68,7 @@ def validate_schema(self, feature_group, df, df_features):
68
68
69
69
return df_features
70
70
71
- def _validate_df_specifics (self , feature_group , df , is_fg_created ):
71
+ def _validate_df_specifics (self , feature_group , df ):
72
72
"""To be implemented by subclasses"""
73
73
raise NotImplementedError ("Subclasses must implement this method" )
74
74
@@ -77,7 +77,8 @@ def get_feature_from_list(feature_name, features):
77
77
for i_feature in features :
78
78
if i_feature .name == feature_name :
79
79
return i_feature
80
- raise ValueError (f"Feature { feature_name } not found in feature list" )
80
+
81
+ return None
81
82
82
83
@staticmethod
83
84
def extract_numbers (input_string ):
@@ -87,13 +88,14 @@ def extract_numbers(input_string):
87
88
return re .findall (pattern , input_string )
88
89
89
90
def get_online_varchar_length (self , feature ):
90
- # returns the column length of varchar columns
91
- if not feature .type == "string" :
92
- raise ValueError ("Feature not a string type" )
93
- if not feature .online_type :
94
- raise ValueError ("Feature is not online enabled" )
95
-
96
- return int (self .extract_numbers (feature .online_type )[0 ])
91
+ # check of online_type is not null and starts with varchar
92
+ if (
93
+ feature
94
+ and feature .online_type
95
+ and feature .online_type .startswith ("varchar" )
96
+ ):
97
+ return int (self .extract_numbers (feature .online_type )[0 ])
98
+ return None
97
99
98
100
@staticmethod
99
101
def increase_string_columns (column_lengths : dict , dataframe_features ):
@@ -109,7 +111,7 @@ def increase_string_columns(column_lengths: dict, dataframe_features):
109
111
110
112
class PandasValidator (DataFrameValidator ):
111
113
# Pandas df specific validator
112
- def _validate_df_specifics (self , feature_group , df , is_fg_created ):
114
+ def _validate_df_specifics (self , feature_group , df ):
113
115
errors = {}
114
116
column_lengths = {}
115
117
is_pk_null = False
@@ -118,7 +120,7 @@ def _validate_df_specifics(self, feature_group, df, is_fg_created):
118
120
# Check for null values in primary key columns
119
121
for pk in feature_group .primary_key :
120
122
if df [pk ].isnull ().any ():
121
- errors [pk ] = f"Primary key column { pk } contains null values"
123
+ errors [pk ] = f"Primary key column { pk } contains null values. "
122
124
is_pk_null = True
123
125
124
126
# Check string lengths
@@ -128,13 +130,13 @@ def _validate_df_specifics(self, feature_group, df, is_fg_created):
128
130
self .get_online_varchar_length (
129
131
self .get_feature_from_list (col , feature_group .features )
130
132
)
131
- if is_fg_created
133
+ if feature_group . features
132
134
else 100
133
135
)
134
136
135
- if currentmax > col_max_len :
137
+ if col_max_len is not None and currentmax > col_max_len :
136
138
errors [col ] = (
137
- f"Column { col } has string values longer than { col_max_len } characters"
139
+ f"String length exceeded. Column { col } has string values longer than maximum colum limit of { col_max_len } characters. "
138
140
)
139
141
column_lengths [col ] = currentmax
140
142
is_string_length_exceeded = True
@@ -144,7 +146,7 @@ def _validate_df_specifics(self, feature_group, df, is_fg_created):
144
146
145
147
class PolarsValidator (DataFrameValidator ):
146
148
# Polars df specific validator
147
- def _validate_df_specifics (self , feature_group , df , is_fg_created ):
149
+ def _validate_df_specifics (self , feature_group , df ):
148
150
import polars as pl
149
151
150
152
errors = {}
@@ -155,7 +157,7 @@ def _validate_df_specifics(self, feature_group, df, is_fg_created):
155
157
# Check for null values in primary key columns
156
158
for pk in feature_group .primary_key :
157
159
if df [pk ].is_null ().any ():
158
- errors [pk ] = f"Primary key column { pk } contains null values"
160
+ errors [pk ] = f"Primary key column { pk } contains null values. "
159
161
is_pk_null = True
160
162
161
163
# Check string lengths
@@ -165,13 +167,13 @@ def _validate_df_specifics(self, feature_group, df, is_fg_created):
165
167
self .get_online_varchar_length (
166
168
self .get_feature_from_list (col , feature_group .features )
167
169
)
168
- if is_fg_created
170
+ if feature_group . features
169
171
else 100
170
172
)
171
173
172
- if currentmax > col_max_len :
174
+ if col_max_len is not None and currentmax > col_max_len :
173
175
errors [col ] = (
174
- f"Column { col } has string values longer than { col_max_len } characters"
176
+ f"String length exceeded. Column { col } has string values longer than maximum colum limit of { col_max_len } characters. "
175
177
)
176
178
column_lengths [col ] = currentmax
177
179
is_string_length_exceeded = True
@@ -181,7 +183,7 @@ def _validate_df_specifics(self, feature_group, df, is_fg_created):
181
183
182
184
class PySparkValidator (DataFrameValidator ):
183
185
# PySpark-specific validator
184
- def _validate_df_specifics (self , feature_group , df , is_fg_created ):
186
+ def _validate_df_specifics (self , feature_group , df ):
185
187
# Import PySpark SQL functions and types
186
188
import pyspark .sql .functions as sf
187
189
from pyspark .sql .types import StringType
@@ -194,7 +196,7 @@ def _validate_df_specifics(self, feature_group, df, is_fg_created):
194
196
# Check for null values in primary key columns
195
197
for pk in feature_group .primary_key :
196
198
if df .filter (df [pk ].isNull ()).count () > 0 :
197
- errors [pk ] = f"Primary key column { pk } contains null values"
199
+ errors [pk ] = f"Primary key column { pk } contains null values. "
198
200
is_pk_null = True
199
201
200
202
# Check string lengths for string columns
@@ -209,13 +211,13 @@ def _validate_df_specifics(self, feature_group, df, is_fg_created):
209
211
self .get_online_varchar_length (
210
212
self .get_feature_from_list (col , feature_group .features )
211
213
)
212
- if is_fg_created
214
+ if feature_group . features
213
215
else 100
214
216
)
215
217
216
- if currentmax > col_max_len :
218
+ if col_max_len is not None and currentmax > col_max_len :
217
219
errors [col ] = (
218
- f"Column { col } has string values longer than { col_max_len } characters"
220
+ f"String length exceeded. Column { col } has string values longer than maximum colum limit of { col_max_len } characters. "
219
221
)
220
222
column_lengths [col ] = currentmax
221
223
is_string_length_exceeded = True
0 commit comments