Skip to content

Commit 16a688a

Browse files
authored
[HWORKS-934] Use more precise statistics (#1198)
* [HWORKS-934] Use more precise statistics * Add file=sys.stderr to print
1 parent cd79ac6 commit 16a688a

File tree

2 files changed

+43
-19
lines changed

2 files changed

+43
-19
lines changed

python/hsfs/engine/python.py

+28-7
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import numbers
3232
import math
3333
import os
34+
import sys
3435
import pytz
3536
from datetime import datetime, timezone
3637

@@ -399,6 +400,12 @@ def profile(
399400
exact_uniqueness=True,
400401
):
401402
# TODO: add statistics for correlations, histograms and exact_uniqueness
403+
404+
# parse timestamp columns to string columns
405+
for col, dtype in df.dtypes.items():
406+
if isinstance(dtype, type(np.dtype(np.datetime64))):
407+
df[col] = df[col].astype(str)
408+
402409
if not relevant_columns:
403410
stats = df.describe().to_dict()
404411
relevant_columns = df.columns
@@ -414,20 +421,34 @@ def profile(
414421

415422
final_stats = []
416423
for col in stats.keys():
417-
stats_dict = stats[col]
418-
stat = self._convert_pandas_statistics(stats_dict)
424+
stat = self._convert_pandas_statistics(stats[col])
425+
stat["isDataTypeInferred"] = "false"
426+
stat["column"] = col.split(".")[-1]
427+
stat["completeness"] = 1
428+
429+
# set data type
419430
if isinstance(df.dtypes[col], type(np.dtype(np.float64))):
420431
stat["dataType"] = "Fractional"
432+
elif isinstance(df.dtypes[col], type(np.dtype(np.int64))):
433+
stat["dataType"] = "Integral"
434+
elif isinstance(df.dtypes[col], type(np.dtype(np.bool_))):
435+
stat["dataType"] = "Boolean"
421436
elif isinstance(df.dtypes[col], type(np.dtype(object))):
422437
stat["dataType"] = "String"
423438
else:
424-
stat["dataType"] = "Integral"
425-
stat["isDataTypeInferred"] = "false"
426-
stat["column"] = col.split(".")[-1]
427-
stat["completeness"] = 1
439+
print(
440+
"Data type could not be inferred for column '"
441+
+ stat["column"]
442+
+ "'. Defaulting to 'String'",
443+
file=sys.stderr,
444+
)
445+
stat["dataType"] = "String"
446+
428447
final_stats.append(stat)
429448

430-
return json.dumps({"columns": final_stats})
449+
return json.dumps(
450+
{"columns": final_stats},
451+
)
431452

432453
def _convert_pandas_statistics(self, stat):
433454
# For now transformation only need 25th, 50th, 75th percentiles

python/tests/engine/test_python.py

+15-12
Original file line numberDiff line numberDiff line change
@@ -801,12 +801,13 @@ def test_profile(self, mocker):
801801

802802
# Assert
803803
assert (
804-
result == '{"columns": [{"test_key": "test_value", "dataType": "Integral", '
805-
'"isDataTypeInferred": "false", "column": "col1", "completeness": 1}, '
806-
'{"test_key": "test_value", "dataType": "Fractional", "isDataTypeInferred": '
807-
'"false", "column": "col2", "completeness": 1}, '
808-
'{"test_key": "test_value", "dataType": "String", "isDataTypeInferred": '
809-
'"false", "column": "col3", "completeness": 1}]}'
804+
result
805+
== '{"columns": [{"test_key": "test_value", "isDataTypeInferred": "false", '
806+
'"column": "col1", "completeness": 1, "dataType": "Integral"}, '
807+
'{"test_key": "test_value", "isDataTypeInferred": "false", '
808+
'"column": "col2", "completeness": 1, "dataType": "Fractional"}, '
809+
'{"test_key": "test_value", "isDataTypeInferred": "false", '
810+
'"column": "col3", "completeness": 1, "dataType": "String"}]}'
810811
)
811812
assert mock_python_engine_convert_pandas_statistics.call_count == 3
812813

@@ -836,8 +837,9 @@ def test_profile_relevant_columns(self, mocker):
836837

837838
# Assert
838839
assert (
839-
result == '{"columns": [{"test_key": "test_value", "dataType": "Integral", '
840-
'"isDataTypeInferred": "false", "column": "col1", "completeness": 1}]}'
840+
result
841+
== '{"columns": [{"test_key": "test_value", "isDataTypeInferred": "false", '
842+
'"column": "col1", "completeness": 1, "dataType": "Integral"}]}'
841843
)
842844
assert mock_python_engine_convert_pandas_statistics.call_count == 1
843845

@@ -868,10 +870,11 @@ def test_profile_relevant_columns_diff_dtypes(self, mocker):
868870

869871
# Assert
870872
assert (
871-
result == '{"columns": [{"test_key": "test_value", "dataType": "Integral", '
872-
'"isDataTypeInferred": "false", "column": "col1", "completeness": 1}, '
873-
'{"test_key": "test_value", "dataType": "String", "isDataTypeInferred": '
874-
'"false", "column": "col3", "completeness": 1}]}'
873+
result
874+
== '{"columns": [{"test_key": "test_value", "isDataTypeInferred": "false", '
875+
'"column": "col1", "completeness": 1, "dataType": "Integral"}, '
876+
'{"test_key": "test_value", "isDataTypeInferred": "false", '
877+
'"column": "col3", "completeness": 1, "dataType": "String"}]}'
875878
)
876879
assert mock_python_engine_convert_pandas_statistics.call_count == 2
877880

0 commit comments

Comments
 (0)