[HWORKS-934] Use more precise statistics (#1198)

javierdlrm · web-flow · commit 16a688a29988 · 2024-01-23T11:16:35.000+01:00
* [HWORKS-934] Use more precise statistics

* Add file=sys.stderr to print
diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
@@ -31,6 +31,7 @@
 import numbers
 import math
 import os
+import sys
 import pytz
 from datetime import datetime, timezone
 
@@ -399,6 +400,12 @@ def profile(
         exact_uniqueness=True,
     ):
         # TODO: add statistics for correlations, histograms and exact_uniqueness
+
+        # parse timestamp columns to string columns
+        for col, dtype in df.dtypes.items():
+            if isinstance(dtype, type(np.dtype(np.datetime64))):
+                df[col] = df[col].astype(str)
+
         if not relevant_columns:
             stats = df.describe().to_dict()
             relevant_columns = df.columns
@@ -414,20 +421,34 @@ def profile(
 
         final_stats = []
         for col in stats.keys():
-            stats_dict = stats[col]
-            stat = self._convert_pandas_statistics(stats_dict)
+            stat = self._convert_pandas_statistics(stats[col])
+            stat["isDataTypeInferred"] = "false"
+            stat["column"] = col.split(".")[-1]
+            stat["completeness"] = 1
+
+            # set data type
             if isinstance(df.dtypes[col], type(np.dtype(np.float64))):
                 stat["dataType"] = "Fractional"
+            elif isinstance(df.dtypes[col], type(np.dtype(np.int64))):
+                stat["dataType"] = "Integral"
+            elif isinstance(df.dtypes[col], type(np.dtype(np.bool_))):
+                stat["dataType"] = "Boolean"
             elif isinstance(df.dtypes[col], type(np.dtype(object))):
                 stat["dataType"] = "String"
             else:
-                stat["dataType"] = "Integral"
-            stat["isDataTypeInferred"] = "false"
-            stat["column"] = col.split(".")[-1]
-            stat["completeness"] = 1
+                print(
+                    "Data type could not be inferred for column '"
+                    + stat["column"]
+                    + "'. Defaulting to 'String'",
+                    file=sys.stderr,
+                )
+                stat["dataType"] = "String"
+
             final_stats.append(stat)
 
-        return json.dumps({"columns": final_stats})
+        return json.dumps(
+            {"columns": final_stats},
+        )
 
     def _convert_pandas_statistics(self, stat):
         # For now transformation only need 25th, 50th, 75th percentiles
diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py
@@ -801,12 +801,13 @@ def test_profile(self, mocker):
 
         # Assert
         assert (
-            result == '{"columns": [{"test_key": "test_value", "dataType": "Integral", '
-            '"isDataTypeInferred": "false", "column": "col1", "completeness": 1}, '
-            '{"test_key": "test_value", "dataType": "Fractional", "isDataTypeInferred": '
-            '"false", "column": "col2", "completeness": 1}, '
-            '{"test_key": "test_value", "dataType": "String", "isDataTypeInferred": '
-            '"false", "column": "col3", "completeness": 1}]}'
+            result
+            == '{"columns": [{"test_key": "test_value", "isDataTypeInferred": "false", '
+            '"column": "col1", "completeness": 1, "dataType": "Integral"}, '
+            '{"test_key": "test_value", "isDataTypeInferred": "false", '
+            '"column": "col2", "completeness": 1, "dataType": "Fractional"}, '
+            '{"test_key": "test_value", "isDataTypeInferred": "false", '
+            '"column": "col3", "completeness": 1, "dataType": "String"}]}'
         )
         assert mock_python_engine_convert_pandas_statistics.call_count == 3
 
@@ -836,8 +837,9 @@ def test_profile_relevant_columns(self, mocker):
 
         # Assert
         assert (
-            result == '{"columns": [{"test_key": "test_value", "dataType": "Integral", '
-            '"isDataTypeInferred": "false", "column": "col1", "completeness": 1}]}'
+            result
+            == '{"columns": [{"test_key": "test_value", "isDataTypeInferred": "false", '
+            '"column": "col1", "completeness": 1, "dataType": "Integral"}]}'
         )
         assert mock_python_engine_convert_pandas_statistics.call_count == 1
 
@@ -868,10 +870,11 @@ def test_profile_relevant_columns_diff_dtypes(self, mocker):
 
         # Assert
         assert (
-            result == '{"columns": [{"test_key": "test_value", "dataType": "Integral", '
-            '"isDataTypeInferred": "false", "column": "col1", "completeness": 1}, '
-            '{"test_key": "test_value", "dataType": "String", "isDataTypeInferred": '
-            '"false", "column": "col3", "completeness": 1}]}'
+            result
+            == '{"columns": [{"test_key": "test_value", "isDataTypeInferred": "false", '
+            '"column": "col1", "completeness": 1, "dataType": "Integral"}, '
+            '{"test_key": "test_value", "isDataTypeInferred": "false", '
+            '"column": "col3", "completeness": 1, "dataType": "String"}]}'
         )
         assert mock_python_engine_convert_pandas_statistics.call_count == 2