[FSTORE-1253] Adding support for all datatype during statistic computation (#1230)

manu-sj · javierdlrm · commit 5d69a09989ef · 2024-02-20T17:54:53.000+01:00
diff --git a/python/hsfs/core/feature_monitoring_result_engine.py b/python/hsfs/core/feature_monitoring_result_engine.py
@@ -382,9 +382,10 @@ def _run_and_save_statistics_comparison_reference_stats(
             )
 
         # sort by feature name
-        sorted_det_stats, sorted_ref_stats = sorted(
-            detection_statistics, key=lambda fds: fds.feature_name
-        ), sorted(reference_statistics, key=lambda fds: fds.feature_name)
+        sorted_det_stats, sorted_ref_stats = (
+            sorted(detection_statistics, key=lambda fds: fds.feature_name),
+            sorted(reference_statistics, key=lambda fds: fds.feature_name),
+        )
 
         fm_results = []
         for det_fds, ref_fds in zip(sorted_det_stats, sorted_ref_stats):
diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
@@ -400,11 +400,16 @@ def profile(
         exact_uniqueness=True,
     ):
         # TODO: add statistics for correlations, histograms and exact_uniqueness
+        arrow_schema = pa.Schema.from_pandas(df, preserve_index=False)
 
         # parse timestamp columns to string columns
-        for col, dtype in df.dtypes.items():
-            if isinstance(dtype, type(np.dtype(np.datetime64))):
-                df[col] = df[col].astype(str)
+        for field in arrow_schema:
+            if not (
+                pa.types.is_list(field.type)
+                or pa.types.is_large_list(field.type)
+                or pa.types.is_struct(field.type)
+            ) and PYARROW_HOPSWORKS_DTYPE_MAPPING[field.type] in ["timestamp", "date"]:
+                df[field.name] = df[field.name].astype(str)
 
         if not relevant_columns:
             stats = df.describe().to_dict()
@@ -427,14 +432,21 @@ def profile(
             stat["completeness"] = 1
 
             # set data type
-            if isinstance(df.dtypes[col], type(np.dtype(np.float64))):
+            arrow_type = arrow_schema.field(col).type
+            if (
+                pa.types.is_list(arrow_type)
+                or pa.types.is_large_list(arrow_type)
+                or pa.types.is_struct(arrow_type)
+                or PYARROW_HOPSWORKS_DTYPE_MAPPING[arrow_type]
+                in ["timestamp", "date", "binary", "string"]
+            ):
+                stat["dataType"] = "String"
+            elif PYARROW_HOPSWORKS_DTYPE_MAPPING[arrow_type] in ["float", "double"]:
                 stat["dataType"] = "Fractional"
-            elif isinstance(df.dtypes[col], type(np.dtype(np.int64))):
+            elif PYARROW_HOPSWORKS_DTYPE_MAPPING[arrow_type] in ["int", "bigint"]:
                 stat["dataType"] = "Integral"
-            elif isinstance(df.dtypes[col], type(np.dtype(np.bool_))):
+            elif PYARROW_HOPSWORKS_DTYPE_MAPPING[arrow_type] == "boolean":
                 stat["dataType"] = "Boolean"
-            elif isinstance(df.dtypes[col], type(np.dtype(object))):
-                stat["dataType"] = "String"
             else:
                 print(
                     "Data type could not be inferred for column '"