Skip to content

Commit 5d69a09

Browse files
manu-sjjavierdlrm
authored andcommitted
[FSTORE-1253] Adding support for all datatype during statistic computation (#1230)
1 parent 6907101 commit 5d69a09

File tree

2 files changed

+24
-11
lines changed

2 files changed

+24
-11
lines changed

python/hsfs/core/feature_monitoring_result_engine.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -382,9 +382,10 @@ def _run_and_save_statistics_comparison_reference_stats(
382382
)
383383

384384
# sort by feature name
385-
sorted_det_stats, sorted_ref_stats = sorted(
386-
detection_statistics, key=lambda fds: fds.feature_name
387-
), sorted(reference_statistics, key=lambda fds: fds.feature_name)
385+
sorted_det_stats, sorted_ref_stats = (
386+
sorted(detection_statistics, key=lambda fds: fds.feature_name),
387+
sorted(reference_statistics, key=lambda fds: fds.feature_name),
388+
)
388389

389390
fm_results = []
390391
for det_fds, ref_fds in zip(sorted_det_stats, sorted_ref_stats):

python/hsfs/engine/python.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -400,11 +400,16 @@ def profile(
400400
exact_uniqueness=True,
401401
):
402402
# TODO: add statistics for correlations, histograms and exact_uniqueness
403+
arrow_schema = pa.Schema.from_pandas(df, preserve_index=False)
403404

404405
# parse timestamp columns to string columns
405-
for col, dtype in df.dtypes.items():
406-
if isinstance(dtype, type(np.dtype(np.datetime64))):
407-
df[col] = df[col].astype(str)
406+
for field in arrow_schema:
407+
if not (
408+
pa.types.is_list(field.type)
409+
or pa.types.is_large_list(field.type)
410+
or pa.types.is_struct(field.type)
411+
) and PYARROW_HOPSWORKS_DTYPE_MAPPING[field.type] in ["timestamp", "date"]:
412+
df[field.name] = df[field.name].astype(str)
408413

409414
if not relevant_columns:
410415
stats = df.describe().to_dict()
@@ -427,14 +432,21 @@ def profile(
427432
stat["completeness"] = 1
428433

429434
# set data type
430-
if isinstance(df.dtypes[col], type(np.dtype(np.float64))):
435+
arrow_type = arrow_schema.field(col).type
436+
if (
437+
pa.types.is_list(arrow_type)
438+
or pa.types.is_large_list(arrow_type)
439+
or pa.types.is_struct(arrow_type)
440+
or PYARROW_HOPSWORKS_DTYPE_MAPPING[arrow_type]
441+
in ["timestamp", "date", "binary", "string"]
442+
):
443+
stat["dataType"] = "String"
444+
elif PYARROW_HOPSWORKS_DTYPE_MAPPING[arrow_type] in ["float", "double"]:
431445
stat["dataType"] = "Fractional"
432-
elif isinstance(df.dtypes[col], type(np.dtype(np.int64))):
446+
elif PYARROW_HOPSWORKS_DTYPE_MAPPING[arrow_type] in ["int", "bigint"]:
433447
stat["dataType"] = "Integral"
434-
elif isinstance(df.dtypes[col], type(np.dtype(np.bool_))):
448+
elif PYARROW_HOPSWORKS_DTYPE_MAPPING[arrow_type] == "boolean":
435449
stat["dataType"] = "Boolean"
436-
elif isinstance(df.dtypes[col], type(np.dtype(object))):
437-
stat["dataType"] = "String"
438450
else:
439451
print(
440452
"Data type could not be inferred for column '"

0 commit comments

Comments
 (0)