Skip to content

Commit

Permalink
[FSTORE-1253] Adding support for all datatype during statistic comput…
Browse files Browse the repository at this point in the history
…ation (#1230)
  • Loading branch information
manu-sj authored and javierdlrm committed Feb 20, 2024
1 parent 6907101 commit 5d69a09
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 11 deletions.
7 changes: 4 additions & 3 deletions python/hsfs/core/feature_monitoring_result_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,9 +382,10 @@ def _run_and_save_statistics_comparison_reference_stats(
)

# sort by feature name
sorted_det_stats, sorted_ref_stats = sorted(
detection_statistics, key=lambda fds: fds.feature_name
), sorted(reference_statistics, key=lambda fds: fds.feature_name)
sorted_det_stats, sorted_ref_stats = (
sorted(detection_statistics, key=lambda fds: fds.feature_name),
sorted(reference_statistics, key=lambda fds: fds.feature_name),
)

fm_results = []
for det_fds, ref_fds in zip(sorted_det_stats, sorted_ref_stats):
Expand Down
28 changes: 20 additions & 8 deletions python/hsfs/engine/python.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,11 +400,16 @@ def profile(
exact_uniqueness=True,
):
# TODO: add statistics for correlations, histograms and exact_uniqueness
arrow_schema = pa.Schema.from_pandas(df, preserve_index=False)

# parse timestamp columns to string columns
for col, dtype in df.dtypes.items():
if isinstance(dtype, type(np.dtype(np.datetime64))):
df[col] = df[col].astype(str)
for field in arrow_schema:
if not (
pa.types.is_list(field.type)
or pa.types.is_large_list(field.type)
or pa.types.is_struct(field.type)
) and PYARROW_HOPSWORKS_DTYPE_MAPPING[field.type] in ["timestamp", "date"]:
df[field.name] = df[field.name].astype(str)

if not relevant_columns:
stats = df.describe().to_dict()
Expand All @@ -427,14 +432,21 @@ def profile(
stat["completeness"] = 1

# set data type
if isinstance(df.dtypes[col], type(np.dtype(np.float64))):
arrow_type = arrow_schema.field(col).type
if (
pa.types.is_list(arrow_type)
or pa.types.is_large_list(arrow_type)
or pa.types.is_struct(arrow_type)
or PYARROW_HOPSWORKS_DTYPE_MAPPING[arrow_type]
in ["timestamp", "date", "binary", "string"]
):
stat["dataType"] = "String"
elif PYARROW_HOPSWORKS_DTYPE_MAPPING[arrow_type] in ["float", "double"]:
stat["dataType"] = "Fractional"
elif isinstance(df.dtypes[col], type(np.dtype(np.int64))):
elif PYARROW_HOPSWORKS_DTYPE_MAPPING[arrow_type] in ["int", "bigint"]:
stat["dataType"] = "Integral"
elif isinstance(df.dtypes[col], type(np.dtype(np.bool_))):
elif PYARROW_HOPSWORKS_DTYPE_MAPPING[arrow_type] == "boolean":
stat["dataType"] = "Boolean"
elif isinstance(df.dtypes[col], type(np.dtype(object))):
stat["dataType"] = "String"
else:
print(
"Data type could not be inferred for column '"
Expand Down

0 comments on commit 5d69a09

Please sign in to comment.