Skip to content

Commit c86d9ef

Browse files
Handle ints represented as doubles in describe_dict with extra stats (#1206)
* handle ints as doubles in describe extra stats * update release notes * remove redundant check * lint fix * update comment * handle all nan numeric columns with extra stats * update test to also use integer_nullable * rename column * lint fix
1 parent 7c6f6b0 commit c86d9ef

File tree

3 files changed

+43
-14
lines changed

3 files changed

+43
-14
lines changed

docs/source/release_notes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ Future Release
66
==============
77
* Enhancements
88
* Allow frequency inference on temporal (Datetime, Timedelta) columns of Woodwork DataFrame (:pr:`1202`)
9+
* Update ``describe_dict`` to compute ``top_values`` for double columns that contain only integer values (:pr:`1206`)
910
* Fixes
1011
* Changes
1112
* Return histogram bins as a list of floats instead of a ``pandas.Interval`` object (:pr:`1207`)

woodwork/statistics_utils.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,11 @@
22

33
import numpy as np
44
import pandas as pd
5+
from pandas.core.dtypes.common import is_integer_dtype
56
from sklearn.metrics.cluster import normalized_mutual_info_score
67

78
from woodwork.accessor_utils import _is_dask_dataframe, _is_koalas_dataframe
8-
from woodwork.logical_types import (
9-
Datetime,
10-
Double,
11-
Integer,
12-
IntegerNullable,
13-
LatLong,
14-
Timedelta,
15-
)
9+
from woodwork.logical_types import Datetime, Double, LatLong, Timedelta
1610
from woodwork.utils import _update_progress, get_valid_mi_types, import_or_none
1711

1812
dd = import_or_none("dask.dataframe")
@@ -153,12 +147,16 @@ def _get_describe_dict(
153147
# Calculate extra detailed stats, if requested
154148
if extra_stats:
155149
if column.is_numeric:
156-
values["histogram"] = _get_histogram_values(series, bins=bins)
157-
if isinstance(column.logical_type, (Integer, IntegerNullable)):
150+
if pd.isnull(values["max"]) or pd.isnull(values["min"]):
151+
values["histogram"] = []
152+
values["top_values"] = []
153+
else:
154+
values["histogram"] = _get_histogram_values(series, bins=bins)
158155
_range = range(int(values["min"]), int(values["max"]) + 1)
159156
# Calculate top numeric values if range of values present
160-
# is less than or equal number of histogram bins
161-
if len(_range) <= bins:
157+
# is less than or equal number of histogram bins and series
158+
# contains only integer values
159+
if len(_range) <= bins and (series % 1 == 0).all():
162160
values["top_values"] = _get_numeric_value_counts_in_range(
163161
series, _range
164162
)
@@ -578,7 +576,11 @@ def _get_numeric_value_counts_in_range(series, _range):
578576
"""
579577
frequencies = series.value_counts(dropna=True)
580578
value_counts = [
581-
{"value": i, "count": frequencies[i] if i in frequencies else 0} for i in _range
579+
{
580+
"value": i if is_integer_dtype(series) else float(i),
581+
"count": frequencies[i] if i in frequencies else 0,
582+
}
583+
for i in _range
582584
]
583585
return sorted(value_counts, key=lambda i: (-i["count"], i["value"]))
584586

woodwork/tests/accessor/test_statistics.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -763,6 +763,22 @@ def test_describe_with_include(sample_df):
763763
multi_params_df["full_name"].equals(sample_df.ww.describe()["full_name"])
764764

765765

766+
def test_describe_numeric_all_nans():
767+
df = pd.DataFrame({"nulls": [np.nan] * 5})
768+
logical_types = ["double", "integer_nullable"]
769+
770+
for logical_type in logical_types:
771+
df.ww.init(logical_types={"nulls": logical_type})
772+
stats = df.ww.describe_dict(extra_stats=True)
773+
assert pd.isnull(stats["nulls"]["max"])
774+
assert pd.isnull(stats["nulls"]["min"])
775+
assert pd.isnull(stats["nulls"]["mean"])
776+
assert pd.isnull(stats["nulls"]["std"])
777+
assert stats["nulls"]["nan_count"] == 5
778+
assert stats["nulls"]["histogram"] == []
779+
assert stats["nulls"]["top_values"] == []
780+
781+
766782
def test_pandas_nullable_integer_quantile_fix():
767783
"""Should fail when https://github.com/pandas-dev/pandas/issues/42626 gets fixed"""
768784
if pd.__version__ not in ["1.3.0", "1.3.1"]: # pragma: no cover
@@ -833,6 +849,12 @@ def test_describe_dict_extra_stats(describe_df):
833849
describe_df["nullable_integer_col"] = describe_df["numeric_col"]
834850
describe_df["integer_col"] = describe_df["numeric_col"].fillna(0)
835851
describe_df["small_range_col"] = describe_df["numeric_col"].fillna(0) // 10
852+
describe_df["small_range_col_ints_as_double"] = (
853+
describe_df["numeric_col"].fillna(0) // 10.0
854+
)
855+
describe_df["small_range_col_double_not_valid"] = (
856+
describe_df["numeric_col"].fillna(0) / 10
857+
)
836858

837859
ltypes = {
838860
"category_col": "Categorical",
@@ -841,6 +863,8 @@ def test_describe_dict_extra_stats(describe_df):
841863
"nullable_integer_col": "IntegerNullable",
842864
"integer_col": "Integer",
843865
"small_range_col": "Integer",
866+
"small_range_col_ints_as_double": "Double",
867+
"small_range_col_double_not_valid": "Double",
844868
}
845869
describe_df.ww.init(index="index_col", logical_types=ltypes)
846870
desc_dict = describe_df.ww.describe_dict(extra_stats=True)
@@ -861,10 +885,12 @@ def test_describe_dict_extra_stats(describe_df):
861885
"nullable_integer_col",
862886
"integer_col",
863887
"small_range_col",
888+
"small_range_col_ints_as_double",
889+
"small_range_col_double_not_valid",
864890
]:
865891
assert isinstance(desc_dict[col]["histogram"], list)
866892
assert desc_dict[col].get("recent_values") is None
867-
if col == "small_range_col":
893+
if col in {"small_range_col", "small_range_col_ints_as_double"}:
868894
# If values are in a narrow range, top values should be present
869895
assert isinstance(desc_dict[col]["top_values"], list)
870896
else:

0 commit comments

Comments
 (0)