Handle ints represented as doubles in describe_dict with extra stats (#1206)

thehomebrewnerd · web-flow · commit c86d9ef4838e · 2021-11-30T14:33:21.000-06:00
* handle ints as doubles in describe extra stats

* update release notes

* remove redundant check

* lint fix

* update comment

* handle all nan numeric columns with extra stats

* update test to also use integer_nullable

* rename column

* lint fix
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -6,6 +6,7 @@ Future Release
 ==============
     * Enhancements
         * Allow frequency inference on temporal (Datetime, Timedelta) columns of Woodwork DataFrame (:pr:`1202`) 
+        * Update ``describe_dict`` to compute ``top_values`` for double columns that contain only integer values (:pr:`1206`)
     * Fixes
     * Changes
         * Return histogram bins as a list of floats instead of a ``pandas.Interval`` object (:pr:`1207`)
diff --git a/woodwork/statistics_utils.py b/woodwork/statistics_utils.py
@@ -2,17 +2,11 @@
 
 import numpy as np
 import pandas as pd
+from pandas.core.dtypes.common import is_integer_dtype
 from sklearn.metrics.cluster import normalized_mutual_info_score
 
 from woodwork.accessor_utils import _is_dask_dataframe, _is_koalas_dataframe
-from woodwork.logical_types import (
-    Datetime,
-    Double,
-    Integer,
-    IntegerNullable,
-    LatLong,
-    Timedelta,
-)
+from woodwork.logical_types import Datetime, Double, LatLong, Timedelta
 from woodwork.utils import _update_progress, get_valid_mi_types, import_or_none
 
 dd = import_or_none("dask.dataframe")
@@ -153,12 +147,16 @@ def _get_describe_dict(
         # Calculate extra detailed stats, if requested
         if extra_stats:
             if column.is_numeric:
-                values["histogram"] = _get_histogram_values(series, bins=bins)
-                if isinstance(column.logical_type, (Integer, IntegerNullable)):
+                if pd.isnull(values["max"]) or pd.isnull(values["min"]):
+                    values["histogram"] = []
+                    values["top_values"] = []
+                else:
+                    values["histogram"] = _get_histogram_values(series, bins=bins)
                     _range = range(int(values["min"]), int(values["max"]) + 1)
                     # Calculate top numeric values if range of values present
-                    # is less than or equal number of histogram bins
-                    if len(_range) <= bins:
+                    # is less than or equal number of histogram bins and series
+                    # contains only integer values
+                    if len(_range) <= bins and (series % 1 == 0).all():
                         values["top_values"] = _get_numeric_value_counts_in_range(
                             series, _range
                         )
@@ -578,7 +576,11 @@ def _get_numeric_value_counts_in_range(series, _range):
     """
     frequencies = series.value_counts(dropna=True)
     value_counts = [
-        {"value": i, "count": frequencies[i] if i in frequencies else 0} for i in _range
+        {
+            "value": i if is_integer_dtype(series) else float(i),
+            "count": frequencies[i] if i in frequencies else 0,
+        }
+        for i in _range
     ]
     return sorted(value_counts, key=lambda i: (-i["count"], i["value"]))
 
diff --git a/woodwork/tests/accessor/test_statistics.py b/woodwork/tests/accessor/test_statistics.py
@@ -763,6 +763,22 @@ def test_describe_with_include(sample_df):
     multi_params_df["full_name"].equals(sample_df.ww.describe()["full_name"])
 
 
+def test_describe_numeric_all_nans():
+    df = pd.DataFrame({"nulls": [np.nan] * 5})
+    logical_types = ["double", "integer_nullable"]
+
+    for logical_type in logical_types:
+        df.ww.init(logical_types={"nulls": logical_type})
+        stats = df.ww.describe_dict(extra_stats=True)
+        assert pd.isnull(stats["nulls"]["max"])
+        assert pd.isnull(stats["nulls"]["min"])
+        assert pd.isnull(stats["nulls"]["mean"])
+        assert pd.isnull(stats["nulls"]["std"])
+        assert stats["nulls"]["nan_count"] == 5
+        assert stats["nulls"]["histogram"] == []
+        assert stats["nulls"]["top_values"] == []
+
+
 def test_pandas_nullable_integer_quantile_fix():
     """Should fail when https://github.com/pandas-dev/pandas/issues/42626 gets fixed"""
     if pd.__version__ not in ["1.3.0", "1.3.1"]:  # pragma: no cover
@@ -833,6 +849,12 @@ def test_describe_dict_extra_stats(describe_df):
     describe_df["nullable_integer_col"] = describe_df["numeric_col"]
     describe_df["integer_col"] = describe_df["numeric_col"].fillna(0)
     describe_df["small_range_col"] = describe_df["numeric_col"].fillna(0) // 10
+    describe_df["small_range_col_ints_as_double"] = (
+        describe_df["numeric_col"].fillna(0) // 10.0
+    )
+    describe_df["small_range_col_double_not_valid"] = (
+        describe_df["numeric_col"].fillna(0) / 10
+    )
 
     ltypes = {
         "category_col": "Categorical",
@@ -841,6 +863,8 @@ def test_describe_dict_extra_stats(describe_df):
         "nullable_integer_col": "IntegerNullable",
         "integer_col": "Integer",
         "small_range_col": "Integer",
+        "small_range_col_ints_as_double": "Double",
+        "small_range_col_double_not_valid": "Double",
     }
     describe_df.ww.init(index="index_col", logical_types=ltypes)
     desc_dict = describe_df.ww.describe_dict(extra_stats=True)
@@ -861,10 +885,12 @@ def test_describe_dict_extra_stats(describe_df):
         "nullable_integer_col",
         "integer_col",
         "small_range_col",
+        "small_range_col_ints_as_double",
+        "small_range_col_double_not_valid",
     ]:
         assert isinstance(desc_dict[col]["histogram"], list)
         assert desc_dict[col].get("recent_values") is None
-        if col == "small_range_col":
+        if col in {"small_range_col", "small_range_col_ints_as_double"}:
             # If values are in a narrow range, top values should be present
             assert isinstance(desc_dict[col]["top_values"], list)
         else: