Replace empty string with NaN values (#1161)

jeff-hernandez · Nate Parsons · web-flow · commit fd7a5d44840e · 2021-11-11T10:51:58.000-06:00
* add read file parameter

* update release notes

* update test

* add None to test

* add test with read_file

* update function logic

* update docstrings

* use more types in test

* lint fix

* remove print statement

* rename function and update test

* lint fix

* bump min pandas requirement

* bump min pandas requirement

* update tests and refactor function

Co-authored-by: Nate Parsons &lt;nate.parsons@alteryx.com&gt;
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -5,6 +5,7 @@ Release Notes
 Future Release
 ==============
     * Enhancements
+        * Added ``read_file`` parameter for replacing empty string values with NaN values (:pr:`1161`)
     * Fixes
         * Set a maximum version for pyspark until we understand why :pr:`1169` failed (:pr:`1179`)
         * Require newer dask version (:pr:`1180`)
@@ -17,7 +18,7 @@ Future Release
         * Updated notebook standardizer to standardize python versions (:pr:`1166`)
 
     Thanks to the following people for contributing to this release:
-    :user:`davesque`, :user:`gsheni`, :user:`bchen1116`, :user:`rwedge`, :user:`tamargrey`
+    :user:`davesque`, :user:`gsheni`, :user:`bchen1116`, :user:`rwedge`, :user:`tamargrey`, :user:`thehomebrewnerd`
 
 v0.8.2 Oct 12, 2021
 ===================
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
-pandas>=1.2.5
+pandas>=1.3.0
 scikit-learn>=0.22
diff --git a/woodwork/tests/requirement_files/minimum_core_requirements.txt b/woodwork/tests/requirement_files/minimum_core_requirements.txt
@@ -1,2 +1,2 @@
-pandas==1.2.5
+pandas==1.3.0
 scikit-learn==0.22
diff --git a/woodwork/tests/requirement_files/minimum_dask_requirements.txt b/woodwork/tests/requirement_files/minimum_dask_requirements.txt
@@ -1,3 +1,3 @@
 dask[dataframe]==2021.10.0
-pandas==1.2.5
+pandas==1.3.0
 scikit-learn==0.22
diff --git a/woodwork/tests/requirement_files/minimum_koalas_requirements.txt b/woodwork/tests/requirement_files/minimum_koalas_requirements.txt
@@ -1,4 +1,4 @@
 pyspark==3.0.0
 koalas==1.8.0
-pandas==1.2.5
+pandas==1.3.0
 scikit-learn==0.22
diff --git a/woodwork/tests/utils/test_read_file.py b/woodwork/tests/utils/test_read_file.py
@@ -6,6 +6,7 @@
 
 import woodwork as ww
 from woodwork.serialize import save_orc_file
+from woodwork.utils import _replace_nan_strings
 
 
 def test_read_file_errors_no_content_type(sample_df_pandas, tmpdir):
@@ -195,3 +196,68 @@ def test_read_file(
         schema_df = schema_df.head(kwargs["nrows"])
 
     pd.testing.assert_frame_equal(df, schema_df)
+
+
+def test_replace_nan_strings():
+    data = {
+        "double": ["<NA>", "6.2", "4.2", "3.11"],
+        "integer": ["<NA>", "6", "4", "3"],
+        "null": ["<NA>", "", "nan", None],
+        "null_string": pd.Series(["<NA>", "", "nan", ""], dtype="string"),
+        "Int64": pd.Series([1, 2, 3, 4], dtype="Int64"),
+        "Float64": pd.Series([1.1, 2.2, 3.3, 4.4], dtype="Float64"),
+        "boolean": pd.Series([True, True, False, False], dtype="boolean"),
+        "int64": pd.Series([1, 2, 3, 4], dtype="int64"),
+        "double2": pd.Series([1, 2, 3, 4.5], dtype="float64"),
+        "bool": pd.Series([True, True, False, False], dtype="bool"),
+        "category": pd.Series([1, 2, 2, 1], dtype="category"),
+    }
+
+    expected_null_count = {
+        "double": 1,
+        "integer": 1,
+        "null": 4,
+        "null_string": 4,
+        "Int64": 0,
+        "Float64": 0,
+        "boolean": 0,
+        "int64": 0,
+        "double2": 0,
+        "bool": 0,
+        "category": 0,
+    }
+
+    df = pd.DataFrame(data=data)
+    replaced_df = _replace_nan_strings(df)
+    for col in replaced_df:
+        assert replaced_df[col].isnull().sum() == expected_null_count[col]
+
+
+def test_replace_nan_strings_with_read_file(tmpdir):
+    filepath = os.path.join(tmpdir, "data.parquet")
+    content_type = "application/parquet"
+
+    data = {
+        "double": ["<NA>", "6.2", "4.2", "3.11"],
+        "integer": ["<NA>", "6", "4", "3"],
+        "null": ["<NA>", "", None, "nan"],
+    }
+
+    df = pd.DataFrame(data=data)
+    df.to_parquet(filepath)
+
+    # Without replacement
+    actual = ww.read_file(
+        content_type=content_type,
+        filepath=filepath,
+        replace_nan=False,
+    )
+    assert actual.isnull().sum().sum() == 1
+
+    # With replacement
+    actual = ww.read_file(
+        content_type=content_type,
+        filepath=filepath,
+        replace_nan=True,
+    )
+    assert actual.isnull().sum().sum() == 6
diff --git a/woodwork/utils.py b/woodwork/utils.py
@@ -99,6 +99,7 @@ def read_file(
     logical_types=None,
     use_standard_tags=True,
     column_origins=None,
+    replace_nan=False,
     validate=True,
     **kwargs,
 ):
@@ -130,6 +131,8 @@ def read_file(
             on the inferred or specified logical type for the column. Defaults to True.
         column_origins (str or dict[str -> str], optional): Origin of each column. If a string is supplied, it is
                 used as the origin for all columns. A dictionary can be used to set origins for individual columns.
+        replace_nan (bool, optional): Whether to replace empty string values and string representations of
+            NaN values ("nan", "<NA>") with np.nan or pd.NA values based on column dtype. Defaults to False.
         validate (bool, optional): Whether parameter and data validation should occur. Defaults to True. Warning:
                 Should be set to False only when parameters and data are known to be valid.
                 Any errors resulting from skipping validation with invalid inputs may not be easily understood.
@@ -170,6 +173,10 @@ def read_file(
             kwargs["engine"] = "pyarrow"
 
     dataframe = type_to_read_func_map[content_type](filepath, **kwargs)
+
+    if replace_nan:
+        dataframe = _replace_nan_strings(dataframe)
+
     dataframe.ww.init(
         name=name,
         index=index,
@@ -511,3 +518,28 @@ def _infer_datetime_format(dates, n=100):
     except (TypeError, ValueError, IndexError, KeyError, NotImplementedError):
         mode_fmt = None
     return mode_fmt
+
+
+def _replace_nan_strings(df):
+    """Replaces empty string values and string representations of
+    NaN values ("nan", "<NA>") with np.nan or pd.NA depending on
+    column dtype."""
+    df = df.fillna(value=np.nan)
+
+    for col, dtype in df.dtypes.items():
+        replace_val = np.nan
+        if str(dtype) == "boolean":
+            # All replace calls below fail with boolean dtype
+            # but boolean cols cannot contain strings to begin with.
+            continue
+        elif str(dtype) == "string":
+            # Must use pd.NA as replacement value for string dtype
+            replace_val = pd.NA
+
+        replaced_series = df[col].replace(r"^\s*$", replace_val, regex=True)
+        replaced_series = replaced_series.replace(
+            {"nan": replace_val, "<NA>": replace_val}
+        )
+        df[col] = replaced_series
+
+    return df

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-pandas>=1.2.5`
	`1`	`+pandas>=1.3.0`
`2`	`2`	`scikit-learn>=0.22`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-pandas==1.2.5`
	`1`	`+pandas==1.3.0`
`2`	`2`	`scikit-learn==0.22`