Skip to content

Commit fd7a5d4

Browse files
jeff-hernandezNate Parsons
andauthored
Replace empty string with NaN values (#1161)
* add read file parameter * update release notes * update test * add None to test * add test with read_file * update function logic * update docstrings * use more types in test * lint fix * remove print statement * rename function and update test * lint fix * bump min pandas requirement * bump min pandas requirement * update tests and refactor function Co-authored-by: Nate Parsons <[email protected]>
1 parent a6a623a commit fd7a5d4

File tree

7 files changed

+104
-5
lines changed

7 files changed

+104
-5
lines changed

docs/source/release_notes.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ Release Notes
55
Future Release
66
==============
77
* Enhancements
8+
* Added ``read_file`` parameter for replacing empty string values with NaN values (:pr:`1161`)
89
* Fixes
910
* Set a maximum version for pyspark until we understand why :pr:`1169` failed (:pr:`1179`)
1011
* Require newer dask version (:pr:`1180`)
@@ -17,7 +18,7 @@ Future Release
1718
* Updated notebook standardizer to standardize python versions (:pr:`1166`)
1819

1920
Thanks to the following people for contributing to this release:
20-
:user:`davesque`, :user:`gsheni`, :user:`bchen1116`, :user:`rwedge`, :user:`tamargrey`
21+
:user:`davesque`, :user:`gsheni`, :user:`bchen1116`, :user:`rwedge`, :user:`tamargrey`, :user:`thehomebrewnerd`
2122

2223
v0.8.2 Oct 12, 2021
2324
===================

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
pandas>=1.2.5
1+
pandas>=1.3.0
22
scikit-learn>=0.22
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
pandas==1.2.5
1+
pandas==1.3.0
22
scikit-learn==0.22
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
dask[dataframe]==2021.10.0
2-
pandas==1.2.5
2+
pandas==1.3.0
33
scikit-learn==0.22
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
pyspark==3.0.0
22
koalas==1.8.0
3-
pandas==1.2.5
3+
pandas==1.3.0
44
scikit-learn==0.22

woodwork/tests/utils/test_read_file.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import woodwork as ww
88
from woodwork.serialize import save_orc_file
9+
from woodwork.utils import _replace_nan_strings
910

1011

1112
def test_read_file_errors_no_content_type(sample_df_pandas, tmpdir):
@@ -195,3 +196,68 @@ def test_read_file(
195196
schema_df = schema_df.head(kwargs["nrows"])
196197

197198
pd.testing.assert_frame_equal(df, schema_df)
199+
200+
201+
def test_replace_nan_strings():
202+
data = {
203+
"double": ["<NA>", "6.2", "4.2", "3.11"],
204+
"integer": ["<NA>", "6", "4", "3"],
205+
"null": ["<NA>", "", "nan", None],
206+
"null_string": pd.Series(["<NA>", "", "nan", ""], dtype="string"),
207+
"Int64": pd.Series([1, 2, 3, 4], dtype="Int64"),
208+
"Float64": pd.Series([1.1, 2.2, 3.3, 4.4], dtype="Float64"),
209+
"boolean": pd.Series([True, True, False, False], dtype="boolean"),
210+
"int64": pd.Series([1, 2, 3, 4], dtype="int64"),
211+
"double2": pd.Series([1, 2, 3, 4.5], dtype="float64"),
212+
"bool": pd.Series([True, True, False, False], dtype="bool"),
213+
"category": pd.Series([1, 2, 2, 1], dtype="category"),
214+
}
215+
216+
expected_null_count = {
217+
"double": 1,
218+
"integer": 1,
219+
"null": 4,
220+
"null_string": 4,
221+
"Int64": 0,
222+
"Float64": 0,
223+
"boolean": 0,
224+
"int64": 0,
225+
"double2": 0,
226+
"bool": 0,
227+
"category": 0,
228+
}
229+
230+
df = pd.DataFrame(data=data)
231+
replaced_df = _replace_nan_strings(df)
232+
for col in replaced_df:
233+
assert replaced_df[col].isnull().sum() == expected_null_count[col]
234+
235+
236+
def test_replace_nan_strings_with_read_file(tmpdir):
237+
filepath = os.path.join(tmpdir, "data.parquet")
238+
content_type = "application/parquet"
239+
240+
data = {
241+
"double": ["<NA>", "6.2", "4.2", "3.11"],
242+
"integer": ["<NA>", "6", "4", "3"],
243+
"null": ["<NA>", "", None, "nan"],
244+
}
245+
246+
df = pd.DataFrame(data=data)
247+
df.to_parquet(filepath)
248+
249+
# Without replacement
250+
actual = ww.read_file(
251+
content_type=content_type,
252+
filepath=filepath,
253+
replace_nan=False,
254+
)
255+
assert actual.isnull().sum().sum() == 1
256+
257+
# With replacement
258+
actual = ww.read_file(
259+
content_type=content_type,
260+
filepath=filepath,
261+
replace_nan=True,
262+
)
263+
assert actual.isnull().sum().sum() == 6

woodwork/utils.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ def read_file(
9999
logical_types=None,
100100
use_standard_tags=True,
101101
column_origins=None,
102+
replace_nan=False,
102103
validate=True,
103104
**kwargs,
104105
):
@@ -130,6 +131,8 @@ def read_file(
130131
on the inferred or specified logical type for the column. Defaults to True.
131132
column_origins (str or dict[str -> str], optional): Origin of each column. If a string is supplied, it is
132133
used as the origin for all columns. A dictionary can be used to set origins for individual columns.
134+
replace_nan (bool, optional): Whether to replace empty string values and string representations of
135+
NaN values ("nan", "<NA>") with np.nan or pd.NA values based on column dtype. Defaults to False.
133136
validate (bool, optional): Whether parameter and data validation should occur. Defaults to True. Warning:
134137
Should be set to False only when parameters and data are known to be valid.
135138
Any errors resulting from skipping validation with invalid inputs may not be easily understood.
@@ -170,6 +173,10 @@ def read_file(
170173
kwargs["engine"] = "pyarrow"
171174

172175
dataframe = type_to_read_func_map[content_type](filepath, **kwargs)
176+
177+
if replace_nan:
178+
dataframe = _replace_nan_strings(dataframe)
179+
173180
dataframe.ww.init(
174181
name=name,
175182
index=index,
@@ -511,3 +518,28 @@ def _infer_datetime_format(dates, n=100):
511518
except (TypeError, ValueError, IndexError, KeyError, NotImplementedError):
512519
mode_fmt = None
513520
return mode_fmt
521+
522+
523+
def _replace_nan_strings(df):
524+
"""Replaces empty string values and string representations of
525+
NaN values ("nan", "<NA>") with np.nan or pd.NA depending on
526+
column dtype."""
527+
df = df.fillna(value=np.nan)
528+
529+
for col, dtype in df.dtypes.items():
530+
replace_val = np.nan
531+
if str(dtype) == "boolean":
532+
# All replace calls below fail with boolean dtype
533+
# but boolean cols cannot contain strings to begin with.
534+
continue
535+
elif str(dtype) == "string":
536+
# Must use pd.NA as replacement value for string dtype
537+
replace_val = pd.NA
538+
539+
replaced_series = df[col].replace(r"^\s*$", replace_val, regex=True)
540+
replaced_series = replaced_series.replace(
541+
{"nan": replace_val, "<NA>": replace_val}
542+
)
543+
df[col] = replaced_series
544+
545+
return df

0 commit comments

Comments
 (0)