Skip to content

Commit 7c6f6b0

Browse files
authored
Allow frequency inference on temporal (Datetime, Timedelta) columns of Woodwork DataFrame (#1202)
* Add code structure and first test * implement freq inference without columns param handling * test columns param and error states * Add docstring and use temporal ionstead of datetime * add new method to init error test * Add release note * add to api ref * Explain infer_freq limitations and add example to test fixture * change note wording * Acknowlege dask and koalas limitations
1 parent 86c1047 commit 7c6f6b0

File tree

7 files changed

+167
-2
lines changed

7 files changed

+167
-2
lines changed

docs/source/api_reference.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ WoodworkTableAccessor
1616
WoodworkTableAccessor.drop
1717
WoodworkTableAccessor.iloc
1818
WoodworkTableAccessor.index
19+
WoodworkTableAccessor.infer_temporal_frequencies
1920
WoodworkTableAccessor.init
2021
WoodworkTableAccessor.init_with_full_schema
2122
WoodworkTableAccessor.init_with_partial_schema

docs/source/release_notes.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,15 @@ Release Notes
55
Future Release
66
==============
77
* Enhancements
8+
* Allow frequency inference on temporal (Datetime, Timedelta) columns of Woodwork DataFrame (:pr:`1202`)
89
* Fixes
910
* Changes
1011
* Return histogram bins as a list of floats instead of a ``pandas.Interval`` object (:pr:`1207`)
1112
* Documentation Changes
1213
* Testing Changes
1314

1415
Thanks to the following people for contributing to this release:
15-
:user:`thehomebrewnerd`
16+
:user:`tamargrey`, :user:`thehomebrewnerd`
1617

1718
Breaking Changes
1819
++++++++++++++++

woodwork/statistics_utils.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,14 @@
55
from sklearn.metrics.cluster import normalized_mutual_info_score
66

77
from woodwork.accessor_utils import _is_dask_dataframe, _is_koalas_dataframe
8-
from woodwork.logical_types import Datetime, Double, Integer, IntegerNullable, LatLong
8+
from woodwork.logical_types import (
9+
Datetime,
10+
Double,
11+
Integer,
12+
IntegerNullable,
13+
LatLong,
14+
Timedelta,
15+
)
916
from woodwork.utils import _update_progress, get_valid_mi_types, import_or_none
1017

1118
dd = import_or_none("dask.dataframe")
@@ -640,3 +647,39 @@ def _get_histogram_values(series, bins=10):
640647
)
641648

642649
return results
650+
651+
652+
def _infer_temporal_frequencies(dataframe, temporal_columns=None):
653+
"""Infers the observation frequency (daily, biweekly, yearly, etc) of each temporal column
654+
in the DataFrame. Temporal columns are ones with the logical type Datetime or Timedelta.
655+
656+
Args:
657+
dataframe (DataFrame): The DataFrame for which frequncies should be inferred.
658+
temporal_columns (list[str], optional): Columns for which frequencies should be inferred. Must be columns
659+
that are present in the DataFrame and are temporal in nature. Defaults to None. If not
660+
specified, all temporal columns will have their frequencies inferred.
661+
662+
Returns:
663+
(dict): A dictionary where each key is a temporal column from the DataFrame, and the
664+
value is its observation frequency represented as a pandas offset alias string (D, M, Y, etc.)
665+
or None if no uniform frequency was present in the data.
666+
"""
667+
logical_types = dataframe.ww.logical_types
668+
669+
if temporal_columns is None:
670+
temporal_columns = [
671+
col
672+
for col, ltype in logical_types.items()
673+
if isinstance(ltype, (Datetime, Timedelta))
674+
]
675+
else:
676+
for col in temporal_columns:
677+
if col not in dataframe:
678+
raise ValueError(f"Column {col} not found in dataframe.")
679+
ltype = logical_types[col]
680+
if not isinstance(ltype, (Datetime, Timedelta)):
681+
raise TypeError(
682+
f"Cannot determine frequency for column {col} with logical type {ltype}"
683+
)
684+
685+
return {col: pd.infer_freq(dataframe[col]) for col in temporal_columns}

woodwork/table_accessor.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
_get_mutual_information_dict,
2828
_get_valid_mi_columns,
2929
_get_value_counts,
30+
_infer_temporal_frequencies,
3031
)
3132
from woodwork.table_schema import TableSchema
3233
from woodwork.type_sys.utils import _is_numeric_series, col_is_datetime
@@ -1081,6 +1082,48 @@ def value_counts(self, ascending=False, top_n=10, dropna=False):
10811082
"""
10821083
return _get_value_counts(self._dataframe, ascending, top_n, dropna)
10831084

1085+
@_check_table_schema
1086+
def infer_temporal_frequencies(self, temporal_columns=None):
1087+
"""Infers the observation frequency (daily, biweekly, yearly, etc) of each temporal column
1088+
in the DataFrame. Temporal columns are ones with the logical type Datetime or Timedelta.
1089+
Not supported for Dask and Koalas DataFrames.
1090+
1091+
Args:
1092+
temporal_columns (list[str], optional): Columns for which frequencies should be inferred. Must be columns
1093+
that are present in the DataFrame and are temporal in nature. Defaults to None. If not
1094+
specified, all temporal columns will have their frequencies inferred.
1095+
1096+
Returns:
1097+
(dict): A dictionary where each key is a temporal column from the DataFrame, and the
1098+
value is its observation frequency represented as a pandas offset alias string (D, M, Y, etc.)
1099+
or None if no uniform frequency was present in the data.
1100+
1101+
Note:
1102+
The pandas util ``pd.infer_freq``, which is used in this method, has the following behaviors:
1103+
- If even one row in a column does not follow the frequency seen in the remaining rows,
1104+
no frequency will be inferred. Example of otherwise daily data that skips one day:
1105+
``['2011-01-03', '2011-01-04', '2011-01-05', '2011-01-07']``.
1106+
- If any NaNs are present in the data, no frequency will be inferred.
1107+
- Pandas will use the largest offset alias available to it, so ``W`` will be inferred for weekly data instead of ``7D``.
1108+
The list of available offset aliases, which include aliases such as ``B`` for business day or ``N`` for nanosecond,
1109+
can be found at https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
1110+
- Offset aliases can be combined to create something like ``2d1H``, which could also be expressed as '49H'.
1111+
Pandas' frequency inference will return the lower common alias, ``49H``, in situations when it'd otherwise
1112+
need to combine aliases.
1113+
- Offset strings can contain more information than just the offset alias. For example, a date range
1114+
``pd.date_range(start="2020-01-01", freq="w", periods=10)`` will be inferred to have frequency ``W-SUN``.
1115+
That string is an offset alias with an anchoring suffix that indicates that the data is not only
1116+
observed at a weekly frequency, but that all the dates are on Sundays. More anchored offsets
1117+
can be seen here: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#anchored-offsets
1118+
- Some frequencies that can be defined for a ``pd.date_range`` cannot then be re-inferred by pandas' ``pd.infer_freq``.
1119+
One example of this can be seen when using the business day offset alias ``B``
1120+
``pd.date_range(start="2020-01-01", freq="4b", periods=10)``, which is a valid ``freq``
1121+
parameter when building the date range, but is not then inferrable.
1122+
"""
1123+
return _infer_temporal_frequencies(
1124+
self._dataframe, temporal_columns=temporal_columns
1125+
)
1126+
10841127

10851128
def _validate_accessor_params(
10861129
dataframe, index, time_index, logical_types, schema, use_standard_tags

woodwork/tests/accessor/test_statistics.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1318,3 +1318,53 @@ def test_box_plot_optional_return_values(outliers_df):
13181318
"low_indices",
13191319
"high_indices",
13201320
} == set(no_outliers_box_plot_info_with_optional.keys())
1321+
1322+
1323+
def test_infer_temporal_frequencies(datetime_freqs_df_pandas):
1324+
# TODO: Add support for Dask and Koalas DataFrames
1325+
datetime_freqs_df_pandas.ww.init()
1326+
1327+
frequency_dict = datetime_freqs_df_pandas.ww.infer_temporal_frequencies()
1328+
assert len(frequency_dict) == len(datetime_freqs_df_pandas.columns) - 1
1329+
assert "ints" not in frequency_dict
1330+
1331+
expected_no_frequency = {
1332+
"same_date",
1333+
"1d_skipped_one_freq",
1334+
"3M_one_nan",
1335+
"3B_no_freq",
1336+
}
1337+
assert {
1338+
key for key, val in frequency_dict.items() if val is None
1339+
} == expected_no_frequency
1340+
1341+
1342+
def test_infer_temporal_frequencies_with_columns(datetime_freqs_df_pandas):
1343+
datetime_freqs_df_pandas.ww.init(time_index="2D_freq")
1344+
1345+
frequency_dict = datetime_freqs_df_pandas.ww.infer_temporal_frequencies(
1346+
temporal_columns=[datetime_freqs_df_pandas.ww.time_index]
1347+
)
1348+
assert len(frequency_dict) == 1
1349+
assert frequency_dict["2D_freq"] == "2D"
1350+
1351+
empty_frequency_dict = datetime_freqs_df_pandas.ww.infer_temporal_frequencies(
1352+
temporal_columns=[]
1353+
)
1354+
assert len(empty_frequency_dict) == 0
1355+
1356+
1357+
def test_infer_temporal_frequencies_errors(datetime_freqs_df_pandas):
1358+
datetime_freqs_df_pandas.ww.init()
1359+
1360+
error = "Column not_present not found in dataframe."
1361+
with pytest.raises(ValueError, match=error):
1362+
datetime_freqs_df_pandas.ww.infer_temporal_frequencies(
1363+
temporal_columns=["2D_freq", "not_present"]
1364+
)
1365+
1366+
error = "Cannot determine frequency for column ints with logical type Integer"
1367+
with pytest.raises(TypeError, match=error):
1368+
datetime_freqs_df_pandas.ww.infer_temporal_frequencies(
1369+
temporal_columns=["1d_skipped_one_freq", "ints"]
1370+
)

woodwork/tests/accessor/test_table_accessor.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@ def test_accessor_init_errors_methods(sample_df):
328328
"to_disk": ["dir"],
329329
"to_dictionary": None,
330330
"value_counts": None,
331+
"infer_temporal_frequencies": None,
331332
}
332333
error = re.escape(
333334
"Woodwork not initialized for this DataFrame. Initialize by calling DataFrame.ww.init"

woodwork/tests/conftest.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,32 @@ def sample_df_koalas(sample_df_pandas):
119119
return ks.from_pandas(sample_df_pandas)
120120

121121

122+
@pytest.fixture()
123+
def datetime_freqs_df_pandas():
124+
return pd.DataFrame(
125+
{
126+
"2D_freq": pd.date_range(start="2020-01-01", end="2020-01-20", freq="2D"),
127+
"3M_freq": pd.date_range(start="2015-01-01", freq="3M", periods=10),
128+
"3B_no_freq": pd.date_range(start="2015-01-01", freq="3B", periods=10),
129+
"1d_skipped_one_freq": pd.date_range(
130+
start="2020-01-01", end="2020-01-11", freq="1D"
131+
).drop("2020-01-04"),
132+
"3M_one_nan": list(
133+
pd.date_range(start="2015-01-01", freq="3M", periods=10).drop(
134+
"2015-07-31"
135+
)
136+
)
137+
+ [None],
138+
"2d_timedelta": pd.date_range(
139+
start="2020-01-01", end="2020-01-20", freq="2D"
140+
)
141+
- pd.Timestamp("2020-01-01"),
142+
"ints": range(10),
143+
"same_date": ["2015-01-01"] * 10,
144+
}
145+
)
146+
147+
122148
@pytest.fixture(
123149
params=[
124150
"sample_unsorted_df_pandas",

0 commit comments

Comments
 (0)