Allow frequency inference on temporal (Datetime, Timedelta) columns of Woodwork DataFrame (#1202)

tamargrey · web-flow · commit 7c6f6b08ee59 · 2021-11-30T13:38:34.000-05:00
* Add code structure and first test

* implement freq inference without columns param handling

* test columns param and error states

* Add  docstring and use temporal ionstead of datetime

* add new method to init error test

* Add release note

* add to api ref

* Explain infer_freq limitations and add example to test fixture

* change note wording

* Acknowlege dask and koalas limitations
diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst
@@ -16,6 +16,7 @@ WoodworkTableAccessor
     WoodworkTableAccessor.drop
     WoodworkTableAccessor.iloc
     WoodworkTableAccessor.index
+    WoodworkTableAccessor.infer_temporal_frequencies
     WoodworkTableAccessor.init
     WoodworkTableAccessor.init_with_full_schema
     WoodworkTableAccessor.init_with_partial_schema
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -5,14 +5,15 @@ Release Notes
 Future Release
 ==============
     * Enhancements
+        * Allow frequency inference on temporal (Datetime, Timedelta) columns of Woodwork DataFrame (:pr:`1202`) 
     * Fixes
     * Changes
         * Return histogram bins as a list of floats instead of a ``pandas.Interval`` object (:pr:`1207`)
     * Documentation Changes
     * Testing Changes
 
     Thanks to the following people for contributing to this release:
-    :user:`thehomebrewnerd`
+    :user:`tamargrey`, :user:`thehomebrewnerd`
 
 Breaking Changes
 ++++++++++++++++
diff --git a/woodwork/statistics_utils.py b/woodwork/statistics_utils.py
@@ -5,7 +5,14 @@
 from sklearn.metrics.cluster import normalized_mutual_info_score
 
 from woodwork.accessor_utils import _is_dask_dataframe, _is_koalas_dataframe
-from woodwork.logical_types import Datetime, Double, Integer, IntegerNullable, LatLong
+from woodwork.logical_types import (
+    Datetime,
+    Double,
+    Integer,
+    IntegerNullable,
+    LatLong,
+    Timedelta,
+)
 from woodwork.utils import _update_progress, get_valid_mi_types, import_or_none
 
 dd = import_or_none("dask.dataframe")
@@ -640,3 +647,39 @@ def _get_histogram_values(series, bins=10):
         )
 
     return results
+
+
+def _infer_temporal_frequencies(dataframe, temporal_columns=None):
+    """Infers the observation frequency (daily, biweekly, yearly, etc) of each temporal column
+            in the DataFrame. Temporal columns are ones with the logical type Datetime or Timedelta.
+
+    Args:
+        dataframe (DataFrame): The DataFrame for which frequncies should be inferred.
+        temporal_columns (list[str], optional): Columns for which frequencies should be inferred. Must be columns
+            that are present in the DataFrame and are temporal in nature. Defaults to None. If not
+            specified, all temporal columns will have their frequencies inferred.
+
+    Returns:
+        (dict): A dictionary where each key is a temporal column from the DataFrame, and the
+            value is its observation frequency represented as a pandas offset alias string (D, M, Y, etc.)
+            or None if no uniform frequency was present in the data.
+    """
+    logical_types = dataframe.ww.logical_types
+
+    if temporal_columns is None:
+        temporal_columns = [
+            col
+            for col, ltype in logical_types.items()
+            if isinstance(ltype, (Datetime, Timedelta))
+        ]
+    else:
+        for col in temporal_columns:
+            if col not in dataframe:
+                raise ValueError(f"Column {col} not found in dataframe.")
+            ltype = logical_types[col]
+            if not isinstance(ltype, (Datetime, Timedelta)):
+                raise TypeError(
+                    f"Cannot determine frequency for column {col} with logical type {ltype}"
+                )
+
+    return {col: pd.infer_freq(dataframe[col]) for col in temporal_columns}
diff --git a/woodwork/table_accessor.py b/woodwork/table_accessor.py
@@ -27,6 +27,7 @@
     _get_mutual_information_dict,
     _get_valid_mi_columns,
     _get_value_counts,
+    _infer_temporal_frequencies,
 )
 from woodwork.table_schema import TableSchema
 from woodwork.type_sys.utils import _is_numeric_series, col_is_datetime
@@ -1081,6 +1082,48 @@ def value_counts(self, ascending=False, top_n=10, dropna=False):
         """
         return _get_value_counts(self._dataframe, ascending, top_n, dropna)
 
+    @_check_table_schema
+    def infer_temporal_frequencies(self, temporal_columns=None):
+        """Infers the observation frequency (daily, biweekly, yearly, etc) of each temporal column
+            in the DataFrame. Temporal columns are ones with the logical type Datetime or Timedelta.
+            Not supported for Dask and Koalas DataFrames.
+
+        Args:
+            temporal_columns (list[str], optional): Columns for which frequencies should be inferred. Must be columns
+                that are present in the DataFrame and are temporal in nature. Defaults to None. If not
+                specified, all temporal columns will have their frequencies inferred.
+
+        Returns:
+            (dict): A dictionary where each key is a temporal column from the DataFrame, and the
+                value is its observation frequency represented as a pandas offset alias string (D, M, Y, etc.)
+                or None if no uniform frequency was present in the data.
+
+        Note:
+            The pandas util ``pd.infer_freq``, which is used in this method, has the following behaviors:
+                - If even one row in a column does not follow the frequency seen in the remaining rows,
+                    no frequency will be inferred. Example of otherwise daily data that skips one day:
+                    ``['2011-01-03', '2011-01-04', '2011-01-05', '2011-01-07']``.
+                - If any NaNs are present in the data, no frequency will be inferred.
+                - Pandas will use the largest offset alias available to it, so ``W`` will be inferred for weekly data instead of ``7D``.
+                    The list of available offset aliases, which include aliases such as ``B`` for business day or ``N`` for nanosecond,
+                    can be found at https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
+                - Offset aliases can be combined to create something like ``2d1H``, which could also be expressed as '49H'.
+                    Pandas' frequency inference will return the lower common alias, ``49H``, in situations when it'd otherwise
+                    need to combine aliases.
+                - Offset strings can contain more information than just the offset alias. For example, a date range
+                    ``pd.date_range(start="2020-01-01", freq="w", periods=10)`` will be inferred to have frequency ``W-SUN``.
+                    That string is an offset alias with an anchoring suffix that indicates that the data is not only
+                    observed at a weekly frequency, but that all the dates are on Sundays. More anchored offsets
+                    can be seen here: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#anchored-offsets
+                - Some frequencies that can be defined for a ``pd.date_range`` cannot then be re-inferred by pandas' ``pd.infer_freq``.
+                    One example of this can be seen when using the business day offset alias ``B``
+                    ``pd.date_range(start="2020-01-01", freq="4b", periods=10)``, which is a valid ``freq``
+                    parameter when building the date range, but is not then inferrable.
+        """
+        return _infer_temporal_frequencies(
+            self._dataframe, temporal_columns=temporal_columns
+        )
+
 
 def _validate_accessor_params(
     dataframe, index, time_index, logical_types, schema, use_standard_tags
diff --git a/woodwork/tests/accessor/test_statistics.py b/woodwork/tests/accessor/test_statistics.py
@@ -1318,3 +1318,53 @@ def test_box_plot_optional_return_values(outliers_df):
         "low_indices",
         "high_indices",
     } == set(no_outliers_box_plot_info_with_optional.keys())
+
+
+def test_infer_temporal_frequencies(datetime_freqs_df_pandas):
+    # TODO: Add support for Dask and Koalas DataFrames
+    datetime_freqs_df_pandas.ww.init()
+
+    frequency_dict = datetime_freqs_df_pandas.ww.infer_temporal_frequencies()
+    assert len(frequency_dict) == len(datetime_freqs_df_pandas.columns) - 1
+    assert "ints" not in frequency_dict
+
+    expected_no_frequency = {
+        "same_date",
+        "1d_skipped_one_freq",
+        "3M_one_nan",
+        "3B_no_freq",
+    }
+    assert {
+        key for key, val in frequency_dict.items() if val is None
+    } == expected_no_frequency
+
+
+def test_infer_temporal_frequencies_with_columns(datetime_freqs_df_pandas):
+    datetime_freqs_df_pandas.ww.init(time_index="2D_freq")
+
+    frequency_dict = datetime_freqs_df_pandas.ww.infer_temporal_frequencies(
+        temporal_columns=[datetime_freqs_df_pandas.ww.time_index]
+    )
+    assert len(frequency_dict) == 1
+    assert frequency_dict["2D_freq"] == "2D"
+
+    empty_frequency_dict = datetime_freqs_df_pandas.ww.infer_temporal_frequencies(
+        temporal_columns=[]
+    )
+    assert len(empty_frequency_dict) == 0
+
+
+def test_infer_temporal_frequencies_errors(datetime_freqs_df_pandas):
+    datetime_freqs_df_pandas.ww.init()
+
+    error = "Column not_present not found in dataframe."
+    with pytest.raises(ValueError, match=error):
+        datetime_freqs_df_pandas.ww.infer_temporal_frequencies(
+            temporal_columns=["2D_freq", "not_present"]
+        )
+
+    error = "Cannot determine frequency for column ints with logical type Integer"
+    with pytest.raises(TypeError, match=error):
+        datetime_freqs_df_pandas.ww.infer_temporal_frequencies(
+            temporal_columns=["1d_skipped_one_freq", "ints"]
+        )
diff --git a/woodwork/tests/accessor/test_table_accessor.py b/woodwork/tests/accessor/test_table_accessor.py
@@ -328,6 +328,7 @@ def test_accessor_init_errors_methods(sample_df):
         "to_disk": ["dir"],
         "to_dictionary": None,
         "value_counts": None,
+        "infer_temporal_frequencies": None,
     }
     error = re.escape(
         "Woodwork not initialized for this DataFrame. Initialize by calling DataFrame.ww.init"
diff --git a/woodwork/tests/conftest.py b/woodwork/tests/conftest.py
@@ -119,6 +119,32 @@ def sample_df_koalas(sample_df_pandas):
     return ks.from_pandas(sample_df_pandas)
 
 
+@pytest.fixture()
+def datetime_freqs_df_pandas():
+    return pd.DataFrame(
+        {
+            "2D_freq": pd.date_range(start="2020-01-01", end="2020-01-20", freq="2D"),
+            "3M_freq": pd.date_range(start="2015-01-01", freq="3M", periods=10),
+            "3B_no_freq": pd.date_range(start="2015-01-01", freq="3B", periods=10),
+            "1d_skipped_one_freq": pd.date_range(
+                start="2020-01-01", end="2020-01-11", freq="1D"
+            ).drop("2020-01-04"),
+            "3M_one_nan": list(
+                pd.date_range(start="2015-01-01", freq="3M", periods=10).drop(
+                    "2015-07-31"
+                )
+            )
+            + [None],
+            "2d_timedelta": pd.date_range(
+                start="2020-01-01", end="2020-01-20", freq="2D"
+            )
+            - pd.Timestamp("2020-01-01"),
+            "ints": range(10),
+            "same_date": ["2015-01-01"] * 10,
+        }
+    )
+
+
 @pytest.fixture(
     params=[
         "sample_unsorted_df_pandas",

Original file line number	Diff line number	Diff line change
`@@ -328,6 +328,7 @@ def test_accessor_init_errors_methods(sample_df):`
`328`	`328`	`"to_disk": ["dir"],`
`329`	`329`	`"to_dictionary": None,`
`330`	`330`	`"value_counts": None,`
	`331`	`+ "infer_temporal_frequencies": None,`
`331`	`332`	`}`
`332`	`333`	`error = re.escape(`
`333`	`334`	`"Woodwork not initialized for this DataFrame. Initialize by calling DataFrame.ww.init"`