|
27 | 27 | _get_mutual_information_dict, |
28 | 28 | _get_valid_mi_columns, |
29 | 29 | _get_value_counts, |
| 30 | + _infer_temporal_frequencies, |
30 | 31 | ) |
31 | 32 | from woodwork.table_schema import TableSchema |
32 | 33 | from woodwork.type_sys.utils import _is_numeric_series, col_is_datetime |
@@ -1081,6 +1082,48 @@ def value_counts(self, ascending=False, top_n=10, dropna=False): |
1081 | 1082 | """ |
1082 | 1083 | return _get_value_counts(self._dataframe, ascending, top_n, dropna) |
1083 | 1084 |
|
| 1085 | + @_check_table_schema |
| 1086 | + def infer_temporal_frequencies(self, temporal_columns=None): |
| 1087 | + """Infers the observation frequency (daily, biweekly, yearly, etc) of each temporal column |
| 1088 | + in the DataFrame. Temporal columns are ones with the logical type Datetime or Timedelta. |
| 1089 | + Not supported for Dask and Koalas DataFrames. |
| 1090 | +
|
| 1091 | + Args: |
| 1092 | + temporal_columns (list[str], optional): Columns for which frequencies should be inferred. Must be columns |
| 1093 | + that are present in the DataFrame and are temporal in nature. Defaults to None. If not |
| 1094 | + specified, all temporal columns will have their frequencies inferred. |
| 1095 | +
|
| 1096 | + Returns: |
| 1097 | + (dict): A dictionary where each key is a temporal column from the DataFrame, and the |
| 1098 | + value is its observation frequency represented as a pandas offset alias string (D, M, Y, etc.) |
| 1099 | + or None if no uniform frequency was present in the data. |
| 1100 | +
|
| 1101 | + Note: |
| 1102 | + The pandas util ``pd.infer_freq``, which is used in this method, has the following behaviors: |
| 1103 | + - If even one row in a column does not follow the frequency seen in the remaining rows, |
| 1104 | + no frequency will be inferred. Example of otherwise daily data that skips one day: |
| 1105 | + ``['2011-01-03', '2011-01-04', '2011-01-05', '2011-01-07']``. |
| 1106 | + - If any NaNs are present in the data, no frequency will be inferred. |
| 1107 | + - Pandas will use the largest offset alias available to it, so ``W`` will be inferred for weekly data instead of ``7D``. |
| 1108 | + The list of available offset aliases, which include aliases such as ``B`` for business day or ``N`` for nanosecond, |
| 1109 | + can be found at https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases |
| 1110 | + - Offset aliases can be combined to create something like ``2d1H``, which could also be expressed as '49H'. |
| 1111 | + Pandas' frequency inference will return the lower common alias, ``49H``, in situations when it'd otherwise |
| 1112 | + need to combine aliases. |
| 1113 | + - Offset strings can contain more information than just the offset alias. For example, a date range |
| 1114 | + ``pd.date_range(start="2020-01-01", freq="w", periods=10)`` will be inferred to have frequency ``W-SUN``. |
| 1115 | + That string is an offset alias with an anchoring suffix that indicates that the data is not only |
| 1116 | + observed at a weekly frequency, but that all the dates are on Sundays. More anchored offsets |
| 1117 | + can be seen here: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#anchored-offsets |
| 1118 | + - Some frequencies that can be defined for a ``pd.date_range`` cannot then be re-inferred by pandas' ``pd.infer_freq``. |
| 1119 | + One example of this can be seen when using the business day offset alias ``B`` |
| 1120 | + ``pd.date_range(start="2020-01-01", freq="4b", periods=10)``, which is a valid ``freq`` |
| 1121 | + parameter when building the date range, but is not then inferrable. |
| 1122 | + """ |
| 1123 | + return _infer_temporal_frequencies( |
| 1124 | + self._dataframe, temporal_columns=temporal_columns |
| 1125 | + ) |
| 1126 | + |
1084 | 1127 |
|
1085 | 1128 | def _validate_accessor_params( |
1086 | 1129 | dataframe, index, time_index, logical_types, schema, use_standard_tags |
|
0 commit comments