Skip to content

Commit 377001a

Browse files
SNOW-2677419: Add support for resample functions in faster pandas (#3990)
1 parent d38d986 commit 377001a

File tree

3 files changed

+106
-0
lines changed

3 files changed

+106
-0
lines changed

CHANGELOG.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,19 @@
166166
- `groupby.resample`
167167
- `to_snowflake`
168168
- `to_snowpark`
169+
- `resample.min`
170+
- `resample.max`
171+
- `resample.count`
172+
- `resample.sum`
173+
- `resample.mean`
174+
- `resample.median`
175+
- `resample.std`
176+
- `resample.var`
177+
- `resample.size`
178+
- `resample.first`
179+
- `resample.last`
180+
- `resample.quantile`
181+
- `resample.nunique`
169182
- Make faster pandas disabled by default (opt-in instead of opt-out).
170183
- Improve performance of `drop_duplicates` by avoiding joins when `keep!=False` in faster pandas.
171184

src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15752,6 +15752,42 @@ def resample(
1575215752
resample_method_args: tuple[Any],
1575315753
resample_method_kwargs: dict[str, Any],
1575415754
is_series: bool,
15755+
) -> Union["SnowflakeQueryCompiler", collections.defaultdict[Hashable, list]]:
15756+
"""
15757+
Wrapper around _resample_internal to be supported in faster pandas.
15758+
"""
15759+
relaxed_query_compiler = None
15760+
if self._relaxed_query_compiler is not None:
15761+
result = self._relaxed_query_compiler._resample_internal(
15762+
resample_kwargs=resample_kwargs,
15763+
resample_method=resample_method,
15764+
resample_method_args=resample_method_args,
15765+
resample_method_kwargs=resample_method_kwargs,
15766+
is_series=is_series,
15767+
)
15768+
if isinstance(result, SnowflakeQueryCompiler):
15769+
relaxed_query_compiler = result
15770+
else:
15771+
return result
15772+
result = self._resample_internal(
15773+
resample_kwargs=resample_kwargs,
15774+
resample_method=resample_method,
15775+
resample_method_args=resample_method_args,
15776+
resample_method_kwargs=resample_method_kwargs,
15777+
is_series=is_series,
15778+
)
15779+
if isinstance(result, SnowflakeQueryCompiler):
15780+
return self._maybe_set_relaxed_qc(result, relaxed_query_compiler)
15781+
else:
15782+
return result
15783+
15784+
def _resample_internal(
15785+
self,
15786+
resample_kwargs: dict[str, Any],
15787+
resample_method: AggFuncType,
15788+
resample_method_args: tuple[Any],
15789+
resample_method_kwargs: dict[str, Any],
15790+
is_series: bool,
1575515791
) -> Union["SnowflakeQueryCompiler", collections.defaultdict[Hashable, list]]:
1575615792
"""
1575715793
Return new SnowflakeQueryCompiler whose ordered frame holds the result of a resample operation.

tests/integ/modin/test_faster_pandas.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1029,6 +1029,63 @@ def test_rename(session):
10291029
assert_frame_equal(snow_result, native_result)
10301030

10311031

1032+
@pytest.mark.parametrize(
1033+
"func",
1034+
[
1035+
"max",
1036+
"min",
1037+
"mean",
1038+
"median",
1039+
"sum",
1040+
"std",
1041+
"var",
1042+
"count",
1043+
"size",
1044+
"first",
1045+
"last",
1046+
"quantile",
1047+
"nunique",
1048+
],
1049+
)
1050+
@sql_count_checker(query_count=5, join_count=1)
1051+
def test_resample(session, func):
1052+
with session_parameter_override(
1053+
session, "dummy_row_pos_optimization_enabled", True
1054+
):
1055+
# create tables
1056+
table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
1057+
session.create_dataframe(
1058+
native_pd.DataFrame(
1059+
{"A": np.random.randn(15)},
1060+
index=native_pd.date_range("2020-01-01", periods=15, freq="1h"),
1061+
).reset_index(drop=False)
1062+
).write.save_as_table(table_name, table_type="temp")
1063+
1064+
# create snow dataframes
1065+
df = pd.read_snowflake(table_name, index_col="index")
1066+
snow_result = getattr(df.resample(rule="2h", closed="left"), func)()
1067+
1068+
# verify that the input dataframe has a populated relaxed query compiler
1069+
assert df._query_compiler._relaxed_query_compiler is not None
1070+
assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
1071+
# verify that the output dataframe also has a populated relaxed query compiler
1072+
assert snow_result._query_compiler._relaxed_query_compiler is not None
1073+
assert (
1074+
snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode
1075+
is True
1076+
)
1077+
1078+
# create pandas dataframes
1079+
native_df = df.to_pandas()
1080+
native_result = getattr(native_df.resample(rule="2h", closed="left"), func)()
1081+
1082+
# compare results
1083+
if func == "size":
1084+
assert_series_equal(snow_result, native_result, check_freq=False)
1085+
else:
1086+
assert_frame_equal(snow_result, native_result, check_freq=False)
1087+
1088+
10321089
@pytest.mark.parametrize(
10331090
"func",
10341091
[

0 commit comments

Comments
 (0)