From a28c4698bc5543641d3e2aeff58daa821ca08d2a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 20 Dec 2024 12:35:37 -0800 Subject: [PATCH 1/3] Avoid shallow copies in groupby methods --- python/cudf/cudf/core/groupby/groupby.py | 30 +++++++++++++----------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index be3cc410174..0f8ed917d5d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -48,7 +48,7 @@ from cudf.utils.utils import GetAttrGetItemMixin if TYPE_CHECKING: - from collections.abc import Generator, Iterable + from collections.abc import Generator, Hashable, Iterable from cudf._typing import ( AggType, @@ -2445,7 +2445,7 @@ def _cov_or_corr(self, func, method_name): # create expanded dataframe consisting all combinations of the # struct columns-pairs to be used in the correlation or covariance # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) - column_names = self.grouping.values._column_names + column_names = self.grouping.values_column_names num_cols = len(column_names) column_pair_structs = {} @@ -2679,10 +2679,8 @@ def diff(self, periods=1, axis=0): if not axis == 0: raise NotImplementedError("Only axis=0 is supported.") - - values = self.obj.__class__._from_data( - self.grouping.values._data, self.obj.index - ) + values = self.grouping.values + values.index = self.obj.index return values - self.shift(periods=periods) def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries: @@ -2786,9 +2784,8 @@ def fillna( raise ValueError("Method can only be of 'ffill', 'bfill'.") return getattr(self, method, limit)() - values = self.obj.__class__._from_data( - self.grouping.values._data, self.obj.index - ) + values = self.grouping.values + values.index = self.obj.index return values.fillna( value=value, inplace=inplace, axis=axis, limit=limit ) @@ -3542,6 +3539,13 @@ def keys(self): self._key_columns[0], name=self.names[0] ) + @property + def values_column_names(self) -> list[Hashable]: + # If the key columns are in `obj`, filter them out + return [ + x for x in self._obj._column_names if x not in self._named_columns + ] + @property def values(self) -> cudf.core.frame.Frame: """Return value columns as a frame. @@ -3552,11 +3556,9 @@ def values(self) -> cudf.core.frame.Frame: This is mainly used in transform-like operations. """ - # If the key columns are in `obj`, filter them out - value_column_names = [ - x for x in self._obj._column_names if x not in self._named_columns - ] - value_columns = self._obj._data.select_by_label(value_column_names) + value_columns = self._obj._data.select_by_label( + self.values_column_names + ) return self._obj.__class__._from_data(value_columns) def _handle_callable(self, by): From 5306981a4d11b84ee8972732ed32075b6028f369 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 20 Dec 2024 14:25:29 -0800 Subject: [PATCH 2/3] Make _values_column_names private --- python/cudf/cudf/core/groupby/groupby.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 0f8ed917d5d..abecdefe6e5 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2445,7 +2445,7 @@ def _cov_or_corr(self, func, method_name): # create expanded dataframe consisting all combinations of the # struct columns-pairs to be used in the correlation or covariance # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) - column_names = self.grouping.values_column_names + column_names = self.grouping._values_column_names num_cols = len(column_names) column_pair_structs = {} @@ -3540,7 +3540,7 @@ def keys(self): ) @property - def values_column_names(self) -> list[Hashable]: + def _values_column_names(self) -> list[Hashable]: # If the key columns are in `obj`, filter them out return [ x for x in self._obj._column_names if x not in self._named_columns @@ -3557,7 +3557,7 @@ def values(self) -> cudf.core.frame.Frame: This is mainly used in transform-like operations. """ value_columns = self._obj._data.select_by_label( - self.values_column_names + self._values_column_names ) return self._obj.__class__._from_data(value_columns) From 093e5579b2798507aae2cde9beb8a6eaa25aa97f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 3 Jan 2025 14:55:18 -0800 Subject: [PATCH 3/3] copyright --- python/cudf/cudf/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 3c5e8dd1ca9..6ae524d6346 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import copy