From a28c4698bc5543641d3e2aeff58daa821ca08d2a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 20 Dec 2024 12:35:37 -0800
Subject: [PATCH 1/3] Avoid shallow copies in groupby methods

---
 python/cudf/cudf/core/groupby/groupby.py | 30 +++++++++++++-----------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index be3cc410174..0f8ed917d5d 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -48,7 +48,7 @@
 from cudf.utils.utils import GetAttrGetItemMixin
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Iterable
+    from collections.abc import Generator, Hashable, Iterable
 
     from cudf._typing import (
         AggType,
@@ -2445,7 +2445,7 @@ def _cov_or_corr(self, func, method_name):
         # create expanded dataframe consisting all combinations of the
         # struct columns-pairs to be used in the correlation or covariance
         # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
-        column_names = self.grouping.values._column_names
+        column_names = self.grouping.values_column_names
         num_cols = len(column_names)
 
         column_pair_structs = {}
@@ -2679,10 +2679,8 @@ def diff(self, periods=1, axis=0):
 
         if not axis == 0:
             raise NotImplementedError("Only axis=0 is supported.")
-
-        values = self.obj.__class__._from_data(
-            self.grouping.values._data, self.obj.index
-        )
+        values = self.grouping.values
+        values.index = self.obj.index
         return values - self.shift(periods=periods)
 
     def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries:
@@ -2786,9 +2784,8 @@ def fillna(
                 raise ValueError("Method can only be of 'ffill', 'bfill'.")
             return getattr(self, method, limit)()
 
-        values = self.obj.__class__._from_data(
-            self.grouping.values._data, self.obj.index
-        )
+        values = self.grouping.values
+        values.index = self.obj.index
         return values.fillna(
             value=value, inplace=inplace, axis=axis, limit=limit
         )
@@ -3542,6 +3539,13 @@ def keys(self):
                 self._key_columns[0], name=self.names[0]
             )
 
+    @property
+    def values_column_names(self) -> list[Hashable]:
+        # If the key columns are in `obj`, filter them out
+        return [
+            x for x in self._obj._column_names if x not in self._named_columns
+        ]
+
     @property
     def values(self) -> cudf.core.frame.Frame:
         """Return value columns as a frame.
@@ -3552,11 +3556,9 @@ def values(self) -> cudf.core.frame.Frame:
 
         This is mainly used in transform-like operations.
         """
-        # If the key columns are in `obj`, filter them out
-        value_column_names = [
-            x for x in self._obj._column_names if x not in self._named_columns
-        ]
-        value_columns = self._obj._data.select_by_label(value_column_names)
+        value_columns = self._obj._data.select_by_label(
+            self.values_column_names
+        )
         return self._obj.__class__._from_data(value_columns)
 
     def _handle_callable(self, by):

From 5306981a4d11b84ee8972732ed32075b6028f369 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 20 Dec 2024 14:25:29 -0800
Subject: [PATCH 2/3] Make _values_column_names private

---
 python/cudf/cudf/core/groupby/groupby.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 0f8ed917d5d..abecdefe6e5 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -2445,7 +2445,7 @@ def _cov_or_corr(self, func, method_name):
         # create expanded dataframe consisting all combinations of the
         # struct columns-pairs to be used in the correlation or covariance
         # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
-        column_names = self.grouping.values_column_names
+        column_names = self.grouping._values_column_names
         num_cols = len(column_names)
 
         column_pair_structs = {}
@@ -3540,7 +3540,7 @@ def keys(self):
             )
 
     @property
-    def values_column_names(self) -> list[Hashable]:
+    def _values_column_names(self) -> list[Hashable]:
         # If the key columns are in `obj`, filter them out
         return [
             x for x in self._obj._column_names if x not in self._named_columns
@@ -3557,7 +3557,7 @@ def values(self) -> cudf.core.frame.Frame:
         This is mainly used in transform-like operations.
         """
         value_columns = self._obj._data.select_by_label(
-            self.values_column_names
+            self._values_column_names
         )
         return self._obj.__class__._from_data(value_columns)
 

From 093e5579b2798507aae2cde9beb8a6eaa25aa97f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 3 Jan 2025 14:55:18 -0800
Subject: [PATCH 3/3] copyright

---
 python/cudf/cudf/core/groupby/groupby.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 3c5e8dd1ca9..6ae524d6346 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import copy