Remove cudf.Scalar from scatter APIs (#17847)

Towards #17843 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: #17847
rapidsai · Jan 30, 2025 · a46307a · a46307a
1 parent f949dee
commit a46307a
Show file tree

Hide file tree

Showing 8 changed files with 49 additions and 33 deletions.
diff --git a/python/cudf/cudf/core/_internals/copying.py b/python/cudf/cudf/core/_internals/copying.py
@@ -36,7 +36,7 @@ def gather(
 
 @acquire_spill_lock()
 def scatter(
-    sources: list[ColumnBase | cudf.Scalar],
+    sources: list[ColumnBase | plc.Scalar],
     scatter_map: NumericalColumn,
     target_columns: list[ColumnBase],
     bounds_check: bool = True,
@@ -67,7 +67,7 @@ def scatter(
     plc_tbl = plc.copying.scatter(
         plc.Table([col.to_pylibcudf(mode="read") for col in sources])  # type: ignore[union-attr]
         if isinstance(sources[0], cudf._lib.column.Column)
-        else [slr.device_value for slr in sources],  # type: ignore[union-attr]
+        else sources,  # type: ignore[union-attr]
         scatter_map.to_pylibcudf(mode="read"),
         plc.Table([col.to_pylibcudf(mode="read") for col in target_columns]),
     )

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -11,10 +11,13 @@
 import pyarrow as pa
 from typing_extensions import Self
 
+import pylibcudf as plc
+
 import cudf
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import CategoricalDtype, IntervalDtype
+from cudf.core.scalar import pa_scalar_to_plc_scalar
 from cudf.utils.dtypes import (
     SIZE_TYPE_DTYPE,
     find_common_type,
@@ -657,17 +660,22 @@ def __setitem__(self, key, value):
 
     def _fill(
         self,
-        fill_value: ScalarLike,
+        fill_value: plc.Scalar,
         begin: int,
         end: int,
         inplace: bool = False,
     ) -> Self:
         if end <= begin or begin >= self.size:
             return self if inplace else self.copy()
 
-        fill_code = self._encode(fill_value)
+        fill_code = self._encode(plc.interop.to_arrow(fill_value))
         result = self if inplace else self.copy()
-        result.codes._fill(fill_code, begin, end, inplace=True)
+        result.codes._fill(
+            pa_scalar_to_plc_scalar(pa.scalar(fill_code)),
+            begin,
+            end,
+            inplace=True,
+        )
         return result
 
     def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
@@ -1017,7 +1025,7 @@ def isnull(self) -> ColumnBase:
             categories = self.categories.isnan()
             if categories.any():
                 code = self._encode(np.nan)
-                result = result | (self.codes == cudf.Scalar(code))
+                result = result | (self.codes == code)
 
         return result
 
@@ -1033,7 +1041,7 @@ def notnull(self) -> ColumnBase:
             categories = self.categories.isnan()
             if categories.any():
                 code = self._encode(np.nan)
-                result = result & (self.codes != cudf.Scalar(code))
+                result = result & (self.codes != code)
 
         return result
 

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -421,33 +421,29 @@ def memory_usage(self) -> int:
 
     def _fill(
         self,
-        fill_value: ScalarLike,
+        fill_value: plc.Scalar,
         begin: int,
         end: int,
         inplace: bool = False,
     ) -> Self | None:
         if end <= begin or begin >= self.size:
             return self if inplace else self.copy()
 
-        # Constructing a cuDF scalar can cut unnecessary DtoH copy if
-        # the scalar is None when calling `is_valid`.
-        slr = cudf.Scalar(fill_value, dtype=self.dtype)
-
         if not inplace or is_string_dtype(self.dtype):
             with acquire_spill_lock():
                 result = type(self).from_pylibcudf(
                     plc.filling.fill(
                         self.to_pylibcudf(mode="read"),
                         begin,
                         end,
-                        slr.device_value,
+                        fill_value,
                     )
                 )
             if is_string_dtype(self.dtype):
                 return self._mimic_inplace(result, inplace=True)
             return result  # type: ignore[return-value]
 
-        if not slr.is_valid() and not self.nullable:
+        if not fill_value.is_valid() and not self.nullable:
             mask = as_buffer(
                 plc.null_mask.create_null_mask(
                     self.size, plc.null_mask.MaskState.ALL_VALID
@@ -460,7 +456,7 @@ def _fill(
                 self.to_pylibcudf(mode="write"),
                 begin,
                 end,
-                slr.device_value,
+                fill_value,
             )
         return self
 
@@ -629,8 +625,8 @@ def __setitem__(self, key: Any, value: Any):
         """
 
         # Normalize value to scalar/column
-        value_normalized: cudf.Scalar | ColumnBase = (
-            cudf.Scalar(value, dtype=self.dtype)
+        value_normalized: plc.Scalar | ColumnBase = (
+            cudf.Scalar(value, dtype=self.dtype).device_value
             if is_scalar(value)
             else as_column(value, dtype=self.dtype)
         )
@@ -658,7 +654,7 @@ def _wrap_binop_normalization(self, other):
     def _scatter_by_slice(
         self,
         key: builtins.slice,
-        value: cudf.core.scalar.Scalar | ColumnBase,
+        value: plc.Scalar | ColumnBase,
     ) -> Self | None:
         """If this function returns None, it's either a no-op (slice is empty),
         or the inplace replacement is already performed (fill-in-place).
@@ -672,12 +668,12 @@ def _scatter_by_slice(
         self._check_scatter_key_length(num_keys, value)
 
         if step == 1 and not isinstance(
-            self, (cudf.core.column.StructColumn, cudf.core.column.ListColumn)
+            self.dtype, (cudf.StructDtype, cudf.ListDtype)
         ):
             # NOTE: List & Struct dtypes aren't supported by both
             # inplace & out-of-place fill. Hence we need to use scatter for
             # these two types.
-            if isinstance(value, cudf.core.scalar.Scalar):
+            if isinstance(value, plc.Scalar):
                 return self._fill(value, start, stop, inplace=True)
             else:
                 with acquire_spill_lock():
@@ -705,7 +701,7 @@ def _scatter_by_slice(
     def _scatter_by_column(
         self,
         key: cudf.core.column.NumericalColumn,
-        value: cudf.core.scalar.Scalar | ColumnBase,
+        value: plc.Scalar | ColumnBase,
         bounds_check: bool = True,
     ) -> Self:
         if key.dtype.kind == "b":
@@ -738,7 +734,7 @@ def _scatter_by_column(
                 plc_table = plc.copying.boolean_mask_scatter(
                     plc.Table([value.to_pylibcudf(mode="read")])
                     if isinstance(value, Column)
-                    else [value.device_value],
+                    else [value],
                     plc.Table([self.to_pylibcudf(mode="read")]),
                     key.to_pylibcudf(mode="read"),
                 )
@@ -753,7 +749,7 @@ def _scatter_by_column(
             )[0]._with_type_metadata(self.dtype)
 
     def _check_scatter_key_length(
-        self, num_keys: int, value: cudf.core.scalar.Scalar | ColumnBase
+        self, num_keys: int, value: plc.Scalar | ColumnBase
     ) -> None:
         """`num_keys` is the number of keys to scatter. Should equal to the
         number of rows in ``value`` if ``value`` is a column.

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -30,7 +30,8 @@
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column.column import ColumnBase, as_column
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
-from cudf.utils.dtypes import _get_base_dtype
+from cudf.core.scalar import pa_scalar_to_plc_scalar
+from cudf.utils.dtypes import _get_base_dtype, cudf_dtype_to_pa_type
 from cudf.utils.utils import (
     _all_bools_with_nulls,
     _datetime_timedelta_find_and_replace,
@@ -949,7 +950,9 @@ def tz_localize(
         )
         localized = self._scatter_by_column(
             self.isnull() | (ambiguous_col | nonexistent_col),
-            cudf.Scalar(cudf.NaT, dtype=self.dtype),
+            pa_scalar_to_plc_scalar(
+                pa.scalar(None, type=cudf_dtype_to_pa_type(self.dtype))
+            ),
         )
 
         transition_times, offsets = get_tz_data(tzname)

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
@@ -545,7 +545,8 @@ def get(
             # replace the value in those rows (should be NA) with `default`
             if out_of_bounds_mask.any():
                 out = out._scatter_by_column(
-                    out_of_bounds_mask, cudf.Scalar(default)
+                    out_of_bounds_mask,
+                    pa_scalar_to_plc_scalar(pa.scalar(default)),
                 )
         if out.dtype != self._column.dtype.element_type:
             # libcudf doesn't maintain struct labels so we must transfer over

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -149,7 +149,7 @@ def __setitem__(self, key: Any, value: Any):
         """
 
         # Normalize value to scalar/column
-        device_value: cudf.Scalar | ColumnBase = (
+        value_normalized: cudf.Scalar | ColumnBase = (
             cudf.Scalar(
                 value,
                 dtype=self.dtype
@@ -160,12 +160,17 @@ def __setitem__(self, key: Any, value: Any):
             else as_column(value)
         )
 
-        if self.dtype.kind != "b" and device_value.dtype.kind == "b":
+        if self.dtype.kind != "b" and value_normalized.dtype.kind == "b":
             raise TypeError(f"Invalid value {value} for dtype {self.dtype}")
         else:
-            device_value = device_value.astype(self.dtype)
+            value_normalized = value_normalized.astype(self.dtype)
 
         out: ColumnBase | None  # If None, no need to perform mimic inplace.
+        device_value = (
+            value_normalized.device_value
+            if isinstance(value_normalized, cudf.Scalar)
+            else value_normalized
+        )
         if isinstance(key, slice):
             out = self._scatter_by_slice(key, device_value)
         else:

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -20,6 +20,7 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from typing_extensions import Self
 
 import pylibcudf as plc
@@ -49,6 +50,7 @@
 from cudf.core.missing import NA
 from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import _Resampler
+from cudf.core.scalar import pa_scalar_to_plc_scalar
 from cudf.core.udf.utils import (
     _compile_or_get,
     _get_input_args_from_frame,
@@ -3258,7 +3260,7 @@ def duplicated(
             True, length=len(self), dtype=bool
         )._scatter_by_column(
             distinct,
-            cudf.Scalar(False),
+            pa_scalar_to_plc_scalar(pa.scalar(False)),
             bounds_check=False,
         )
         return cudf.Series._from_column(result, index=self.index, name=name)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
@@ -21,7 +21,7 @@
 from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar
 from cudf.core import column
 from cudf.core._base_index import _return_get_indexer_result
-from cudf.core._internals import copying, sorting
+from cudf.core._internals import sorting
 from cudf.core.algorithms import factorize
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column_accessor import ColumnAccessor
@@ -1964,8 +1964,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             )
             scatter_map = libcudf.column.Column.from_pylibcudf(left_plc)
             indices = libcudf.column.Column.from_pylibcudf(right_plc)
-        result = copying.scatter([indices], scatter_map, [result])[0]
-        result_series = cudf.Series._from_column(result)
+        result_series = cudf.Series._from_column(
+            result._scatter_by_column(scatter_map, indices)
+        )
 
         if method in {"ffill", "bfill", "pad", "backfill"}:
             result_series = _get_indexer_basic(