Skip to content

Commit

Permalink
Remove cudf.Scalar from scatter APIs (#17847)
Browse files Browse the repository at this point in the history
Towards #17843

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: #17847
  • Loading branch information
mroeschke authored Jan 30, 2025
1 parent f949dee commit a46307a
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 33 deletions.
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/_internals/copying.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def gather(

@acquire_spill_lock()
def scatter(
sources: list[ColumnBase | cudf.Scalar],
sources: list[ColumnBase | plc.Scalar],
scatter_map: NumericalColumn,
target_columns: list[ColumnBase],
bounds_check: bool = True,
Expand Down Expand Up @@ -67,7 +67,7 @@ def scatter(
plc_tbl = plc.copying.scatter(
plc.Table([col.to_pylibcudf(mode="read") for col in sources]) # type: ignore[union-attr]
if isinstance(sources[0], cudf._lib.column.Column)
else [slr.device_value for slr in sources], # type: ignore[union-attr]
else sources, # type: ignore[union-attr]
scatter_map.to_pylibcudf(mode="read"),
plc.Table([col.to_pylibcudf(mode="read") for col in target_columns]),
)
Expand Down
18 changes: 13 additions & 5 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@
import pyarrow as pa
from typing_extensions import Self

import pylibcudf as plc

import cudf
from cudf.core.column import column
from cudf.core.column.methods import ColumnMethods
from cudf.core.dtypes import CategoricalDtype, IntervalDtype
from cudf.core.scalar import pa_scalar_to_plc_scalar
from cudf.utils.dtypes import (
SIZE_TYPE_DTYPE,
find_common_type,
Expand Down Expand Up @@ -657,17 +660,22 @@ def __setitem__(self, key, value):

def _fill(
self,
fill_value: ScalarLike,
fill_value: plc.Scalar,
begin: int,
end: int,
inplace: bool = False,
) -> Self:
if end <= begin or begin >= self.size:
return self if inplace else self.copy()

fill_code = self._encode(fill_value)
fill_code = self._encode(plc.interop.to_arrow(fill_value))
result = self if inplace else self.copy()
result.codes._fill(fill_code, begin, end, inplace=True)
result.codes._fill(
pa_scalar_to_plc_scalar(pa.scalar(fill_code)),
begin,
end,
inplace=True,
)
return result

def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
Expand Down Expand Up @@ -1017,7 +1025,7 @@ def isnull(self) -> ColumnBase:
categories = self.categories.isnan()
if categories.any():
code = self._encode(np.nan)
result = result | (self.codes == cudf.Scalar(code))
result = result | (self.codes == code)

return result

Expand All @@ -1033,7 +1041,7 @@ def notnull(self) -> ColumnBase:
categories = self.categories.isnan()
if categories.any():
code = self._encode(np.nan)
result = result & (self.codes != cudf.Scalar(code))
result = result & (self.codes != code)

return result

Expand Down
28 changes: 12 additions & 16 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,33 +421,29 @@ def memory_usage(self) -> int:

def _fill(
self,
fill_value: ScalarLike,
fill_value: plc.Scalar,
begin: int,
end: int,
inplace: bool = False,
) -> Self | None:
if end <= begin or begin >= self.size:
return self if inplace else self.copy()

# Constructing a cuDF scalar can cut unnecessary DtoH copy if
# the scalar is None when calling `is_valid`.
slr = cudf.Scalar(fill_value, dtype=self.dtype)

if not inplace or is_string_dtype(self.dtype):
with acquire_spill_lock():
result = type(self).from_pylibcudf(
plc.filling.fill(
self.to_pylibcudf(mode="read"),
begin,
end,
slr.device_value,
fill_value,
)
)
if is_string_dtype(self.dtype):
return self._mimic_inplace(result, inplace=True)
return result # type: ignore[return-value]

if not slr.is_valid() and not self.nullable:
if not fill_value.is_valid() and not self.nullable:
mask = as_buffer(
plc.null_mask.create_null_mask(
self.size, plc.null_mask.MaskState.ALL_VALID
Expand All @@ -460,7 +456,7 @@ def _fill(
self.to_pylibcudf(mode="write"),
begin,
end,
slr.device_value,
fill_value,
)
return self

Expand Down Expand Up @@ -629,8 +625,8 @@ def __setitem__(self, key: Any, value: Any):
"""

# Normalize value to scalar/column
value_normalized: cudf.Scalar | ColumnBase = (
cudf.Scalar(value, dtype=self.dtype)
value_normalized: plc.Scalar | ColumnBase = (
cudf.Scalar(value, dtype=self.dtype).device_value
if is_scalar(value)
else as_column(value, dtype=self.dtype)
)
Expand Down Expand Up @@ -658,7 +654,7 @@ def _wrap_binop_normalization(self, other):
def _scatter_by_slice(
self,
key: builtins.slice,
value: cudf.core.scalar.Scalar | ColumnBase,
value: plc.Scalar | ColumnBase,
) -> Self | None:
"""If this function returns None, it's either a no-op (slice is empty),
or the inplace replacement is already performed (fill-in-place).
Expand All @@ -672,12 +668,12 @@ def _scatter_by_slice(
self._check_scatter_key_length(num_keys, value)

if step == 1 and not isinstance(
self, (cudf.core.column.StructColumn, cudf.core.column.ListColumn)
self.dtype, (cudf.StructDtype, cudf.ListDtype)
):
# NOTE: List & Struct dtypes aren't supported by both
# inplace & out-of-place fill. Hence we need to use scatter for
# these two types.
if isinstance(value, cudf.core.scalar.Scalar):
if isinstance(value, plc.Scalar):
return self._fill(value, start, stop, inplace=True)
else:
with acquire_spill_lock():
Expand Down Expand Up @@ -705,7 +701,7 @@ def _scatter_by_slice(
def _scatter_by_column(
self,
key: cudf.core.column.NumericalColumn,
value: cudf.core.scalar.Scalar | ColumnBase,
value: plc.Scalar | ColumnBase,
bounds_check: bool = True,
) -> Self:
if key.dtype.kind == "b":
Expand Down Expand Up @@ -738,7 +734,7 @@ def _scatter_by_column(
plc_table = plc.copying.boolean_mask_scatter(
plc.Table([value.to_pylibcudf(mode="read")])
if isinstance(value, Column)
else [value.device_value],
else [value],
plc.Table([self.to_pylibcudf(mode="read")]),
key.to_pylibcudf(mode="read"),
)
Expand All @@ -753,7 +749,7 @@ def _scatter_by_column(
)[0]._with_type_metadata(self.dtype)

def _check_scatter_key_length(
self, num_keys: int, value: cudf.core.scalar.Scalar | ColumnBase
self, num_keys: int, value: plc.Scalar | ColumnBase
) -> None:
"""`num_keys` is the number of keys to scatter. Should equal to the
number of rows in ``value`` if ``value`` is a column.
Expand Down
7 changes: 5 additions & 2 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
from cudf.core.buffer import Buffer, acquire_spill_lock
from cudf.core.column.column import ColumnBase, as_column
from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
from cudf.utils.dtypes import _get_base_dtype
from cudf.core.scalar import pa_scalar_to_plc_scalar
from cudf.utils.dtypes import _get_base_dtype, cudf_dtype_to_pa_type
from cudf.utils.utils import (
_all_bools_with_nulls,
_datetime_timedelta_find_and_replace,
Expand Down Expand Up @@ -949,7 +950,9 @@ def tz_localize(
)
localized = self._scatter_by_column(
self.isnull() | (ambiguous_col | nonexistent_col),
cudf.Scalar(cudf.NaT, dtype=self.dtype),
pa_scalar_to_plc_scalar(
pa.scalar(None, type=cudf_dtype_to_pa_type(self.dtype))
),
)

transition_times, offsets = get_tz_data(tzname)
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,8 @@ def get(
# replace the value in those rows (should be NA) with `default`
if out_of_bounds_mask.any():
out = out._scatter_by_column(
out_of_bounds_mask, cudf.Scalar(default)
out_of_bounds_mask,
pa_scalar_to_plc_scalar(pa.scalar(default)),
)
if out.dtype != self._column.dtype.element_type:
# libcudf doesn't maintain struct labels so we must transfer over
Expand Down
11 changes: 8 additions & 3 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def __setitem__(self, key: Any, value: Any):
"""

# Normalize value to scalar/column
device_value: cudf.Scalar | ColumnBase = (
value_normalized: cudf.Scalar | ColumnBase = (
cudf.Scalar(
value,
dtype=self.dtype
Expand All @@ -160,12 +160,17 @@ def __setitem__(self, key: Any, value: Any):
else as_column(value)
)

if self.dtype.kind != "b" and device_value.dtype.kind == "b":
if self.dtype.kind != "b" and value_normalized.dtype.kind == "b":
raise TypeError(f"Invalid value {value} for dtype {self.dtype}")
else:
device_value = device_value.astype(self.dtype)
value_normalized = value_normalized.astype(self.dtype)

out: ColumnBase | None # If None, no need to perform mimic inplace.
device_value = (
value_normalized.device_value
if isinstance(value_normalized, cudf.Scalar)
else value_normalized
)
if isinstance(key, slice):
out = self._scatter_by_slice(key, device_value)
else:
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import cupy as cp
import numpy as np
import pandas as pd
import pyarrow as pa
from typing_extensions import Self

import pylibcudf as plc
Expand Down Expand Up @@ -49,6 +50,7 @@
from cudf.core.missing import NA
from cudf.core.multiindex import MultiIndex
from cudf.core.resample import _Resampler
from cudf.core.scalar import pa_scalar_to_plc_scalar
from cudf.core.udf.utils import (
_compile_or_get,
_get_input_args_from_frame,
Expand Down Expand Up @@ -3258,7 +3260,7 @@ def duplicated(
True, length=len(self), dtype=bool
)._scatter_by_column(
distinct,
cudf.Scalar(False),
pa_scalar_to_plc_scalar(pa.scalar(False)),
bounds_check=False,
)
return cudf.Series._from_column(result, index=self.index, name=name)
Expand Down
7 changes: 4 additions & 3 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar
from cudf.core import column
from cudf.core._base_index import _return_get_indexer_result
from cudf.core._internals import copying, sorting
from cudf.core._internals import sorting
from cudf.core.algorithms import factorize
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column_accessor import ColumnAccessor
Expand Down Expand Up @@ -1964,8 +1964,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
)
scatter_map = libcudf.column.Column.from_pylibcudf(left_plc)
indices = libcudf.column.Column.from_pylibcudf(right_plc)
result = copying.scatter([indices], scatter_map, [result])[0]
result_series = cudf.Series._from_column(result)
result_series = cudf.Series._from_column(
result._scatter_by_column(scatter_map, indices)
)

if method in {"ffill", "bfill", "pad", "backfill"}:
result_series = _get_indexer_basic(
Expand Down

0 comments on commit a46307a

Please sign in to comment.