Skip to content

Commit

Permalink
Make column_empty mask buffer creation consistent with libcudf (#16715
Browse files Browse the repository at this point in the history
)

Based on offline discussions, this PR makes `column_empty` consistent with libcudf where

* A size 0 "empty" column should not have a mask buffer
* A size > 0 "empty" (i.e all null) column should have a mask buffer

Additionally removes `column_empty_like` which can be subsumed by `column_empty` (I didn't find any active usage of this method across RAPIDS https://github.com/search?q=org%3Arapidsai%20column_empty_like&type=code)

`column_empty` will have an unused `masked` argument, but since there is usage of this method across RAPIDS I'll need to adjust them before removing that keyword here (https://github.com/search?q=org%3Arapidsai%20column_empty&type=code)

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #16715
  • Loading branch information
mroeschke authored Dec 3, 2024
1 parent 7cc9a9f commit 541e7e8
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 63 deletions.
1 change: 0 additions & 1 deletion python/cudf/cudf/core/column/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
as_column,
build_column,
column_empty,
column_empty_like,
concat_columns,
deserialize_columns,
serialize_columns,
Expand Down
66 changes: 27 additions & 39 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -882,7 +882,7 @@ def take(
"""
# Handle zero size
if indices.size == 0:
return cast(Self, column_empty_like(self, newsize=0))
return cast(Self, column_empty(row_count=0, dtype=self.dtype))

# TODO: For performance, the check and conversion of gather map should
# be done by the caller. This check will be removed in future release.
Expand Down Expand Up @@ -1222,7 +1222,6 @@ def __cuda_array_interface__(self) -> abc.Mapping[str, Any]:
"data": (self.data_ptr, False),
"version": 1,
}

if self.nullable and self.has_nulls():
# Create a simple Python object that exposes the
# `__cuda_array_interface__` attribute here since we need to modify
Expand Down Expand Up @@ -1516,37 +1515,6 @@ def _return_sentinel_column():
return codes.fillna(na_sentinel.value)


def column_empty_like(
column: ColumnBase,
dtype: Dtype | None = None,
masked: bool = False,
newsize: int | None = None,
) -> ColumnBase:
"""Allocate a new column like the given *column*"""
if dtype is None:
dtype = column.dtype
row_count = len(column) if newsize is None else newsize

if (
hasattr(column, "dtype")
and isinstance(column.dtype, cudf.CategoricalDtype)
and dtype == column.dtype
):
catcolumn = cast("cudf.core.column.CategoricalColumn", column)
codes = column_empty_like(
catcolumn.codes, masked=masked, newsize=newsize
)
return build_column(
data=None,
dtype=dtype,
mask=codes.base_mask,
children=(codes,),
size=codes.size,
)

return column_empty(row_count, dtype, masked)


def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
"""Check if an object dtype Series or array contains NaN."""
return any(
Expand All @@ -1556,9 +1524,31 @@ def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:


def column_empty(
row_count: int, dtype: Dtype = "object", masked: bool = False
row_count: int,
dtype: Dtype = "object",
masked: bool = False,
for_numba: bool = False,
) -> ColumnBase:
"""Allocate a new column like the given row_count and dtype."""
"""
Allocate a new column with the given row_count and dtype.
* Passing row_count == 0 creates a size 0 column without a mask buffer.
* Passing row_count > 0 creates an all null column with a mask buffer.
Parameters
----------
row_count : int
Number of elements in the column.
dtype : Dtype
Type of the column.
masked : bool
Unused.
for_numba : bool, default False
If True, don't allocate a mask as it's not supported by numba.
"""
dtype = cudf.dtype(dtype)
children: tuple[ColumnBase, ...] = ()

Expand Down Expand Up @@ -1600,7 +1590,7 @@ def column_empty(
else:
data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize))

if masked:
if row_count > 0 and not for_numba:
mask = as_buffer(
plc.null_mask.create_null_mask(
row_count, plc.null_mask.MaskState.ALL_NULL
Expand Down Expand Up @@ -2353,9 +2343,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
if not is_dtype_equal(obj.dtype, head.dtype):
# if all null, cast to appropriate dtype
if obj.null_count == len(obj):
objs[i] = column_empty_like(
head, dtype=head.dtype, masked=True, newsize=len(obj)
)
objs[i] = column_empty(row_count=len(obj), dtype=head.dtype)
else:
raise ValueError("All columns must be the same type")

Expand Down
14 changes: 6 additions & 8 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1424,8 +1424,8 @@ def __setitem__(self, arg, value):
new_columns = (
value
if key == arg
else column.column_empty_like(
col, masked=True, newsize=length
else column.column_empty(
row_count=length, dtype=col.dtype
)
for key, col in self._column_labels_and_values
)
Expand Down Expand Up @@ -3385,10 +3385,8 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
if num_cols != 0:
ca = self._data._from_columns_like_self(
(
column.column_empty_like(
col_data, masked=True, newsize=length
)
for col_data in self._columns
column.column_empty(row_count=length, dtype=dtype)
for _, dtype in self._dtypes
),
verify=False,
)
Expand Down Expand Up @@ -6191,8 +6189,8 @@ def quantile(
quant_index=False,
)._column
if len(res) == 0:
res = column.column_empty_like(
qs, dtype=ser.dtype, masked=True, newsize=len(qs)
res = column.column_empty(
row_count=len(qs), dtype=ser.dtype
)
result[k] = res
result = DataFrame._from_data(result)
Expand Down
18 changes: 9 additions & 9 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from cudf.api.extensions import no_default
from cudf.api.types import is_scalar
from cudf.core._compat import PANDAS_LT_300
from cudf.core.column import ColumnBase, as_column, column_empty_like
from cudf.core.column import ColumnBase, as_column, column_empty
from cudf.core.column_accessor import ColumnAccessor
from cudf.utils.dtypes import min_unsigned_type

Expand Down Expand Up @@ -421,8 +421,8 @@ def concat(
# if join is inner and it contains an empty df
# we return an empty df, hence creating an empty
# column with dtype metadata retained.
result_data[name] = cudf.core.column.column_empty_like(
col, newsize=0
result_data[name] = column_empty(
row_count=0, dtype=col.dtype
)
else:
result_data[name] = col
Expand Down Expand Up @@ -458,8 +458,8 @@ def concat(
else:
col_label = (k, name)
if empty_inner:
result_data[col_label] = (
cudf.core.column.column_empty_like(col, newsize=0)
result_data[col_label] = column_empty(
row_count=0, dtype=col.dtype
)
else:
result_data[col_label] = col
Expand Down Expand Up @@ -995,9 +995,7 @@ def as_tuple(x):
]
new_size = nrows * len(names)
scatter_map = (columns_idx * np.int32(nrows)) + index_idx
target_col = cudf.core.column.column_empty_like(
col, masked=True, newsize=new_size
)
target_col = column_empty(row_count=new_size, dtype=col.dtype)
target_col[scatter_map] = col
target = cudf.Index._from_column(target_col)
result.update(
Expand Down Expand Up @@ -1300,7 +1298,9 @@ def _one_hot_encode_column(
"""
if isinstance(column.dtype, cudf.CategoricalDtype):
if column.size == column.null_count:
column = column_empty_like(categories, newsize=column.size)
column = column_empty(
row_count=column.size, dtype=categories.dtype
)
else:
column = column._get_decategorized_column() # type: ignore[attr-defined]

Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/core/udf/groupby_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,9 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
offsets = cp.asarray(offsets)
ngroups = len(offsets) - 1

output = cudf.core.column.column_empty(ngroups, dtype=return_type)

output = cudf.core.column.column_empty(
ngroups, dtype=return_type, for_numba=True
)
launch_args = [
offsets,
output,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -852,7 +852,7 @@ def test_listcol_setitem_retain_dtype():
{"a": cudf.Series([["a", "b"], []]), "b": [1, 2], "c": [123, 321]}
)
df1 = df.head(0)
# Performing a setitem on `b` triggers a `column.column_empty_like` call
# Performing a setitem on `b` triggers a `column.column_empty` call
# which tries to create an empty ListColumn.
df1["b"] = df1["c"]
# Performing a copy to trigger a copy dtype which is obtained by accessing
Expand Down
25 changes: 25 additions & 0 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4158,6 +4158,31 @@ def test_parquet_reader_with_mismatched_schemas_error():
)


def test_parquet_roundtrip_zero_rows_no_column_mask():
expected = cudf.DataFrame._from_data(
{
"int": cudf.core.column.column_empty(0, "int64"),
"float": cudf.core.column.column_empty(0, "float64"),
"datetime": cudf.core.column.column_empty(0, "datetime64[ns]"),
"timedelta": cudf.core.column.column_empty(0, "timedelta64[ns]"),
"bool": cudf.core.column.column_empty(0, "bool"),
"decimal": cudf.core.column.column_empty(
0, cudf.Decimal64Dtype(1)
),
"struct": cudf.core.column.column_empty(
0, cudf.StructDtype({"a": "int64"})
),
"list": cudf.core.column.column_empty(
0, cudf.ListDtype("float64")
),
}
)
with BytesIO() as bio:
expected.to_parquet(bio)
result = cudf.read_parquet(bio)
assert_eq(result, expected)


def test_parquet_reader_mismatched_nullability():
# Ensure that we can faithfully read the tables with mismatched nullabilities
df1 = cudf.DataFrame(
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/tests/test_string_udfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,9 @@ def run_udf_test(data, func, dtype):
)
else:
dtype = np.dtype(dtype)
output = cudf.core.column.column_empty(len(data), dtype=dtype)
output = cudf.core.column.column_empty(
len(data), dtype=dtype, for_numba=True
)

cudf_column = cudf.core.column.as_column(data)
str_views = column_to_string_view_array(cudf_column)
Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/utils/queryutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,6 @@ def query_execute(df, expr, callenv):
Contains keys 'local_dict', 'locals' and 'globals' which are all dict.
They represent the arg, local and global dictionaries of the caller.
"""

# compile
compiled = query_compile(expr)
columns = compiled["colnames"]
Expand Down Expand Up @@ -247,7 +246,7 @@ def query_execute(df, expr, callenv):

# allocate output buffer
nrows = len(df)
out = column_empty(nrows, dtype=np.bool_)
out = column_empty(nrows, dtype=np.bool_, for_numba=True)
# run kernel
args = [out, *colarrays, *envargs]
with _CUDFNumbaConfig():
Expand Down

0 comments on commit 541e7e8

Please sign in to comment.