Skip to content

Commit c650bf7

Browse files
authored
Move cudf._lib.stream_compaction to cudf.core._internals (#17456)
Contributes to #17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: #17456
1 parent 5802d34 commit c650bf7

File tree

10 files changed

+191
-243
lines changed

10 files changed

+191
-243
lines changed

python/cudf/cudf/_lib/CMakeLists.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
# the License.
1313
# =============================================================================
1414

15-
set(cython_sources column.pyx groupby.pyx interop.pyx scalar.pyx stream_compaction.pyx
16-
string_casting.pyx strings_udf.pyx types.pyx utils.pyx
15+
set(cython_sources column.pyx groupby.pyx interop.pyx scalar.pyx string_casting.pyx strings_udf.pyx
16+
types.pyx utils.pyx
1717
)
1818
set(linked_libraries cudf::cudf)
1919

python/cudf/cudf/_lib/__init__.py

-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from . import (
55
groupby,
66
interop,
7-
stream_compaction,
87
string_casting,
98
strings_udf,
109
)

python/cudf/cudf/_lib/stream_compaction.pyx

-181
This file was deleted.

python/cudf/cudf/core/_base_index.py

+6-8
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,15 @@
1010
from typing_extensions import Self
1111

1212
import cudf
13-
from cudf._lib.stream_compaction import (
14-
apply_boolean_mask,
15-
drop_duplicates,
16-
drop_nulls,
17-
)
1813
from cudf._lib.types import size_type_dtype
1914
from cudf.api.extensions import no_default
2015
from cudf.api.types import is_integer, is_list_like, is_scalar
2116
from cudf.core._internals import copying
17+
from cudf.core._internals.stream_compaction import (
18+
apply_boolean_mask,
19+
drop_duplicates,
20+
drop_nulls,
21+
)
2222
from cudf.core.abc import Serializable
2323
from cudf.core.column import ColumnBase, column
2424
from cudf.core.copy_types import GatherMap
@@ -414,7 +414,7 @@ def hasnans(self):
414414
raise NotImplementedError
415415

416416
@property
417-
def nlevels(self):
417+
def nlevels(self) -> int:
418418
"""
419419
Number of levels.
420420
"""
@@ -1944,7 +1944,6 @@ def drop_duplicates(
19441944
return self._from_columns_like_self(
19451945
drop_duplicates(
19461946
list(self._columns),
1947-
keys=range(len(self._columns)),
19481947
keep=keep,
19491948
nulls_are_equal=nulls_are_equal,
19501949
),
@@ -2033,7 +2032,6 @@ def dropna(self, how="any"):
20332032
drop_nulls(
20342033
data_columns,
20352034
how=how,
2036-
keys=range(len(data_columns)),
20372035
),
20382036
self._column_names,
20392037
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
2+
from __future__ import annotations
3+
4+
from typing import TYPE_CHECKING, Literal
5+
6+
import pylibcudf as plc
7+
8+
from cudf._lib.column import Column
9+
from cudf.core.buffer import acquire_spill_lock
10+
11+
if TYPE_CHECKING:
12+
from cudf.core.column import ColumnBase
13+
14+
15+
@acquire_spill_lock()
16+
def drop_nulls(
17+
columns: list[ColumnBase],
18+
how: Literal["any", "all"] = "any",
19+
keys: list[int] | None = None,
20+
thresh: int | None = None,
21+
) -> list[ColumnBase]:
22+
"""
23+
Drops null rows from cols depending on key columns.
24+
25+
Parameters
26+
----------
27+
columns : list of columns
28+
how : "any" or "all". If thresh is None, drops rows of cols that have any
29+
nulls or all nulls (respectively) in subset (default: "any")
30+
keys : List of column indices. If set, then these columns are checked for
31+
nulls rather than all of columns (optional)
32+
thresh : Minimum number of non-nulls required to keep a row (optional)
33+
34+
Returns
35+
-------
36+
columns with null rows dropped
37+
"""
38+
if how not in {"any", "all"}:
39+
raise ValueError("how must be 'any' or 'all'")
40+
41+
keys = keys if keys is not None else list(range(len(columns)))
42+
43+
# Note: If how == "all" and thresh is specified this prioritizes thresh
44+
if thresh is not None:
45+
keep_threshold = thresh
46+
elif how == "all":
47+
keep_threshold = 1
48+
else:
49+
keep_threshold = len(keys)
50+
51+
plc_table = plc.stream_compaction.drop_nulls(
52+
plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
53+
keys,
54+
keep_threshold,
55+
)
56+
return [Column.from_pylibcudf(col) for col in plc_table.columns()]
57+
58+
59+
@acquire_spill_lock()
60+
def apply_boolean_mask(
61+
columns: list[ColumnBase], boolean_mask: ColumnBase
62+
) -> list[ColumnBase]:
63+
"""
64+
Drops the rows which correspond to False in boolean_mask.
65+
66+
Parameters
67+
----------
68+
columns : list of columns whose rows are dropped as per boolean_mask
69+
boolean_mask : a boolean column of same size as source_table
70+
71+
Returns
72+
-------
73+
columns obtained from applying mask
74+
"""
75+
plc_table = plc.stream_compaction.apply_boolean_mask(
76+
plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
77+
boolean_mask.to_pylibcudf(mode="read"),
78+
)
79+
return [Column.from_pylibcudf(col) for col in plc_table.columns()]
80+
81+
82+
@acquire_spill_lock()
83+
def drop_duplicates(
84+
columns: list[ColumnBase],
85+
keys: list[int] | None = None,
86+
keep: Literal["first", "last", False] = "first",
87+
nulls_are_equal: bool = True,
88+
) -> list[ColumnBase]:
89+
"""
90+
Drops rows in source_table as per duplicate rows in keys.
91+
92+
Parameters
93+
----------
94+
columns : List of columns
95+
keys : List of column indices. If set, then these columns are checked for
96+
duplicates rather than all of columns (optional)
97+
keep : keep 'first' or 'last' or none of the duplicate rows
98+
nulls_are_equal : if True, nulls are treated equal else not.
99+
100+
Returns
101+
-------
102+
columns with duplicate dropped
103+
"""
104+
_keep_options = {
105+
"first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
106+
"last": plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
107+
False: plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
108+
}
109+
if (keep_option := _keep_options.get(keep)) is None:
110+
raise ValueError('keep must be either "first", "last" or False')
111+
112+
plc_table = plc.stream_compaction.stable_distinct(
113+
plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
114+
keys if keys is not None else list(range(len(columns))),
115+
keep_option,
116+
plc.types.NullEquality.EQUAL
117+
if nulls_are_equal
118+
else plc.types.NullEquality.UNEQUAL,
119+
plc.types.NanEquality.ALL_EQUAL,
120+
)
121+
return [Column.from_pylibcudf(col) for col in plc_table.columns()]

0 commit comments

Comments
 (0)