Skip to content
Open
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
cbab72f
ENH: Implement split_part bindings, API, and tests for pylibcudf and …
Umang-projects Jan 18, 2026
0a824ad
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jan 18, 2026
42f7110
Merge branch 'main' into feature/add-split-part
Umang-projects Jan 19, 2026
a46bf48
Update python/pylibcudf/pylibcudf/tests/test_split_part.py
Umang-projects Jan 19, 2026
6ad0bae
Update python/cudf/cudf/tests/test_split_part.py
Umang-projects Jan 19, 2026
b218452
Update python/pylibcudf/pylibcudf/strings/split/split.pyx
Umang-projects Jan 19, 2026
bb44d15
FIX: Add missing split_part declaration in pylibcudf pxd and fixing s…
Umang-projects Jan 19, 2026
ab92619
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jan 19, 2026
46f1be9
Merge branch 'main' into feature/add-split-part
Umang-projects Jan 20, 2026
e9f95dc
STYLE: Fixed String.py
Umang-projects Jan 20, 2026
b9f20e0
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jan 20, 2026
c58353b
Merge branch 'main' into feature/add-split-part
Umang-projects Jan 20, 2026
f7e7e8e
Update python/cudf/cudf/core/column/string.py
Umang-projects Jan 20, 2026
705ea28
FIX: Address review comments (add type stubs, fix regression, cleanup…
Umang-projects Jan 20, 2026
82c69a2
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jan 20, 2026
17bd7b7
Update python/cudf/cudf/core/accessors/string.py
Umang-projects Jan 20, 2026
0ee4f94
FIX: files
Umang-projects Jan 21, 2026
d259e8a
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jan 21, 2026
2716fd7
Update python/cudf/cudf/core/accessors/string.py
Umang-projects Jan 21, 2026
c03bf2c
Update python/pylibcudf/pylibcudf/strings/split/split.pyx
Umang-projects Jan 21, 2026
01a7852
FIX: Address review comments (Move tests, fix imports, add whitespace…
Umang-projects Jan 21, 2026
694a165
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jan 21, 2026
f3cb63f
Update python/pylibcudf/pylibcudf/strings/split/split.pyx
Umang-projects Jan 21, 2026
cc49475
STYLE: Update copyright years to 2026
Umang-projects Jan 21, 2026
b8cf502
Merge branch 'main' into feature/add-split-part
Umang-projects Jan 21, 2026
363917a
Update python/pylibcudf/pylibcudf/strings/split/split.pyx
Umang-projects Jan 21, 2026
7a75d80
Update python/pylibcudf/pylibcudf/strings/split/split.pyx
Umang-projects Jan 21, 2026
e5328e4
FIX: Convert delimiter to Scalar
Umang-projects Jan 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions python/cudf/cudf/core/accessors/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -2785,6 +2785,40 @@ def rsplit(

return self._return_or_inplace(result_table, expand=expand)

def split_part(
self, delimiter: str | None = None, index: int = 0
) -> Series | Index:
"""
Splits the string by delimiter and returns the token at the given index.

Parameters
----------
delimiter : str, default None
The string to split on. If not specified, split on whitespace.
index : int, default 0
The index of the token to retrieve.

Returns
-------
Series or Index

Examples
--------
>>> import cudf
>>> s = cudf.Series(["a_b_c", "d_e", "f"])
>>> s.str.split_part(delimiter="_", index=1)
0 b
1 e
2 None
dtype: object
"""

if delimiter is None:
delimiter = ""
return self._return_or_inplace(
self._column.split_part(delimiter, index)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the delimiter annotation, you'll need to call plc.Scalar.from_py(delimiter) before passing it to split_part

)

def partition(self, sep: str = " ", expand: bool = True) -> Series | Index:
"""
Split the string at the first occurrence of sep.
Expand Down
16 changes: 16 additions & 0 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1211,6 +1211,22 @@ def split(self, delimiter: plc.Scalar, maxsplit: int) -> dict[int, Self]:
def rsplit(self, delimiter: plc.Scalar, maxsplit: int) -> dict[int, Self]:
return self._split(delimiter, maxsplit, plc.strings.split.split.rsplit)

def split_part(self, delimiter: plc.Scalar, index: int) -> Self:
with self.access(mode="read", scope="internal"):
plc_column = plc.strings.split.split.split_part(
self.plc_column,
delimiter,
index,
)
return cast(
Self,
(
type(self)
.from_pylibcudf(plc_column)
._with_type_metadata(self.dtype)
),
)

def _partition(
self,
delimiter: plc.Scalar,
Expand Down
30 changes: 29 additions & 1 deletion python/cudf/cudf/tests/series/accessors/test_str.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0

import json
Expand Down Expand Up @@ -1732,6 +1732,34 @@ def test_string_rsplit_re(n, expand):
assert_eq(expect, got)


@pytest.mark.parametrize(
"data, delimiter, index, expected",
[
(["a_b_c", "d_e", "f"], "_", 1, ["b", "e", None]),
(["a_b_c", "d_e", "f"], "_", 0, ["a", "d", "f"]),
],
)
def test_split_part(data, delimiter, index, expected):
s = cudf.Series(data)
got = s.str.split_part(delimiter=delimiter, index=index)
expect = cudf.Series(expected)
assert_eq(got, expect)


@pytest.mark.parametrize(
"data, index, expected",
[
(["a b c", "d e", "f\tg", " h "], 0, ["a", "d", "f", "h"]),
(["a b c", "d e", "f\tg", " h "], 1, ["b", "e", "g", None]),
],
)
def test_split_part_whitespace(data, index, expected):
s = cudf.Series(data)
got = s.str.split_part(delimiter="", index=index)
expect = cudf.Series(expected)
assert_eq(got, expect)


@pytest.mark.parametrize(
"data",
[
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
# SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
Expand Down Expand Up @@ -44,6 +44,13 @@ cdef extern from "cudf/strings/split/split.hpp" namespace \
cuda_stream_view stream,
device_memory_resource* mr) except +libcudf_exception_handler

cdef unique_ptr[column] split_part(
column_view strings,
string_scalar delimiter,
size_type index,
cuda_stream_view stream,
device_memory_resource* mr) except +libcudf_exception_handler


cdef extern from "cudf/strings/split/split_re.hpp" namespace \
"cudf::strings" nogil:
Expand Down
7 changes: 6 additions & 1 deletion python/pylibcudf/pylibcudf/strings/split/split.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0

from pylibcudf.column cimport Column
Expand Down Expand Up @@ -49,3 +49,8 @@ cpdef Column rsplit_record_re(
Column input, RegexProgram prog, size_type maxsplit, Stream stream=*,
DeviceMemoryResource mr=*,
)

cpdef Column split_part(
Column input, Scalar delimiter, size_type index, Stream stream=*,
DeviceMemoryResource mr=*,
)
9 changes: 8 additions & 1 deletion python/pylibcudf/pylibcudf/strings/split/split.pyi
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0

from rmm.pylibrmm.memory_resource import DeviceMemoryResource
Expand Down Expand Up @@ -65,3 +65,10 @@ def rsplit_record_re(
stream: Stream | None = None,
mr: DeviceMemoryResource | None = None,
) -> Column: ...
def split_part(
input: Column,
delimiter: Scalar,
index: int,
stream: Stream | None = None,
mr: DeviceMemoryResource | None = None,
) -> Column: ...
25 changes: 24 additions & 1 deletion python/pylibcudf/pylibcudf/strings/split/split.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
Expand Down Expand Up @@ -405,3 +405,26 @@ cpdef Column rsplit_record_re(
)

return Column.from_libcudf(move(c_result), stream, mr)


cpdef Column split_part(
Column input, Scalar delimiter, size_type index, Stream stream=None,
DeviceMemoryResource mr=None,
):
cdef unique_ptr[column] c_result
cdef const string_scalar* c_delimiter = <const string_scalar*>(
delimiter.c_obj.get()
)
stream = _get_stream(stream)
mr = _get_memory_resource(mr)

with nogil:
c_result = cpp_split.split_part(
input.view(),
dereference(c_delimiter),
index,
stream.view(),
mr.get_mr()
)

return Column.from_libcudf(move(c_result), stream, mr)
33 changes: 32 additions & 1 deletion python/pylibcudf/tests/test_string_split_split.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0

import pyarrow as pa
Expand Down Expand Up @@ -130,3 +130,34 @@ def test_rsplit_record_re(data_col, re_delimiter):
)
expect = pc.split_pattern_regex(pa_array, re_delimiter)
assert_column_eq(expect, got)


def test_split_part(data_col, delimiter):
# Using existing fixtures (data_col has ["a_b_c", "d-e-f", None], delimiter is "_")
_, plc_column = data_col
_, plc_delimiter = delimiter

# Case 1: Index 0
got = plc.strings.split.split.split_part(plc_column, plc_delimiter, 0)
expect = pa.array(["a", "d-e-f", None])
assert_column_eq(expect, got)

# Case 2: Index 1
got = plc.strings.split.split.split_part(plc_column, plc_delimiter, 1)
# "d-e-f" has no delimiter, so index 1 is null
expect = pa.array(["b", None, None])
assert_column_eq(expect, got)


def test_split_part_whitespace():
# Standalone test for whitespace because fixtures use "_"
data = pa.array(["a b", "c d", "e\\tf", None])
plc_column = plc.Column.from_arrow(data)

# Empty delimiter for whitespace split
plc_delimiter = plc.Scalar.from_arrow(pa.scalar(""))

# Index 1
got = plc.strings.split.split.split_part(plc_column, plc_delimiter, 1)
expect = pa.array(["b", "d", "f", None])
assert_column_eq(expect, got)
Loading