Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cudf-polars string slicing #16082

Merged
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
12a6b2c
building
brandon-b-miller Jun 11, 2024
2c33cf7
plumbing to cuDF cython
brandon-b-miller Jun 11, 2024
65e0011
add tests
brandon-b-miller Jun 11, 2024
a6e9ed2
excize improper pyarrow function
brandon-b-miller Jun 12, 2024
e52079c
column tests
brandon-b-miller Jun 12, 2024
d455908
add tests
brandon-b-miller Jun 12, 2024
8e08880
docs
brandon-b-miller Jun 12, 2024
5d223b8
cleanup
brandon-b-miller Jun 12, 2024
c53d07a
Merge branch 'branch-24.08' into pylibcudf-strings-slice
galipremsagar Jun 12, 2024
8605bd2
Merge branch 'branch-24.08' into pylibcudf-strings-slice
brandon-b-miller Jun 14, 2024
866fefe
Merge branch 'branch-24.08' into pylibcudf-strings-slice
brandon-b-miller Jun 17, 2024
fb1a0a1
address some reviews
brandon-b-miller Jun 17, 2024
238a583
Merge branch 'branch-24.08' into pylibcudf-strings-slice
brandon-b-miller Jun 18, 2024
9d70bb2
Merge branch 'branch-24.08' into pylibcudf-strings-slice
brandon-b-miller Jun 24, 2024
78fe267
slice edge case
brandon-b-miller Jun 25, 2024
4e07bf5
plumb polars string slice
brandon-b-miller Jun 25, 2024
4ada5c9
merge latest and resolve conflicts
brandon-b-miller Jun 26, 2024
b690193
add column logic
brandon-b-miller Jun 28, 2024
02072b0
funnel scalar cases
brandon-b-miller Jun 28, 2024
45e2fc4
updates
brandon-b-miller Jul 1, 2024
326c321
add back future column tests
brandon-b-miller Jul 1, 2024
ed9b119
cleanup
brandon-b-miller Jul 1, 2024
920a2e1
s :)
brandon-b-miller Jul 1, 2024
dcfa6c6
update comments
brandon-b-miller Jul 1, 2024
adcf5f8
Merge branch 'branch-24.08' into cudf-polars-str-slice
brandon-b-miller Jul 1, 2024
a7c17cf
Merge branch 'branch-24.08' into cudf-polars-str-slice
brandon-b-miller Jul 2, 2024
fdaa556
Merge branch 'branch-24.08' into cudf-polars-str-slice
brandon-b-miller Jul 2, 2024
2628dde
address reviews
brandon-b-miller Jul 2, 2024
cc2cc7c
update logic
brandon-b-miller Jul 3, 2024
27dfef8
Apply suggestions from code review
brandon-b-miller Jul 3, 2024
9637636
style
brandon-b-miller Jul 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ strings

contains
replace
slice
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=====
slice
=====

.. automodule:: cudf._lib.pylibcudf.strings.slice
:members:
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar

cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil:
cdef unique_ptr[scalar] make_string_scalar(const string & _string) except +
cdef unique_ptr[scalar] make_fixed_width_scalar[T](T value) except +
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# =============================================================================

set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
regex_program.pyx replace.pyx
regex_program.pyx replace.pyx slice.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ from . cimport (
regex_flags,
regex_program,
replace,
slice,
)
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@
regex_flags,
regex_program,
replace,
slice,
)
15 changes: 15 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.scalar cimport Scalar

ctypedef fused ColumnOrScalar:
Column
Scalar

cpdef Column slice_strings(
Column input,
ColumnOrScalar start=*,
ColumnOrScalar stop=*,
Scalar step=*
)
94 changes: 94 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
make_fixed_width_scalar as cpp_make_fixed_width_scalar,
)
from cudf._lib.pylibcudf.libcudf.strings cimport substring as cpp_slice
from cudf._lib.pylibcudf.libcudf.types cimport size_type
from cudf._lib.pylibcudf.scalar cimport Scalar

from cython.operator import dereference


cpdef Column slice_strings(
Column input,
ColumnOrScalar start=None,
ColumnOrScalar stop=None,
Scalar step=None
):
"""Perform a slice operation on a strings column.

``start`` and ``stop`` may be a
:py:class:`~cudf._lib.pylibcudf.column.Column` or a
:py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. But ``step`` must be a
:py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.

For details, see :cpp:func:`cudf::strings::slice_strings`.

Parameters
----------
input : Column
Strings column for this operation
start : Union[Column, Scalar]
The start character position or positions.
stop : Union[Column, Scalar]
The end character position or positions
step : Scalar
Distance between input characters retrieved

Returns
-------
pylibcudf.Column
The result of the slice operation
"""
cdef unique_ptr[column] c_result
cdef numeric_scalar[size_type]* cpp_start
cdef numeric_scalar[size_type]* cpp_stop
cdef numeric_scalar[size_type]* cpp_step

if ColumnOrScalar is Column:
if step is not None:
raise ValueError("Column-wise slice does not support step")

with nogil:
c_result = cpp_slice.slice_strings(
input.view(),
start.view(),
stop.view()
)

elif ColumnOrScalar is Scalar:
if start is None:
start = Scalar.from_libcudf(
cpp_make_fixed_width_scalar(0)
)
if stop is None:
stop = Scalar.from_libcudf(
cpp_make_fixed_width_scalar(0)
)
if step is None:
step = Scalar.from_libcudf(
cpp_make_fixed_width_scalar(1)
)

cpp_start = <numeric_scalar[size_type]*>start.c_obj.get()
cpp_stop = <numeric_scalar[size_type]*>stop.c_obj.get()
cpp_step = <numeric_scalar[size_type]*>step.c_obj.get()

with nogil:
c_result = cpp_slice.slice_strings(
input.view(),
dereference(cpp_start),
dereference(cpp_stop),
dereference(cpp_step)
)
else:
raise ValueError("start, stop, and step must be either Column or Scalar")

return Column.from_libcudf(move(c_result))
88 changes: 26 additions & 62 deletions python/cudf/cudf/_lib/strings/substring.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,16 @@

import numpy as np

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from cudf._lib.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
from cudf._lib.pylibcudf.libcudf.strings.substring cimport (
slice_strings as cpp_slice_strings,
)
from cudf._lib.pylibcudf.libcudf.types cimport size_type

from cudf._lib.scalar import as_device_scalar

from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
from cudf._lib.scalar cimport DeviceScalar

import cudf._lib.pylibcudf as plc


@acquire_spill_lock()
def slice_strings(Column source_strings,
Expand All @@ -32,30 +24,18 @@ def slice_strings(Column source_strings,
performed in steps by skipping `step` number of
characters in a string.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef DeviceScalar start_scalar = as_device_scalar(start, np.int32)
cdef DeviceScalar end_scalar = as_device_scalar(end, np.int32)
cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32)

cdef numeric_scalar[size_type]* start_numeric_scalar = \
<numeric_scalar[size_type]*>(
start_scalar.get_raw_ptr())
cdef numeric_scalar[size_type]* end_numeric_scalar = \
<numeric_scalar[size_type]*>(end_scalar.get_raw_ptr())
cdef numeric_scalar[size_type]* step_numeric_scalar = \
<numeric_scalar[size_type]*>(step_scalar.get_raw_ptr())

with nogil:
c_result = move(cpp_slice_strings(
source_view,
start_numeric_scalar[0],
end_numeric_scalar[0],
step_numeric_scalar[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
plc.strings.slice.slice_strings(
source_strings.to_pylibcudf(mode="read"),
start_scalar.c_value,
end_scalar.c_value,
step_scalar.c_value
)
)


@acquire_spill_lock()
Expand All @@ -67,19 +47,13 @@ def slice_from(Column source_strings,
at given starts and stops positions. `starts` and `stops`
here are positions per element in the string-column.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef column_view starts_view = starts.view()
cdef column_view stops_view = stops.view()

with nogil:
c_result = move(cpp_slice_strings(
source_view,
starts_view,
stops_view
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
plc.strings.slice.slice_strings(
source_strings.to_pylibcudf(mode="read"),
starts.to_pylibcudf(mode="read"),
stops.to_pylibcudf(mode="read")
)
)


@acquire_spill_lock()
Expand All @@ -90,8 +64,7 @@ def get(Column source_strings,
character from each input string. The index of
characters required can be controlled by passing `index`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

if index < 0:
next_index = index - 1
step = -1
Expand All @@ -102,20 +75,11 @@ def get(Column source_strings,
cdef DeviceScalar end_scalar = as_device_scalar(next_index, np.int32)
cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32)

cdef numeric_scalar[size_type]* start_numeric_scalar = \
<numeric_scalar[size_type]*>(
start_scalar.get_raw_ptr())
cdef numeric_scalar[size_type]* end_numeric_scalar = \
<numeric_scalar[size_type]*>(end_scalar.get_raw_ptr())
cdef numeric_scalar[size_type]* step_numeric_scalar = \
<numeric_scalar[size_type]*>(step_scalar.get_raw_ptr())

with nogil:
c_result = move(cpp_slice_strings(
source_view,
start_numeric_scalar[0],
end_numeric_scalar[0],
step_numeric_scalar[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
plc.strings.slice.slice_strings(
source_strings.to_pylibcudf(mode="read"),
start_scalar.c_value,
end_scalar.c_value,
step_scalar.c_value
)
)
99 changes: 99 additions & 0 deletions python/cudf/cudf/pylibcudf_tests/test_string_slice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pytest
from utils import assert_column_eq

import cudf._lib.pylibcudf as plc


@pytest.fixture(scope="module")
def pa_col():
return pa.array(["AbC", "123abc", "", " ", None])


@pytest.fixture(scope="module")
def plc_col(pa_col):
return plc.interop.from_arrow(pa_col)


@pytest.fixture(
scope="module",
params=[(1, 3, 1), (0, 3, -1), (3, 2, 1), (1, 5, 5), (1, 100, 2)],
)
def pa_start_stop_step(request):
return tuple(pa.scalar(x, type=pa.int32()) for x in request.param)


@pytest.fixture(scope="module")
def plc_start_stop_step(pa_start_stop_step):
return tuple(plc.interop.from_arrow(x) for x in pa_start_stop_step)


@pytest.fixture(scope="module")
def pa_starts_col():
return pa.array([0, 1, 3, -1, 100])


@pytest.fixture(scope="module")
def plc_starts_col(pa_starts_col):
return plc.interop.from_arrow(pa_starts_col)


@pytest.fixture(scope="module")
def pa_stops_col():
return pa.array([1, 3, 4, -1, 100])


@pytest.fixture(scope="module")
def plc_stops_col(pa_stops_col):
return plc.interop.from_arrow(pa_stops_col)


def test_slice(pa_col, plc_col, pa_start_stop_step, plc_start_stop_step):
pa_start, pa_stop, pa_step = pa_start_stop_step
plc_start, plc_stop, plc_step = plc_start_stop_step

def slice_string(st, start, stop, step):
return st[start:stop:step] if st is not None else None

expected = pa.array(
[
slice_string(x, pa_start.as_py(), pa_stop.as_py(), pa_step.as_py())
for x in pa_col.to_pylist()
],
type=pa.string(),
)

got = plc.strings.slice.slice_strings(
plc_col, start=plc_start, stop=plc_stop, step=plc_step
)

assert_column_eq(expected, got)


def test_slice_column(
pa_col, plc_col, pa_starts_col, plc_starts_col, pa_stops_col, plc_stops_col
):
def slice_string(st, start, stop):
if stop < 0:
stop = len(st)
return st[start:stop] if st is not None else None

expected = pa.array(
[
slice_string(x, start, stop)
for x, start, stop in zip(
pa_col.to_pylist(),
pa_starts_col.to_pylist(),
pa_stops_col.to_pylist(),
)
],
type=pa.string(),
)

got = plc.strings.slice.slice_strings(
plc_col, plc_starts_col, plc_stops_col
)

assert_column_eq(expected, got)
Loading
Loading