From 12a6b2c1d01c227df21c507352c5f0d0830f04f4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 11 Jun 2024 09:14:22 -0700 Subject: [PATCH 01/22] building --- .../libcudf/scalar/scalar_factories.pxd | 1 + .../_lib/pylibcudf/strings/CMakeLists.txt | 2 +- .../cudf/_lib/pylibcudf/strings/__init__.pxd | 1 + .../cudf/_lib/pylibcudf/strings/__init__.py | 1 + .../cudf/_lib/pylibcudf/strings/slice.pxd | 15 +++++ .../cudf/_lib/pylibcudf/strings/slice.pyx | 60 +++++++++++++++++++ 6 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd index 5c4e5bf346f..c8220df8938 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd @@ -8,3 +8,4 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil: cdef unique_ptr[scalar] make_string_scalar(const string & _string) except + + cdef unique_ptr[scalar] make_fixed_width_scalar[T](T value) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index cb7f71b1912..b499a127541 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -13,7 +13,7 @@ # ============================================================================= set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx - regex_program.pyx replace.pyx + regex_program.pyx replace.pyx slice.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index 959aa94737d..d1f632d6d8e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -9,4 +9,5 @@ from . cimport ( regex_flags, regex_program, replace, + slice, ) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py index b7384913286..ef102aff2af 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -9,4 +9,5 @@ regex_flags, regex_program, replace, + slice, ) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd new file mode 100644 index 00000000000..ef93e97435a --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.scalar cimport Scalar + +ctypedef fused ColumnOrScalar: + Column + Scalar + +cpdef Column slice_strings( + Column input, + ColumnOrScalar start=*, + ColumnOrScalar stop=*, + ColumnOrScalar step=* +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx new file mode 100644 index 00000000000..cb047cb494f --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx @@ -0,0 +1,60 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar +from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_fixed_width_scalar as cpp_make_fixed_width_scalar, +) +from cudf._lib.pylibcudf.libcudf.strings cimport substring as cpp_slice +from cudf._lib.pylibcudf.libcudf.types cimport size_type +from cudf._lib.pylibcudf.scalar cimport Scalar + +from cython.operator import dereference + + +cpdef Column slice_strings( + Column input, + ColumnOrScalar start=None, + ColumnOrScalar stop=None, + ColumnOrScalar step=None +): + cdef unique_ptr[column] c_result + cdef numeric_scalar[size_type]* cpp_start + cdef numeric_scalar[size_type]* cpp_stop + cdef numeric_scalar[size_type]* cpp_step + + if ColumnOrScalar is Column: + pass + elif ColumnOrScalar is Scalar: + if start is None: + delimiters = Scalar.from_libcudf( + cpp_make_fixed_width_scalar(0) + ) + if stop is None: + delimiters = Scalar.from_libcudf( + cpp_make_fixed_width_scalar(0) + ) + if step is None: + delimiters = Scalar.from_libcudf( + cpp_make_fixed_width_scalar(1) + ) + + cpp_start = start.c_obj.get() + cpp_stop = stop.c_obj.get() + cpp_step = step.c_obj.get() + + with nogil: + c_result = cpp_slice.slice_strings( + input.view(), + dereference(cpp_start), + dereference(cpp_stop), + dereference(cpp_step) + ) + else: + raise ValueError("start, stop, and step must be either Column or Scalar") + + return Column.from_libcudf(move(c_result)) From 2c33cf7fb57fe9350a308f58978be97a608c9ecd Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 11 Jun 2024 11:09:22 -0700 Subject: [PATCH 02/22] plumbing to cuDF cython --- .../cudf/_lib/pylibcudf/strings/slice.pyx | 11 +++- python/cudf/cudf/_lib/strings/substring.pyx | 50 +++++++------------ 2 files changed, 27 insertions(+), 34 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx index cb047cb494f..bc02100bd07 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx @@ -28,7 +28,16 @@ cpdef Column slice_strings( cdef numeric_scalar[size_type]* cpp_step if ColumnOrScalar is Column: - pass + if step is not None: + raise ValueError("Column-wise slice does not support step") + + with nogil: + c_result = cpp_slice.slice_strings( + input.view(), + start.view(), + stop.view() + ) + elif ColumnOrScalar is Scalar: if start is None: delimiters = Scalar.from_libcudf( diff --git a/python/cudf/cudf/_lib/strings/substring.pyx b/python/cudf/cudf/_lib/strings/substring.pyx index 170c1016b89..ccdd4d8623d 100644 --- a/python/cudf/cudf/_lib/strings/substring.pyx +++ b/python/cudf/cudf/_lib/strings/substring.pyx @@ -20,6 +20,8 @@ from cudf._lib.scalar import as_device_scalar from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar from cudf._lib.scalar cimport DeviceScalar +import cudf._lib.pylibcudf as plc + @acquire_spill_lock() def slice_strings(Column source_strings, @@ -32,30 +34,18 @@ def slice_strings(Column source_strings, performed in steps by skipping `step` number of characters in a string. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef DeviceScalar start_scalar = as_device_scalar(start, np.int32) cdef DeviceScalar end_scalar = as_device_scalar(end, np.int32) cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32) - cdef numeric_scalar[size_type]* start_numeric_scalar = \ - ( - start_scalar.get_raw_ptr()) - cdef numeric_scalar[size_type]* end_numeric_scalar = \ - (end_scalar.get_raw_ptr()) - cdef numeric_scalar[size_type]* step_numeric_scalar = \ - (step_scalar.get_raw_ptr()) - - with nogil: - c_result = move(cpp_slice_strings( - source_view, - start_numeric_scalar[0], - end_numeric_scalar[0], - step_numeric_scalar[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.slice.slice_strings( + source_strings.to_pylibcudf(mode="read"), + start_scalar.c_value, + end_scalar.c_value, + step_scalar.c_value + ) + ) @acquire_spill_lock() @@ -67,19 +57,13 @@ def slice_from(Column source_strings, at given starts and stops positions. `starts` and `stops` here are positions per element in the string-column. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view starts_view = starts.view() - cdef column_view stops_view = stops.view() - - with nogil: - c_result = move(cpp_slice_strings( - source_view, - starts_view, - stops_view - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.slice.slice_strings( + source_strings.to_pylibcudf(mode="read"), + starts.to_pylibcudf(mode="read"), + stops.to_pylibcudf(mode="read") + ) + ) @acquire_spill_lock() From 65e0011ee10a21b11907a36734926d53afc17231 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 11 Jun 2024 15:07:47 -0700 Subject: [PATCH 03/22] add tests --- .../cudf/pylibcudf_tests/test_string_slice.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_slice.py diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py new file mode 100644 index 00000000000..78c7f3a6b63 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(scope="module") +def string_col(): + return pa.array(["AbC"]) + + +@pytest.fixture( + scope="module", + params=[ + (1, 3, 1), + ], +) +def pa_start_stop_step(request): + return ( + pa.scalar(request.param[0], type=pa.int32()), + pa.scalar(request.param[1], type=pa.int32()), + pa.scalar(request.param[2], type=pa.int32()), + ) + + +@pytest.fixture(scope="module") +def plc_start_stop_step(pa_start_stop_step): + return ( + plc.interop.from_arrow(pa_start_stop_step[0]), + plc.interop.from_arrow(pa_start_stop_step[1]), + plc.interop.from_arrow(pa_start_stop_step[2]), + ) + + +def test_slice(string_col, pa_start_stop_step, plc_start_stop_step): + plc_col = plc.interop.from_arrow(string_col) + + pa_start, pa_stop, pa_step = pa_start_stop_step + plc_start, plc_stop, plc_step = plc_start_stop_step + + expected = pa.compute.utf8_slice_codeunits( + string_col, pa_start.as_py(), pa_stop.as_py(), pa_step.as_py() + ) + got = plc.strings.slice.slice_strings( + plc_col, start=plc_start, stop=plc_stop, step=plc_step + ) + + assert_column_eq(expected, got) From a6e9ed2d5401ed0db714ea9e243ddaabfd0feb70 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 11 Jun 2024 17:49:32 -0700 Subject: [PATCH 04/22] excize improper pyarrow function --- .../cudf/pylibcudf_tests/test_string_slice.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py index 78c7f3a6b63..8d72700706f 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py @@ -16,23 +16,16 @@ def string_col(): scope="module", params=[ (1, 3, 1), + (0, 3, -1), ], ) def pa_start_stop_step(request): - return ( - pa.scalar(request.param[0], type=pa.int32()), - pa.scalar(request.param[1], type=pa.int32()), - pa.scalar(request.param[2], type=pa.int32()), - ) + return tuple(pa.scalar(x, type=pa.int32()) for x in request.param) @pytest.fixture(scope="module") def plc_start_stop_step(pa_start_stop_step): - return ( - plc.interop.from_arrow(pa_start_stop_step[0]), - plc.interop.from_arrow(pa_start_stop_step[1]), - plc.interop.from_arrow(pa_start_stop_step[2]), - ) + return tuple(plc.interop.from_arrow(x) for x in pa_start_stop_step) def test_slice(string_col, pa_start_stop_step, plc_start_stop_step): @@ -41,9 +34,17 @@ def test_slice(string_col, pa_start_stop_step, plc_start_stop_step): pa_start, pa_stop, pa_step = pa_start_stop_step plc_start, plc_stop, plc_step = plc_start_stop_step - expected = pa.compute.utf8_slice_codeunits( - string_col, pa_start.as_py(), pa_stop.as_py(), pa_step.as_py() + def slice_string(st, start, stop, step): + return st[start:stop:step] if st is not None else None + + expected = pa.array( + [ + slice_string(x, pa_start.as_py(), pa_stop.as_py(), pa_step.as_py()) + for x in string_col.to_pylist() + ], + type=pa.string(), ) + got = plc.strings.slice.slice_strings( plc_col, start=plc_start, stop=plc_stop, step=plc_step ) From e52079c8d397c42d738edabac270911ff203919b Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 11 Jun 2024 18:17:41 -0700 Subject: [PATCH 05/22] column tests --- .../cudf/pylibcudf_tests/test_string_slice.py | 58 +++++++++++++++++-- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py index 8d72700706f..d94ab8a7f99 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py @@ -8,8 +8,13 @@ @pytest.fixture(scope="module") -def string_col(): - return pa.array(["AbC"]) +def pa_col(): + return pa.array(["AbC", "123abc", "", " ", None]) + + +@pytest.fixture(scope="module") +def plc_col(pa_col): + return plc.interop.from_arrow(pa_col) @pytest.fixture( @@ -28,9 +33,27 @@ def plc_start_stop_step(pa_start_stop_step): return tuple(plc.interop.from_arrow(x) for x in pa_start_stop_step) -def test_slice(string_col, pa_start_stop_step, plc_start_stop_step): - plc_col = plc.interop.from_arrow(string_col) +@pytest.fixture(scope="module") +def pa_starts_col(): + return pa.array([0, 1, 3, -1, 100]) + + +@pytest.fixture(scope="module") +def plc_starts_col(pa_starts_col): + return plc.interop.from_arrow(pa_starts_col) + + +@pytest.fixture(scope="module") +def pa_stops_col(): + return pa.array([1, 3, 4, -1, 100]) + + +@pytest.fixture(scope="module") +def plc_stops_col(pa_stops_col): + return plc.interop.from_arrow(pa_stops_col) + +def test_slice(pa_col, plc_col, pa_start_stop_step, plc_start_stop_step): pa_start, pa_stop, pa_step = pa_start_stop_step plc_start, plc_stop, plc_step = plc_start_stop_step @@ -40,7 +63,7 @@ def slice_string(st, start, stop, step): expected = pa.array( [ slice_string(x, pa_start.as_py(), pa_stop.as_py(), pa_step.as_py()) - for x in string_col.to_pylist() + for x in pa_col.to_pylist() ], type=pa.string(), ) @@ -50,3 +73,28 @@ def slice_string(st, start, stop, step): ) assert_column_eq(expected, got) + + +def test_slice_column( + pa_col, plc_col, pa_starts_col, plc_starts_col, pa_stops_col, plc_stops_col +): + def slice_string(st, start, stop): + return st[start:stop] if st is not None else None + + expected = pa.array( + [ + slice_string(x, start, stop) + for x, start, stop in zip( + pa_col.to_pylist(), + pa_starts_col.to_pylist(), + pa_stops_col.to_pylist(), + ) + ], + type=pa.string(), + ) + + got = plc.strings.slice.slice_strings( + plc_col, plc_starts_col, plc_stops_col + ) + + assert_column_eq(expected, got) From d4559085f6c0bbd1ae7ce006746565eaad0274f1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 11 Jun 2024 18:33:30 -0700 Subject: [PATCH 06/22] add tests --- python/cudf/cudf/pylibcudf_tests/test_string_slice.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py index d94ab8a7f99..cddebe36a1b 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py @@ -19,10 +19,7 @@ def plc_col(pa_col): @pytest.fixture( scope="module", - params=[ - (1, 3, 1), - (0, 3, -1), - ], + params=[(1, 3, 1), (0, 3, -1), (3, 2, 1), (1, 5, 5), (1, 100, 2)], ) def pa_start_stop_step(request): return tuple(pa.scalar(x, type=pa.int32()) for x in request.param) From 8e08880bb81216d86c3366db30929dca884ad3b3 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 11 Jun 2024 19:29:05 -0700 Subject: [PATCH 07/22] docs --- .../api_docs/pylibcudf/strings/index.rst | 1 + .../api_docs/pylibcudf/strings/slice.rst | 6 +++++ .../cudf/_lib/pylibcudf/strings/slice.pyx | 27 ++++++++++++++++++- 3 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index bfaef732555..cecf1ccc9bb 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -6,3 +6,4 @@ strings contains replace + slice diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst new file mode 100644 index 00000000000..0ee5af71c03 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst @@ -0,0 +1,6 @@ +===== +slice +===== + +.. automodule:: cudf._lib.pylibcudf.strings.slice + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx index bc02100bd07..46ed1e29c0a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx @@ -20,8 +20,33 @@ cpdef Column slice_strings( Column input, ColumnOrScalar start=None, ColumnOrScalar stop=None, - ColumnOrScalar step=None + Scalar step=None ): + """Perform a slice operation on a strings column. + + ``start`` and ``stop`` may be a + :py:class:`~cudf._lib.pylibcudf.column.Column` or a + :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. But ``step`` must be a + :py:class:`~cudf._lib.pylibcudf.column.Scalar`. + + For details, see :cpp:func:`cudf::strings::slice_strings`. + + Parameters + ---------- + input : Column + Strings column for this operation + start : Union[Column, Scalar] + The start character position or positions. + stop : Union[Column, Scalar] + The end character position or positions + step : Scalar + Distance between input characters retrieved + + Returns + ------- + pylibcudf.Column + The result of the slice operation + """ cdef unique_ptr[column] c_result cdef numeric_scalar[size_type]* cpp_start cdef numeric_scalar[size_type]* cpp_stop From 5d223b89ce50d9c092e99036d5948643392e944e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 11 Jun 2024 19:32:01 -0700 Subject: [PATCH 08/22] cleanup --- python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd | 2 +- python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd index ef93e97435a..7d8d0006ef4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd @@ -11,5 +11,5 @@ cpdef Column slice_strings( Column input, ColumnOrScalar start=*, ColumnOrScalar stop=*, - ColumnOrScalar step=* + Scalar step=* ) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx index 46ed1e29c0a..f10a890c9ab 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx @@ -27,7 +27,7 @@ cpdef Column slice_strings( ``start`` and ``stop`` may be a :py:class:`~cudf._lib.pylibcudf.column.Column` or a :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. But ``step`` must be a - :py:class:`~cudf._lib.pylibcudf.column.Scalar`. + :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. For details, see :cpp:func:`cudf::strings::slice_strings`. From fb1a0a10cb165257c983e74dbc4db33c9801d95d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 17 Jun 2024 11:30:23 -0700 Subject: [PATCH 09/22] address some reviews --- .../cudf/_lib/pylibcudf/strings/slice.pyx | 6 +-- python/cudf/cudf/_lib/strings/substring.pyx | 38 +++++-------------- 2 files changed, 12 insertions(+), 32 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx index f10a890c9ab..50f53230608 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx @@ -65,15 +65,15 @@ cpdef Column slice_strings( elif ColumnOrScalar is Scalar: if start is None: - delimiters = Scalar.from_libcudf( + start = Scalar.from_libcudf( cpp_make_fixed_width_scalar(0) ) if stop is None: - delimiters = Scalar.from_libcudf( + stop = Scalar.from_libcudf( cpp_make_fixed_width_scalar(0) ) if step is None: - delimiters = Scalar.from_libcudf( + step = Scalar.from_libcudf( cpp_make_fixed_width_scalar(1) ) diff --git a/python/cudf/cudf/_lib/strings/substring.pyx b/python/cudf/cudf/_lib/strings/substring.pyx index ccdd4d8623d..706c21c0634 100644 --- a/python/cudf/cudf/_lib/strings/substring.pyx +++ b/python/cudf/cudf/_lib/strings/substring.pyx @@ -2,22 +2,12 @@ import numpy as np -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock from cudf._lib.column cimport Column -from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.strings.substring cimport ( - slice_strings as cpp_slice_strings, -) -from cudf._lib.pylibcudf.libcudf.types cimport size_type from cudf._lib.scalar import as_device_scalar -from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar from cudf._lib.scalar cimport DeviceScalar import cudf._lib.pylibcudf as plc @@ -74,8 +64,7 @@ def get(Column source_strings, character from each input string. The index of characters required can be controlled by passing `index`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() + if index < 0: next_index = index - 1 step = -1 @@ -86,20 +75,11 @@ def get(Column source_strings, cdef DeviceScalar end_scalar = as_device_scalar(next_index, np.int32) cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32) - cdef numeric_scalar[size_type]* start_numeric_scalar = \ - ( - start_scalar.get_raw_ptr()) - cdef numeric_scalar[size_type]* end_numeric_scalar = \ - (end_scalar.get_raw_ptr()) - cdef numeric_scalar[size_type]* step_numeric_scalar = \ - (step_scalar.get_raw_ptr()) - - with nogil: - c_result = move(cpp_slice_strings( - source_view, - start_numeric_scalar[0], - end_numeric_scalar[0], - step_numeric_scalar[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.slice.slice_strings( + source_strings.to_pylibcudf(mode="read"), + start_scalar.c_value, + end_scalar.c_value, + step_scalar.c_value + ) + ) From 78fe2672a766630e541913424aacaf82e6ad1196 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 24 Jun 2024 18:27:59 -0700 Subject: [PATCH 10/22] slice edge case --- python/cudf/cudf/pylibcudf_tests/test_string_slice.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py index cddebe36a1b..27edfff0036 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py @@ -76,6 +76,8 @@ def test_slice_column( pa_col, plc_col, pa_starts_col, plc_starts_col, pa_stops_col, plc_stops_col ): def slice_string(st, start, stop): + if stop < 0: + stop = len(st) return st[start:stop] if st is not None else None expected = pa.array( From 4e07bf50d8029c3bca05425bb359c1069f0084d4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 25 Jun 2024 07:37:00 -0700 Subject: [PATCH 11/22] plumb polars string slice --- python/cudf_polars/cudf_polars/dsl/expr.py | 25 +++++++++++++++++++ .../tests/expressions/test_stringfunction.py | 6 +++++ 2 files changed, 31 insertions(+) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index c92e0714d54..2dda3b95c4a 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -669,6 +669,7 @@ def _validate_input(self): pl_expr.StringFunction.EndsWith, pl_expr.StringFunction.StartsWith, pl_expr.StringFunction.Contains, + pl_expr.StringFunction.Slice, ): raise NotImplementedError(f"String function {self.name}") if self.name == pl_expr.StringFunction.Contains: @@ -710,6 +711,30 @@ def do_evaluate( flags=plc.strings.regex_flags.RegexFlags.DEFAULT, ) return Column(plc.strings.contains.contains_re(column.obj, prog)) + elif self.name == pl_expr.StringFunction.Slice: + child, expr_start, expr_length = self.children + assert isinstance(expr_start, Literal) + assert isinstance(expr_length, Literal) + + column = child.evaluate(df, context=context, mapping=mapping) + + # libcudf slices via [start,stop). + # polars slices with offset + length where start == offset + # stop = start + length. Do this math on the host + stop = Literal( + expr_length.dtype, + pa.scalar( + expr_start.value.as_py() + expr_length.value.as_py(), + type=pa.int32(), + ), + ).evaluate(df, context=context, mapping=mapping) + start = expr_start.evaluate(df, context=context, mapping=mapping) + + return Column( + plc.strings.slice.slice_strings( + column.obj, start.obj_scalar, stop.obj_scalar + ) + ) columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 3c498fe7286..ef611dff26e 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -104,3 +104,9 @@ def test_contains_invalid(ldf): query.collect() with pytest.raises(pl.exceptions.ComputeError): query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)) + + +@pytest.mark.parametrize("offset,length", [(1, 3), (0, 3), (0, 0), (-3, 1), (-100, 5)]) +def test_slice(ldf, offset, length): + query = ldf.select(pl.col("a").str.slice(offset, length)) + assert_gpu_result_equal(query) From b690193572909f0248f289609d93baed20bf6b30 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 28 Jun 2024 13:32:32 -0700 Subject: [PATCH 12/22] add column logic --- .../cudf/_lib/pylibcudf/column_factories.pyx | 15 ++++ python/cudf_polars/cudf_polars/dsl/expr.py | 85 +++++++++++++++---- .../tests/expressions/test_stringfunction.py | 32 +++++++ 3 files changed, 114 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx b/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx index ef7f512f0e5..94d9590757e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx @@ -2,8 +2,11 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move +from cython.operator import dereference + from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_factories cimport ( + make_column_from_scalar as cpp_make_column_from_scalar, make_duration_column as cpp_make_duration_column, make_empty_column as cpp_make_empty_column, make_fixed_point_column as cpp_make_fixed_point_column, @@ -13,6 +16,7 @@ from cudf._lib.pylibcudf.libcudf.column.column_factories cimport ( ) from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type +from .scalar cimport Scalar from .types cimport DataType, type_id from .types import MaskState, TypeId @@ -203,3 +207,14 @@ cpdef Column make_fixed_width_column( ) return Column.from_libcudf(move(result)) + +cpdef Column make_column_from_scalar(Scalar input, size_type size): + cdef unique_ptr[column] result + with nogil: + result = move( + cpp_make_column_from_scalar( + dereference(input.c_obj), + size + ) + ) + return Column.from_libcudf(move(result)) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index ad79a39819c..8551258fed0 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -713,28 +713,77 @@ def do_evaluate( return Column(plc.strings.contains.contains_re(column.obj, prog)) elif self.name == pl_expr.StringFunction.Slice: child, expr_start, expr_length = self.children - assert isinstance(expr_start, Literal) - assert isinstance(expr_length, Literal) - column = child.evaluate(df, context=context, mapping=mapping) + # Case 1 - both are literals + if isinstance(expr_start, Literal) and isinstance(expr_length, Literal): + # libcudf slices via [start,stop). + # polars slices with offset + length where start == offset + # stop = start + length. Do this math on the host + start = expr_start.value.as_py() + length = expr_length.value.as_py() + + if length == 0: + stop = start + else: + # -1 is not inclusive of the end element in libcudf + # However a null scalar scans to the end + stop = start + length if length else None + return Column( + plc.strings.slice.slice_strings( + column.obj, + plc.interop.from_arrow(pa.scalar(start, type=pa.int32())), + plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())), + ) + ) + elif isinstance(expr_start, Col) and isinstance(expr_length, Col): + offsets_col = expr_start.evaluate(df, context=context, mapping=mapping) + length_col = expr_length.evaluate(df, context=context, mapping=mapping) + breakpoint() + if_ = plc.binaryop.binary_operation( + offsets_col.obj, + plc.interop.from_arrow(pa.scalar(0, type=pa.int32())), + plc.binaryop.BinaryOperator.LESS, + plc.DataType(plc.TypeId.BOOL8), + ) + otherwise = plc.binaryop.binary_operation( + offsets_col.obj, + length_col.obj, + plc.binaryop.BinaryOperator.ADD, + plc.DataType(plc.TypeId.INT32), + ) + new_starts = plc.copying.copy_if_else(otherwise, offsets_col.obj, if_) - # libcudf slices via [start,stop). - # polars slices with offset + length where start == offset - # stop = start + length. Do this math on the host - stop = Literal( - expr_length.dtype, - pa.scalar( - expr_start.value.as_py() + expr_length.value.as_py(), - type=pa.int32(), - ), - ).evaluate(df, context=context, mapping=mapping) - start = expr_start.evaluate(df, context=context, mapping=mapping) + stops = plc.binaryop.binary_operation( + new_starts, + length_col.obj, + plc.binaryop.BinaryOperator.ADD, + plc.DataType(plc.TypeId.INT32), + ) - return Column( - plc.strings.slice.slice_strings( - column.obj, start.obj_scalar, stop.obj_scalar + stops_lt_0 = plc.binaryop.binary_operation( + stops, + plc.interop.from_arrow(pa.scalar(0, type=pa.int32())), + plc.binaryop.BinaryOperator.LESS, + plc.DataType(plc.TypeId.BOOL8), ) - ) + + stops = plc.copying.copy_if_else( + plc.column_factories.make_column_from_scalar( + plc.interop.from_arrow(pa.scalar(0, type=pa.int32())), + df.num_rows, + ), + stops, + stops_lt_0, + ) + return Column( + plc.strings.slice.slice_strings(column.obj, new_starts, stops) + ) + + elif isinstance(expr_start, Col) and isinstance(expr_length, Literal): + raise NotImplementedError # TODO: finish + else: + raise NotImplementedError # TODO: finish + columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index ef611dff26e..640e7d11f14 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -110,3 +110,35 @@ def test_contains_invalid(ldf): def test_slice(ldf, offset, length): query = ldf.select(pl.col("a").str.slice(offset, length)) assert_gpu_result_equal(query) + + +@pytest.mark.parametrize("offset", [1, -1, 0, 100, -100]) +def test_slice_scalars_offset(ldf, offset): + query = ldf.select(pl.col("a").str.slice(offset)) + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "offset,length", [(1, 3), (0, 3), (0, 0), (-3, 1), (-100, 5), (1, 1), (100, 100)] +) +def test_slice_scalars_length_and_offset(ldf, offset, length): + query = ldf.select(pl.col("a").str.slice(offset, length)) + assert_gpu_result_equal(query) + + +def test_slice_column(): + df = pl.LazyFrame( + { + "a": ["a", "bcdefesf", "gsdfasd"], + "b": pl.Series([1, 2, 3], dtype=pl.Int32()), + "c": pl.Series([3, 5, 3], dtype=pl.Int32()), + } + ) + query = df.select(pl.col.a.str.slice(pl.col.b, pl.col.c)) + assert_gpu_result_equal(query) + + +# def test_slice_column_and_literal(): +# df = pl.LazyFrame({"a": ["a", "bcdefesf", "gsdfasd"], "b": pl.Series([1, 2, -2], dtype=pl.Int32())}) +# query = df.select(pl.col.a.str.slice(pl.col.b, 2)) +# assert_gpu_result_equal(query) From 02072b01edb19584124f994f2184fb54f3c415d1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 28 Jun 2024 14:38:01 -0700 Subject: [PATCH 13/22] funnel scalar cases --- python/cudf_polars/cudf_polars/dsl/expr.py | 34 ++++++++++++++-------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 8551258fed0..38e557e8ed9 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -712,14 +712,14 @@ def do_evaluate( ) return Column(plc.strings.contains.contains_re(column.obj, prog)) elif self.name == pl_expr.StringFunction.Slice: - child, expr_start, expr_length = self.children + child, expr_offset, expr_length = self.children column = child.evaluate(df, context=context, mapping=mapping) # Case 1 - both are literals - if isinstance(expr_start, Literal) and isinstance(expr_length, Literal): + if isinstance(expr_offset, Literal) and isinstance(expr_length, Literal): # libcudf slices via [start,stop). # polars slices with offset + length where start == offset # stop = start + length. Do this math on the host - start = expr_start.value.as_py() + start = expr_offset.value.as_py() length = expr_length.value.as_py() if length == 0: @@ -735,10 +735,25 @@ def do_evaluate( plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())), ) ) - elif isinstance(expr_start, Col) and isinstance(expr_length, Col): - offsets_col = expr_start.evaluate(df, context=context, mapping=mapping) - length_col = expr_length.evaluate(df, context=context, mapping=mapping) - breakpoint() + else: + # funnel to column overload + if isinstance(expr_offset, Literal): + offsets_col = plc.column_factories.make_column_from_scalar( + expr_offset.value, df.num_rows + ) + else: + offsets_col = expr_offset.evaluate( + df, context=context, mapping=mapping + ) + if isinstance(expr_length, Literal): + length_col = plc.column_factories.make_column_from_scalar( + expr_length.value, df.num_rows + ) + else: + length_col = expr_length.evaluate( + df, context=context, mapping=mapping + ) + if_ = plc.binaryop.binary_operation( offsets_col.obj, plc.interop.from_arrow(pa.scalar(0, type=pa.int32())), @@ -779,11 +794,6 @@ def do_evaluate( plc.strings.slice.slice_strings(column.obj, new_starts, stops) ) - elif isinstance(expr_start, Col) and isinstance(expr_length, Literal): - raise NotImplementedError # TODO: finish - else: - raise NotImplementedError # TODO: finish - columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children From 45e2fc491d706996606a489f3cd062a48347b3c4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 1 Jul 2024 07:24:54 -0700 Subject: [PATCH 14/22] updates --- python/cudf_polars/cudf_polars/dsl/expr.py | 64 ++----------------- .../tests/expressions/test_stringfunction.py | 24 ------- 2 files changed, 5 insertions(+), 83 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 38e557e8ed9..65318f161ad 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -683,6 +683,11 @@ def _validate_input(self): raise NotImplementedError( "Regex contains only supports a scalar pattern" ) + elif self.name == pl_expr.StringFunction.Slice: + if not all(isinstance(child, Literal) for child in self.children[1:]): + raise NotImplementedError( + "Slice only supports literal start and stop values" + ) def do_evaluate( self, @@ -735,65 +740,6 @@ def do_evaluate( plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())), ) ) - else: - # funnel to column overload - if isinstance(expr_offset, Literal): - offsets_col = plc.column_factories.make_column_from_scalar( - expr_offset.value, df.num_rows - ) - else: - offsets_col = expr_offset.evaluate( - df, context=context, mapping=mapping - ) - if isinstance(expr_length, Literal): - length_col = plc.column_factories.make_column_from_scalar( - expr_length.value, df.num_rows - ) - else: - length_col = expr_length.evaluate( - df, context=context, mapping=mapping - ) - - if_ = plc.binaryop.binary_operation( - offsets_col.obj, - plc.interop.from_arrow(pa.scalar(0, type=pa.int32())), - plc.binaryop.BinaryOperator.LESS, - plc.DataType(plc.TypeId.BOOL8), - ) - otherwise = plc.binaryop.binary_operation( - offsets_col.obj, - length_col.obj, - plc.binaryop.BinaryOperator.ADD, - plc.DataType(plc.TypeId.INT32), - ) - new_starts = plc.copying.copy_if_else(otherwise, offsets_col.obj, if_) - - stops = plc.binaryop.binary_operation( - new_starts, - length_col.obj, - plc.binaryop.BinaryOperator.ADD, - plc.DataType(plc.TypeId.INT32), - ) - - stops_lt_0 = plc.binaryop.binary_operation( - stops, - plc.interop.from_arrow(pa.scalar(0, type=pa.int32())), - plc.binaryop.BinaryOperator.LESS, - plc.DataType(plc.TypeId.BOOL8), - ) - - stops = plc.copying.copy_if_else( - plc.column_factories.make_column_from_scalar( - plc.interop.from_arrow(pa.scalar(0, type=pa.int32())), - df.num_rows, - ), - stops, - stops_lt_0, - ) - return Column( - plc.strings.slice.slice_strings(column.obj, new_starts, stops) - ) - columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 640e7d11f14..951f6f4f8c6 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -106,12 +106,6 @@ def test_contains_invalid(ldf): query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)) -@pytest.mark.parametrize("offset,length", [(1, 3), (0, 3), (0, 0), (-3, 1), (-100, 5)]) -def test_slice(ldf, offset, length): - query = ldf.select(pl.col("a").str.slice(offset, length)) - assert_gpu_result_equal(query) - - @pytest.mark.parametrize("offset", [1, -1, 0, 100, -100]) def test_slice_scalars_offset(ldf, offset): query = ldf.select(pl.col("a").str.slice(offset)) @@ -124,21 +118,3 @@ def test_slice_scalars_offset(ldf, offset): def test_slice_scalars_length_and_offset(ldf, offset, length): query = ldf.select(pl.col("a").str.slice(offset, length)) assert_gpu_result_equal(query) - - -def test_slice_column(): - df = pl.LazyFrame( - { - "a": ["a", "bcdefesf", "gsdfasd"], - "b": pl.Series([1, 2, 3], dtype=pl.Int32()), - "c": pl.Series([3, 5, 3], dtype=pl.Int32()), - } - ) - query = df.select(pl.col.a.str.slice(pl.col.b, pl.col.c)) - assert_gpu_result_equal(query) - - -# def test_slice_column_and_literal(): -# df = pl.LazyFrame({"a": ["a", "bcdefesf", "gsdfasd"], "b": pl.Series([1, 2, -2], dtype=pl.Int32())}) -# query = df.select(pl.col.a.str.slice(pl.col.b, 2)) -# assert_gpu_result_equal(query) From 326c3219956e59d4971a3a88c1c885e70c74855d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 1 Jul 2024 07:31:22 -0700 Subject: [PATCH 15/22] add back future column tests --- .../tests/expressions/test_stringfunction.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 951f6f4f8c6..236ec33f47d 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -34,6 +34,28 @@ def ldf(with_nulls): return pl.LazyFrame({"a": a, "b": range(len(a))}) +@pytest.fixture( + params=[ + (1, None), + (-2, None), + (-100, None), + (1, 1), + (-2, 2), + (-100, 3), + (0, 0), + (0, 1000), + ] +) +def slice_column_data(ldf, request): + start, length = request.param + if length: + return ldf.with_columns( + pl.lit(start).alias("start"), pl.lit(length).alias("length") + ) + else: + return ldf.with_columns(pl.lit(start).alias("start")) + + def test_supported_stringfunction_expression(ldf): query = ldf.select( pl.col("a").str.starts_with("Z"), @@ -118,3 +140,14 @@ def test_slice_scalars_offset(ldf, offset): def test_slice_scalars_length_and_offset(ldf, offset, length): query = ldf.select(pl.col("a").str.slice(offset, length)) assert_gpu_result_equal(query) + + +def test_slice_column(slice_column_data): + if "length" in slice_column_data.columns: + query = slice_column_data.select( + pl.col("a").str.slice(pl.col("start"), pl.col("length")) + ) + else: + query = slice_column_data.select(pl.col("a").str.slice(pl.col("start"))) + with pytest.raises(pl.exceptions.ComputeError): + assert_gpu_result_equal(query) From ed9b119205f3bb4f83d0cfc742a2c4de05474df1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 1 Jul 2024 07:33:03 -0700 Subject: [PATCH 16/22] cleanup --- .../cudf/cudf/_lib/pylibcudf/column_factories.pyx | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx b/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx index 94d9590757e..ef7f512f0e5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx @@ -2,11 +2,8 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -from cython.operator import dereference - from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_factories cimport ( - make_column_from_scalar as cpp_make_column_from_scalar, make_duration_column as cpp_make_duration_column, make_empty_column as cpp_make_empty_column, make_fixed_point_column as cpp_make_fixed_point_column, @@ -16,7 +13,6 @@ from cudf._lib.pylibcudf.libcudf.column.column_factories cimport ( ) from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type -from .scalar cimport Scalar from .types cimport DataType, type_id from .types import MaskState, TypeId @@ -207,14 +203,3 @@ cpdef Column make_fixed_width_column( ) return Column.from_libcudf(move(result)) - -cpdef Column make_column_from_scalar(Scalar input, size_type size): - cdef unique_ptr[column] result - with nogil: - result = move( - cpp_make_column_from_scalar( - dereference(input.c_obj), - size - ) - ) - return Column.from_libcudf(move(result)) From 920a2e1cc73ac97096af9d169f20efa637d10820 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 1 Jul 2024 07:34:43 -0700 Subject: [PATCH 17/22] s :) --- python/cudf_polars/cudf_polars/dsl/expr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 65318f161ad..a3ade347d95 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -723,7 +723,7 @@ def do_evaluate( if isinstance(expr_offset, Literal) and isinstance(expr_length, Literal): # libcudf slices via [start,stop). # polars slices with offset + length where start == offset - # stop = start + length. Do this math on the host + # stop = start + length. Do this maths on the host start = expr_offset.value.as_py() length = expr_length.value.as_py() From dcfa6c611b70eb4b512ed9decda7543d92ae3451 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 1 Jul 2024 07:37:00 -0700 Subject: [PATCH 18/22] update comments --- python/cudf_polars/cudf_polars/dsl/expr.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index a3ade347d95..607620f5f1b 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -719,7 +719,6 @@ def do_evaluate( elif self.name == pl_expr.StringFunction.Slice: child, expr_offset, expr_length = self.children column = child.evaluate(df, context=context, mapping=mapping) - # Case 1 - both are literals if isinstance(expr_offset, Literal) and isinstance(expr_length, Literal): # libcudf slices via [start,stop). # polars slices with offset + length where start == offset @@ -730,8 +729,8 @@ def do_evaluate( if length == 0: stop = start else: - # -1 is not inclusive of the end element in libcudf - # However a null scalar scans to the end + # No length indicates a scan to the end + # The libcudf equivalent is a null stop stop = start + length if length else None return Column( plc.strings.slice.slice_strings( From 2628ddeb7aa3a0e6ddbfa5afcc38181450e8cc61 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 2 Jul 2024 09:59:23 -0700 Subject: [PATCH 19/22] address reviews --- python/cudf_polars/cudf_polars/dsl/expr.py | 45 +++++++++++-------- .../tests/expressions/test_stringfunction.py | 25 +++-------- 2 files changed, 33 insertions(+), 37 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 29c6d43042b..c54650c4fb5 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -752,27 +752,34 @@ def do_evaluate( return Column(plc.strings.contains.contains_re(column.obj, prog)) elif self.name == pl_expr.StringFunction.Slice: child, expr_offset, expr_length = self.children + assert isinstance(expr_offset, Literal) + assert isinstance(expr_length, Literal) + column = child.evaluate(df, context=context, mapping=mapping) - if isinstance(expr_offset, Literal) and isinstance(expr_length, Literal): - # libcudf slices via [start,stop). - # polars slices with offset + length where start == offset - # stop = start + length. Do this maths on the host - start = expr_offset.value.as_py() - length = expr_length.value.as_py() - - if length == 0: - stop = start - else: - # No length indicates a scan to the end - # The libcudf equivalent is a null stop - stop = start + length if length else None - return Column( - plc.strings.slice.slice_strings( - column.obj, - plc.interop.from_arrow(pa.scalar(start, type=pa.int32())), - plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())), - ) + # libcudf slices via [start,stop). + # polars slices with offset + length where start == offset + # stop = start + length. Negative values for start look backward + # from the last element of the string. If the end index would be + # below zero, an empty string is returned. + # Do this maths on the host + start = expr_offset.value.as_py() + length = expr_length.value.as_py() + + if length == 0: + stop = start + else: + # No length indicates a scan to the end + # The libcudf equivalent is a null stop + stop = start + length if length else None + if start < 0 and length > -start: + stop = None + return Column( + plc.strings.slice.slice_strings( + column.obj, + plc.interop.from_arrow(pa.scalar(start, type=pa.int32())), + plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())), ) + ) columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 191fabaef7f..512ef228753 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -37,18 +37,10 @@ def ldf(with_nulls): return pl.LazyFrame({"a": a, "b": range(len(a))}) -@pytest.fixture( - params=[ - (1, None), - (-2, None), - (-100, None), - (1, 1), - (-2, 2), - (-100, 3), - (0, 0), - (0, 1000), - ] -) +slice_cases = [(1, 3), (0, 3), (0, 0), (-3, 1), (-100, 5), (1, 1), (100, 100), (-3, 4)] + + +@pytest.fixture(params=slice_cases) def slice_column_data(ldf, request): start, length = request.param if length: @@ -134,20 +126,17 @@ def test_slice_scalars_offset(ldf, offset): assert_gpu_result_equal(query) -@pytest.mark.parametrize( - "offset,length", [(1, 3), (0, 3), (0, 0), (-3, 1), (-100, 5), (1, 1), (100, 100)] -) +@pytest.mark.parametrize("offset,length", slice_cases) def test_slice_scalars_length_and_offset(ldf, offset, length): query = ldf.select(pl.col("a").str.slice(offset, length)) assert_gpu_result_equal(query) def test_slice_column(slice_column_data): - if "length" in slice_column_data.columns: + if "length" in slice_column_data.collect_schema(): query = slice_column_data.select( pl.col("a").str.slice(pl.col("start"), pl.col("length")) ) else: query = slice_column_data.select(pl.col("a").str.slice(pl.col("start"))) - with pytest.raises(pl.exceptions.ComputeError): - assert_gpu_result_equal(query) + assert_ir_translation_raises(query, NotImplementedError) From cc2cc7c1bcf69bfb6c262623e251811b2f543090 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 3 Jul 2024 05:36:43 -0700 Subject: [PATCH 20/22] update logic --- python/cudf_polars/cudf_polars/dsl/expr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index c54650c4fb5..2fb2bc43b3c 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -771,7 +771,7 @@ def do_evaluate( # No length indicates a scan to the end # The libcudf equivalent is a null stop stop = start + length if length else None - if start < 0 and length > -start: + if length and start < 0 and length > -start: stop = None return Column( plc.strings.slice.slice_strings( From 27dfef8d0db1147e50c8c893d6d2e5401326898c Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Wed, 3 Jul 2024 11:24:02 -0500 Subject: [PATCH 21/22] Apply suggestions from code review Co-authored-by: Lawrence Mitchell --- python/cudf_polars/cudf_polars/dsl/expr.py | 2 +- python/cudf_polars/tests/expressions/test_stringfunction.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 2fb2bc43b3c..cfc2947f8de 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -771,7 +771,7 @@ def do_evaluate( # No length indicates a scan to the end # The libcudf equivalent is a null stop stop = start + length if length else None - if length and start < 0 and length > -start: + if length and start < 0 and length >= -start: stop = None return Column( plc.strings.slice.slice_strings( diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 512ef228753..4359b849629 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -37,7 +37,7 @@ def ldf(with_nulls): return pl.LazyFrame({"a": a, "b": range(len(a))}) -slice_cases = [(1, 3), (0, 3), (0, 0), (-3, 1), (-100, 5), (1, 1), (100, 100), (-3, 4)] +slice_cases = [(1, 3), (0, 3), (0, 0), (-3, 1), (-100, 5), (1, 1), (100, 100), (-3, 4), (-3, 3)] @pytest.fixture(params=slice_cases) From 96376361255d3d76028aff26d1339a76e0e88f30 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 3 Jul 2024 09:40:12 -0700 Subject: [PATCH 22/22] style --- .../tests/expressions/test_stringfunction.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 4359b849629..8cf65dd51ac 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -37,7 +37,17 @@ def ldf(with_nulls): return pl.LazyFrame({"a": a, "b": range(len(a))}) -slice_cases = [(1, 3), (0, 3), (0, 0), (-3, 1), (-100, 5), (1, 1), (100, 100), (-3, 4), (-3, 3)] +slice_cases = [ + (1, 3), + (0, 3), + (0, 0), + (-3, 1), + (-100, 5), + (1, 1), + (100, 100), + (-3, 4), + (-3, 3), +] @pytest.fixture(params=slice_cases)