Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cudf-polars string slicing #16082

Merged
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
12a6b2c
building
brandon-b-miller Jun 11, 2024
2c33cf7
plumbing to cuDF cython
brandon-b-miller Jun 11, 2024
65e0011
add tests
brandon-b-miller Jun 11, 2024
a6e9ed2
excize improper pyarrow function
brandon-b-miller Jun 12, 2024
e52079c
column tests
brandon-b-miller Jun 12, 2024
d455908
add tests
brandon-b-miller Jun 12, 2024
8e08880
docs
brandon-b-miller Jun 12, 2024
5d223b8
cleanup
brandon-b-miller Jun 12, 2024
c53d07a
Merge branch 'branch-24.08' into pylibcudf-strings-slice
galipremsagar Jun 12, 2024
8605bd2
Merge branch 'branch-24.08' into pylibcudf-strings-slice
brandon-b-miller Jun 14, 2024
866fefe
Merge branch 'branch-24.08' into pylibcudf-strings-slice
brandon-b-miller Jun 17, 2024
fb1a0a1
address some reviews
brandon-b-miller Jun 17, 2024
238a583
Merge branch 'branch-24.08' into pylibcudf-strings-slice
brandon-b-miller Jun 18, 2024
9d70bb2
Merge branch 'branch-24.08' into pylibcudf-strings-slice
brandon-b-miller Jun 24, 2024
78fe267
slice edge case
brandon-b-miller Jun 25, 2024
4e07bf5
plumb polars string slice
brandon-b-miller Jun 25, 2024
4ada5c9
merge latest and resolve conflicts
brandon-b-miller Jun 26, 2024
b690193
add column logic
brandon-b-miller Jun 28, 2024
02072b0
funnel scalar cases
brandon-b-miller Jun 28, 2024
45e2fc4
updates
brandon-b-miller Jul 1, 2024
326c321
add back future column tests
brandon-b-miller Jul 1, 2024
ed9b119
cleanup
brandon-b-miller Jul 1, 2024
920a2e1
s :)
brandon-b-miller Jul 1, 2024
dcfa6c6
update comments
brandon-b-miller Jul 1, 2024
adcf5f8
Merge branch 'branch-24.08' into cudf-polars-str-slice
brandon-b-miller Jul 1, 2024
a7c17cf
Merge branch 'branch-24.08' into cudf-polars-str-slice
brandon-b-miller Jul 2, 2024
fdaa556
Merge branch 'branch-24.08' into cudf-polars-str-slice
brandon-b-miller Jul 2, 2024
2628dde
address reviews
brandon-b-miller Jul 2, 2024
cc2cc7c
update logic
brandon-b-miller Jul 3, 2024
27dfef8
Apply suggestions from code review
brandon-b-miller Jul 3, 2024
9637636
style
brandon-b-miller Jul 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions python/cudf_polars/cudf_polars/dsl/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -703,6 +703,7 @@ def _validate_input(self):
pl_expr.StringFunction.EndsWith,
pl_expr.StringFunction.StartsWith,
pl_expr.StringFunction.Contains,
pl_expr.StringFunction.Slice,
):
raise NotImplementedError(f"String function {self.name}")
if self.name == pl_expr.StringFunction.Contains:
Expand All @@ -716,6 +717,11 @@ def _validate_input(self):
raise NotImplementedError(
"Regex contains only supports a scalar pattern"
)
elif self.name == pl_expr.StringFunction.Slice:
if not all(isinstance(child, Literal) for child in self.children[1:]):
raise NotImplementedError(
"Slice only supports literal start and stop values"
)

def do_evaluate(
self,
Expand Down Expand Up @@ -744,6 +750,29 @@ def do_evaluate(
flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
)
return Column(plc.strings.contains.contains_re(column.obj, prog))
elif self.name == pl_expr.StringFunction.Slice:
child, expr_offset, expr_length = self.children
column = child.evaluate(df, context=context, mapping=mapping)
if isinstance(expr_offset, Literal) and isinstance(expr_length, Literal):
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
# libcudf slices via [start,stop).
# polars slices with offset + length where start == offset
# stop = start + length. Do this maths on the host
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
start = expr_offset.value.as_py()
length = expr_length.value.as_py()

if length == 0:
stop = start
else:
# No length indicates a scan to the end
# The libcudf equivalent is a null stop
stop = start + length if length else None
return Column(
plc.strings.slice.slice_strings(
column.obj,
plc.interop.from_arrow(pa.scalar(start, type=pa.int32())),
plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())),
)
)
columns = [
child.evaluate(df, context=context, mapping=mapping)
for child in self.children
Expand Down
47 changes: 47 additions & 0 deletions python/cudf_polars/tests/expressions/test_stringfunction.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,28 @@ def ldf(with_nulls):
return pl.LazyFrame({"a": a, "b": range(len(a))})


@pytest.fixture(
params=[
(1, None),
(-2, None),
(-100, None),
(1, 1),
(-2, 2),
(-100, 3),
(0, 0),
(0, 1000),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a case where the computed start is negative and the computed stop is positive to ensure that does the right thing? e.g. suppose you have (-3, 4), I think that will produce a start stop pair of [-3, 1) which will be empty for long strings, whereas in polars it would slice from strlen - 3 to the end of the string.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added some logic for this.

]
)
def slice_column_data(ldf, request):
start, length = request.param
if length:
return ldf.with_columns(
pl.lit(start).alias("start"), pl.lit(length).alias("length")
)
else:
return ldf.with_columns(pl.lit(start).alias("start"))


def test_supported_stringfunction_expression(ldf):
query = ldf.select(
pl.col("a").str.starts_with("Z"),
Expand Down Expand Up @@ -104,3 +126,28 @@ def test_contains_invalid(ldf):
query.collect()
with pytest.raises(pl.exceptions.ComputeError):
query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True))


@pytest.mark.parametrize("offset", [1, -1, 0, 100, -100])
def test_slice_scalars_offset(ldf, offset):
query = ldf.select(pl.col("a").str.slice(offset))
assert_gpu_result_equal(query)


@pytest.mark.parametrize(
"offset,length", [(1, 3), (0, 3), (0, 0), (-3, 1), (-100, 5), (1, 1), (100, 100)]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are these parametrizations different to those with columns above? As there, can we particularly add a case where the computed start is negative but the computed stop is non-negative (e.g. (-3, 4) or (-2, 2)? I think those will not do the right thing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Merged these

)
def test_slice_scalars_length_and_offset(ldf, offset, length):
query = ldf.select(pl.col("a").str.slice(offset, length))
assert_gpu_result_equal(query)


def test_slice_column(slice_column_data):
if "length" in slice_column_data.columns:
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
query = slice_column_data.select(
pl.col("a").str.slice(pl.col("start"), pl.col("length"))
)
else:
query = slice_column_data.select(pl.col("a").str.slice(pl.col("start")))
with pytest.raises(pl.exceptions.ComputeError):
assert_gpu_result_equal(query)
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
Loading