rapidsai · rapids-bot · Jul 3, 2024 · Jun 11, 2024 · Jun 11, 2024 · Jun 11, 2024
@@ -703,6 +703,7 @@ def _validate_input(self):
             pl_expr.StringFunction.EndsWith,
             pl_expr.StringFunction.StartsWith,
             pl_expr.StringFunction.Contains,
+            pl_expr.StringFunction.Slice,
         ):
             raise NotImplementedError(f"String function {self.name}")
         if self.name == pl_expr.StringFunction.Contains:
@@ -716,6 +717,11 @@ def _validate_input(self):
                     raise NotImplementedError(
                         "Regex contains only supports a scalar pattern"
                     )
+        elif self.name == pl_expr.StringFunction.Slice:
+            if not all(isinstance(child, Literal) for child in self.children[1:]):
+                raise NotImplementedError(
+                    "Slice only supports literal start and stop values"
+                )
 
     def do_evaluate(
         self,
@@ -744,6 +750,29 @@ def do_evaluate(
                 flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
             )
             return Column(plc.strings.contains.contains_re(column.obj, prog))
+        elif self.name == pl_expr.StringFunction.Slice:
+            child, expr_offset, expr_length = self.children
+            column = child.evaluate(df, context=context, mapping=mapping)
+            if isinstance(expr_offset, Literal) and isinstance(expr_length, Literal):
+                # libcudf slices via [start,stop).
+                # polars slices with offset + length where start == offset
+                # stop = start + length. Do this maths on the host
+                start = expr_offset.value.as_py()
+                length = expr_length.value.as_py()
+
+                if length == 0:
+                    stop = start
+                else:
+                    # No length indicates a scan to the end
+                    # The libcudf equivalent is a null stop
+                    stop = start + length if length else None
+                return Column(
+                    plc.strings.slice.slice_strings(
+                        column.obj,
+                        plc.interop.from_arrow(pa.scalar(start, type=pa.int32())),
+                        plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())),
+                    )
+                )
         columns = [
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children

@@ -37,6 +37,28 @@ def ldf(with_nulls):
     return pl.LazyFrame({"a": a, "b": range(len(a))})
 
 
+@pytest.fixture(
+    params=[
+        (1, None),
+        (-2, None),
+        (-100, None),
+        (1, 1),
+        (-2, 2),
+        (-100, 3),
+        (0, 0),
+        (0, 1000),
+    ]
+)
+def slice_column_data(ldf, request):
+    start, length = request.param
+    if length:
+        return ldf.with_columns(
+            pl.lit(start).alias("start"), pl.lit(length).alias("length")
+        )
+    else:
+        return ldf.with_columns(pl.lit(start).alias("start"))
+
+
 def test_supported_stringfunction_expression(ldf):
     query = ldf.select(
         pl.col("a").str.starts_with("Z"),
@@ -104,3 +126,28 @@ def test_contains_invalid(ldf):
         query.collect()
     with pytest.raises(pl.exceptions.ComputeError):
         query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True))
+
+
+@pytest.mark.parametrize("offset", [1, -1, 0, 100, -100])
+def test_slice_scalars_offset(ldf, offset):
+    query = ldf.select(pl.col("a").str.slice(offset))
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize(
+    "offset,length", [(1, 3), (0, 3), (0, 0), (-3, 1), (-100, 5), (1, 1), (100, 100)]
+)
+def test_slice_scalars_length_and_offset(ldf, offset, length):
+    query = ldf.select(pl.col("a").str.slice(offset, length))
+    assert_gpu_result_equal(query)
+
+
+def test_slice_column(slice_column_data):
+    if "length" in slice_column_data.columns:
+        query = slice_column_data.select(
+            pl.col("a").str.slice(pl.col("start"), pl.col("length"))
+        )
+    else:
+        query = slice_column_data.select(pl.col("a").str.slice(pl.col("start")))
+    with pytest.raises(pl.exceptions.ComputeError):
+        assert_gpu_result_equal(query)