Skip to content

Commit 3a62314

Browse files
authored
Upgrade to polars 1.11 in cudf-polars (#17154)
Polars 1.11 is out, with slight updates to the IR, so we can correctly raise for dynamic groupbys and see inequality joins. These changes adapt to that and do a first pass at supporting inequality joins (by translating to cross + filter). A followup (#17000) will use libcudf's conditional joins. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Bradley Dice (https://github.com/bdice) - Mike Sarahan (https://github.com/msarahan) URL: #17154
1 parent d7cdf44 commit 3a62314

File tree

9 files changed

+172
-29
lines changed

9 files changed

+172
-29
lines changed

conda/environments/all_cuda-118_arch-x86_64.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ dependencies:
6565
- pandas
6666
- pandas>=2.0,<2.2.4dev0
6767
- pandoc
68-
- polars>=1.8,<1.9
68+
- polars>=1.11,<1.12
6969
- pre-commit
7070
- ptxcompiler
7171
- pyarrow>=14.0.0,<18.0.0a0

conda/environments/all_cuda-125_arch-x86_64.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ dependencies:
6363
- pandas
6464
- pandas>=2.0,<2.2.4dev0
6565
- pandoc
66-
- polars>=1.8,<1.9
66+
- polars>=1.11,<1.12
6767
- pre-commit
6868
- pyarrow>=14.0.0,<18.0.0a0
6969
- pydata-sphinx-theme!=0.14.2

conda/recipes/cudf-polars/meta.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ requirements:
4343
run:
4444
- python
4545
- pylibcudf ={{ version }}
46-
- polars >=1.8,<1.9
46+
- polars >=1.11,<1.12
4747
- {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
4848

4949
test:

dependencies.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -727,7 +727,7 @@ dependencies:
727727
common:
728728
- output_types: [conda, requirements, pyproject]
729729
packages:
730-
- polars>=1.8,<1.9
730+
- polars>=1.11,<1.12
731731
run_dask_cudf:
732732
common:
733733
- output_types: [conda, requirements, pyproject]

python/cudf_polars/cudf_polars/dsl/ir.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -666,11 +666,11 @@ def __init__(
666666
raise NotImplementedError(
667667
"rolling window/groupby"
668668
) # pragma: no cover; rollingwindow constructor has already raised
669+
if self.options.dynamic:
670+
raise NotImplementedError("dynamic group by")
669671
if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
670672
raise NotImplementedError("Nested aggregations in groupby")
671673
self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
672-
if len(self.keys) == 0:
673-
raise NotImplementedError("dynamic groupby")
674674

675675
@staticmethod
676676
def check_agg(agg: expr.Expr) -> int:
@@ -802,10 +802,10 @@ class Join(IR):
802802
right_on: tuple[expr.NamedExpr, ...]
803803
"""List of expressions used as keys in the right frame."""
804804
options: tuple[
805-
Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"],
805+
Literal["inner", "left", "right", "full", "semi", "anti", "cross"],
806806
bool,
807807
tuple[int, int] | None,
808-
str | None,
808+
str,
809809
bool,
810810
]
811811
"""
@@ -840,7 +840,7 @@ def __init__(
840840
@staticmethod
841841
@cache
842842
def _joiners(
843-
how: Literal["inner", "left", "right", "full", "leftsemi", "leftanti"],
843+
how: Literal["inner", "left", "right", "full", "semi", "anti"],
844844
) -> tuple[
845845
Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None
846846
]:
@@ -862,13 +862,13 @@ def _joiners(
862862
plc.copying.OutOfBoundsPolicy.NULLIFY,
863863
plc.copying.OutOfBoundsPolicy.NULLIFY,
864864
)
865-
elif how == "leftsemi":
865+
elif how == "semi":
866866
return (
867867
plc.join.left_semi_join,
868868
plc.copying.OutOfBoundsPolicy.DONT_CHECK,
869869
None,
870870
)
871-
elif how == "leftanti":
871+
elif how == "anti":
872872
return (
873873
plc.join.left_anti_join,
874874
plc.copying.OutOfBoundsPolicy.DONT_CHECK,
@@ -933,7 +933,6 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
933933
"""Evaluate and return a dataframe."""
934934
left, right = (c.evaluate(cache=cache) for c in self.children)
935935
how, join_nulls, zlice, suffix, coalesce = self.options
936-
suffix = "_right" if suffix is None else suffix
937936
if how == "cross":
938937
# Separate implementation, since cross_join returns the
939938
# result, not the gather maps
@@ -955,7 +954,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
955954
columns[left.num_columns :], right.column_names, strict=True
956955
)
957956
]
958-
return DataFrame([*left_cols, *right_cols])
957+
return DataFrame([*left_cols, *right_cols]).slice(zlice)
959958
# TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184
960959
left_on = DataFrame(broadcast(*(e.evaluate(left) for e in self.left_on)))
961960
right_on = DataFrame(broadcast(*(e.evaluate(right) for e in self.right_on)))

python/cudf_polars/cudf_polars/dsl/translate.py

Lines changed: 72 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@
55

66
from __future__ import annotations
77

8+
import functools
89
import json
910
from contextlib import AbstractContextManager, nullcontext
1011
from functools import singledispatch
11-
from typing import Any
12+
from typing import TYPE_CHECKING, Any
1213

1314
import pyarrow as pa
1415
import pylibcudf as plc
@@ -19,9 +20,13 @@
1920
from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
2021

2122
from cudf_polars.dsl import expr, ir
23+
from cudf_polars.dsl.traversal import make_recursive, reuse_if_unchanged
2224
from cudf_polars.typing import NodeTraverser
2325
from cudf_polars.utils import dtypes, sorting
2426

27+
if TYPE_CHECKING:
28+
from cudf_polars.typing import ExprTransformer
29+
2530
__all__ = ["translate_ir", "translate_named_expr"]
2631

2732

@@ -182,7 +187,71 @@ def _(
182187
with set_node(visitor, node.input_right):
183188
inp_right = translate_ir(visitor, n=None)
184189
right_on = [translate_named_expr(visitor, n=e) for e in node.right_on]
185-
return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right)
190+
if (how := node.options[0]) in {
191+
"inner",
192+
"left",
193+
"right",
194+
"full",
195+
"cross",
196+
"semi",
197+
"anti",
198+
}:
199+
return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right)
200+
else:
201+
how, op1, op2 = how
202+
if how != "ie_join":
203+
raise NotImplementedError(
204+
f"Unsupported join type {how}"
205+
) # pragma: no cover; asof joins not yet exposed
206+
# No exposure of mixed/conditional joins in pylibcudf yet, so in
207+
# the first instance, implement by doing a cross join followed by
208+
# a filter.
209+
_, join_nulls, zlice, suffix, coalesce = node.options
210+
cross = ir.Join(
211+
schema,
212+
[],
213+
[],
214+
("cross", join_nulls, None, suffix, coalesce),
215+
inp_left,
216+
inp_right,
217+
)
218+
dtype = plc.DataType(plc.TypeId.BOOL8)
219+
if op2 is None:
220+
ops = [op1]
221+
else:
222+
ops = [op1, op2]
223+
suffix = cross.options[3]
224+
225+
# Column references in the right table refer to the post-join
226+
# names, so with suffixes.
227+
def _rename(e: expr.Expr, rec: ExprTransformer) -> expr.Expr:
228+
if isinstance(e, expr.Col) and e.name in inp_left.schema:
229+
return type(e)(e.dtype, f"{e.name}{suffix}")
230+
return reuse_if_unchanged(e, rec)
231+
232+
mapper = make_recursive(_rename)
233+
right_on = [
234+
expr.NamedExpr(
235+
f"{old.name}{suffix}" if old.name in inp_left.schema else old.name, new
236+
)
237+
for new, old in zip(
238+
(mapper(e.value) for e in right_on), right_on, strict=True
239+
)
240+
]
241+
mask = functools.reduce(
242+
functools.partial(
243+
expr.BinOp, dtype, plc.binaryop.BinaryOperator.LOGICAL_AND
244+
),
245+
(
246+
expr.BinOp(dtype, expr.BinOp._MAPPING[op], left.value, right.value)
247+
for op, left, right in zip(ops, left_on, right_on, strict=True)
248+
),
249+
)
250+
filtered = ir.Filter(schema, expr.NamedExpr("mask", mask), cross)
251+
if zlice is not None:
252+
offset, length = zlice
253+
return ir.Slice(schema, offset, length, filtered)
254+
return filtered
186255

187256

188257
@_translate_ir.register
@@ -319,8 +388,7 @@ def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
319388
# IR is versioned with major.minor, minor is bumped for backwards
320389
# compatible changes (e.g. adding new nodes), major is bumped for
321390
# incompatible changes (e.g. renaming nodes).
322-
# Polars 1.7 changes definition of the CSV reader options schema name.
323-
if (version := visitor.version()) >= (3, 0):
391+
if (version := visitor.version()) >= (4, 0):
324392
raise NotImplementedError(
325393
f"No support for polars IR {version=}"
326394
) # pragma: no cover; no such version for now.

python/cudf_polars/cudf_polars/testing/plugin.py

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,34 @@ def pytest_configure(config: pytest.Config):
5353
"tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
5454
"tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
5555
"tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394",
56+
"tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-columns]": "Correctly raises but different error",
57+
"tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "Correctly raises but different error",
58+
"tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "Correctly raises but different error",
59+
"tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "Correctly raises but different error",
5660
"tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394",
61+
"tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[False-False]": "Needs some variant of cudf#16394",
62+
"tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[True-False]": "Needs some variant of cudf#16394",
5763
"tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?",
5864
"tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
5965
"tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception",
6066
"tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception",
6167
"tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
68+
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-none]": "Mismatching column read cudf#16394",
69+
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-none]": "Mismatching column read cudf#16394",
70+
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-prefiltered]": "Mismatching column read cudf#16394",
71+
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-prefiltered]": "Mismatching column read cudf#16394",
72+
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-row_groups]": "Mismatching column read cudf#16394",
73+
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-row_groups]": "Mismatching column read cudf#16394",
74+
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-columns]": "Mismatching column read cudf#16394",
75+
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-columns]": "Mismatching column read cudf#16394",
76+
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-none]": "Mismatching column read cudf#16394",
77+
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-none]": "Mismatching column read cudf#16394",
78+
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-prefiltered]": "Mismatching column read cudf#16394",
79+
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-prefiltered]": "Mismatching column read cudf#16394",
80+
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-row_groups]": "Mismatching column read cudf#16394",
81+
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-row_groups]": "Mismatching column read cudf#16394",
82+
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394",
83+
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394",
6284
"tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match",
6385
"tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match",
6486
"tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match",
@@ -107,6 +129,14 @@ def pytest_configure(config: pytest.Config):
107129
"tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero",
108130
"tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list",
109131
"tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context",
132+
"tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func0-none]": "cudf-polars doesn't nullify division by zero",
133+
"tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func1-none]": "cudf-polars doesn't nullify division by zero",
134+
"tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func2-none]": "cudf-polars doesn't nullify division by zero",
135+
"tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func3-none]": "cudf-polars doesn't nullify division by zero",
136+
"tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func0-none]": "cudf-polars doesn't nullify division by zero",
137+
"tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func1-none]": "cudf-polars doesn't nullify division by zero",
138+
"tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func2-none]": "cudf-polars doesn't nullify division by zero",
139+
"tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func3-none]": "cudf-polars doesn't nullify division by zero",
110140
"tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values",
111141
"tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852",
112142
"tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
@@ -124,13 +154,6 @@ def pytest_configure(config: pytest.Config):
124154
"tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
125155
"tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
126156
"tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
127-
"tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information",
128-
"tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[left-expected0]": "IR needs to expose groupby-dynamic information",
129-
"tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[right-expected1]": "IR needs to expose groupby-dynamic information",
130-
"tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[datapoint-expected2]": "IR needs to expose groupby-dynamic information",
131-
"tests/unit/operations/test_group_by_dynamic.py::test_rolling_dynamic_sortedness_check": "IR needs to expose groupby-dynamic information",
132-
"tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_validation": "IR needs to expose groupby-dynamic information",
133-
"tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_15225": "IR needs to expose groupby-dynamic information",
134157
"tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
135158
"tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
136159
"tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",
@@ -140,6 +163,7 @@ def pytest_configure(config: pytest.Config):
140163
"tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match",
141164
"tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
142165
"tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852",
166+
"tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised",
143167
# Maybe flaky, order-dependent?
144168
"tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
145169
"tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",

python/cudf_polars/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ authors = [
1919
license = { text = "Apache 2.0" }
2020
requires-python = ">=3.10"
2121
dependencies = [
22-
"polars>=1.8,<1.9",
22+
"polars>=1.11,<1.12",
2323
"pylibcudf==24.12.*,>=0.0.0a0",
2424
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
2525
classifiers = [

0 commit comments

Comments
 (0)