Use a balanced tree instead of unbalanced one (#1830)

koenvo · web-flow · commit bae62dfee6bb · 2025-03-31T14:24:48.000+02:00
**Use a balanced tree instead of an unbalanced one to prevent recursion error in `create_match_filter`**  ## Rationale for this change In the `create_match_filter` function, the previous implementation used `functools.reduce(operator.or_, filters)` to combine expressions. This approach constructed a right-heavy, unbalanced tree, which could lead to a `RecursionError` when dealing with a large number of expressions (e.g., over 1,000). To address this, we've introduced the `_build_balanced_tree` function. This utility constructs a balanced binary tree of expressions, reducing the maximum depth to O(log n) and thereby preventing potential recursion errors. This makes expression construction more stable and scalable, especially when working with large datasets. ```python # Past behavior Or(*[A, B, C, D]) = Or(A, Or(B, Or(C, D)) # New behavior Or(*[A, B, C, D]) = Or(Or(A, B), Or(C, D)) ``` ## Are these changes tested? Yes, existing tests cover the functionality of `Or`. Additional testing was done with large expression sets (e.g., 10,000 items) to ensure that balanced tree construction avoids recursion errors. ## Are there any user-facing changes? No, there are no user-facing changes. This is an internal implementation improvement that does not affect the public API. Closes #1759 Closes #1785
diff --git a/pyiceberg/expressions/__init__.py b/pyiceberg/expressions/__init__.py
@@ -18,11 +18,13 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from functools import cached_property, reduce
+from functools import cached_property
 from typing import (
     Any,
+    Callable,
     Generic,
     Iterable,
+    Sequence,
     Set,
     Tuple,
     Type,
@@ -79,6 +81,45 @@ def __or__(self, other: BooleanExpression) -> BooleanExpression:
         return Or(self, other)
 
 
+def _build_balanced_tree(
+    operator_: Callable[[BooleanExpression, BooleanExpression], BooleanExpression], items: Sequence[BooleanExpression]
+) -> BooleanExpression:
+    """
+    Recursively constructs a balanced binary tree of BooleanExpressions using the provided binary operator.
+
+    This function is a safer and more scalable alternative to:
+        reduce(operator_, items)
+
+    Using `reduce` creates a deeply nested, unbalanced tree (e.g., operator_(a, operator_(b, operator_(c, ...)))),
+    which grows linearly with the number of items. This can lead to RecursionError exceptions in Python
+    when the number of expressions is large (e.g., >1000).
+
+    In contrast, this function builds a balanced binary tree with logarithmic depth (O(log n)),
+    helping avoid recursion issues and ensuring that expression trees remain stable, predictable,
+    and safe to traverse — especially in tools like PyIceberg that operate on large logical trees.
+
+    Parameters:
+        operator_ (Callable): A binary operator function (e.g., pyiceberg.expressions.Or, And) that takes two
+            BooleanExpressions and returns a combined BooleanExpression.
+        items (Sequence[BooleanExpression]): A sequence of BooleanExpression objects to combine.
+
+    Returns:
+        BooleanExpression: The balanced combination of all input BooleanExpressions.
+
+    Raises:
+        ValueError: If the input sequence is empty.
+    """
+    if not items:
+        raise ValueError("No expressions to combine")
+    if len(items) == 1:
+        return items[0]
+    mid = len(items) // 2
+
+    left = _build_balanced_tree(operator_, items[:mid])
+    right = _build_balanced_tree(operator_, items[mid:])
+    return operator_(left, right)
+
+
 class Term(Generic[L], ABC):
     """A simple expression that evaluates to a value."""
 
@@ -214,7 +255,7 @@ class And(BooleanExpression):
 
     def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression) -> BooleanExpression:  # type: ignore
         if rest:
-            return reduce(And, (left, right, *rest))
+            return _build_balanced_tree(And, (left, right, *rest))
         if left is AlwaysFalse() or right is AlwaysFalse():
             return AlwaysFalse()
         elif left is AlwaysTrue():
@@ -257,7 +298,7 @@ class Or(BooleanExpression):
 
     def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression) -> BooleanExpression:  # type: ignore
         if rest:
-            return reduce(Or, (left, right, *rest))
+            return _build_balanced_tree(Or, (left, right, *rest))
         if left is AlwaysTrue() or right is AlwaysTrue():
             return AlwaysTrue()
         elif left is AlwaysFalse():
diff --git a/pyiceberg/table/upsert_util.py b/pyiceberg/table/upsert_util.py
@@ -26,6 +26,7 @@
     BooleanExpression,
     EqualTo,
     In,
+    Or,
 )
 
 
@@ -39,7 +40,12 @@ def create_match_filter(df: pyarrow_table, join_cols: list[str]) -> BooleanExpre
             functools.reduce(operator.and_, [EqualTo(col, row[col]) for col in join_cols]) for row in unique_keys.to_pylist()
         ]
 
-        return AlwaysFalse() if len(filters) == 0 else functools.reduce(operator.or_, filters)
+        if len(filters) == 0:
+            return AlwaysFalse()
+        elif len(filters) == 1:
+            return filters[0]
+        else:
+            return Or(*filters)
 
 
 def has_duplicate_rows(df: pyarrow_table, join_cols: list[str]) -> bool:
diff --git a/tests/expressions/test_expressions.py b/tests/expressions/test_expressions.py
@@ -591,11 +591,11 @@ def test_negate(lhs: BooleanExpression, rhs: BooleanExpression) -> None:
     [
         (
             And(ExpressionA(), ExpressionB(), ExpressionA()),
-            And(And(ExpressionA(), ExpressionB()), ExpressionA()),
+            And(ExpressionA(), And(ExpressionB(), ExpressionA())),
         ),
         (
             Or(ExpressionA(), ExpressionB(), ExpressionA()),
-            Or(Or(ExpressionA(), ExpressionB()), ExpressionA()),
+            Or(ExpressionA(), Or(ExpressionB(), ExpressionA())),
         ),
         (Not(Not(ExpressionA())), ExpressionA()),
     ],
diff --git a/tests/expressions/test_visitors.py b/tests/expressions/test_visitors.py
@@ -230,14 +230,14 @@ def test_boolean_expression_visitor() -> None:
         "NOT",
         "OR",
         "EQUALTO",
-        "OR",
         "NOTEQUALTO",
         "OR",
+        "OR",
         "EQUALTO",
         "NOT",
-        "AND",
         "NOTEQUALTO",
         "AND",
+        "AND",
     ]
 
 
@@ -335,28 +335,28 @@ def test_always_false_or_always_true_expression_binding(table_schema_simple: Sch
                 ),
             ),
             And(
-                And(
-                    BoundIn(
-                        BoundReference(
-                            field=NestedField(field_id=1, name="foo", field_type=StringType(), required=False),
-                            accessor=Accessor(position=0, inner=None),
-                        ),
-                        {literal("bar"), literal("baz")},
+                BoundIn(
+                    BoundReference(
+                        field=NestedField(field_id=1, name="foo", field_type=StringType(), required=False),
+                        accessor=Accessor(position=0, inner=None),
                     ),
+                    {literal("bar"), literal("baz")},
+                ),
+                And(
                     BoundEqualTo[int](
                         BoundReference(
                             field=NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True),
                             accessor=Accessor(position=1, inner=None),
                         ),
                         literal(1),
                     ),
-                ),
-                BoundEqualTo(
-                    BoundReference(
-                        field=NestedField(field_id=1, name="foo", field_type=StringType(), required=False),
-                        accessor=Accessor(position=0, inner=None),
+                    BoundEqualTo(
+                        BoundReference(
+                            field=NestedField(field_id=1, name="foo", field_type=StringType(), required=False),
+                            accessor=Accessor(position=0, inner=None),
+                        ),
+                        literal("baz"),
                     ),
-                    literal("baz"),
                 ),
             ),
         ),
@@ -408,28 +408,28 @@ def test_and_expression_binding(
                 ),
             ),
             Or(
+                BoundIn(
+                    BoundReference(
+                        field=NestedField(field_id=1, name="foo", field_type=StringType(), required=False),
+                        accessor=Accessor(position=0, inner=None),
+                    ),
+                    {literal("bar"), literal("baz")},
+                ),
                 Or(
                     BoundIn(
                         BoundReference(
                             field=NestedField(field_id=1, name="foo", field_type=StringType(), required=False),
                             accessor=Accessor(position=0, inner=None),
                         ),
-                        {literal("bar"), literal("baz")},
+                        {literal("bar")},
                     ),
                     BoundIn(
                         BoundReference(
                             field=NestedField(field_id=1, name="foo", field_type=StringType(), required=False),
                             accessor=Accessor(position=0, inner=None),
                         ),
-                        {literal("bar")},
-                    ),
-                ),
-                BoundIn(
-                    BoundReference(
-                        field=NestedField(field_id=1, name="foo", field_type=StringType(), required=False),
-                        accessor=Accessor(position=0, inner=None),
+                        {literal("baz")},
                     ),
-                    {literal("baz")},
                 ),
             ),
         ),
diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py
@@ -836,5 +836,5 @@ def test_expression_to_complementary_pyarrow(
     # Notice an isNan predicate on a str column is automatically converted to always false and removed from Or and thus will not appear in the pc.expr.
     assert (
         repr(result)
-        == """<pyarrow.compute.Expression (((invert((((((string_field == "hello") and (float_field > 100)) or (is_nan(float_field) and (double_field == 0))) or (float_field > 100)) and invert(is_null(double_field, {nan_is_null=false})))) or is_null(float_field, {nan_is_null=false})) or is_null(string_field, {nan_is_null=false})) or is_nan(double_field))>"""
+        == """<pyarrow.compute.Expression (((invert(((((string_field == "hello") and (float_field > 100)) or ((is_nan(float_field) and (double_field == 0)) or (float_field > 100))) and invert(is_null(double_field, {nan_is_null=false})))) or is_null(float_field, {nan_is_null=false})) or is_null(string_field, {nan_is_null=false})) or is_nan(double_field))>"""
     )

Original file line number	Diff line number	Diff line change
`@@ -591,11 +591,11 @@ def test_negate(lhs: BooleanExpression, rhs: BooleanExpression) -> None:`
`591`	`591`	`[`
`592`	`592`	`(`
`593`	`593`	`And(ExpressionA(), ExpressionB(), ExpressionA()),`
`594`		`- And(And(ExpressionA(), ExpressionB()), ExpressionA()),`
	`594`	`+ And(ExpressionA(), And(ExpressionB(), ExpressionA())),`
`595`	`595`	`),`
`596`	`596`	`(`
`597`	`597`	`Or(ExpressionA(), ExpressionB(), ExpressionA()),`
`598`		`- Or(Or(ExpressionA(), ExpressionB()), ExpressionA()),`
	`598`	`+ Or(ExpressionA(), Or(ExpressionB(), ExpressionA())),`
`599`	`599`	`),`
`600`	`600`	`(Not(Not(ExpressionA())), ExpressionA()),`
`601`	`601`	`],`
Original file line number	Diff line number	Diff line change
`@@ -836,5 +836,5 @@ def test_expression_to_complementary_pyarrow(`
`836`	`836`	`# Notice an isNan predicate on a str column is automatically converted to always false and removed from Or and thus will not appear in the pc.expr.`
`837`	`837`	`assert (`
`838`	`838`	`repr(result)`
`839`		`- == """<pyarrow.compute.Expression (((invert((((((string_field == "hello") and (float_field > 100)) or (is_nan(float_field) and (double_field == 0))) or (float_field > 100)) and invert(is_null(double_field, {nan_is_null=false})))) or is_null(float_field, {nan_is_null=false})) or is_null(string_field, {nan_is_null=false})) or is_nan(double_field))>"""`
	`839`	`+ == """<pyarrow.compute.Expression (((invert(((((string_field == "hello") and (float_field > 100)) or ((is_nan(float_field) and (double_field == 0)) or (float_field > 100))) and invert(is_null(double_field, {nan_is_null=false})))) or is_null(float_field, {nan_is_null=false})) or is_null(string_field, {nan_is_null=false})) or is_nan(double_field))>"""`
`840`	`840`	`)`