|
18 | 18 | from __future__ import annotations
|
19 | 19 |
|
20 | 20 | from abc import ABC, abstractmethod
|
21 |
| -from functools import cached_property, reduce |
| 21 | +from functools import cached_property |
22 | 22 | from typing import (
|
23 | 23 | Any,
|
| 24 | + Callable, |
24 | 25 | Generic,
|
25 | 26 | Iterable,
|
| 27 | + Sequence, |
26 | 28 | Set,
|
27 | 29 | Tuple,
|
28 | 30 | Type,
|
@@ -79,6 +81,45 @@ def __or__(self, other: BooleanExpression) -> BooleanExpression:
|
79 | 81 | return Or(self, other)
|
80 | 82 |
|
81 | 83 |
|
| 84 | +def _build_balanced_tree( |
| 85 | + operator_: Callable[[BooleanExpression, BooleanExpression], BooleanExpression], items: Sequence[BooleanExpression] |
| 86 | +) -> BooleanExpression: |
| 87 | + """ |
| 88 | + Recursively constructs a balanced binary tree of BooleanExpressions using the provided binary operator. |
| 89 | +
|
| 90 | + This function is a safer and more scalable alternative to: |
| 91 | + reduce(operator_, items) |
| 92 | +
|
| 93 | + Using `reduce` creates a deeply nested, unbalanced tree (e.g., operator_(a, operator_(b, operator_(c, ...)))), |
| 94 | + which grows linearly with the number of items. This can lead to RecursionError exceptions in Python |
| 95 | + when the number of expressions is large (e.g., >1000). |
| 96 | +
|
| 97 | + In contrast, this function builds a balanced binary tree with logarithmic depth (O(log n)), |
| 98 | + helping avoid recursion issues and ensuring that expression trees remain stable, predictable, |
| 99 | + and safe to traverse — especially in tools like PyIceberg that operate on large logical trees. |
| 100 | +
|
| 101 | + Parameters: |
| 102 | + operator_ (Callable): A binary operator function (e.g., pyiceberg.expressions.Or, And) that takes two |
| 103 | + BooleanExpressions and returns a combined BooleanExpression. |
| 104 | + items (Sequence[BooleanExpression]): A sequence of BooleanExpression objects to combine. |
| 105 | +
|
| 106 | + Returns: |
| 107 | + BooleanExpression: The balanced combination of all input BooleanExpressions. |
| 108 | +
|
| 109 | + Raises: |
| 110 | + ValueError: If the input sequence is empty. |
| 111 | + """ |
| 112 | + if not items: |
| 113 | + raise ValueError("No expressions to combine") |
| 114 | + if len(items) == 1: |
| 115 | + return items[0] |
| 116 | + mid = len(items) // 2 |
| 117 | + |
| 118 | + left = _build_balanced_tree(operator_, items[:mid]) |
| 119 | + right = _build_balanced_tree(operator_, items[mid:]) |
| 120 | + return operator_(left, right) |
| 121 | + |
| 122 | + |
82 | 123 | class Term(Generic[L], ABC):
|
83 | 124 | """A simple expression that evaluates to a value."""
|
84 | 125 |
|
@@ -214,7 +255,7 @@ class And(BooleanExpression):
|
214 | 255 |
|
215 | 256 | def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression) -> BooleanExpression: # type: ignore
|
216 | 257 | if rest:
|
217 |
| - return reduce(And, (left, right, *rest)) |
| 258 | + return _build_balanced_tree(And, (left, right, *rest)) |
218 | 259 | if left is AlwaysFalse() or right is AlwaysFalse():
|
219 | 260 | return AlwaysFalse()
|
220 | 261 | elif left is AlwaysTrue():
|
@@ -257,7 +298,7 @@ class Or(BooleanExpression):
|
257 | 298 |
|
258 | 299 | def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression) -> BooleanExpression: # type: ignore
|
259 | 300 | if rest:
|
260 |
| - return reduce(Or, (left, right, *rest)) |
| 301 | + return _build_balanced_tree(Or, (left, right, *rest)) |
261 | 302 | if left is AlwaysTrue() or right is AlwaysTrue():
|
262 | 303 | return AlwaysTrue()
|
263 | 304 | elif left is AlwaysFalse():
|
|
0 commit comments