Skip to content

Commit c2f3d65

Browse files
sdf-jklalamb
andauthored
Support API for "pre-image" for pruning predicate evaluation (#19722)
## Which issue does this PR close? - closes #18320 ## Rationale for this change Splitting the PR to make it more readable. ## What changes are included in this PR? Adding the udf_preimage logic without date_part implementation. ## Are these changes tested? Added unit tests for a test specific function ## Are there any user-facing changes? No --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 736fa7c commit c2f3d65

File tree

6 files changed

+521
-2
lines changed

6 files changed

+521
-2
lines changed

datafusion/expr/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ pub mod statistics {
7777
pub use datafusion_expr_common::statistics::*;
7878
}
7979
mod predicate_bounds;
80+
pub mod preimage;
8081
pub mod ptr_eq;
8182
pub mod test;
8283
pub mod tree_node;

datafusion/expr/src/preimage.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use datafusion_expr_common::interval_arithmetic::Interval;
19+
20+
use crate::Expr;
21+
22+
/// Return from [`crate::ScalarUDFImpl::preimage`]
23+
pub enum PreimageResult {
24+
/// No preimage exists for the specified value
25+
None,
26+
/// The expression always evaluates to the specified constant
27+
/// given that `expr` is within the interval
28+
Range { expr: Expr, interval: Box<Interval> },
29+
}

datafusion/expr/src/udf.rs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
2020
use crate::async_udf::AsyncScalarUDF;
2121
use crate::expr::schema_name_from_exprs_comma_separated_without_space;
22+
use crate::preimage::PreimageResult;
2223
use crate::simplify::{ExprSimplifyResult, SimplifyContext};
2324
use crate::sort_properties::{ExprProperties, SortProperties};
2425
use crate::udf_eq::UdfEq;
@@ -232,6 +233,18 @@ impl ScalarUDF {
232233
self.inner.is_nullable(args, schema)
233234
}
234235

236+
/// Return a preimage
237+
///
238+
/// See [`ScalarUDFImpl::preimage`] for more details.
239+
pub fn preimage(
240+
&self,
241+
args: &[Expr],
242+
lit_expr: &Expr,
243+
info: &SimplifyContext,
244+
) -> Result<PreimageResult> {
245+
self.inner.preimage(args, lit_expr, info)
246+
}
247+
235248
/// Invoke the function on `args`, returning the appropriate result.
236249
///
237250
/// See [`ScalarUDFImpl::invoke_with_args`] for details.
@@ -696,6 +709,32 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync {
696709
Ok(ExprSimplifyResult::Original(args))
697710
}
698711

712+
/// Returns the [preimage] for this function and the specified scalar value, if any.
713+
///
714+
/// A preimage is a single contiguous [`Interval`] of values where the function
715+
/// will always return `lit_value`
716+
///
717+
/// Implementations should return intervals with an inclusive lower bound and
718+
/// exclusive upper bound.
719+
///
720+
/// This rewrite is described in the [ClickHouse Paper] and is particularly
721+
/// useful for simplifying expressions `date_part` or equivalent functions. The
722+
/// idea is that if you have an expression like `date_part(YEAR, k) = 2024` and you
723+
/// can find a [preimage] for `date_part(YEAR, k)`, which is the range of dates
724+
/// covering the entire year of 2024. Thus, you can rewrite the expression to `k
725+
/// >= '2024-01-01' AND k < '2025-01-01' which is often more optimizable.
726+
///
727+
/// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf
728+
/// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image
729+
fn preimage(
730+
&self,
731+
_args: &[Expr],
732+
_lit_expr: &Expr,
733+
_info: &SimplifyContext,
734+
) -> Result<PreimageResult> {
735+
Ok(PreimageResult::None)
736+
}
737+
699738
/// Returns true if some of this `exprs` subexpressions may not be evaluated
700739
/// and thus any side effects (like divide by zero) may not be encountered.
701740
///
@@ -926,6 +965,15 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl {
926965
self.inner.simplify(args, info)
927966
}
928967

968+
fn preimage(
969+
&self,
970+
args: &[Expr],
971+
lit_expr: &Expr,
972+
info: &SimplifyContext,
973+
) -> Result<PreimageResult> {
974+
self.inner.preimage(args, lit_expr, info)
975+
}
976+
929977
fn conditional_arguments<'a>(
930978
&self,
931979
args: &'a [Expr],

datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs

Lines changed: 78 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ use datafusion_common::{
3939
};
4040
use datafusion_expr::{
4141
BinaryExpr, Case, ColumnarValue, Expr, Like, Operator, Volatility, and,
42-
binary::BinaryTypeCoercer, lit, or,
42+
binary::BinaryTypeCoercer, lit, or, preimage::PreimageResult,
4343
};
4444
use datafusion_expr::{Cast, TryCast, simplify::ExprSimplifyResult};
4545
use datafusion_expr::{expr::ScalarFunction, interval_arithmetic::NullableInterval};
@@ -51,14 +51,17 @@ use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionP
5151

5252
use super::inlist_simplifier::ShortenInListSimplifier;
5353
use super::utils::*;
54-
use crate::analyzer::type_coercion::TypeCoercionRewriter;
5554
use crate::simplify_expressions::SimplifyContext;
5655
use crate::simplify_expressions::regex::simplify_regex_expr;
5756
use crate::simplify_expressions::unwrap_cast::{
5857
is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary,
5958
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist,
6059
unwrap_cast_in_comparison_for_binary,
6160
};
61+
use crate::{
62+
analyzer::type_coercion::TypeCoercionRewriter,
63+
simplify_expressions::udf_preimage::rewrite_with_preimage,
64+
};
6265
use datafusion_expr::expr_rewriter::rewrite_with_guarantees_map;
6366
use datafusion_expr_common::casts::try_cast_literal_to_type;
6467
use indexmap::IndexSet;
@@ -1969,12 +1972,85 @@ impl TreeNodeRewriter for Simplifier<'_> {
19691972
}))
19701973
}
19711974

1975+
// =======================================
1976+
// preimage_in_comparison
1977+
// =======================================
1978+
//
1979+
// For case:
1980+
// date_part('YEAR', expr) op literal
1981+
//
1982+
// For details see datafusion_expr::ScalarUDFImpl::preimage
1983+
Expr::BinaryExpr(BinaryExpr { left, op, right }) => {
1984+
use datafusion_expr::Operator::*;
1985+
let is_preimage_op = matches!(
1986+
op,
1987+
Eq | NotEq
1988+
| Lt
1989+
| LtEq
1990+
| Gt
1991+
| GtEq
1992+
| IsDistinctFrom
1993+
| IsNotDistinctFrom
1994+
);
1995+
if !is_preimage_op || is_null(&right) {
1996+
return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
1997+
left,
1998+
op,
1999+
right,
2000+
})));
2001+
}
2002+
2003+
if let PreimageResult::Range { interval, expr } =
2004+
get_preimage(left.as_ref(), right.as_ref(), info)?
2005+
{
2006+
rewrite_with_preimage(*interval, op, expr)?
2007+
} else if let Some(swapped) = op.swap() {
2008+
if let PreimageResult::Range { interval, expr } =
2009+
get_preimage(right.as_ref(), left.as_ref(), info)?
2010+
{
2011+
rewrite_with_preimage(*interval, swapped, expr)?
2012+
} else {
2013+
Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))
2014+
}
2015+
} else {
2016+
Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))
2017+
}
2018+
}
2019+
19722020
// no additional rewrites possible
19732021
expr => Transformed::no(expr),
19742022
})
19752023
}
19762024
}
19772025

2026+
fn get_preimage(
2027+
left_expr: &Expr,
2028+
right_expr: &Expr,
2029+
info: &SimplifyContext,
2030+
) -> Result<PreimageResult> {
2031+
let Expr::ScalarFunction(ScalarFunction { func, args }) = left_expr else {
2032+
return Ok(PreimageResult::None);
2033+
};
2034+
if !is_literal_or_literal_cast(right_expr) {
2035+
return Ok(PreimageResult::None);
2036+
}
2037+
if func.signature().volatility != Volatility::Immutable {
2038+
return Ok(PreimageResult::None);
2039+
}
2040+
func.preimage(args, right_expr, info)
2041+
}
2042+
2043+
fn is_literal_or_literal_cast(expr: &Expr) -> bool {
2044+
match expr {
2045+
Expr::Literal(_, _) => true,
2046+
Expr::Cast(Cast { expr, .. }) => matches!(expr.as_ref(), Expr::Literal(_, _)),
2047+
Expr::TryCast(TryCast { expr, .. }) => {
2048+
matches!(expr.as_ref(), Expr::Literal(_, _))
2049+
}
2050+
_ => false,
2051+
}
2052+
}
2053+
19782054
fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option<String>)> {
19792055
match expr {
19802056
Expr::Literal(ScalarValue::Utf8(s), _) => Some((DataType::Utf8, s)),

datafusion/optimizer/src/simplify_expressions/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ mod regex;
2424
pub mod simplify_exprs;
2525
pub mod simplify_literal;
2626
mod simplify_predicates;
27+
mod udf_preimage;
2728
mod unwrap_cast;
2829
mod utils;
2930

0 commit comments

Comments
 (0)