Skip to content

Commit 42afc06

Browse files
authored
Implement row group skipping for the default engine parquet readers (#362)
Previous PR #357 implemented the logic of stats-based skipping for a parquet reader, but in abstract form that doesn't actually depend on parquet footers. With that in place, we can now wire up the kernel default parquet readers to use row group skipping. Also fixes #380.
1 parent 1c4b9ce commit 42afc06

File tree

9 files changed

+793
-29
lines changed

9 files changed

+793
-29
lines changed

kernel/src/engine/default/parquet.rs

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStream
1414
use super::file_stream::{FileOpenFuture, FileOpener, FileStream};
1515
use crate::engine::arrow_utils::{generate_mask, get_requested_indices, reorder_struct_array};
1616
use crate::engine::default::executor::TaskExecutor;
17+
use crate::engine::parquet_row_group_skipping::ParquetRowGroupSkipping;
1718
use crate::schema::SchemaRef;
1819
use crate::{DeltaResult, Error, Expression, FileDataReadResultIterator, FileMeta, ParquetHandler};
1920

@@ -89,7 +90,7 @@ struct ParquetOpener {
8990
// projection: Arc<[usize]>,
9091
batch_size: usize,
9192
table_schema: SchemaRef,
92-
_predicate: Option<Expression>,
93+
predicate: Option<Expression>,
9394
limit: Option<usize>,
9495
store: Arc<DynObjectStore>,
9596
}
@@ -104,7 +105,7 @@ impl ParquetOpener {
104105
Self {
105106
batch_size,
106107
table_schema,
107-
_predicate: predicate,
108+
predicate,
108109
limit: None,
109110
store,
110111
}
@@ -119,6 +120,7 @@ impl FileOpener for ParquetOpener {
119120
let batch_size = self.batch_size;
120121
// let projection = self.projection.clone();
121122
let table_schema = self.table_schema.clone();
123+
let predicate = self.predicate.clone();
122124
let limit = self.limit;
123125

124126
Ok(Box::pin(async move {
@@ -141,6 +143,9 @@ impl FileOpener for ParquetOpener {
141143
builder = builder.with_projection(mask)
142144
}
143145

146+
if let Some(ref predicate) = predicate {
147+
builder = builder.with_row_group_filter(predicate);
148+
}
144149
if let Some(limit) = limit {
145150
builder = builder.with_limit(limit)
146151
}
@@ -161,7 +166,7 @@ impl FileOpener for ParquetOpener {
161166
/// Implements [`FileOpener`] for a opening a parquet file from a presigned URL
162167
struct PresignedUrlOpener {
163168
batch_size: usize,
164-
_predicate: Option<Expression>,
169+
predicate: Option<Expression>,
165170
limit: Option<usize>,
166171
table_schema: SchemaRef,
167172
client: reqwest::Client,
@@ -172,7 +177,7 @@ impl PresignedUrlOpener {
172177
Self {
173178
batch_size,
174179
table_schema: schema,
175-
_predicate: predicate,
180+
predicate,
176181
limit: None,
177182
client: reqwest::Client::new(),
178183
}
@@ -183,6 +188,7 @@ impl FileOpener for PresignedUrlOpener {
183188
fn open(&self, file_meta: FileMeta, _range: Option<Range<i64>>) -> DeltaResult<FileOpenFuture> {
184189
let batch_size = self.batch_size;
185190
let table_schema = self.table_schema.clone();
191+
let predicate = self.predicate.clone();
186192
let limit = self.limit;
187193
let client = self.client.clone(); // uses Arc internally according to reqwest docs
188194

@@ -206,6 +212,9 @@ impl FileOpener for PresignedUrlOpener {
206212
builder = builder.with_projection(mask)
207213
}
208214

215+
if let Some(ref predicate) = predicate {
216+
builder = builder.with_row_group_filter(predicate);
217+
}
209218
if let Some(limit) = limit {
210219
builder = builder.with_limit(limit)
211220
}

kernel/src/engine/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ pub mod arrow_expression;
1111
#[cfg(any(feature = "default-engine", feature = "sync-engine"))]
1212
pub mod arrow_data;
1313

14+
#[cfg(any(feature = "default-engine", feature = "sync-engine"))]
15+
pub mod parquet_row_group_skipping;
16+
1417
#[cfg(any(feature = "default-engine", feature = "sync-engine"))]
1518
pub mod parquet_stats_skipping;
1619

Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
//! An implementation of parquet row group skipping using data skipping predicates over footer stats.
2+
use crate::engine::parquet_stats_skipping::{col_name_to_path, ParquetStatsSkippingFilter};
3+
use crate::expressions::{Expression, Scalar};
4+
use crate::schema::{DataType, PrimitiveType};
5+
use chrono::{DateTime, Days};
6+
use parquet::arrow::arrow_reader::ArrowReaderBuilder;
7+
use parquet::file::metadata::RowGroupMetaData;
8+
use parquet::file::statistics::Statistics;
9+
use parquet::schema::types::{ColumnDescPtr, ColumnPath};
10+
use std::collections::{HashMap, HashSet};
11+
use tracing::debug;
12+
13+
#[cfg(test)]
14+
mod tests;
15+
16+
/// An extension trait for [`ArrowReaderBuilder`] that injects row group skipping capability.
17+
pub(crate) trait ParquetRowGroupSkipping {
18+
/// Instructs the parquet reader to perform row group skipping, eliminating any row group whose
19+
/// stats prove that none of the group's rows can satisfy the given `predicate`.
20+
fn with_row_group_filter(self, predicate: &Expression) -> Self;
21+
}
22+
impl<T> ParquetRowGroupSkipping for ArrowReaderBuilder<T> {
23+
fn with_row_group_filter(self, predicate: &Expression) -> Self {
24+
let indices = self
25+
.metadata()
26+
.row_groups()
27+
.iter()
28+
.enumerate()
29+
.filter_map(|(index, row_group)| {
30+
// If the group survives the filter, return Some(index) so filter_map keeps it.
31+
RowGroupFilter::apply(row_group, predicate).then_some(index)
32+
})
33+
.collect();
34+
debug!("with_row_group_filter({predicate:#?}) = {indices:?})");
35+
self.with_row_groups(indices)
36+
}
37+
}
38+
39+
/// A ParquetStatsSkippingFilter for row group skipping. It obtains stats from a parquet
40+
/// [`RowGroupMetaData`] and pre-computes the mapping of each referenced column path to its
41+
/// corresponding field index, for O(1) stats lookups.
42+
struct RowGroupFilter<'a> {
43+
row_group: &'a RowGroupMetaData,
44+
field_indices: HashMap<ColumnPath, usize>,
45+
}
46+
47+
impl<'a> RowGroupFilter<'a> {
48+
/// Creates a new row group filter for the given row group and predicate.
49+
fn new(row_group: &'a RowGroupMetaData, predicate: &Expression) -> Self {
50+
Self {
51+
row_group,
52+
field_indices: compute_field_indices(row_group.schema_descr().columns(), predicate),
53+
}
54+
}
55+
56+
/// Applies a filtering predicate to a row group. Return value false means to skip it.
57+
fn apply(row_group: &'a RowGroupMetaData, predicate: &Expression) -> bool {
58+
RowGroupFilter::new(row_group, predicate).apply_sql_where(predicate) != Some(false)
59+
}
60+
61+
/// Returns `None` if the column doesn't exist and `Some(None)` if the column has no stats.
62+
fn get_stats(&self, col: &ColumnPath) -> Option<Option<&Statistics>> {
63+
self.field_indices
64+
.get(col)
65+
.map(|&i| self.row_group.column(i).statistics())
66+
}
67+
68+
fn decimal_from_bytes(bytes: Option<&[u8]>, precision: u8, scale: u8) -> Option<Scalar> {
69+
// WARNING: The bytes are stored in big-endian order; reverse and then 0-pad to 16 bytes.
70+
let bytes = bytes.filter(|b| b.len() <= 16)?;
71+
let mut bytes = Vec::from(bytes);
72+
bytes.reverse();
73+
bytes.resize(16, 0u8);
74+
let bytes: [u8; 16] = bytes.try_into().ok()?;
75+
Some(Scalar::Decimal(
76+
i128::from_le_bytes(bytes),
77+
precision,
78+
scale,
79+
))
80+
}
81+
82+
fn timestamp_from_date(days: Option<&i32>) -> Option<Scalar> {
83+
let days = u64::try_from(*days?).ok()?;
84+
let timestamp = DateTime::UNIX_EPOCH.checked_add_days(Days::new(days))?;
85+
let timestamp = timestamp.signed_duration_since(DateTime::UNIX_EPOCH);
86+
Some(Scalar::TimestampNtz(timestamp.num_microseconds()?))
87+
}
88+
}
89+
90+
impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> {
91+
// Extracts a stat value, converting from its physical type to the requested logical type.
92+
//
93+
// NOTE: This code is highly redundant with [`get_max_stat_value`] below, but parquet
94+
// ValueStatistics<T> requires T to impl a private trait, so we can't factor out any kind of
95+
// helper method. And macros are hard enough to read that it's not worth defining one.
96+
fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option<Scalar> {
97+
use PrimitiveType::*;
98+
let value = match (data_type.as_primitive_opt()?, self.get_stats(col)??) {
99+
(String, Statistics::ByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(),
100+
(String, Statistics::FixedLenByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(),
101+
(String, _) => return None,
102+
(Long, Statistics::Int64(s)) => s.min_opt()?.into(),
103+
(Long, Statistics::Int32(s)) => (*s.min_opt()? as i64).into(),
104+
(Long, _) => return None,
105+
(Integer, Statistics::Int32(s)) => s.min_opt()?.into(),
106+
(Integer, _) => return None,
107+
(Short, Statistics::Int32(s)) => (*s.min_opt()? as i16).into(),
108+
(Short, _) => return None,
109+
(Byte, Statistics::Int32(s)) => (*s.min_opt()? as i8).into(),
110+
(Byte, _) => return None,
111+
(Float, Statistics::Float(s)) => s.min_opt()?.into(),
112+
(Float, _) => return None,
113+
(Double, Statistics::Double(s)) => s.min_opt()?.into(),
114+
(Double, Statistics::Float(s)) => (*s.min_opt()? as f64).into(),
115+
(Double, _) => return None,
116+
(Boolean, Statistics::Boolean(s)) => s.min_opt()?.into(),
117+
(Boolean, _) => return None,
118+
(Binary, Statistics::ByteArray(s)) => s.min_opt()?.data().into(),
119+
(Binary, Statistics::FixedLenByteArray(s)) => s.min_opt()?.data().into(),
120+
(Binary, _) => return None,
121+
(Date, Statistics::Int32(s)) => Scalar::Date(*s.min_opt()?),
122+
(Date, _) => return None,
123+
(Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.min_opt()?),
124+
(Timestamp, _) => return None, // TODO: Int96 timestamps
125+
(TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.min_opt()?),
126+
(TimestampNtz, Statistics::Int32(s)) => Self::timestamp_from_date(s.min_opt())?,
127+
(TimestampNtz, _) => return None, // TODO: Int96 timestamps
128+
(Decimal(p, s), Statistics::Int32(i)) => Scalar::Decimal(*i.min_opt()? as i128, *p, *s),
129+
(Decimal(p, s), Statistics::Int64(i)) => Scalar::Decimal(*i.min_opt()? as i128, *p, *s),
130+
(Decimal(p, s), Statistics::FixedLenByteArray(b)) => {
131+
Self::decimal_from_bytes(b.min_bytes_opt(), *p, *s)?
132+
}
133+
(Decimal(..), _) => return None,
134+
};
135+
Some(value)
136+
}
137+
138+
fn get_max_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option<Scalar> {
139+
use PrimitiveType::*;
140+
let value = match (data_type.as_primitive_opt()?, self.get_stats(col)??) {
141+
(String, Statistics::ByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(),
142+
(String, Statistics::FixedLenByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(),
143+
(String, _) => return None,
144+
(Long, Statistics::Int64(s)) => s.max_opt()?.into(),
145+
(Long, Statistics::Int32(s)) => (*s.max_opt()? as i64).into(),
146+
(Long, _) => return None,
147+
(Integer, Statistics::Int32(s)) => s.max_opt()?.into(),
148+
(Integer, _) => return None,
149+
(Short, Statistics::Int32(s)) => (*s.max_opt()? as i16).into(),
150+
(Short, _) => return None,
151+
(Byte, Statistics::Int32(s)) => (*s.max_opt()? as i8).into(),
152+
(Byte, _) => return None,
153+
(Float, Statistics::Float(s)) => s.max_opt()?.into(),
154+
(Float, _) => return None,
155+
(Double, Statistics::Double(s)) => s.max_opt()?.into(),
156+
(Double, Statistics::Float(s)) => (*s.max_opt()? as f64).into(),
157+
(Double, _) => return None,
158+
(Boolean, Statistics::Boolean(s)) => s.max_opt()?.into(),
159+
(Boolean, _) => return None,
160+
(Binary, Statistics::ByteArray(s)) => s.max_opt()?.data().into(),
161+
(Binary, Statistics::FixedLenByteArray(s)) => s.max_opt()?.data().into(),
162+
(Binary, _) => return None,
163+
(Date, Statistics::Int32(s)) => Scalar::Date(*s.max_opt()?),
164+
(Date, _) => return None,
165+
(Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.max_opt()?),
166+
(Timestamp, _) => return None, // TODO: Int96 timestamps
167+
(TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.max_opt()?),
168+
(TimestampNtz, Statistics::Int32(s)) => Self::timestamp_from_date(s.max_opt())?,
169+
(TimestampNtz, _) => return None, // TODO: Int96 timestamps
170+
(Decimal(p, s), Statistics::Int32(i)) => Scalar::Decimal(*i.max_opt()? as i128, *p, *s),
171+
(Decimal(p, s), Statistics::Int64(i)) => Scalar::Decimal(*i.max_opt()? as i128, *p, *s),
172+
(Decimal(p, s), Statistics::FixedLenByteArray(b)) => {
173+
Self::decimal_from_bytes(b.max_bytes_opt(), *p, *s)?
174+
}
175+
(Decimal(..), _) => return None,
176+
};
177+
Some(value)
178+
}
179+
180+
fn get_nullcount_stat_value(&self, col: &ColumnPath) -> Option<i64> {
181+
// NOTE: Stats for any given column are optional, which may produce a NULL nullcount. But if
182+
// the column itself is missing, then we know all values are implied to be NULL.
183+
let Some(stats) = self.get_stats(col) else {
184+
return Some(self.get_rowcount_stat_value());
185+
};
186+
187+
// WARNING: [`Statistics::null_count_opt`] returns Some(0) when the underlying stat is
188+
// missing, causing an IS NULL predicate to wrongly skip the file if it contains any NULL
189+
// values. Manually drill into each arm's [`ValueStatistics`] for the stat's true.
190+
let nullcount = match stats? {
191+
Statistics::Boolean(s) => s.null_count_opt(),
192+
Statistics::Int32(s) => s.null_count_opt(),
193+
Statistics::Int64(s) => s.null_count_opt(),
194+
Statistics::Int96(s) => s.null_count_opt(),
195+
Statistics::Float(s) => s.null_count_opt(),
196+
Statistics::Double(s) => s.null_count_opt(),
197+
Statistics::ByteArray(s) => s.null_count_opt(),
198+
Statistics::FixedLenByteArray(s) => s.null_count_opt(),
199+
};
200+
201+
// Parquet nullcount stats are always u64, so we can directly return the value instead of
202+
// wrapping it in a Scalar. We can safely cast it from u64 to i64 because the nullcount can
203+
// never be larger than the rowcount and the parquet rowcount stat is i64.
204+
Some(nullcount? as i64)
205+
}
206+
207+
fn get_rowcount_stat_value(&self) -> i64 {
208+
self.row_group.num_rows()
209+
}
210+
}
211+
212+
/// Given a filter expression of interest and a set of parquet column descriptors, build a column ->
213+
/// index mapping for columns the expression references. This ensures O(1) lookup times, for an
214+
/// overall O(n) cost to evaluate an expression tree with n nodes.
215+
pub(crate) fn compute_field_indices(
216+
fields: &[ColumnDescPtr],
217+
expression: &Expression,
218+
) -> HashMap<ColumnPath, usize> {
219+
fn do_recurse(expression: &Expression, cols: &mut HashSet<ColumnPath>) {
220+
use Expression::*;
221+
let mut recurse = |expr| do_recurse(expr, cols); // simplifies the call sites below
222+
match expression {
223+
Literal(_) => {}
224+
Column(name) => cols.extend([col_name_to_path(name)]), // returns `()`, unlike `insert`
225+
Struct(fields) => fields.iter().for_each(recurse),
226+
UnaryOperation { expr, .. } => recurse(expr),
227+
BinaryOperation { left, right, .. } => [left, right].iter().for_each(|e| recurse(e)),
228+
VariadicOperation { exprs, .. } => exprs.iter().for_each(recurse),
229+
}
230+
}
231+
232+
// Build up a set of requested column paths, then take each found path as the corresponding map
233+
// key (avoids unnecessary cloning).
234+
//
235+
// NOTE: If a requested column was not available, it is silently ignored. These missing columns
236+
// are implied all-null, so we will infer their min/max stats as NULL and nullcount == rowcount.
237+
let mut requested_columns = HashSet::new();
238+
do_recurse(expression, &mut requested_columns);
239+
fields
240+
.iter()
241+
.enumerate()
242+
.filter_map(|(i, f)| requested_columns.take(f.path()).map(|path| (path, i)))
243+
.collect()
244+
}

0 commit comments

Comments
 (0)