Skip to content

Commit 6c98441

Browse files
committed
parquet reader now uses row group skipping
1 parent 18b33cf commit 6c98441

File tree

10 files changed

+241
-20
lines changed

10 files changed

+241
-20
lines changed

ffi/src/engine_funcs.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ fn read_parquet_file_impl(
122122
last_modified: file.last_modified,
123123
size: file.size,
124124
};
125+
// TODO: Plumb the predicate through the FFI?
125126
let data = parquet_handler.read_parquet_files(&[delta_fm], physical_schema, None)?;
126127
let res = Box::new(FileReadResultIterator {
127128
data,

kernel/src/engine/default/parquet.rs

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStream
1414
use super::file_stream::{FileOpenFuture, FileOpener, FileStream};
1515
use crate::engine::arrow_utils::{generate_mask, get_requested_indices, reorder_struct_array};
1616
use crate::engine::default::executor::TaskExecutor;
17+
use crate::engine::parquet_row_group_skipping::ParquetRowGroupSkipping;
1718
use crate::schema::SchemaRef;
1819
use crate::{DeltaResult, Error, Expression, FileDataReadResultIterator, FileMeta, ParquetHandler};
1920

@@ -47,7 +48,7 @@ impl<E: TaskExecutor> ParquetHandler for DefaultParquetHandler<E> {
4748
&self,
4849
files: &[FileMeta],
4950
physical_schema: SchemaRef,
50-
_predicate: Option<Expression>,
51+
predicate: Option<Expression>,
5152
) -> DeltaResult<FileDataReadResultIterator> {
5253
if files.is_empty() {
5354
return Ok(Box::new(std::iter::empty()));
@@ -62,10 +63,15 @@ impl<E: TaskExecutor> ParquetHandler for DefaultParquetHandler<E> {
6263
// -> parse to parquet
6364
// SAFETY: we did is_empty check above, this is ok.
6465
let file_opener: Box<dyn FileOpener> = match files[0].location.scheme() {
65-
"http" | "https" => Box::new(PresignedUrlOpener::new(1024, physical_schema.clone())),
66+
"http" | "https" => Box::new(PresignedUrlOpener::new(
67+
1024,
68+
physical_schema.clone(),
69+
predicate,
70+
)),
6671
_ => Box::new(ParquetOpener::new(
6772
1024,
6873
physical_schema.clone(),
74+
predicate,
6975
self.store.clone(),
7076
)),
7177
};
@@ -83,20 +89,23 @@ impl<E: TaskExecutor> ParquetHandler for DefaultParquetHandler<E> {
8389
struct ParquetOpener {
8490
// projection: Arc<[usize]>,
8591
batch_size: usize,
86-
limit: Option<usize>,
8792
table_schema: SchemaRef,
93+
predicate: Option<Expression>,
94+
limit: Option<usize>,
8895
store: Arc<DynObjectStore>,
8996
}
9097

9198
impl ParquetOpener {
9299
pub(crate) fn new(
93100
batch_size: usize,
94101
table_schema: SchemaRef,
102+
predicate: Option<Expression>,
95103
store: Arc<DynObjectStore>,
96104
) -> Self {
97105
Self {
98106
batch_size,
99107
table_schema,
108+
predicate,
100109
limit: None,
101110
store,
102111
}
@@ -111,6 +120,7 @@ impl FileOpener for ParquetOpener {
111120
let batch_size = self.batch_size;
112121
// let projection = self.projection.clone();
113122
let table_schema = self.table_schema.clone();
123+
let predicate = self.predicate.clone();
114124
let limit = self.limit;
115125

116126
Ok(Box::pin(async move {
@@ -133,6 +143,9 @@ impl FileOpener for ParquetOpener {
133143
builder = builder.with_projection(mask)
134144
}
135145

146+
if let Some(ref predicate) = predicate {
147+
builder = builder.with_row_group_filter(predicate);
148+
}
136149
if let Some(limit) = limit {
137150
builder = builder.with_limit(limit)
138151
}
@@ -153,16 +166,18 @@ impl FileOpener for ParquetOpener {
153166
/// Implements [`FileOpener`] for a opening a parquet file from a presigned URL
154167
struct PresignedUrlOpener {
155168
batch_size: usize,
169+
predicate: Option<Expression>,
156170
limit: Option<usize>,
157171
table_schema: SchemaRef,
158172
client: reqwest::Client,
159173
}
160174

161175
impl PresignedUrlOpener {
162-
pub(crate) fn new(batch_size: usize, schema: SchemaRef) -> Self {
176+
pub(crate) fn new(batch_size: usize, schema: SchemaRef, predicate: Option<Expression>) -> Self {
163177
Self {
164178
batch_size,
165179
table_schema: schema,
180+
predicate,
166181
limit: None,
167182
client: reqwest::Client::new(),
168183
}
@@ -173,6 +188,7 @@ impl FileOpener for PresignedUrlOpener {
173188
fn open(&self, file_meta: FileMeta, _range: Option<Range<i64>>) -> DeltaResult<FileOpenFuture> {
174189
let batch_size = self.batch_size;
175190
let table_schema = self.table_schema.clone();
191+
let predicate = self.predicate.clone();
176192
let limit = self.limit;
177193
let client = self.client.clone(); // uses Arc internally according to reqwest docs
178194

@@ -196,6 +212,9 @@ impl FileOpener for PresignedUrlOpener {
196212
builder = builder.with_projection(mask)
197213
}
198214

215+
if let Some(ref predicate) = predicate {
216+
builder = builder.with_row_group_filter(predicate);
217+
}
199218
if let Some(limit) = limit {
200219
builder = builder.with_limit(limit)
201220
}
@@ -261,6 +280,7 @@ mod tests {
261280
size: meta.size,
262281
}];
263282

283+
// TODO: add a test that uses predicate skipping?
264284
let handler = DefaultParquetHandler::new(store, Arc::new(TokioBackgroundExecutor::new()));
265285
let data: Vec<RecordBatch> = handler
266286
.read_parquet_files(files, Arc::new(physical_schema.try_into().unwrap()), None)

kernel/src/engine/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ pub mod arrow_expression;
1111
#[cfg(any(feature = "default-engine", feature = "sync-engine"))]
1212
pub mod arrow_data;
1313

14+
#[cfg(any(feature = "default-engine", feature = "sync-engine"))]
15+
pub mod parquet_row_group_skipping;
16+
1417
#[cfg(any(feature = "default-engine", feature = "sync-engine"))]
1518
pub mod parquet_stats_skipping;
1619

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
//! An implementation of parquet row group skipping using data skipping predicates over footer stats.
2+
use crate::engine::parquet_stats_skipping::{col_name_to_path, ParquetStatsSkippingFilter};
3+
use crate::expressions::{Expression, Scalar};
4+
use crate::schema::{DataType, PrimitiveType};
5+
use parquet::arrow::arrow_reader::ArrowReaderBuilder;
6+
use parquet::file::metadata::RowGroupMetaData;
7+
use parquet::file::statistics::Statistics;
8+
use parquet::schema::types::{ColumnDescPtr, ColumnPath};
9+
use std::collections::{HashMap, HashSet};
10+
11+
/// An extension trait for [`ArrowReaderBuilder`] that injects row group skipping capability.
12+
pub(crate) trait ParquetRowGroupSkipping {
13+
/// Instructs the parquet reader to perform row group skipping, eliminating any row group whose
14+
/// stats prove that none of the group's rows can satisfy the given `predicate`.
15+
fn with_row_group_filter(self, predicate: &Expression) -> Self;
16+
}
17+
impl<T> ParquetRowGroupSkipping for ArrowReaderBuilder<T> {
18+
fn with_row_group_filter(self, predicate: &Expression) -> Self {
19+
let indices = self
20+
.metadata()
21+
.row_groups()
22+
.iter()
23+
.enumerate()
24+
.filter_map(|(index, row_group)| {
25+
RowGroupFilter::apply(predicate, row_group).then_some(index)
26+
})
27+
.collect();
28+
self.with_row_groups(indices)
29+
}
30+
}
31+
32+
/// A ParquetStatsSkippingFilter for row group skipping. It obtains stats from a parquet
33+
/// [`RowGroupMetaData`] and pre-computes the mapping of each referenced column path to its
34+
/// corresponding field index, for O(1) stats lookups.
35+
struct RowGroupFilter<'a> {
36+
row_group: &'a RowGroupMetaData,
37+
field_indices: HashMap<ColumnPath, usize>,
38+
}
39+
40+
impl<'a> RowGroupFilter<'a> {
41+
/// Applies a filtering expression to a row group. Return value false means to skip it.
42+
fn apply(filter: &Expression, row_group: &'a RowGroupMetaData) -> bool {
43+
let field_indices = compute_field_indices(row_group.schema_descr().columns(), filter);
44+
let result = Self {
45+
row_group,
46+
field_indices,
47+
}
48+
.apply_sql_where(filter);
49+
!matches!(result, Some(false))
50+
}
51+
52+
fn get_stats(&self, col: &ColumnPath) -> Option<&Statistics> {
53+
let field_index = self.field_indices.get(col)?;
54+
self.row_group.column(*field_index).statistics()
55+
}
56+
}
57+
58+
impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> {
59+
// Extracts a stat value, converting from its physical type to the requested logical type.
60+
//
61+
// NOTE: This code is highly redundant with [`get_min_stat_value`], but parquet
62+
// ValueStatistics<T> requires T to impl a private trait, so we can't factor out any kind of
63+
// helper method. And macros are hard enough to read that it's not worth defining one.
64+
fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option<Scalar> {
65+
use PrimitiveType::*;
66+
let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) {
67+
(String, Statistics::ByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(),
68+
(String, Statistics::FixedLenByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(),
69+
(String, _) => None?,
70+
(Long, Statistics::Int64(s)) => s.min_opt()?.into(),
71+
(Long, Statistics::Int32(s)) => (*s.min_opt()? as i64).into(),
72+
(Long, _) => None?,
73+
(Integer, Statistics::Int32(s)) => s.min_opt()?.into(),
74+
(Integer, _) => None?,
75+
(Short, Statistics::Int32(s)) => (*s.min_opt()? as i16).into(),
76+
(Short, _) => None?,
77+
(Byte, Statistics::Int32(s)) => (*s.min_opt()? as i8).into(),
78+
(Byte, _) => None?,
79+
(Float, Statistics::Float(s)) => s.min_opt()?.into(),
80+
(Float, _) => None?,
81+
(Double, Statistics::Double(s)) => s.min_opt()?.into(),
82+
(Double, _) => None?,
83+
(Boolean, Statistics::Boolean(s)) => s.min_opt()?.into(),
84+
(Boolean, _) => None?,
85+
(Binary, Statistics::ByteArray(s)) => s.min_opt()?.data().into(),
86+
(Binary, Statistics::FixedLenByteArray(s)) => s.min_opt()?.data().into(),
87+
(Binary, _) => None?,
88+
(Date, Statistics::Int32(s)) => Scalar::Date(*s.min_opt()?),
89+
(Date, _) => None?,
90+
(Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.min_opt()?),
91+
(Timestamp, _) => None?, // TODO: Int96 timestamps
92+
(TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.min_opt()?),
93+
(TimestampNtz, _) => None?, // TODO: Int96 timestamps
94+
(Decimal(..), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray)
95+
};
96+
Some(value)
97+
}
98+
99+
fn get_max_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option<Scalar> {
100+
use PrimitiveType::*;
101+
let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) {
102+
(String, Statistics::ByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(),
103+
(String, Statistics::FixedLenByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(),
104+
(String, _) => None?,
105+
(Long, Statistics::Int64(s)) => s.max_opt()?.into(),
106+
(Long, Statistics::Int32(s)) => (*s.max_opt()? as i64).into(),
107+
(Long, _) => None?,
108+
(Integer, Statistics::Int32(s)) => s.max_opt()?.into(),
109+
(Integer, _) => None?,
110+
(Short, Statistics::Int32(s)) => (*s.max_opt()? as i16).into(),
111+
(Short, _) => None?,
112+
(Byte, Statistics::Int32(s)) => (*s.max_opt()? as i8).into(),
113+
(Byte, _) => None?,
114+
(Float, Statistics::Float(s)) => s.max_opt()?.into(),
115+
(Float, _) => None?,
116+
(Double, Statistics::Double(s)) => s.max_opt()?.into(),
117+
(Double, _) => None?,
118+
(Boolean, Statistics::Boolean(s)) => s.max_opt()?.into(),
119+
(Boolean, _) => None?,
120+
(Binary, Statistics::ByteArray(s)) => s.max_opt()?.data().into(),
121+
(Binary, Statistics::FixedLenByteArray(s)) => s.max_opt()?.data().into(),
122+
(Binary, _) => None?,
123+
(Date, Statistics::Int32(s)) => Scalar::Date(*s.max_opt()?),
124+
(Date, _) => None?,
125+
(Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.max_opt()?),
126+
(Timestamp, _) => None?, // TODO: Int96 timestamps
127+
(TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.max_opt()?),
128+
(TimestampNtz, _) => None?, // TODO: Int96 timestamps
129+
(Decimal(..), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray)
130+
};
131+
Some(value)
132+
}
133+
134+
// Parquet nullcount stats always have the same type (u64), so we can directly return the value
135+
// instead of wrapping it in a Scalar. We can safely cast it from u64 to i64, because the
136+
// nullcount can never be larger than the rowcount, and the parquet rowcount stat is i64.
137+
fn get_nullcount_stat_value(&self, col: &ColumnPath) -> Option<i64> {
138+
Some(self.get_stats(col)?.null_count_opt()? as i64)
139+
}
140+
141+
fn get_rowcount_stat_value(&self) -> i64 {
142+
self.row_group.num_rows()
143+
}
144+
}
145+
146+
/// Given a filter expression of interest and a set of parquet column descriptors, build a column ->
147+
/// index mapping for columns the expression references. This ensures O(1) lookup times, for an
148+
/// overall O(n) cost to evaluate an expression tree with n nodes.
149+
pub(crate) fn compute_field_indices(
150+
fields: &[ColumnDescPtr],
151+
expression: &Expression,
152+
) -> HashMap<ColumnPath, usize> {
153+
fn do_recurse(expression: &Expression, cols: &mut HashSet<ColumnPath>) {
154+
use Expression::*;
155+
let mut recurse = |expr| do_recurse(expr, cols); // less arg passing below
156+
match expression {
157+
Literal(_) => {}
158+
Column(name) => drop(cols.insert(col_name_to_path(name))),
159+
Struct(fields) => fields.iter().for_each(recurse),
160+
UnaryOperation { expr, .. } => recurse(expr),
161+
BinaryOperation { left, right, .. } => [left, right].iter().for_each(|e| recurse(e)),
162+
VariadicOperation { exprs, .. } => exprs.iter().for_each(recurse),
163+
}
164+
}
165+
166+
// Build up a set of requested column paths, then take each found path as the corresponding map
167+
// key (avoids unnecessary cloning).
168+
//
169+
// NOTE: If a requested column was not available, it is silently ignored.
170+
let mut requested_columns = HashSet::new();
171+
do_recurse(expression, &mut requested_columns);
172+
fields
173+
.iter()
174+
.enumerate()
175+
.filter_map(|(i, f)| requested_columns.take(f.path()).map(|path| (path, i)))
176+
.collect()
177+
}

kernel/src/engine/parquet_stats_skipping.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ use std::cmp::Ordering;
1212
/// a SET of rows -- has different semantics than row-based predicate evaluation. The provided
1313
/// methods of this class convert various supported expressions into data skipping predicates, and
1414
/// then return the result of evaluating the translated filter.
15-
#[allow(unused)] // temporary, until we wire up the parquet reader to actually use this
1615
pub(crate) trait ParquetStatsSkippingFilter {
1716
/// Retrieves the minimum value of a column, if it exists and has the requested type.
1817
fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option<Scalar>;

kernel/src/engine/sync/parquet.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,17 @@ use url::Url;
66

77
use crate::engine::arrow_data::ArrowEngineData;
88
use crate::engine::arrow_utils::{generate_mask, get_requested_indices, reorder_struct_array};
9+
use crate::engine::parquet_row_group_skipping::ParquetRowGroupSkipping;
910
use crate::schema::SchemaRef;
1011
use crate::{DeltaResult, Error, Expression, FileDataReadResultIterator, FileMeta, ParquetHandler};
1112

1213
pub(crate) struct SyncParquetHandler;
1314

14-
fn try_create_from_parquet(schema: SchemaRef, location: Url) -> DeltaResult<ArrowEngineData> {
15+
fn try_create_from_parquet(
16+
schema: SchemaRef,
17+
location: Url,
18+
predicate: Option<&Expression>,
19+
) -> DeltaResult<ArrowEngineData> {
1520
let file = File::open(
1621
location
1722
.to_file_path()
@@ -25,6 +30,9 @@ fn try_create_from_parquet(schema: SchemaRef, location: Url) -> DeltaResult<Arro
2530
{
2631
builder = builder.with_projection(mask);
2732
}
33+
if let Some(predicate) = predicate {
34+
builder = builder.with_row_group_filter(predicate);
35+
}
2836
let mut reader = builder.build()?;
2937
let data = reader
3038
.next()
@@ -46,7 +54,8 @@ impl ParquetHandler for SyncParquetHandler {
4654
}
4755
let locations: Vec<_> = files.iter().map(|file| file.location.clone()).collect();
4856
Ok(Box::new(locations.into_iter().map(move |location| {
49-
try_create_from_parquet(schema.clone(), location).map(|d| Box::new(d) as _)
57+
try_create_from_parquet(schema.clone(), location, predicate.as_ref())
58+
.map(|d| Box::new(d) as _)
5059
})))
5160
}
5261
}

kernel/src/lib.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,8 +193,6 @@ pub trait JsonHandler: Send + Sync {
193193
&self,
194194
files: &[FileMeta],
195195
physical_schema: SchemaRef,
196-
// TODO: This should really be an Option<Arc<Expression>>, because otherwise we have to
197-
// clone the (potentially large) expression every time we call this function.
198196
predicate: Option<Expression>,
199197
) -> DeltaResult<FileDataReadResultIterator>;
200198
}

kernel/src/scan/mod.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,11 +199,13 @@ impl Scan {
199199
let commit_read_schema = get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?;
200200
let checkpoint_read_schema = get_log_schema().project(&[ADD_NAME])?;
201201

202+
// NOTE: We don't pass any meta-predicate because we expect no meaningful row group skipping
203+
// when ~every checkpoint file will contain the adds and removes we are looking for.
202204
let log_iter = self.snapshot.log_segment.replay(
203205
engine,
204206
commit_read_schema,
205207
checkpoint_read_schema,
206-
self.predicate.clone(),
208+
None,
207209
)?;
208210

209211
Ok(scan_action_iter(
@@ -285,7 +287,7 @@ impl Scan {
285287
let read_result_iter = engine.get_parquet_handler().read_parquet_files(
286288
&[meta],
287289
global_state.read_schema.clone(),
288-
None,
290+
self.predicate().clone(),
289291
)?;
290292
let gs = global_state.clone(); // Arc clone
291293
Ok(read_result_iter.into_iter().map(move |read_result| {

0 commit comments

Comments
 (0)