|
| 1 | +//! An implementation of parquet row group skipping using data skipping predicates over footer stats. |
| 2 | +use crate::engine::parquet_stats_skipping::{col_name_to_path, ParquetStatsSkippingFilter}; |
| 3 | +use crate::expressions::{Expression, Scalar}; |
| 4 | +use crate::schema::{DataType, PrimitiveType}; |
| 5 | +use chrono::{DateTime, Days}; |
| 6 | +use parquet::arrow::arrow_reader::ArrowReaderBuilder; |
| 7 | +use parquet::file::metadata::RowGroupMetaData; |
| 8 | +use parquet::file::statistics::Statistics; |
| 9 | +use parquet::schema::types::{ColumnDescPtr, ColumnPath}; |
| 10 | +use std::collections::{HashMap, HashSet}; |
| 11 | +use tracing::debug; |
| 12 | + |
| 13 | +#[cfg(test)] |
| 14 | +mod tests; |
| 15 | + |
| 16 | +/// An extension trait for [`ArrowReaderBuilder`] that injects row group skipping capability. |
| 17 | +pub(crate) trait ParquetRowGroupSkipping { |
| 18 | + /// Instructs the parquet reader to perform row group skipping, eliminating any row group whose |
| 19 | + /// stats prove that none of the group's rows can satisfy the given `predicate`. |
| 20 | + fn with_row_group_filter(self, predicate: &Expression) -> Self; |
| 21 | +} |
| 22 | +impl<T> ParquetRowGroupSkipping for ArrowReaderBuilder<T> { |
| 23 | + fn with_row_group_filter(self, predicate: &Expression) -> Self { |
| 24 | + let indices = self |
| 25 | + .metadata() |
| 26 | + .row_groups() |
| 27 | + .iter() |
| 28 | + .enumerate() |
| 29 | + .filter_map(|(index, row_group)| { |
| 30 | + // If the group survives the filter, return Some(index) so filter_map keeps it. |
| 31 | + RowGroupFilter::apply(row_group, predicate).then_some(index) |
| 32 | + }) |
| 33 | + .collect(); |
| 34 | + debug!("with_row_group_filter({predicate:#?}) = {indices:?})"); |
| 35 | + self.with_row_groups(indices) |
| 36 | + } |
| 37 | +} |
| 38 | + |
| 39 | +/// A ParquetStatsSkippingFilter for row group skipping. It obtains stats from a parquet |
| 40 | +/// [`RowGroupMetaData`] and pre-computes the mapping of each referenced column path to its |
| 41 | +/// corresponding field index, for O(1) stats lookups. |
| 42 | +struct RowGroupFilter<'a> { |
| 43 | + row_group: &'a RowGroupMetaData, |
| 44 | + field_indices: HashMap<ColumnPath, usize>, |
| 45 | +} |
| 46 | + |
| 47 | +impl<'a> RowGroupFilter<'a> { |
| 48 | + /// Creates a new row group filter for the given row group and predicate. |
| 49 | + fn new(row_group: &'a RowGroupMetaData, predicate: &Expression) -> Self { |
| 50 | + Self { |
| 51 | + row_group, |
| 52 | + field_indices: compute_field_indices(row_group.schema_descr().columns(), predicate), |
| 53 | + } |
| 54 | + } |
| 55 | + |
| 56 | + /// Applies a filtering predicate to a row group. Return value false means to skip it. |
| 57 | + fn apply(row_group: &'a RowGroupMetaData, predicate: &Expression) -> bool { |
| 58 | + RowGroupFilter::new(row_group, predicate).apply_sql_where(predicate) != Some(false) |
| 59 | + } |
| 60 | + |
| 61 | + /// Returns `None` if the column doesn't exist and `Some(None)` if the column has no stats. |
| 62 | + fn get_stats(&self, col: &ColumnPath) -> Option<Option<&Statistics>> { |
| 63 | + self.field_indices |
| 64 | + .get(col) |
| 65 | + .map(|&i| self.row_group.column(i).statistics()) |
| 66 | + } |
| 67 | + |
| 68 | + fn decimal_from_bytes(bytes: Option<&[u8]>, precision: u8, scale: u8) -> Option<Scalar> { |
| 69 | + // WARNING: The bytes are stored in big-endian order; reverse and then 0-pad to 16 bytes. |
| 70 | + let bytes = bytes.filter(|b| b.len() <= 16)?; |
| 71 | + let mut bytes = Vec::from(bytes); |
| 72 | + bytes.reverse(); |
| 73 | + bytes.resize(16, 0u8); |
| 74 | + let bytes: [u8; 16] = bytes.try_into().ok()?; |
| 75 | + Some(Scalar::Decimal( |
| 76 | + i128::from_le_bytes(bytes), |
| 77 | + precision, |
| 78 | + scale, |
| 79 | + )) |
| 80 | + } |
| 81 | + |
| 82 | + fn timestamp_from_date(days: Option<&i32>) -> Option<Scalar> { |
| 83 | + let days = u64::try_from(*days?).ok()?; |
| 84 | + let timestamp = DateTime::UNIX_EPOCH.checked_add_days(Days::new(days))?; |
| 85 | + let timestamp = timestamp.signed_duration_since(DateTime::UNIX_EPOCH); |
| 86 | + Some(Scalar::TimestampNtz(timestamp.num_microseconds()?)) |
| 87 | + } |
| 88 | +} |
| 89 | + |
| 90 | +impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { |
| 91 | + // Extracts a stat value, converting from its physical type to the requested logical type. |
| 92 | + // |
| 93 | + // NOTE: This code is highly redundant with [`get_max_stat_value`] below, but parquet |
| 94 | + // ValueStatistics<T> requires T to impl a private trait, so we can't factor out any kind of |
| 95 | + // helper method. And macros are hard enough to read that it's not worth defining one. |
| 96 | + fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option<Scalar> { |
| 97 | + use PrimitiveType::*; |
| 98 | + let value = match (data_type.as_primitive_opt()?, self.get_stats(col)??) { |
| 99 | + (String, Statistics::ByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), |
| 100 | + (String, Statistics::FixedLenByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), |
| 101 | + (String, _) => return None, |
| 102 | + (Long, Statistics::Int64(s)) => s.min_opt()?.into(), |
| 103 | + (Long, Statistics::Int32(s)) => (*s.min_opt()? as i64).into(), |
| 104 | + (Long, _) => return None, |
| 105 | + (Integer, Statistics::Int32(s)) => s.min_opt()?.into(), |
| 106 | + (Integer, _) => return None, |
| 107 | + (Short, Statistics::Int32(s)) => (*s.min_opt()? as i16).into(), |
| 108 | + (Short, _) => return None, |
| 109 | + (Byte, Statistics::Int32(s)) => (*s.min_opt()? as i8).into(), |
| 110 | + (Byte, _) => return None, |
| 111 | + (Float, Statistics::Float(s)) => s.min_opt()?.into(), |
| 112 | + (Float, _) => return None, |
| 113 | + (Double, Statistics::Double(s)) => s.min_opt()?.into(), |
| 114 | + (Double, Statistics::Float(s)) => (*s.min_opt()? as f64).into(), |
| 115 | + (Double, _) => return None, |
| 116 | + (Boolean, Statistics::Boolean(s)) => s.min_opt()?.into(), |
| 117 | + (Boolean, _) => return None, |
| 118 | + (Binary, Statistics::ByteArray(s)) => s.min_opt()?.data().into(), |
| 119 | + (Binary, Statistics::FixedLenByteArray(s)) => s.min_opt()?.data().into(), |
| 120 | + (Binary, _) => return None, |
| 121 | + (Date, Statistics::Int32(s)) => Scalar::Date(*s.min_opt()?), |
| 122 | + (Date, _) => return None, |
| 123 | + (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.min_opt()?), |
| 124 | + (Timestamp, _) => return None, // TODO: Int96 timestamps |
| 125 | + (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.min_opt()?), |
| 126 | + (TimestampNtz, Statistics::Int32(s)) => Self::timestamp_from_date(s.min_opt())?, |
| 127 | + (TimestampNtz, _) => return None, // TODO: Int96 timestamps |
| 128 | + (Decimal(p, s), Statistics::Int32(i)) => Scalar::Decimal(*i.min_opt()? as i128, *p, *s), |
| 129 | + (Decimal(p, s), Statistics::Int64(i)) => Scalar::Decimal(*i.min_opt()? as i128, *p, *s), |
| 130 | + (Decimal(p, s), Statistics::FixedLenByteArray(b)) => { |
| 131 | + Self::decimal_from_bytes(b.min_bytes_opt(), *p, *s)? |
| 132 | + } |
| 133 | + (Decimal(..), _) => return None, |
| 134 | + }; |
| 135 | + Some(value) |
| 136 | + } |
| 137 | + |
| 138 | + fn get_max_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option<Scalar> { |
| 139 | + use PrimitiveType::*; |
| 140 | + let value = match (data_type.as_primitive_opt()?, self.get_stats(col)??) { |
| 141 | + (String, Statistics::ByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), |
| 142 | + (String, Statistics::FixedLenByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), |
| 143 | + (String, _) => return None, |
| 144 | + (Long, Statistics::Int64(s)) => s.max_opt()?.into(), |
| 145 | + (Long, Statistics::Int32(s)) => (*s.max_opt()? as i64).into(), |
| 146 | + (Long, _) => return None, |
| 147 | + (Integer, Statistics::Int32(s)) => s.max_opt()?.into(), |
| 148 | + (Integer, _) => return None, |
| 149 | + (Short, Statistics::Int32(s)) => (*s.max_opt()? as i16).into(), |
| 150 | + (Short, _) => return None, |
| 151 | + (Byte, Statistics::Int32(s)) => (*s.max_opt()? as i8).into(), |
| 152 | + (Byte, _) => return None, |
| 153 | + (Float, Statistics::Float(s)) => s.max_opt()?.into(), |
| 154 | + (Float, _) => return None, |
| 155 | + (Double, Statistics::Double(s)) => s.max_opt()?.into(), |
| 156 | + (Double, Statistics::Float(s)) => (*s.max_opt()? as f64).into(), |
| 157 | + (Double, _) => return None, |
| 158 | + (Boolean, Statistics::Boolean(s)) => s.max_opt()?.into(), |
| 159 | + (Boolean, _) => return None, |
| 160 | + (Binary, Statistics::ByteArray(s)) => s.max_opt()?.data().into(), |
| 161 | + (Binary, Statistics::FixedLenByteArray(s)) => s.max_opt()?.data().into(), |
| 162 | + (Binary, _) => return None, |
| 163 | + (Date, Statistics::Int32(s)) => Scalar::Date(*s.max_opt()?), |
| 164 | + (Date, _) => return None, |
| 165 | + (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.max_opt()?), |
| 166 | + (Timestamp, _) => return None, // TODO: Int96 timestamps |
| 167 | + (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.max_opt()?), |
| 168 | + (TimestampNtz, Statistics::Int32(s)) => Self::timestamp_from_date(s.max_opt())?, |
| 169 | + (TimestampNtz, _) => return None, // TODO: Int96 timestamps |
| 170 | + (Decimal(p, s), Statistics::Int32(i)) => Scalar::Decimal(*i.max_opt()? as i128, *p, *s), |
| 171 | + (Decimal(p, s), Statistics::Int64(i)) => Scalar::Decimal(*i.max_opt()? as i128, *p, *s), |
| 172 | + (Decimal(p, s), Statistics::FixedLenByteArray(b)) => { |
| 173 | + Self::decimal_from_bytes(b.max_bytes_opt(), *p, *s)? |
| 174 | + } |
| 175 | + (Decimal(..), _) => return None, |
| 176 | + }; |
| 177 | + Some(value) |
| 178 | + } |
| 179 | + |
| 180 | + fn get_nullcount_stat_value(&self, col: &ColumnPath) -> Option<i64> { |
| 181 | + // NOTE: Stats for any given column are optional, which may produce a NULL nullcount. But if |
| 182 | + // the column itself is missing, then we know all values are implied to be NULL. |
| 183 | + let Some(stats) = self.get_stats(col) else { |
| 184 | + return Some(self.get_rowcount_stat_value()); |
| 185 | + }; |
| 186 | + |
| 187 | + // WARNING: [`Statistics::null_count_opt`] returns Some(0) when the underlying stat is |
| 188 | + // missing, causing an IS NULL predicate to wrongly skip the file if it contains any NULL |
| 189 | + // values. Manually drill into each arm's [`ValueStatistics`] for the stat's true. |
| 190 | + let nullcount = match stats? { |
| 191 | + Statistics::Boolean(s) => s.null_count_opt(), |
| 192 | + Statistics::Int32(s) => s.null_count_opt(), |
| 193 | + Statistics::Int64(s) => s.null_count_opt(), |
| 194 | + Statistics::Int96(s) => s.null_count_opt(), |
| 195 | + Statistics::Float(s) => s.null_count_opt(), |
| 196 | + Statistics::Double(s) => s.null_count_opt(), |
| 197 | + Statistics::ByteArray(s) => s.null_count_opt(), |
| 198 | + Statistics::FixedLenByteArray(s) => s.null_count_opt(), |
| 199 | + }; |
| 200 | + |
| 201 | + // Parquet nullcount stats are always u64, so we can directly return the value instead of |
| 202 | + // wrapping it in a Scalar. We can safely cast it from u64 to i64 because the nullcount can |
| 203 | + // never be larger than the rowcount and the parquet rowcount stat is i64. |
| 204 | + Some(nullcount? as i64) |
| 205 | + } |
| 206 | + |
| 207 | + fn get_rowcount_stat_value(&self) -> i64 { |
| 208 | + self.row_group.num_rows() |
| 209 | + } |
| 210 | +} |
| 211 | + |
| 212 | +/// Given a filter expression of interest and a set of parquet column descriptors, build a column -> |
| 213 | +/// index mapping for columns the expression references. This ensures O(1) lookup times, for an |
| 214 | +/// overall O(n) cost to evaluate an expression tree with n nodes. |
| 215 | +pub(crate) fn compute_field_indices( |
| 216 | + fields: &[ColumnDescPtr], |
| 217 | + expression: &Expression, |
| 218 | +) -> HashMap<ColumnPath, usize> { |
| 219 | + fn do_recurse(expression: &Expression, cols: &mut HashSet<ColumnPath>) { |
| 220 | + use Expression::*; |
| 221 | + let mut recurse = |expr| do_recurse(expr, cols); // simplifies the call sites below |
| 222 | + match expression { |
| 223 | + Literal(_) => {} |
| 224 | + Column(name) => cols.extend([col_name_to_path(name)]), // returns `()`, unlike `insert` |
| 225 | + Struct(fields) => fields.iter().for_each(recurse), |
| 226 | + UnaryOperation { expr, .. } => recurse(expr), |
| 227 | + BinaryOperation { left, right, .. } => [left, right].iter().for_each(|e| recurse(e)), |
| 228 | + VariadicOperation { exprs, .. } => exprs.iter().for_each(recurse), |
| 229 | + } |
| 230 | + } |
| 231 | + |
| 232 | + // Build up a set of requested column paths, then take each found path as the corresponding map |
| 233 | + // key (avoids unnecessary cloning). |
| 234 | + // |
| 235 | + // NOTE: If a requested column was not available, it is silently ignored. These missing columns |
| 236 | + // are implied all-null, so we will infer their min/max stats as NULL and nullcount == rowcount. |
| 237 | + let mut requested_columns = HashSet::new(); |
| 238 | + do_recurse(expression, &mut requested_columns); |
| 239 | + fields |
| 240 | + .iter() |
| 241 | + .enumerate() |
| 242 | + .filter_map(|(i, f)| requested_columns.take(f.path()).map(|path| (path, i))) |
| 243 | + .collect() |
| 244 | +} |
0 commit comments