Skip to content

Commit 0c9ea1d

Browse files
committed
stats
1 parent bc9a5d8 commit 0c9ea1d

File tree

2 files changed

+79
-66
lines changed

2 files changed

+79
-66
lines changed

kernel/src/scan/data_skipping/stats_schema.rs

Lines changed: 53 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,22 @@ use crate::{
2020
/// - otherwise the first `dataSkippingNumIndexedCols` (default 32) leaf fields are included.
2121
/// - all fields are made nullable.
2222
///
23-
/// For the `nullCount` schema, we consider the whole base schema and convert all leaf fields
24-
/// to data type LONG. Maps, arrays, and variant are considered leaf fields in this case.
23+
/// The `nullCount` struct field is a nested structure mirroring the table's column hierarchy.
24+
/// It tracks the count of null values for each column. All leaf fields from the base schema
25+
/// are converted to LONG type (since null counts are always integers). Maps, arrays, and
26+
/// variants are considered leaf fields. Unlike `minValues`/`maxValues`, `nullCount` includes
27+
/// all columns from the base schema regardless of data type - every column can have nulls counted.
2528
///
26-
/// For the min / max schemas, we non-eligible leaf fields from the base schema.
27-
/// Field eligibility is determined by the fields data type via [`is_skipping_eligeble_datatype`].
29+
/// Note: `nullCount` still respects the column limit from `dataSkippingNumIndexedCols` or
30+
/// `dataSkippingStatsColumns` (via the base schema). The difference from `minValues`/`maxValues`
31+
/// is only that `nullCount` does not filter by data type eligibility.
32+
///
33+
/// The `minValues`/`maxValues` struct fields are also nested structures mirroring the table's
34+
/// column hierarchy. They additionally filter out leaf fields with non-eligible data types
35+
/// (e.g., Boolean, Binary) via [`is_skipping_eligible_datatype`].
36+
///
37+
/// See the Delta protocol for more details on statistics:
38+
/// <https://github.com/delta-io/delta/blob/master/PROTOCOL.md#per-file-statistics>
2839
///
2940
/// The overall schema is then:
3041
/// ```ignored
@@ -111,36 +122,23 @@ pub(crate) fn expected_stats_schema(
111122
StructType::try_new(fields)
112123
}
113124

114-
// Convert a min/max stats schema into a nullcount schema (all leaf fields are LONG)
125+
/// Converts a stats schema into a nullCount schema where all leaf fields become LONG.
126+
///
127+
/// The nullCount struct field tracks the number of null values for each column.
128+
/// All leaf fields (primitives, arrays, maps, variants) are converted to LONG type
129+
/// since null counts are always integers, while struct fields are recursed into
130+
/// to preserve the nested structure.
115131
#[allow(unused)]
116132
pub(crate) struct NullCountStatsTransform;
117133
impl<'a> SchemaTransform<'a> for NullCountStatsTransform {
118-
fn transform_primitive(&mut self, _ptype: &'a PrimitiveType) -> Option<Cow<'a, PrimitiveType>> {
119-
Some(Cow::Owned(PrimitiveType::Long))
120-
}
121134
fn transform_struct_field(&mut self, field: &'a StructField) -> Option<Cow<'a, StructField>> {
122-
use Cow::*;
123-
124-
if matches!(
125-
&field.data_type,
126-
DataType::Array(_) | DataType::Map(_) | DataType::Variant(_)
127-
) {
128-
return Some(Cow::Owned(StructField {
129-
name: field.name.clone(),
130-
data_type: DataType::LONG,
131-
nullable: true,
132-
metadata: Default::default(),
133-
}));
134-
}
135-
136-
match self.transform(&field.data_type)? {
137-
Borrowed(_) => Some(Borrowed(field)),
138-
dt => Some(Owned(StructField {
139-
name: field.name.clone(),
140-
data_type: dt.into_owned(),
141-
nullable: true,
142-
metadata: Default::default(),
143-
})),
135+
// Only recurse into struct fields; convert all other types (leaf fields) to LONG
136+
match &field.data_type {
137+
DataType::Struct(_) => self.recurse_into_struct_field(field),
138+
_ => Some(Cow::Owned(StructField::nullable(
139+
&field.name,
140+
DataType::LONG,
141+
))),
144142
}
145143
}
146144
}
@@ -169,22 +167,21 @@ struct BaseStatsTransform {
169167
impl BaseStatsTransform {
170168
#[allow(unused)]
171169
fn new(props: &TableProperties) -> Self {
172-
// if data_skipping_stats_columns is specified, it takes precedence
173-
// over data_skipping_num_indexed_cols, even if that is also specified
174-
if let Some(columns_names) = &props.data_skipping_stats_columns {
170+
// If data_skipping_stats_columns is specified, it takes precedence
171+
// over data_skipping_num_indexed_cols, even if that is also specified.
172+
if let Some(column_names) = &props.data_skipping_stats_columns {
175173
Self {
176174
n_columns: None,
177175
added_columns: 0,
178-
column_names: Some(columns_names.clone()),
176+
column_names: Some(column_names.clone()),
179177
path: Vec::new(),
180178
}
181179
} else {
180+
let n_cols = props
181+
.data_skipping_num_indexed_cols
182+
.unwrap_or(DataSkippingNumIndexedCols::NumColumns(32));
182183
Self {
183-
n_columns: Some(
184-
props
185-
.data_skipping_num_indexed_cols
186-
.unwrap_or(DataSkippingNumIndexedCols::NumColumns(32)),
187-
),
184+
n_columns: Some(n_cols),
188185
added_columns: 0,
189186
column_names: None,
190187
path: Vec::new(),
@@ -207,26 +204,23 @@ impl<'a> SchemaTransform<'a> for BaseStatsTransform {
207204

208205
self.path.push(field.name.clone());
209206
let data_type = field.data_type();
210-
let is_struct = matches!(data_type, DataType::Struct(_));
211-
212-
// keep the field if it:
213-
// - is a struct field and we need to traverse its children
214-
// - OR it is referenced by the column names
215-
// - OR it is a primitive type / leaf field
216-
let should_include = is_struct
217-
|| self
207+
208+
// We always traverse struct fields (they don't count against the column limit),
209+
// but we only include leaf fields if they qualify based on column_names config.
210+
// When column_names is None, all leaf fields are included (up to n_columns limit).
211+
if !matches!(data_type, DataType::Struct(_)) {
212+
let should_include = self
218213
.column_names
219214
.as_ref()
220215
.map(|ns| should_include_column(&ColumnName::new(&self.path), ns))
221216
.unwrap_or(true);
222217

223-
if !should_include {
224-
self.path.pop();
225-
return None;
226-
}
218+
if !should_include {
219+
self.path.pop();
220+
return None;
221+
}
227222

228-
// increment count only for leaf columns.
229-
if !is_struct {
223+
// Increment count only for leaf columns
230224
self.added_columns += 1;
231225
}
232226

@@ -270,11 +264,7 @@ impl<'a> SchemaTransform<'a> for MinMaxStatsTransform {
270264
}
271265

272266
fn transform_primitive(&mut self, ptype: &'a PrimitiveType) -> Option<Cow<'a, PrimitiveType>> {
273-
if is_skipping_eligible_datatype(ptype) {
274-
Some(Cow::Borrowed(ptype))
275-
} else {
276-
None
277-
}
267+
is_skipping_eligible_datatype(ptype).then_some(Cow::Borrowed(ptype))
278268
}
279269
}
280270

@@ -291,7 +281,11 @@ fn should_include_column(column_name: &ColumnName, column_names: &[ColumnName])
291281
}
292282

293283
/// Checks if a data type is eligible for min/max file skipping.
294-
/// https://github.com/delta-io/delta/blob/143ab3337121248d2ca6a7d5bc31deae7c8fe4be/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/StatsSchemaHelper.java#L61
284+
///
285+
/// Note: Boolean and Binary are intentionally excluded as min/max statistics provide minimal
286+
/// skipping benefit for low-cardinality or opaque data types.
287+
///
288+
/// See: <https://github.com/delta-io/delta/blob/143ab3337121248d2ca6a7d5bc31deae7c8fe4be/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/StatsSchemaHelper.java#L61>
295289
#[allow(unused)]
296290
fn is_skipping_eligible_datatype(data_type: &PrimitiveType) -> bool {
297291
matches!(
@@ -306,7 +300,6 @@ fn is_skipping_eligible_datatype(data_type: &PrimitiveType) -> bool {
306300
| &PrimitiveType::Timestamp
307301
| &PrimitiveType::TimestampNtz
308302
| &PrimitiveType::String
309-
// | &PrimitiveType::Boolean
310303
| PrimitiveType::Decimal(_)
311304
)
312305
}

kernel/src/table_configuration.rs

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -139,17 +139,37 @@ impl TableConfiguration {
139139

140140
/// Generates the expected schema for file statistics.
141141
///
142-
/// Engines can decide to provide statistics for files written to the delta table,
143-
/// which enables data skipping and other optimizations. While it is not required to
144-
/// provide statistics, it is strongly recommended. This method generates the expected
145-
/// schema for statistics based on the table configuration. Often times the consfigration
146-
/// is based on operator experience or automates systems as to what statistics are most
147-
/// useful for a given table.
142+
/// Engines can provide statistics for files written to the delta table, enabling
143+
/// data skipping and other optimizations. This method generates the expected schema
144+
/// for structured statistics based on the table configuration.
145+
///
146+
/// The returned schema uses physical column names (respecting column mapping mode) and
147+
/// is structured as:
148+
/// ```text
149+
/// {
150+
/// numRecords: long,
151+
/// nullCount: { <physical columns with LONG type> },
152+
/// minValues: { <physical columns with original types> },
153+
/// maxValues: { <physical columns with original types> },
154+
/// }
155+
/// ```
156+
///
157+
/// The schema is affected by:
158+
/// - **Column mapping mode**: Field names use physical names from column mapping metadata.
159+
/// - **`delta.dataSkippingStatsColumns`**: If set, only specified columns are included.
160+
/// - **`delta.dataSkippingNumIndexedCols`**: Otherwise, includes the first N leaf columns
161+
/// (default 32).
162+
///
163+
/// See the Delta protocol for more details on per-file statistics:
164+
/// <https://github.com/delta-io/delta/blob/master/PROTOCOL.md#per-file-statistics>
148165
#[allow(unused)]
149166
#[internal_api]
150167
pub(crate) fn expected_stats_schema(&self) -> DeltaResult<SchemaRef> {
151168
let partition_columns = self.metadata().partition_columns();
152169
let column_mapping_mode = self.column_mapping_mode();
170+
// Partition columns are excluded because statistics are only collected for data columns
171+
// that are physically stored in the parquet files. Partition values are stored in the
172+
// file path, not in the file content, so they don't have file-level statistics.
153173
let physical_schema = StructType::try_new(
154174
self.schema()
155175
.fields()

0 commit comments

Comments
 (0)