Skip to content

Commit daa9a1e

Browse files
authored
chore: stats-schema improvements (#1642)
## 🥞 Stacked PR Use this [link](https://github.com/delta-io/delta-kernel-rs/pull/1642/files) to review incremental changes. - [**stack/stats-schema**](#1642) [[Files changed](https://github.com/delta-io/delta-kernel-rs/pull/1642/files)] - [stack/write-stats-stack](#1656) [[Files changed](https://github.com/delta-io/delta-kernel-rs/pull/1656/files/f218eef7dafc67390b5fa3de7c4beea9e18acff4..38d6f4f61025af9adc10cd19737c3a6fb724350f)] --------- ## What changes are proposed in this pull request? Followups on stats_schema pr from #1594 <!-- **Uncomment** this section if there are any changes affecting public APIs. Else, **delete** this section. ### This PR affects the following public APIs If there are breaking changes, please ensure the `breaking-changes` label gets added by CI, and describe why the changes are needed. Note that _new_ public APIs are not considered breaking. --> ## How was this change tested?
1 parent f218eef commit daa9a1e

File tree

2 files changed

+99
-67
lines changed

2 files changed

+99
-67
lines changed

kernel/src/scan/data_skipping/stats_schema.rs

Lines changed: 73 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,27 @@ use crate::{
2020
/// - otherwise the first `dataSkippingNumIndexedCols` (default 32) leaf fields are included.
2121
/// - all fields are made nullable.
2222
///
23-
/// For the `nullCount` schema, we consider the whole base schema and convert all leaf fields
24-
/// to data type LONG. Maps, arrays, and variant are considered leaf fields in this case.
23+
/// The `nullCount` struct field is a nested structure mirroring the table's column hierarchy.
24+
/// It tracks the count of null values for each column. All leaf fields from the base schema
25+
/// are converted to LONG type (since null counts are always integers). Maps, arrays, and
26+
/// variants are considered leaf fields. Unlike `minValues`/`maxValues`, `nullCount` includes
27+
/// all columns from the base schema regardless of data type - every column can have nulls counted.
2528
///
26-
/// For the min / max schemas, we non-eligible leaf fields from the base schema.
27-
/// Field eligibility is determined by the fields data type via [`is_skipping_eligeble_datatype`].
29+
/// Note: `nullCount` still respects the column limit from `dataSkippingNumIndexedCols` or
30+
/// `dataSkippingStatsColumns` (via the base schema). The difference from `minValues`/`maxValues`
31+
/// is only that `nullCount` does not filter by data type eligibility.
32+
///
33+
/// The `minValues`/`maxValues` struct fields are also nested structures mirroring the table's
34+
/// column hierarchy. They additionally filter out leaf fields with non-eligible data types
35+
/// (e.g., Boolean, Binary) via [`is_skipping_eligible_datatype`].
36+
///
37+
/// The `tightBounds` field is a boolean indicating whether the min/max statistics are "tight"
38+
/// (accurate) or "wide" (potentially outdated). When `tightBounds` is `true`, the statistics
39+
/// accurately reflect the data in the file. When `false`, the file may have deletion vectors
40+
/// and the statistics haven't been recomputed to exclude deleted rows.
41+
///
42+
/// See the Delta protocol for more details on statistics:
43+
/// <https://github.com/delta-io/delta/blob/master/PROTOCOL.md#per-file-statistics>
2844
///
2945
/// The overall schema is then:
3046
/// ```ignored
@@ -33,6 +49,7 @@ use crate::{
3349
/// nullCount: <derived null count schema>,
3450
/// minValues: <derived min/max schema>,
3551
/// maxValues: <derived min/max schema>,
52+
/// tightBounds: boolean,
3653
/// }
3754
/// ```
3855
///
@@ -73,14 +90,15 @@ use crate::{
7390
/// age: integer,
7491
/// },
7592
/// },
93+
/// tightBounds: boolean,
7694
/// }
7795
/// ```
7896
#[allow(unused)]
7997
pub(crate) fn expected_stats_schema(
8098
physical_file_schema: &Schema,
8199
table_properties: &TableProperties,
82100
) -> DeltaResult<Schema> {
83-
let mut fields = Vec::with_capacity(4);
101+
let mut fields = Vec::with_capacity(5);
84102
fields.push(StructField::nullable("numRecords", DataType::LONG));
85103

86104
// generate the base stats schema:
@@ -108,6 +126,10 @@ pub(crate) fn expected_stats_schema(
108126
}
109127
}
110128

129+
// tightBounds indicates whether min/max statistics are accurate (true) or potentially
130+
// outdated due to deletion vectors (false)
131+
fields.push(StructField::nullable("tightBounds", DataType::BOOLEAN));
132+
111133
StructType::try_new(fields)
112134
}
113135

@@ -130,36 +152,23 @@ impl<'a> SchemaTransform<'a> for NullableStatsTransform {
130152
}
131153
}
132154

133-
// Convert a min/max stats schema into a nullcount schema (all leaf fields are LONG)
155+
/// Converts a stats schema into a nullCount schema where all leaf fields become LONG.
156+
///
157+
/// The nullCount struct field tracks the number of null values for each column.
158+
/// All leaf fields (primitives, arrays, maps, variants) are converted to LONG type
159+
/// since null counts are always integers, while struct fields are recursed into
160+
/// to preserve the nested structure.
134161
#[allow(unused)]
135162
pub(crate) struct NullCountStatsTransform;
136163
impl<'a> SchemaTransform<'a> for NullCountStatsTransform {
137-
fn transform_primitive(&mut self, _ptype: &'a PrimitiveType) -> Option<Cow<'a, PrimitiveType>> {
138-
Some(Cow::Owned(PrimitiveType::Long))
139-
}
140164
fn transform_struct_field(&mut self, field: &'a StructField) -> Option<Cow<'a, StructField>> {
141-
use Cow::*;
142-
143-
if matches!(
144-
&field.data_type,
145-
DataType::Array(_) | DataType::Map(_) | DataType::Variant(_)
146-
) {
147-
return Some(Cow::Owned(StructField {
148-
name: field.name.clone(),
149-
data_type: DataType::LONG,
150-
nullable: true,
151-
metadata: Default::default(),
152-
}));
153-
}
154-
155-
match self.transform(&field.data_type)? {
156-
Borrowed(_) => Some(Borrowed(field)),
157-
dt => Some(Owned(StructField {
158-
name: field.name.clone(),
159-
data_type: dt.into_owned(),
160-
nullable: true,
161-
metadata: Default::default(),
162-
})),
165+
// Only recurse into struct fields; convert all other types (leaf fields) to LONG
166+
match &field.data_type {
167+
DataType::Struct(_) => self.recurse_into_struct_field(field),
168+
_ => Some(Cow::Owned(StructField::nullable(
169+
&field.name,
170+
DataType::LONG,
171+
))),
163172
}
164173
}
165174
}
@@ -188,22 +197,21 @@ struct BaseStatsTransform {
188197
impl BaseStatsTransform {
189198
#[allow(unused)]
190199
fn new(props: &TableProperties) -> Self {
191-
// if data_skipping_stats_columns is specified, it takes precedence
192-
// over data_skipping_num_indexed_cols, even if that is also specified
193-
if let Some(columns_names) = &props.data_skipping_stats_columns {
200+
// If data_skipping_stats_columns is specified, it takes precedence
201+
// over data_skipping_num_indexed_cols, even if that is also specified.
202+
if let Some(column_names) = &props.data_skipping_stats_columns {
194203
Self {
195204
n_columns: None,
196205
added_columns: 0,
197-
column_names: Some(columns_names.clone()),
206+
column_names: Some(column_names.clone()),
198207
path: Vec::new(),
199208
}
200209
} else {
210+
let n_cols = props
211+
.data_skipping_num_indexed_cols
212+
.unwrap_or(DataSkippingNumIndexedCols::NumColumns(32));
201213
Self {
202-
n_columns: Some(
203-
props
204-
.data_skipping_num_indexed_cols
205-
.unwrap_or(DataSkippingNumIndexedCols::NumColumns(32)),
206-
),
214+
n_columns: Some(n_cols),
207215
added_columns: 0,
208216
column_names: None,
209217
path: Vec::new(),
@@ -226,26 +234,23 @@ impl<'a> SchemaTransform<'a> for BaseStatsTransform {
226234

227235
self.path.push(field.name.clone());
228236
let data_type = field.data_type();
229-
let is_struct = matches!(data_type, DataType::Struct(_));
230-
231-
// keep the field if it:
232-
// - is a struct field and we need to traverse its children
233-
// - OR it is referenced by the column names
234-
// - OR it is a primitive type / leaf field
235-
let should_include = is_struct
236-
|| self
237+
238+
// We always traverse struct fields (they don't count against the column limit),
239+
// but we only include leaf fields if they qualify based on column_names config.
240+
// When column_names is None, all leaf fields are included (up to n_columns limit).
241+
if !matches!(data_type, DataType::Struct(_)) {
242+
let should_include = self
237243
.column_names
238244
.as_ref()
239245
.map(|ns| should_include_column(&ColumnName::new(&self.path), ns))
240246
.unwrap_or(true);
241247

242-
if !should_include {
243-
self.path.pop();
244-
return None;
245-
}
248+
if !should_include {
249+
self.path.pop();
250+
return None;
251+
}
246252

247-
// increment count only for leaf columns.
248-
if !is_struct {
253+
// Increment count only for leaf columns
249254
self.added_columns += 1;
250255
}
251256

@@ -289,11 +294,7 @@ impl<'a> SchemaTransform<'a> for MinMaxStatsTransform {
289294
}
290295

291296
fn transform_primitive(&mut self, ptype: &'a PrimitiveType) -> Option<Cow<'a, PrimitiveType>> {
292-
if is_skipping_eligible_datatype(ptype) {
293-
Some(Cow::Borrowed(ptype))
294-
} else {
295-
None
296-
}
297+
is_skipping_eligible_datatype(ptype).then_some(Cow::Borrowed(ptype))
297298
}
298299
}
299300

@@ -310,7 +311,11 @@ fn should_include_column(column_name: &ColumnName, column_names: &[ColumnName])
310311
}
311312

312313
/// Checks if a data type is eligible for min/max file skipping.
313-
/// https://github.com/delta-io/delta/blob/143ab3337121248d2ca6a7d5bc31deae7c8fe4be/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/StatsSchemaHelper.java#L61
314+
///
315+
/// Note: Boolean and Binary are intentionally excluded as min/max statistics provide minimal
316+
/// skipping benefit for low-cardinality or opaque data types.
317+
///
318+
/// See: <https://github.com/delta-io/delta/blob/143ab3337121248d2ca6a7d5bc31deae7c8fe4be/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/StatsSchemaHelper.java#L61>
314319
#[allow(unused)]
315320
fn is_skipping_eligible_datatype(data_type: &PrimitiveType) -> bool {
316321
matches!(
@@ -325,7 +330,6 @@ fn is_skipping_eligible_datatype(data_type: &PrimitiveType) -> bool {
325330
| &PrimitiveType::Timestamp
326331
| &PrimitiveType::TimestampNtz
327332
| &PrimitiveType::String
328-
// | &PrimitiveType::Boolean
329333
| PrimitiveType::Decimal(_)
330334
)
331335
}
@@ -365,6 +369,7 @@ mod tests {
365369
StructField::nullable("nullCount", file_schema.clone()),
366370
StructField::nullable("minValues", file_schema.clone()),
367371
StructField::nullable("maxValues", file_schema),
372+
StructField::nullable("tightBounds", DataType::BOOLEAN),
368373
]);
369374

370375
assert_eq!(&expected, &stats_schema);
@@ -400,6 +405,7 @@ mod tests {
400405
StructField::nullable("nullCount", null_count),
401406
StructField::nullable("minValues", expected_min_max.clone()),
402407
StructField::nullable("maxValues", expected_min_max),
408+
StructField::nullable("tightBounds", DataType::BOOLEAN),
403409
]);
404410

405411
assert_eq!(&expected, &stats_schema);
@@ -457,6 +463,7 @@ mod tests {
457463
StructField::nullable("nullCount", expected_null),
458464
StructField::nullable("minValues", expected_fields.clone()),
459465
StructField::nullable("maxValues", expected_fields.clone()),
466+
StructField::nullable("tightBounds", DataType::BOOLEAN),
460467
]);
461468

462469
assert_eq!(&expected, &stats_schema);
@@ -497,6 +504,7 @@ mod tests {
497504
StructField::nullable("nullCount", null_count),
498505
StructField::nullable("minValues", expected_fields.clone()),
499506
StructField::nullable("maxValues", expected_fields.clone()),
507+
StructField::nullable("tightBounds", DataType::BOOLEAN),
500508
]);
501509

502510
assert_eq!(&expected, &stats_schema);
@@ -529,6 +537,7 @@ mod tests {
529537
StructField::nullable("nullCount", null_count),
530538
StructField::nullable("minValues", expected_fields.clone()),
531539
StructField::nullable("maxValues", expected_fields.clone()),
540+
StructField::nullable("tightBounds", DataType::BOOLEAN),
532541
]);
533542

534543
assert_eq!(&expected, &stats_schema);
@@ -566,6 +575,7 @@ mod tests {
566575
StructField::nullable("nullCount", expected_null_count),
567576
StructField::nullable("minValues", expected_min_max.clone()),
568577
StructField::nullable("maxValues", expected_min_max),
578+
StructField::nullable("tightBounds", DataType::BOOLEAN),
569579
]);
570580

571581
assert_eq!(&expected, &stats_schema);
@@ -619,6 +629,7 @@ mod tests {
619629
StructField::nullable("nullCount", expected_null_count),
620630
StructField::nullable("minValues", expected_min_max.clone()),
621631
StructField::nullable("maxValues", expected_min_max),
632+
StructField::nullable("tightBounds", DataType::BOOLEAN),
622633
]);
623634

624635
assert_eq!(&expected, &stats_schema);
@@ -653,6 +664,7 @@ mod tests {
653664
StructField::nullable("numRecords", DataType::LONG),
654665
StructField::nullable("nullCount", expected_null_count),
655666
// No minValues or maxValues fields since no primitive fields are eligible
667+
StructField::nullable("tightBounds", DataType::BOOLEAN),
656668
]);
657669

658670
assert_eq!(&expected, &stats_schema);

kernel/src/table_configuration.rs

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -155,17 +155,37 @@ impl TableConfiguration {
155155

156156
/// Generates the expected schema for file statistics.
157157
///
158-
/// Engines can decide to provide statistics for files written to the delta table,
159-
/// which enables data skipping and other optimizations. While it is not required to
160-
/// provide statistics, it is strongly recommended. This method generates the expected
161-
/// schema for statistics based on the table configuration. Often times the consfigration
162-
/// is based on operator experience or automates systems as to what statistics are most
163-
/// useful for a given table.
158+
/// Engines can provide statistics for files written to the delta table, enabling
159+
/// data skipping and other optimizations. This method generates the expected schema
160+
/// for structured statistics based on the table configuration.
161+
///
162+
/// The returned schema uses physical column names (respecting column mapping mode) and
163+
/// is structured as:
164+
/// ```text
165+
/// {
166+
/// numRecords: long,
167+
/// nullCount: { <physical columns with LONG type> },
168+
/// minValues: { <physical columns with original types> },
169+
/// maxValues: { <physical columns with original types> },
170+
/// }
171+
/// ```
172+
///
173+
/// The schema is affected by:
174+
/// - **Column mapping mode**: Field names use physical names from column mapping metadata.
175+
/// - **`delta.dataSkippingStatsColumns`**: If set, only specified columns are included.
176+
/// - **`delta.dataSkippingNumIndexedCols`**: Otherwise, includes the first N leaf columns
177+
/// (default 32).
178+
///
179+
/// See the Delta protocol for more details on per-file statistics:
180+
/// <https://github.com/delta-io/delta/blob/master/PROTOCOL.md#per-file-statistics>
164181
#[allow(unused)]
165182
#[internal_api]
166183
pub(crate) fn expected_stats_schema(&self) -> DeltaResult<SchemaRef> {
167184
let partition_columns = self.metadata().partition_columns();
168185
let column_mapping_mode = self.column_mapping_mode();
186+
// Partition columns are excluded because statistics are only collected for data columns
187+
// that are physically stored in the parquet files. Partition values are stored in the
188+
// file path, not in the file content, so they don't have file-level statistics.
169189
let physical_schema = StructType::try_new(
170190
self.schema()
171191
.fields()

0 commit comments

Comments
 (0)