@@ -20,11 +20,27 @@ use crate::{
2020/// - otherwise the first `dataSkippingNumIndexedCols` (default 32) leaf fields are included.
2121/// - all fields are made nullable.
2222///
23- /// For the `nullCount` schema, we consider the whole base schema and convert all leaf fields
24- /// to data type LONG. Maps, arrays, and variant are considered leaf fields in this case.
23+ /// The `nullCount` struct field is a nested structure mirroring the table's column hierarchy.
24+ /// It tracks the count of null values for each column. All leaf fields from the base schema
25+ /// are converted to LONG type (since null counts are always integers). Maps, arrays, and
26+ /// variants are considered leaf fields. Unlike `minValues`/`maxValues`, `nullCount` includes
27+ /// all columns from the base schema regardless of data type - every column can have nulls counted.
2528///
26- /// For the min / max schemas, we non-eligible leaf fields from the base schema.
27- /// Field eligibility is determined by the fields data type via [`is_skipping_eligeble_datatype`].
29+ /// Note: `nullCount` still respects the column limit from `dataSkippingNumIndexedCols` or
30+ /// `dataSkippingStatsColumns` (via the base schema). The difference from `minValues`/`maxValues`
31+ /// is only that `nullCount` does not filter by data type eligibility.
32+ ///
33+ /// The `minValues`/`maxValues` struct fields are also nested structures mirroring the table's
34+ /// column hierarchy. They additionally filter out leaf fields with non-eligible data types
35+ /// (e.g., Boolean, Binary) via [`is_skipping_eligible_datatype`].
36+ ///
37+ /// The `tightBounds` field is a boolean indicating whether the min/max statistics are "tight"
38+ /// (accurate) or "wide" (potentially outdated). When `tightBounds` is `true`, the statistics
39+ /// accurately reflect the data in the file. When `false`, the file may have deletion vectors
40+ /// and the statistics haven't been recomputed to exclude deleted rows.
41+ ///
42+ /// See the Delta protocol for more details on statistics:
43+ /// <https://github.com/delta-io/delta/blob/master/PROTOCOL.md#per-file-statistics>
2844///
2945/// The overall schema is then:
3046/// ```ignored
@@ -33,6 +49,7 @@ use crate::{
3349/// nullCount: <derived null count schema>,
3450/// minValues: <derived min/max schema>,
3551/// maxValues: <derived min/max schema>,
52+ /// tightBounds: boolean,
3653/// }
3754/// ```
3855///
@@ -73,14 +90,15 @@ use crate::{
7390/// age: integer,
7491/// },
7592/// },
93+ /// tightBounds: boolean,
7694/// }
7795/// ```
7896#[ allow( unused) ]
7997pub ( crate ) fn expected_stats_schema (
8098 physical_file_schema : & Schema ,
8199 table_properties : & TableProperties ,
82100) -> DeltaResult < Schema > {
83- let mut fields = Vec :: with_capacity ( 4 ) ;
101+ let mut fields = Vec :: with_capacity ( 5 ) ;
84102 fields. push ( StructField :: nullable ( "numRecords" , DataType :: LONG ) ) ;
85103
86104 // generate the base stats schema:
@@ -108,6 +126,10 @@ pub(crate) fn expected_stats_schema(
108126 }
109127 }
110128
129+ // tightBounds indicates whether min/max statistics are accurate (true) or potentially
130+ // outdated due to deletion vectors (false)
131+ fields. push ( StructField :: nullable ( "tightBounds" , DataType :: BOOLEAN ) ) ;
132+
111133 StructType :: try_new ( fields)
112134}
113135
@@ -130,36 +152,23 @@ impl<'a> SchemaTransform<'a> for NullableStatsTransform {
130152 }
131153}
132154
133- // Convert a min/max stats schema into a nullcount schema (all leaf fields are LONG)
155+ /// Converts a stats schema into a nullCount schema where all leaf fields become LONG.
156+ ///
157+ /// The nullCount struct field tracks the number of null values for each column.
158+ /// All leaf fields (primitives, arrays, maps, variants) are converted to LONG type
159+ /// since null counts are always integers, while struct fields are recursed into
160+ /// to preserve the nested structure.
134161#[ allow( unused) ]
135162pub ( crate ) struct NullCountStatsTransform ;
136163impl < ' a > SchemaTransform < ' a > for NullCountStatsTransform {
137- fn transform_primitive ( & mut self , _ptype : & ' a PrimitiveType ) -> Option < Cow < ' a , PrimitiveType > > {
138- Some ( Cow :: Owned ( PrimitiveType :: Long ) )
139- }
140164 fn transform_struct_field ( & mut self , field : & ' a StructField ) -> Option < Cow < ' a , StructField > > {
141- use Cow :: * ;
142-
143- if matches ! (
144- & field. data_type,
145- DataType :: Array ( _) | DataType :: Map ( _) | DataType :: Variant ( _)
146- ) {
147- return Some ( Cow :: Owned ( StructField {
148- name : field. name . clone ( ) ,
149- data_type : DataType :: LONG ,
150- nullable : true ,
151- metadata : Default :: default ( ) ,
152- } ) ) ;
153- }
154-
155- match self . transform ( & field. data_type ) ? {
156- Borrowed ( _) => Some ( Borrowed ( field) ) ,
157- dt => Some ( Owned ( StructField {
158- name : field. name . clone ( ) ,
159- data_type : dt. into_owned ( ) ,
160- nullable : true ,
161- metadata : Default :: default ( ) ,
162- } ) ) ,
165+ // Only recurse into struct fields; convert all other types (leaf fields) to LONG
166+ match & field. data_type {
167+ DataType :: Struct ( _) => self . recurse_into_struct_field ( field) ,
168+ _ => Some ( Cow :: Owned ( StructField :: nullable (
169+ & field. name ,
170+ DataType :: LONG ,
171+ ) ) ) ,
163172 }
164173 }
165174}
@@ -188,22 +197,21 @@ struct BaseStatsTransform {
188197impl BaseStatsTransform {
189198 #[ allow( unused) ]
190199 fn new ( props : & TableProperties ) -> Self {
191- // if data_skipping_stats_columns is specified, it takes precedence
192- // over data_skipping_num_indexed_cols, even if that is also specified
193- if let Some ( columns_names ) = & props. data_skipping_stats_columns {
200+ // If data_skipping_stats_columns is specified, it takes precedence
201+ // over data_skipping_num_indexed_cols, even if that is also specified.
202+ if let Some ( column_names ) = & props. data_skipping_stats_columns {
194203 Self {
195204 n_columns : None ,
196205 added_columns : 0 ,
197- column_names : Some ( columns_names . clone ( ) ) ,
206+ column_names : Some ( column_names . clone ( ) ) ,
198207 path : Vec :: new ( ) ,
199208 }
200209 } else {
210+ let n_cols = props
211+ . data_skipping_num_indexed_cols
212+ . unwrap_or ( DataSkippingNumIndexedCols :: NumColumns ( 32 ) ) ;
201213 Self {
202- n_columns : Some (
203- props
204- . data_skipping_num_indexed_cols
205- . unwrap_or ( DataSkippingNumIndexedCols :: NumColumns ( 32 ) ) ,
206- ) ,
214+ n_columns : Some ( n_cols) ,
207215 added_columns : 0 ,
208216 column_names : None ,
209217 path : Vec :: new ( ) ,
@@ -226,26 +234,23 @@ impl<'a> SchemaTransform<'a> for BaseStatsTransform {
226234
227235 self . path . push ( field. name . clone ( ) ) ;
228236 let data_type = field. data_type ( ) ;
229- let is_struct = matches ! ( data_type, DataType :: Struct ( _) ) ;
230-
231- // keep the field if it:
232- // - is a struct field and we need to traverse its children
233- // - OR it is referenced by the column names
234- // - OR it is a primitive type / leaf field
235- let should_include = is_struct
236- || self
237+
238+ // We always traverse struct fields (they don't count against the column limit),
239+ // but we only include leaf fields if they qualify based on column_names config.
240+ // When column_names is None, all leaf fields are included (up to n_columns limit).
241+ if !matches ! ( data_type, DataType :: Struct ( _) ) {
242+ let should_include = self
237243 . column_names
238244 . as_ref ( )
239245 . map ( |ns| should_include_column ( & ColumnName :: new ( & self . path ) , ns) )
240246 . unwrap_or ( true ) ;
241247
242- if !should_include {
243- self . path . pop ( ) ;
244- return None ;
245- }
248+ if !should_include {
249+ self . path . pop ( ) ;
250+ return None ;
251+ }
246252
247- // increment count only for leaf columns.
248- if !is_struct {
253+ // Increment count only for leaf columns
249254 self . added_columns += 1 ;
250255 }
251256
@@ -289,11 +294,7 @@ impl<'a> SchemaTransform<'a> for MinMaxStatsTransform {
289294 }
290295
291296 fn transform_primitive ( & mut self , ptype : & ' a PrimitiveType ) -> Option < Cow < ' a , PrimitiveType > > {
292- if is_skipping_eligible_datatype ( ptype) {
293- Some ( Cow :: Borrowed ( ptype) )
294- } else {
295- None
296- }
297+ is_skipping_eligible_datatype ( ptype) . then_some ( Cow :: Borrowed ( ptype) )
297298 }
298299}
299300
@@ -310,7 +311,11 @@ fn should_include_column(column_name: &ColumnName, column_names: &[ColumnName])
310311}
311312
312313/// Checks if a data type is eligible for min/max file skipping.
313- /// https://github.com/delta-io/delta/blob/143ab3337121248d2ca6a7d5bc31deae7c8fe4be/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/StatsSchemaHelper.java#L61
314+ ///
315+ /// Note: Boolean and Binary are intentionally excluded as min/max statistics provide minimal
316+ /// skipping benefit for low-cardinality or opaque data types.
317+ ///
318+ /// See: <https://github.com/delta-io/delta/blob/143ab3337121248d2ca6a7d5bc31deae7c8fe4be/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/StatsSchemaHelper.java#L61>
314319#[ allow( unused) ]
315320fn is_skipping_eligible_datatype ( data_type : & PrimitiveType ) -> bool {
316321 matches ! (
@@ -325,7 +330,6 @@ fn is_skipping_eligible_datatype(data_type: &PrimitiveType) -> bool {
325330 | & PrimitiveType :: Timestamp
326331 | & PrimitiveType :: TimestampNtz
327332 | & PrimitiveType :: String
328- // | &PrimitiveType::Boolean
329333 | PrimitiveType :: Decimal ( _)
330334 )
331335}
@@ -365,6 +369,7 @@ mod tests {
365369 StructField :: nullable ( "nullCount" , file_schema. clone ( ) ) ,
366370 StructField :: nullable ( "minValues" , file_schema. clone ( ) ) ,
367371 StructField :: nullable ( "maxValues" , file_schema) ,
372+ StructField :: nullable ( "tightBounds" , DataType :: BOOLEAN ) ,
368373 ] ) ;
369374
370375 assert_eq ! ( & expected, & stats_schema) ;
@@ -400,6 +405,7 @@ mod tests {
400405 StructField :: nullable ( "nullCount" , null_count) ,
401406 StructField :: nullable ( "minValues" , expected_min_max. clone ( ) ) ,
402407 StructField :: nullable ( "maxValues" , expected_min_max) ,
408+ StructField :: nullable ( "tightBounds" , DataType :: BOOLEAN ) ,
403409 ] ) ;
404410
405411 assert_eq ! ( & expected, & stats_schema) ;
@@ -457,6 +463,7 @@ mod tests {
457463 StructField :: nullable ( "nullCount" , expected_null) ,
458464 StructField :: nullable ( "minValues" , expected_fields. clone ( ) ) ,
459465 StructField :: nullable ( "maxValues" , expected_fields. clone ( ) ) ,
466+ StructField :: nullable ( "tightBounds" , DataType :: BOOLEAN ) ,
460467 ] ) ;
461468
462469 assert_eq ! ( & expected, & stats_schema) ;
@@ -497,6 +504,7 @@ mod tests {
497504 StructField :: nullable ( "nullCount" , null_count) ,
498505 StructField :: nullable ( "minValues" , expected_fields. clone ( ) ) ,
499506 StructField :: nullable ( "maxValues" , expected_fields. clone ( ) ) ,
507+ StructField :: nullable ( "tightBounds" , DataType :: BOOLEAN ) ,
500508 ] ) ;
501509
502510 assert_eq ! ( & expected, & stats_schema) ;
@@ -529,6 +537,7 @@ mod tests {
529537 StructField :: nullable ( "nullCount" , null_count) ,
530538 StructField :: nullable ( "minValues" , expected_fields. clone ( ) ) ,
531539 StructField :: nullable ( "maxValues" , expected_fields. clone ( ) ) ,
540+ StructField :: nullable ( "tightBounds" , DataType :: BOOLEAN ) ,
532541 ] ) ;
533542
534543 assert_eq ! ( & expected, & stats_schema) ;
@@ -566,6 +575,7 @@ mod tests {
566575 StructField :: nullable ( "nullCount" , expected_null_count) ,
567576 StructField :: nullable ( "minValues" , expected_min_max. clone ( ) ) ,
568577 StructField :: nullable ( "maxValues" , expected_min_max) ,
578+ StructField :: nullable ( "tightBounds" , DataType :: BOOLEAN ) ,
569579 ] ) ;
570580
571581 assert_eq ! ( & expected, & stats_schema) ;
@@ -619,6 +629,7 @@ mod tests {
619629 StructField :: nullable ( "nullCount" , expected_null_count) ,
620630 StructField :: nullable ( "minValues" , expected_min_max. clone ( ) ) ,
621631 StructField :: nullable ( "maxValues" , expected_min_max) ,
632+ StructField :: nullable ( "tightBounds" , DataType :: BOOLEAN ) ,
622633 ] ) ;
623634
624635 assert_eq ! ( & expected, & stats_schema) ;
@@ -653,6 +664,7 @@ mod tests {
653664 StructField :: nullable ( "numRecords" , DataType :: LONG ) ,
654665 StructField :: nullable ( "nullCount" , expected_null_count) ,
655666 // No minValues or maxValues fields since no primitive fields are eligible
667+ StructField :: nullable ( "tightBounds" , DataType :: BOOLEAN ) ,
656668 ] ) ;
657669
658670 assert_eq ! ( & expected, & stats_schema) ;
0 commit comments