@@ -20,11 +20,22 @@ use crate::{
2020/// - otherwise the first `dataSkippingNumIndexedCols` (default 32) leaf fields are included.
2121/// - all fields are made nullable.
2222///
23- /// For the `nullCount` schema, we consider the whole base schema and convert all leaf fields
24- /// to data type LONG. Maps, arrays, and variant are considered leaf fields in this case.
23+ /// The `nullCount` struct field is a nested structure mirroring the table's column hierarchy.
24+ /// It tracks the count of null values for each column. All leaf fields from the base schema
25+ /// are converted to LONG type (since null counts are always integers). Maps, arrays, and
26+ /// variants are considered leaf fields. Unlike `minValues`/`maxValues`, `nullCount` includes
27+ /// all columns from the base schema regardless of data type - every column can have nulls counted.
2528///
26- /// For the min / max schemas, we non-eligible leaf fields from the base schema.
27- /// Field eligibility is determined by the fields data type via [`is_skipping_eligeble_datatype`].
29+ /// Note: `nullCount` still respects the column limit from `dataSkippingNumIndexedCols` or
30+ /// `dataSkippingStatsColumns` (via the base schema). The difference from `minValues`/`maxValues`
31+ /// is only that `nullCount` does not filter by data type eligibility.
32+ ///
33+ /// The `minValues`/`maxValues` struct fields are also nested structures mirroring the table's
34+ /// column hierarchy. They additionally filter out leaf fields with non-eligible data types
35+ /// (e.g., Boolean, Binary) via [`is_skipping_eligible_datatype`].
36+ ///
37+ /// See the Delta protocol for more details on statistics:
38+ /// <https://github.com/delta-io/delta/blob/master/PROTOCOL.md#per-file-statistics>
2839///
2940/// The overall schema is then:
3041/// ```ignored
@@ -111,36 +122,23 @@ pub(crate) fn expected_stats_schema(
111122 StructType :: try_new ( fields)
112123}
113124
114- // Convert a min/max stats schema into a nullcount schema (all leaf fields are LONG)
125+ /// Converts a stats schema into a nullCount schema where all leaf fields become LONG.
126+ ///
127+ /// The nullCount struct field tracks the number of null values for each column.
128+ /// All leaf fields (primitives, arrays, maps, variants) are converted to LONG type
129+ /// since null counts are always integers, while struct fields are recursed into
130+ /// to preserve the nested structure.
115131#[ allow( unused) ]
116132pub ( crate ) struct NullCountStatsTransform ;
117133impl < ' a > SchemaTransform < ' a > for NullCountStatsTransform {
118- fn transform_primitive ( & mut self , _ptype : & ' a PrimitiveType ) -> Option < Cow < ' a , PrimitiveType > > {
119- Some ( Cow :: Owned ( PrimitiveType :: Long ) )
120- }
121134 fn transform_struct_field ( & mut self , field : & ' a StructField ) -> Option < Cow < ' a , StructField > > {
122- use Cow :: * ;
123-
124- if matches ! (
125- & field. data_type,
126- DataType :: Array ( _) | DataType :: Map ( _) | DataType :: Variant ( _)
127- ) {
128- return Some ( Cow :: Owned ( StructField {
129- name : field. name . clone ( ) ,
130- data_type : DataType :: LONG ,
131- nullable : true ,
132- metadata : Default :: default ( ) ,
133- } ) ) ;
134- }
135-
136- match self . transform ( & field. data_type ) ? {
137- Borrowed ( _) => Some ( Borrowed ( field) ) ,
138- dt => Some ( Owned ( StructField {
139- name : field. name . clone ( ) ,
140- data_type : dt. into_owned ( ) ,
141- nullable : true ,
142- metadata : Default :: default ( ) ,
143- } ) ) ,
135+ // Only recurse into struct fields; convert all other types (leaf fields) to LONG
136+ match & field. data_type {
137+ DataType :: Struct ( _) => self . recurse_into_struct_field ( field) ,
138+ _ => Some ( Cow :: Owned ( StructField :: nullable (
139+ & field. name ,
140+ DataType :: LONG ,
141+ ) ) ) ,
144142 }
145143 }
146144}
@@ -169,22 +167,21 @@ struct BaseStatsTransform {
169167impl BaseStatsTransform {
170168 #[ allow( unused) ]
171169 fn new ( props : & TableProperties ) -> Self {
172- // if data_skipping_stats_columns is specified, it takes precedence
173- // over data_skipping_num_indexed_cols, even if that is also specified
174- if let Some ( columns_names ) = & props. data_skipping_stats_columns {
170+ // If data_skipping_stats_columns is specified, it takes precedence
171+ // over data_skipping_num_indexed_cols, even if that is also specified.
172+ if let Some ( column_names ) = & props. data_skipping_stats_columns {
175173 Self {
176174 n_columns : None ,
177175 added_columns : 0 ,
178- column_names : Some ( columns_names . clone ( ) ) ,
176+ column_names : Some ( column_names . clone ( ) ) ,
179177 path : Vec :: new ( ) ,
180178 }
181179 } else {
180+ let n_cols = props
181+ . data_skipping_num_indexed_cols
182+ . unwrap_or ( DataSkippingNumIndexedCols :: NumColumns ( 32 ) ) ;
182183 Self {
183- n_columns : Some (
184- props
185- . data_skipping_num_indexed_cols
186- . unwrap_or ( DataSkippingNumIndexedCols :: NumColumns ( 32 ) ) ,
187- ) ,
184+ n_columns : Some ( n_cols) ,
188185 added_columns : 0 ,
189186 column_names : None ,
190187 path : Vec :: new ( ) ,
@@ -207,26 +204,23 @@ impl<'a> SchemaTransform<'a> for BaseStatsTransform {
207204
208205 self . path . push ( field. name . clone ( ) ) ;
209206 let data_type = field. data_type ( ) ;
210- let is_struct = matches ! ( data_type, DataType :: Struct ( _) ) ;
211-
212- // keep the field if it:
213- // - is a struct field and we need to traverse its children
214- // - OR it is referenced by the column names
215- // - OR it is a primitive type / leaf field
216- let should_include = is_struct
217- || self
207+
208+ // We always traverse struct fields (they don't count against the column limit),
209+ // but we only include leaf fields if they qualify based on column_names config.
210+ // When column_names is None, all leaf fields are included (up to n_columns limit).
211+ if !matches ! ( data_type, DataType :: Struct ( _) ) {
212+ let should_include = self
218213 . column_names
219214 . as_ref ( )
220215 . map ( |ns| should_include_column ( & ColumnName :: new ( & self . path ) , ns) )
221216 . unwrap_or ( true ) ;
222217
223- if !should_include {
224- self . path . pop ( ) ;
225- return None ;
226- }
218+ if !should_include {
219+ self . path . pop ( ) ;
220+ return None ;
221+ }
227222
228- // increment count only for leaf columns.
229- if !is_struct {
223+ // Increment count only for leaf columns
230224 self . added_columns += 1 ;
231225 }
232226
@@ -270,11 +264,7 @@ impl<'a> SchemaTransform<'a> for MinMaxStatsTransform {
270264 }
271265
272266 fn transform_primitive ( & mut self , ptype : & ' a PrimitiveType ) -> Option < Cow < ' a , PrimitiveType > > {
273- if is_skipping_eligible_datatype ( ptype) {
274- Some ( Cow :: Borrowed ( ptype) )
275- } else {
276- None
277- }
267+ is_skipping_eligible_datatype ( ptype) . then_some ( Cow :: Borrowed ( ptype) )
278268 }
279269}
280270
@@ -291,7 +281,11 @@ fn should_include_column(column_name: &ColumnName, column_names: &[ColumnName])
291281}
292282
293283/// Checks if a data type is eligible for min/max file skipping.
294- /// https://github.com/delta-io/delta/blob/143ab3337121248d2ca6a7d5bc31deae7c8fe4be/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/StatsSchemaHelper.java#L61
284+ ///
285+ /// Note: Boolean and Binary are intentionally excluded as min/max statistics provide minimal
286+ /// skipping benefit for low-cardinality or opaque data types.
287+ ///
288+ /// See: <https://github.com/delta-io/delta/blob/143ab3337121248d2ca6a7d5bc31deae7c8fe4be/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/StatsSchemaHelper.java#L61>
295289#[ allow( unused) ]
296290fn is_skipping_eligible_datatype ( data_type : & PrimitiveType ) -> bool {
297291 matches ! (
@@ -306,7 +300,6 @@ fn is_skipping_eligible_datatype(data_type: &PrimitiveType) -> bool {
306300 | & PrimitiveType :: Timestamp
307301 | & PrimitiveType :: TimestampNtz
308302 | & PrimitiveType :: String
309- // | &PrimitiveType::Boolean
310303 | PrimitiveType :: Decimal ( _)
311304 )
312305}
0 commit comments