stats

DrakeLin · DrakeLin · commit 0c9ea1db642e · 2026-01-21T00:19:32.000Z
diff --git a/kernel/src/scan/data_skipping/stats_schema.rs b/kernel/src/scan/data_skipping/stats_schema.rs
@@ -20,11 +20,22 @@ use crate::{
 /// - otherwise the first `dataSkippingNumIndexedCols` (default 32) leaf fields are included.
 /// - all fields are made nullable.
 ///
-/// For the `nullCount` schema, we consider the whole base schema and convert all leaf fields
-/// to data type LONG. Maps, arrays, and variant are considered leaf fields in this case.
+/// The `nullCount` struct field is a nested structure mirroring the table's column hierarchy.
+/// It tracks the count of null values for each column. All leaf fields from the base schema
+/// are converted to LONG type (since null counts are always integers). Maps, arrays, and
+/// variants are considered leaf fields. Unlike `minValues`/`maxValues`, `nullCount` includes
+/// all columns from the base schema regardless of data type - every column can have nulls counted.
 ///
-/// For the min / max schemas, we non-eligible leaf fields from the base schema.
-/// Field eligibility is determined by the fields data type via [`is_skipping_eligeble_datatype`].
+/// Note: `nullCount` still respects the column limit from `dataSkippingNumIndexedCols` or
+/// `dataSkippingStatsColumns` (via the base schema). The difference from `minValues`/`maxValues`
+/// is only that `nullCount` does not filter by data type eligibility.
+///
+/// The `minValues`/`maxValues` struct fields are also nested structures mirroring the table's
+/// column hierarchy. They additionally filter out leaf fields with non-eligible data types
+/// (e.g., Boolean, Binary) via [`is_skipping_eligible_datatype`].
+///
+/// See the Delta protocol for more details on statistics:
+/// <https://github.com/delta-io/delta/blob/master/PROTOCOL.md#per-file-statistics>
 ///
 /// The overall schema is then:
 /// ```ignored
@@ -111,36 +122,23 @@ pub(crate) fn expected_stats_schema(
     StructType::try_new(fields)
 }
 
-// Convert a min/max stats schema into a nullcount schema (all leaf fields are LONG)
+/// Converts a stats schema into a nullCount schema where all leaf fields become LONG.
+///
+/// The nullCount struct field tracks the number of null values for each column.
+/// All leaf fields (primitives, arrays, maps, variants) are converted to LONG type
+/// since null counts are always integers, while struct fields are recursed into
+/// to preserve the nested structure.
 #[allow(unused)]
 pub(crate) struct NullCountStatsTransform;
 impl<'a> SchemaTransform<'a> for NullCountStatsTransform {
-    fn transform_primitive(&mut self, _ptype: &'a PrimitiveType) -> Option<Cow<'a, PrimitiveType>> {
-        Some(Cow::Owned(PrimitiveType::Long))
-    }
     fn transform_struct_field(&mut self, field: &'a StructField) -> Option<Cow<'a, StructField>> {
-        use Cow::*;
-
-        if matches!(
-            &field.data_type,
-            DataType::Array(_) | DataType::Map(_) | DataType::Variant(_)
-        ) {
-            return Some(Cow::Owned(StructField {
-                name: field.name.clone(),
-                data_type: DataType::LONG,
-                nullable: true,
-                metadata: Default::default(),
-            }));
-        }
-
-        match self.transform(&field.data_type)? {
-            Borrowed(_) => Some(Borrowed(field)),
-            dt => Some(Owned(StructField {
-                name: field.name.clone(),
-                data_type: dt.into_owned(),
-                nullable: true,
-                metadata: Default::default(),
-            })),
+        // Only recurse into struct fields; convert all other types (leaf fields) to LONG
+        match &field.data_type {
+            DataType::Struct(_) => self.recurse_into_struct_field(field),
+            _ => Some(Cow::Owned(StructField::nullable(
+                &field.name,
+                DataType::LONG,
+            ))),
         }
     }
 }
@@ -169,22 +167,21 @@ struct BaseStatsTransform {
 impl BaseStatsTransform {
     #[allow(unused)]
     fn new(props: &TableProperties) -> Self {
-        // if data_skipping_stats_columns is specified, it takes precedence
-        // over data_skipping_num_indexed_cols, even if that is also specified
-        if let Some(columns_names) = &props.data_skipping_stats_columns {
+        // If data_skipping_stats_columns is specified, it takes precedence
+        // over data_skipping_num_indexed_cols, even if that is also specified.
+        if let Some(column_names) = &props.data_skipping_stats_columns {
             Self {
                 n_columns: None,
                 added_columns: 0,
-                column_names: Some(columns_names.clone()),
+                column_names: Some(column_names.clone()),
                 path: Vec::new(),
             }
         } else {
+            let n_cols = props
+                .data_skipping_num_indexed_cols
+                .unwrap_or(DataSkippingNumIndexedCols::NumColumns(32));
             Self {
-                n_columns: Some(
-                    props
-                        .data_skipping_num_indexed_cols
-                        .unwrap_or(DataSkippingNumIndexedCols::NumColumns(32)),
-                ),
+                n_columns: Some(n_cols),
                 added_columns: 0,
                 column_names: None,
                 path: Vec::new(),
@@ -207,26 +204,23 @@ impl<'a> SchemaTransform<'a> for BaseStatsTransform {
 
         self.path.push(field.name.clone());
         let data_type = field.data_type();
-        let is_struct = matches!(data_type, DataType::Struct(_));
-
-        // keep the field if it:
-        // - is a struct field and we need to traverse its children
-        // - OR it is referenced by the column names
-        // - OR it is a primitive type / leaf field
-        let should_include = is_struct
-            || self
+
+        // We always traverse struct fields (they don't count against the column limit),
+        // but we only include leaf fields if they qualify based on column_names config.
+        // When column_names is None, all leaf fields are included (up to n_columns limit).
+        if !matches!(data_type, DataType::Struct(_)) {
+            let should_include = self
                 .column_names
                 .as_ref()
                 .map(|ns| should_include_column(&ColumnName::new(&self.path), ns))
                 .unwrap_or(true);
 
-        if !should_include {
-            self.path.pop();
-            return None;
-        }
+            if !should_include {
+                self.path.pop();
+                return None;
+            }
 
-        // increment count only for leaf columns.
-        if !is_struct {
+            // Increment count only for leaf columns
             self.added_columns += 1;
         }
 
@@ -270,11 +264,7 @@ impl<'a> SchemaTransform<'a> for MinMaxStatsTransform {
     }
 
     fn transform_primitive(&mut self, ptype: &'a PrimitiveType) -> Option<Cow<'a, PrimitiveType>> {
-        if is_skipping_eligible_datatype(ptype) {
-            Some(Cow::Borrowed(ptype))
-        } else {
-            None
-        }
+        is_skipping_eligible_datatype(ptype).then_some(Cow::Borrowed(ptype))
     }
 }
 
@@ -291,7 +281,11 @@ fn should_include_column(column_name: &ColumnName, column_names: &[ColumnName])
 }
 
 /// Checks if a data type is eligible for min/max file skipping.
-/// https://github.com/delta-io/delta/blob/143ab3337121248d2ca6a7d5bc31deae7c8fe4be/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/StatsSchemaHelper.java#L61
+///
+/// Note: Boolean and Binary are intentionally excluded as min/max statistics provide minimal
+/// skipping benefit for low-cardinality or opaque data types.
+///
+/// See: <https://github.com/delta-io/delta/blob/143ab3337121248d2ca6a7d5bc31deae7c8fe4be/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/StatsSchemaHelper.java#L61>
 #[allow(unused)]
 fn is_skipping_eligible_datatype(data_type: &PrimitiveType) -> bool {
     matches!(
@@ -306,7 +300,6 @@ fn is_skipping_eligible_datatype(data_type: &PrimitiveType) -> bool {
             | &PrimitiveType::Timestamp
             | &PrimitiveType::TimestampNtz
             | &PrimitiveType::String
-            // | &PrimitiveType::Boolean
             | PrimitiveType::Decimal(_)
     )
 }
diff --git a/kernel/src/table_configuration.rs b/kernel/src/table_configuration.rs
@@ -139,17 +139,37 @@ impl TableConfiguration {
 
     /// Generates the expected schema for file statistics.
     ///
-    /// Engines can decide to provide statistics for files written to the delta table,
-    /// which enables data skipping and other optimizations. While it is not required to
-    /// provide statistics, it is strongly recommended. This method generates the expected
-    /// schema for statistics based on the table configuration. Often times the consfigration
-    /// is based on operator experience or automates systems as to what statistics are most
-    /// useful for a given table.
+    /// Engines can provide statistics for files written to the delta table, enabling
+    /// data skipping and other optimizations. This method generates the expected schema
+    /// for structured statistics based on the table configuration.
+    ///
+    /// The returned schema uses physical column names (respecting column mapping mode) and
+    /// is structured as:
+    /// ```text
+    /// {
+    ///   numRecords: long,
+    ///   nullCount: { <physical columns with LONG type> },
+    ///   minValues: { <physical columns with original types> },
+    ///   maxValues: { <physical columns with original types> },
+    /// }
+    /// ```
+    ///
+    /// The schema is affected by:
+    /// - **Column mapping mode**: Field names use physical names from column mapping metadata.
+    /// - **`delta.dataSkippingStatsColumns`**: If set, only specified columns are included.
+    /// - **`delta.dataSkippingNumIndexedCols`**: Otherwise, includes the first N leaf columns
+    ///   (default 32).
+    ///
+    /// See the Delta protocol for more details on per-file statistics:
+    /// <https://github.com/delta-io/delta/blob/master/PROTOCOL.md#per-file-statistics>
     #[allow(unused)]
     #[internal_api]
     pub(crate) fn expected_stats_schema(&self) -> DeltaResult<SchemaRef> {
         let partition_columns = self.metadata().partition_columns();
         let column_mapping_mode = self.column_mapping_mode();
+        // Partition columns are excluded because statistics are only collected for data columns
+        // that are physically stored in the parquet files. Partition values are stored in the
+        // file path, not in the file content, so they don't have file-level statistics.
         let physical_schema = StructType::try_new(
             self.schema()
                 .fields()