delta-io
diff --git a/‎kernel/src/log_replay.rs‎
Lines changed: 7 additions & 2 deletions b/‎kernel/src/log_replay.rs‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎kernel/src/parallel/sequential_phase.rs‎
Lines changed: 16 additions & 3 deletions b/‎kernel/src/parallel/sequential_phase.rs‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎kernel/src/scan/data_skipping.rs‎
Lines changed: 78 additions & 41 deletions b/‎kernel/src/scan/data_skipping.rs‎
Lines changed: 78 additions & 41 deletions
diff --git a/‎kernel/src/scan/data_skipping/stats_schema.rs‎
Lines changed: 4 additions & 0 deletions b/‎kernel/src/scan/data_skipping/stats_schema.rs‎
Lines changed: 4 additions & 0 deletions
@@ -310,13 +310,18 @@ pub(crate) trait LogReplayProcessor: Sized {
     ///
     /// # Parameters
     /// - `batch`: A reference to the batch of actions to be processed.
+    /// - `is_log_batch`: Whether this batch is from a commit log (`true`) or checkpoint (`false`).
     ///
     /// # Returns
     /// A `DeltaResult<Vec<bool>>`, where each boolean indicates if the corresponding row should be included.
     /// If no filter is provided, all rows are selected.
-    fn build_selection_vector(&self, batch: &dyn EngineData) -> DeltaResult<Vec<bool>> {
+    fn build_selection_vector(
+        &self,
+        batch: &dyn EngineData,
+        is_log_batch: bool,
+    ) -> DeltaResult<Vec<bool>> {
         match self.data_skipping_filter() {
-            Some(filter) => filter.apply(batch),
+            Some(filter) => filter.apply(batch, is_log_batch),
             None => Ok(vec![true; batch.len()]), // If no filter is provided, select all rows
         }
     }
 
@@ -201,10 +201,11 @@ impl<P: LogReplayProcessor> Iterator for SequentialPhase<P> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::actions::get_log_add_schema;
+    use crate::log_segment::CheckpointReadInfo;
     use crate::scan::log_replay::ScanLogReplayProcessor;
     use crate::scan::state_info::StateInfo;
     use crate::utils::test_utils::{assert_result_error_with_message, load_test_table};
-    use std::sync::Arc;
 
     /// Core helper function to verify sequential processing with expected adds and sidecars.
     fn verify_sequential_processing(
@@ -222,7 +223,13 @@ mod tests {
             (),
         )?);
 
-        let processor = ScanLogReplayProcessor::new(engine.as_ref(), state_info)?;
+        // Use base log add schema for tests - no stats_parsed optimization
+        let checkpoint_info = CheckpointReadInfo {
+            has_stats_parsed: false,
+            checkpoint_read_schema: get_log_add_schema().clone(),
+        };
+
+        let processor = ScanLogReplayProcessor::new(engine.as_ref(), state_info, checkpoint_info)?;
         let mut sequential = SequentialPhase::try_new(processor, log_segment, engine.clone())?;
 
         // Process all batches and collect Add file paths
@@ -313,7 +320,13 @@ mod tests {
             (),
         )?);
 
-        let processor = ScanLogReplayProcessor::new(engine.as_ref(), state_info)?;
+        // Use base log add schema for tests - no stats_parsed optimization
+        let checkpoint_info = CheckpointReadInfo {
+            has_stats_parsed: false,
+            checkpoint_read_schema: get_log_add_schema().clone(),
+        };
+
+        let processor = ScanLogReplayProcessor::new(engine.as_ref(), state_info, checkpoint_info)?;
         let mut sequential = SequentialPhase::try_new(processor, log_segment, engine.clone())?;
 
         // Call next() once but don't exhaust the iterator
 
@@ -3,7 +3,6 @@ use std::sync::{Arc, LazyLock};
 
 use tracing::{debug, error};
 
-use crate::actions::get_log_add_schema;
 use crate::actions::visitors::SelectionVectorVisitor;
 use crate::error::DeltaResult;
 use crate::expressions::{
@@ -14,14 +13,13 @@ use crate::expressions::{
 use crate::kernel_predicates::{
     DataSkippingPredicateEvaluator, KernelPredicateEvaluator, KernelPredicateEvaluatorDefaults,
 };
-use crate::schema::{DataType, SchemaRef, SchemaTransform, StructField, StructType};
+use crate::schema::{DataType, SchemaRef};
 use crate::{
     Engine, EngineData, ExpressionEvaluator, JsonHandler, PredicateEvaluator, RowVisitor as _,
 };
 
 pub(crate) mod stats_schema;
 
-use stats_schema::{NullCountStatsTransform, NullableStatsTransform};
 #[cfg(test)]
 mod tests;
 
@@ -55,46 +53,50 @@ fn as_sql_data_skipping_predicate(pred: &Pred) -> Option<Pred> {
 pub(crate) struct DataSkippingFilter {
     stats_schema: SchemaRef,
     select_stats_evaluator: Arc<dyn ExpressionEvaluator>,
+    /// Evaluator for extracting stats_parsed from checkpoints.
+    /// Only present when the checkpoint has compatible pre-parsed stats.
+    select_stats_parsed_evaluator: Option<Arc<dyn ExpressionEvaluator>>,
     skipping_evaluator: Arc<dyn PredicateEvaluator>,
     filter_evaluator: Arc<dyn PredicateEvaluator>,
     json_handler: Arc<dyn JsonHandler>,
 }
 
 impl DataSkippingFilter {
-    /// Creates a new data skipping filter. Returns None if there is no predicate, or the predicate
-    /// is ineligible for data skipping.
+    /// Creates a new data skipping filter. Returns None if there is no predicate/stats_schema,
+    /// or the predicate is ineligible for data skipping.
     ///
     /// NOTE: None is equivalent to a trivial filter that always returns TRUE (= keeps all files),
     /// but using an Option lets the engine easily avoid the overhead of applying trivial filters.
+    ///
+    /// `checkpoint_read_schema` is the schema used to read checkpoint files, which includes
+    /// `stats_parsed` for data skipping optimization. This schema is needed to create the
+    /// evaluator that extracts stats_parsed from the actions.
+    ///
+    /// `has_compatible_stats_parsed` indicates whether the checkpoint has compatible pre-parsed
+    /// stats. When true, checkpoint batches use stats_parsed directly instead of parsing JSON.
     pub(crate) fn new(
         engine: &dyn Engine,
-        physical_predicate: Option<(PredicateRef, SchemaRef)>,
+        predicate: Option<PredicateRef>,
+        stats_schema: Option<SchemaRef>,
+        checkpoint_read_schema: SchemaRef,
+        has_compatible_stats_parsed: bool,
     ) -> Option<Self> {
         static STATS_EXPR: LazyLock<ExpressionRef> =
             LazyLock::new(|| Arc::new(column_expr!("add.stats")));
+        static STATS_PARSED_EXPR: LazyLock<ExpressionRef> =
+            LazyLock::new(|| Arc::new(column_expr!("add.stats_parsed")));
         static FILTER_PRED: LazyLock<PredicateRef> =
             LazyLock::new(|| Arc::new(column_expr!("output").distinct(Expr::literal(false))));
 
-        let (predicate, referenced_schema) = physical_predicate?;
+        let predicate = predicate?;
+        let stats_schema = stats_schema?;
         debug!("Creating a data skipping filter for {:#?}", predicate);
 
-        let stats_schema = NullableStatsTransform
-            .transform_struct(&referenced_schema)?
-            .into_owned();
-
-        let nullcount_schema = NullCountStatsTransform
-            .transform_struct(&stats_schema)?
-            .into_owned();
-        let stats_schema = Arc::new(StructType::new_unchecked([
-            StructField::nullable("numRecords", DataType::LONG),
-            StructField::nullable("nullCount", nullcount_schema),
-            StructField::nullable("minValues", stats_schema.clone()),
-            StructField::nullable("maxValues", stats_schema),
-        ]));
-
         // Skipping happens in several steps:
         //
-        // 1. The stats selector fetches add.stats from the metadata
+        // 1. The stats selector fetches add.stats or add.stats_parsed from the metadata.
+        //    For checkpoint batches with compatible stats_parsed, we use stats_parsed directly.
+        //    Otherwise, we parse add.stats (JSON string) to a stats struct.
         //
         // 2. The predicate (skipping evaluator) produces false for any file whose stats prove we
         //    can safely skip it. A value of true means the stats say we must keep the file, and
@@ -106,7 +108,7 @@ impl DataSkippingFilter {
         let select_stats_evaluator = engine
             .evaluation_handler()
             .new_expression_evaluator(
-                get_log_add_schema().clone(),
+                checkpoint_read_schema.clone(),
                 STATS_EXPR.clone(),
                 DataType::STRING,
             )
@@ -115,6 +117,23 @@ impl DataSkippingFilter {
             .inspect_err(|e| error!("Failed to create select stats evaluator: {e}"))
             .ok()?;
 
+        // Only create stats_parsed evaluator when checkpoint has compatible pre-parsed stats
+        let select_stats_parsed_evaluator = if has_compatible_stats_parsed {
+            engine
+                .evaluation_handler()
+                .new_expression_evaluator(
+                    checkpoint_read_schema.clone(),
+                    STATS_PARSED_EXPR.clone(),
+                    DataType::Struct(Box::new(stats_schema.as_ref().clone())),
+                )
+                .inspect_err(|e| {
+                    debug!("stats_parsed evaluator not available (falling back to JSON): {e}")
+                })
+                .ok()
+        } else {
+            None
+        };
+
         let skipping_evaluator = engine
             .evaluation_handler()
             .new_predicate_evaluator(
@@ -137,6 +156,7 @@ impl DataSkippingFilter {
         Some(Self {
             stats_schema,
             select_stats_evaluator,
+            select_stats_parsed_evaluator,
             skipping_evaluator,
             filter_evaluator,
             json_handler: engine.json_handler(),
@@ -145,34 +165,51 @@ impl DataSkippingFilter {
 
     /// Apply the DataSkippingFilter to an EngineData batch of actions. Returns a selection vector
     /// which can be applied to the actions to find those that passed data skipping.
-    pub(crate) fn apply(&self, actions: &dyn EngineData) -> DeltaResult<Vec<bool>> {
-        // retrieve and parse stats from actions data
-        let stats = self.select_stats_evaluator.evaluate(actions)?;
-        assert_eq!(stats.len(), actions.len());
-        let parsed_stats = self
-            .json_handler
-            .parse_json(stats, self.stats_schema.clone())?;
-        assert_eq!(parsed_stats.len(), actions.len());
+    ///
+    /// `is_log_batch` indicates whether this batch is from a commit log (`true`) or checkpoint (`false`).
+    /// Checkpoint batches may have pre-parsed stats (`stats_parsed`) that can be used directly
+    /// instead of parsing JSON. Commit batches only have JSON stats.
+    pub(crate) fn apply(
+        &self,
+        actions: &dyn EngineData,
+        is_log_batch: bool,
+    ) -> DeltaResult<Vec<bool>> {
+        // Get the final stats by either:
+        // 1. Using stats_parsed directly (for checkpoint batches with compatible stats)
+        // 2. Parsing JSON (for commit batches or when stats_parsed unavailable)
+        let final_stats = if let Some(stats_parsed_evaluator) = (!is_log_batch)
+            .then_some(())
+            .and(self.select_stats_parsed_evaluator.as_ref())
+        {
+            // Checkpoint batch with compatible stats_parsed - use it directly
+            let stats_parsed = stats_parsed_evaluator.evaluate(actions)?;
+            debug!(
+                "Using stats_parsed from checkpoint ({} rows)",
+                stats_parsed.len()
+            );
+            stats_parsed
+        } else {
+            // Commit batch or no stats_parsed evaluator - parse JSON
+            let stats_json = self.select_stats_evaluator.evaluate(actions)?;
+            assert_eq!(stats_json.len(), actions.len());
+            self.json_handler
+                .parse_json(stats_json, self.stats_schema.clone())?
+        };
 
-        // evaluate the predicate on the parsed stats, then convert to selection vector
-        let skipping_predicate = self.skipping_evaluator.evaluate(&*parsed_stats)?;
+        assert_eq!(final_stats.len(), actions.len());
+
+        // Evaluate predicate on the stats
+        let skipping_predicate = self.skipping_evaluator.evaluate(&*final_stats)?;
         assert_eq!(skipping_predicate.len(), actions.len());
         let selection_vector = self
             .filter_evaluator
             .evaluate(skipping_predicate.as_ref())?;
         assert_eq!(selection_vector.len(), actions.len());
 
-        // visit the engine's selection vector to produce a Vec<bool>
+        // Visit the engine's selection vector to produce a Vec<bool>
         let mut visitor = SelectionVectorVisitor::default();
         visitor.visit_rows_of(selection_vector.as_ref())?;
         Ok(visitor.selection_vector)
-
-        // TODO(zach): add some debug info about data skipping that occurred
-        // let before_count = actions.length();
-        // debug!(
-        //     "number of actions before/after data skipping: {before_count} / {}",
-        //     filtered_actions.num_rows()
-        // );
     }
 }
 
 
@@ -175,13 +175,17 @@ impl<'a> SchemaTransform<'a> for NullableStatsTransform {
     }
 }
 
+<<<<<<< HEAD
 /// Converts a stats schema into a nullCount schema where all leaf fields become LONG.
 ///
 /// The nullCount struct field tracks the number of null values for each column.
 /// All leaf fields (primitives, arrays, maps, variants) are converted to LONG type
 /// since null counts are always integers, while struct fields are recursed into
 /// to preserve the nested structure.
 #[allow(unused)]
+=======
+/// Transforms a schema to convert all primitive fields to LONG (for null count stats).
+>>>>>>> 53871208 (ps)
 pub(crate) struct NullCountStatsTransform;
 impl<'a> SchemaTransform<'a> for NullCountStatsTransform {
     fn transform_struct_field(&mut self, field: &'a StructField) -> Option<Cow<'a, StructField>> {