From 435302e42e2f2776435b915698af1e6cd0ca7339 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Tue, 11 Mar 2025 22:03:21 -0700
Subject: [PATCH 01/45] introduce visitors

---
 kernel/src/actions/visitors.rs | 524 +++++++++++++++++++++++++++++++--
 kernel/src/scan/log_replay.rs  |  10 +-
 2 files changed, 510 insertions(+), 24 deletions(-)
diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs
index 36a2c7faf..9eef22ed5 100644
--- a/kernel/src/actions/visitors.rs
+++ b/kernel/src/actions/visitors.rs
@@ -1,10 +1,12 @@
 //! This module defines visitors that can be used to extract the various delta actions from
 //! [`crate::engine_data::EngineData`] types.
 
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::sync::LazyLock;
+use tracing::debug;
 
 use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
+use crate::scan::log_replay::FileActionKey;
 use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType};
 use crate::utils::require;
 use crate::{DeltaResult, Error};
@@ -483,6 +485,270 @@ impl RowVisitor for SidecarVisitor {
     }
 }
 
+/// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds and
+/// removes to be included in a checkpoint file. Log replay visits actions newest-first, so once
+/// we've seen a file action for a given (path, dvId) pair, we should ignore all subsequent (older)
+/// actions for that same (path, dvId) pair. If the first action for a given (path, dvId) is a remove
+/// action, we should only include it if it is not expired (i.e., its deletion timestamp is greater
+/// than the minimum file retention timestamp).
+struct CheckpointFileActionsVisitor<'seen> {
+    seen_file_keys: &'seen mut HashSet<FileActionKey>,
+    selection_vector: Vec<bool>,
+    is_log_batch: bool,
+    total_actions: usize,
+    total_add_actions: usize,
+    minimum_file_retention_timestamp: i64,
+}
+
+#[allow(unused)] // TODO: Remove flag once used for checkpoint writing
+impl CheckpointFileActionsVisitor<'_> {
+    /// Checks if log replay already processed this logical file (in which case the current action
+    /// should be ignored). If not already seen, register it so we can recognize future duplicates.
+    /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it
+    /// and should process it.
+    ///
+    /// TODO: This method is a duplicate of AddRemoveDedupVisior's method!
+    fn check_and_record_seen(&mut self, key: FileActionKey) -> bool {
+        // Note: each (add.path + add.dv_unique_id()) pair has a
+        // unique Add + Remove pair in the log. For example:
+        // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json
+
+        if self.seen_file_keys.contains(&key) {
+            debug!(
+                "Ignoring duplicate ({}, {:?}) in scan, is log {}",
+                key.path, key.dv_unique_id, self.is_log_batch
+            );
+            true
+        } else {
+            debug!(
+                "Including ({}, {:?}) in scan, is log {}",
+                key.path, key.dv_unique_id, self.is_log_batch
+            );
+            if self.is_log_batch {
+                // Remember file actions from this batch so we can ignore duplicates as we process
+                // batches from older commit and/or checkpoint files. We don't track checkpoint
+                // batches because they are already the oldest actions and never replace anything.
+                self.seen_file_keys.insert(key);
+            }
+            false
+        }
+    }
+
+    /// A remove action includes a timestamp indicating when the deletion occurred. Physical files  
+    /// are deleted lazily after a user-defined expiration time, allowing concurrent readers to  
+    /// access stale snapshots. A remove action remains as a tombstone in a checkpoint file until
+    /// it expires, which happens when the current time exceeds the removal timestamp plus the
+    /// expiration threshold.
+    fn is_expired_tombstone<'a>(&self, i: usize, getter: &'a dyn GetData<'a>) -> DeltaResult<bool> {
+        // Ideally this should never be zero, but we are following the same behavior as Delta
+        // Spark and the Java Kernel.
+        let mut deletion_timestamp: i64 = 0;
+        if let Some(ts) = getter.get_opt(i, "remove.deletionTimestamp")? {
+            deletion_timestamp = ts;
+        }
+
+        Ok(deletion_timestamp <= self.minimum_file_retention_timestamp)
+    }
+
+    /// Returns true if the row contains a valid file action to be included in the checkpoint.
+    fn is_valid_file_action<'a>(
+        &mut self,
+        i: usize,
+        getters: &[&'a dyn GetData<'a>],
+    ) -> DeltaResult<bool> {
+        // Add will have a path at index 0 if it is valid; otherwise we may
+        // have a remove with a path at index 4. In either case, extract the three dv getters at
+        // indexes that immediately follow a valid path index.
+        let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? {
+            (path, &getters[1..4], true)
+        } else if let Some(path) = getters[4].get_opt(i, "remove.path")? {
+            (path, &getters[6..9], false)
+        } else {
+            return Ok(false);
+        };
+
+        let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? {
+            Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts(
+                storage_type,
+                dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?,
+                dv_getters[2].get_opt(i, "deletionVector.offset")?,
+            )),
+            None => None,
+        };
+
+        // Check both adds and removes (skipping already-seen)
+        let file_key = FileActionKey::new(path, dv_unique_id);
+        if self.check_and_record_seen(file_key) {
+            return Ok(false);
+        }
+
+        // Ignore expired tombstones.
+        if !is_add && self.is_expired_tombstone(i, getters[5])? {
+            return Ok(false);
+        }
+
+        if is_add {
+            self.total_add_actions += 1;
+        }
+
+        Ok(true)
+    }
+}
+
+impl RowVisitor for CheckpointFileActionsVisitor<'_> {
+    fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) {
+        // The data columns visited must be in the following order:
+        // 1. ADD
+        // 2. REMOVE
+        static CHECKPOINT_FILE_ACTION_COLUMNS: LazyLock<ColumnNamesAndTypes> =
+            LazyLock::new(|| {
+                const STRING: DataType = DataType::STRING;
+                const INTEGER: DataType = DataType::INTEGER;
+                let types_and_names = vec![
+                    (STRING, column_name!("add.path")),
+                    (STRING, column_name!("add.deletionVector.storageType")),
+                    (STRING, column_name!("add.deletionVector.pathOrInlineDv")),
+                    (INTEGER, column_name!("add.deletionVector.offset")),
+                    (STRING, column_name!("remove.path")),
+                    (DataType::LONG, column_name!("remove.deletionTimestamp")),
+                    (STRING, column_name!("remove.deletionVector.storageType")),
+                    (STRING, column_name!("remove.deletionVector.pathOrInlineDv")),
+                    (INTEGER, column_name!("remove.deletionVector.offset")),
+                ];
+                let (types, names) = types_and_names.into_iter().unzip();
+                (names, types).into()
+            });
+        CHECKPOINT_FILE_ACTION_COLUMNS.as_ref()
+    }
+
+    fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> {
+        require!(
+            getters.len() == 9,
+            Error::InternalError(format!(
+                "Wrong number of visitor getters: {}",
+                getters.len()
+            ))
+        );
+
+        for i in 0..row_count {
+            let should_select = self.is_valid_file_action(i, getters)?;
+
+            if should_select {
+                self.selection_vector[i] = true;
+                self.total_actions += 1;
+            }
+        }
+        Ok(())
+    }
+}
+
+/// A visitor that selects non-file actions for a checkpoint file. Since log replay visits actions
+/// in newest-first order, we only keep the first occurrence of:
+/// - a protocol action,
+/// - a metadata action,
+/// - a transaction (txn) action for a given app ID.
+///
+/// Any subsequent (older) actions of the same type are ignored. This visitor tracks which actions
+/// have been seen and includes only the first occurrence of each in the selection vector.
+#[cfg_attr(feature = "developer-visibility", visibility::make(pub))]
+pub(crate) struct CheckpointNonFileActionsVisitor<'seen> {
+    // Non-file actions state
+    pub(crate) seen_protocol: bool,
+    pub(crate) seen_metadata: bool,
+    pub(crate) seen_txns: &'seen mut HashSet<String>,
+    pub(crate) selection_vector: Vec<bool>,
+    pub(crate) total_actions: usize,
+}
+
+#[allow(unused)] // TODO: Remove flag once used for checkpoint writing
+impl CheckpointNonFileActionsVisitor<'_> {
+    /// Returns true if the row contains a protocol action, and we haven’t seen one yet.
+    fn is_valid_protocol_action<'a>(
+        &mut self,
+        i: usize,
+        getter: &'a dyn GetData<'a>,
+    ) -> DeltaResult<bool> {
+        if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol {
+            self.seen_protocol = true;
+            Ok(true)
+        } else {
+            Ok(false)
+        }
+    }
+
+    /// Returns true if the row contains a metadata action, and we haven’t seen one yet.
+    fn is_valid_metadata_action<'a>(
+        &mut self,
+        i: usize,
+        getter: &'a dyn GetData<'a>,
+    ) -> DeltaResult<bool> {
+        if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata {
+            self.seen_metadata = true;
+            Ok(true)
+        } else {
+            Ok(false)
+        }
+    }
+
+    /// Returns true if the row contains a txn action with an appId that we haven’t seen yet.
+    fn is_valid_txn_action<'a>(
+        &mut self,
+        i: usize,
+        getter: &'a dyn GetData<'a>,
+    ) -> DeltaResult<bool> {
+        let app_id = match getter.get_str(i, "txn.appId")? {
+            Some(id) => id,
+            None => return Ok(false),
+        };
+
+        Ok(self.seen_txns.insert(app_id.to_string()))
+    }
+}
+
+impl RowVisitor for CheckpointNonFileActionsVisitor<'_> {
+    fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) {
+        // The data columns visited must be in the following order:
+        // 1. METADATA
+        // 2. PROTOCOL
+        // 3. TXN
+        static CHECKPOINT_NON_FILE_ACTION_COLUMNS: LazyLock<ColumnNamesAndTypes> =
+            LazyLock::new(|| {
+                const STRING: DataType = DataType::STRING;
+                const INTEGER: DataType = DataType::INTEGER;
+                let types_and_names = vec![
+                    (STRING, column_name!("metaData.id")),
+                    (INTEGER, column_name!("protocol.minReaderVersion")),
+                    (STRING, column_name!("txn.appId")),
+                ];
+                let (types, names) = types_and_names.into_iter().unzip();
+                (names, types).into()
+            });
+        CHECKPOINT_NON_FILE_ACTION_COLUMNS.as_ref()
+    }
+
+    fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> {
+        require!(
+            getters.len() == 3,
+            Error::InternalError(format!(
+                "Wrong number of visitor getters: {}",
+                getters.len()
+            ))
+        );
+
+        for i in 0..row_count {
+            let should_select = self.is_valid_metadata_action(i, getters[0])?
+                || self.is_valid_protocol_action(i, getters[1])?
+                || self.is_valid_txn_action(i, getters[2])?;
+
+            if should_select {
+                self.selection_vector[i] = true;
+                self.total_actions += 1;
+            }
+        }
+        Ok(())
+    }
+}
+
 /// Get a DV out of some engine data. The caller is responsible for slicing the `getters` slice such
 /// that the first element contains the `storageType` element of the deletion vector.
 pub(crate) fn visit_deletion_vector_at<'a>(
@@ -537,11 +803,13 @@ mod tests {
         let handler = SyncJsonHandler {};
         let json_strings: StringArray = vec![
             r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#,
+            r#"{"remove":{"path":"part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, 
             r#"{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Databricks-Runtime/<unknown>","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}}"#,
             r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#,
             r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#,
             r#"{"cdc":{"path":"_change_data/age=21/cdc-00000-93f7fceb-281a-446a-b221-07b88132d203.c000.snappy.parquet","partitionValues":{"age":"21"},"size":1033,"dataChange":false}}"#,
             r#"{"sidecar":{"path":"016ae953-37a9-438e-8683-9a9a4a79a395.parquet","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{"tag_foo":"tag_bar"}}}"#,
+            r#"{"txn":{"appId":"myApp","version": 3}}"#,
         ]
         .into();
         let output_schema = get_log_schema().clone();
@@ -551,6 +819,18 @@ mod tests {
         ArrowEngineData::try_from_engine_data(parsed).unwrap()
     }
 
+    fn parse_json_batch(json_strings: StringArray) -> Box<dyn EngineData> {
+        let engine = SyncEngine::new();
+        let json_handler = engine.get_json_handler();
+        let output_schema = get_log_schema().clone();
+        json_handler
+            .parse_json(
+                string_array_to_engine_data(json_strings.into()),
+                output_schema,
+            )
+            .unwrap()
+    }
+
     #[test]
     fn test_parse_protocol() -> DeltaResult<()> {
         let data = action_batch();
@@ -639,8 +919,6 @@ mod tests {
 
     #[test]
     fn test_parse_add_partitioned() {
-        let engine = SyncEngine::new();
-        let json_handler = engine.get_json_handler();
         let json_strings: StringArray = vec![
             r#"{"commitInfo":{"timestamp":1670892998177,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[\"c1\",\"c2\"]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"3","numOutputRows":"3","numOutputBytes":"1356"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.2.0","txnId":"046a258f-45e3-4657-b0bf-abfb0f76681c"}}"#,
             r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#,
@@ -650,10 +928,7 @@ mod tests {
             r#"{"add":{"path":"c1=6/c2=a/part-00011-10619b10-b691-4fd0-acc4-2a9608499d7c.c000.snappy.parquet","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":4},\"maxValues\":{\"c3\":4},\"nullCount\":{\"c3\":0}}"}}"#,
         ]
         .into();
-        let output_schema = get_log_schema().clone();
-        let batch = json_handler
-            .parse_json(string_array_to_engine_data(json_strings), output_schema)
-            .unwrap();
+        let batch = parse_json_batch(json_strings);
         let mut add_visitor = AddVisitor::default();
         add_visitor.visit_rows_of(batch.as_ref()).unwrap();
         let add1 = Add {
@@ -697,18 +972,13 @@ mod tests {
 
     #[test]
     fn test_parse_remove_partitioned() {
-        let engine = SyncEngine::new();
-        let json_handler = engine.get_json_handler();
         let json_strings: StringArray = vec![
             r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#,
             r#"{"metaData":{"id":"aff5cb91-8cd9-4195-aef9-446908507302","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#,
             r#"{"remove":{"path":"c1=4/c2=c/part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#,
         ]
         .into();
-        let output_schema = get_log_schema().clone();
-        let batch = json_handler
-            .parse_json(string_array_to_engine_data(json_strings), output_schema)
-            .unwrap();
+        let batch = parse_json_batch(json_strings);
         let mut remove_visitor = RemoveVisitor::default();
         remove_visitor.visit_rows_of(batch.as_ref()).unwrap();
         let expected_remove = Remove {
@@ -736,8 +1006,6 @@ mod tests {
 
     #[test]
     fn test_parse_txn() {
-        let engine = SyncEngine::new();
-        let json_handler = engine.get_json_handler();
         let json_strings: StringArray = vec![
             r#"{"commitInfo":{"timestamp":1670892998177,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[\"c1\",\"c2\"]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"3","numOutputRows":"3","numOutputBytes":"1356"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.2.0","txnId":"046a258f-45e3-4657-b0bf-abfb0f76681c"}}"#,
             r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#,
@@ -747,10 +1015,7 @@ mod tests {
             r#"{"txn":{"appId":"myApp2","version": 4, "lastUpdated": 1670892998177}}"#,
         ]
         .into();
-        let output_schema = get_log_schema().clone();
-        let batch = json_handler
-            .parse_json(string_array_to_engine_data(json_strings), output_schema)
-            .unwrap();
+        let batch = parse_json_batch(json_strings);
         let mut txn_visitor = SetTransactionVisitor::default();
         txn_visitor.visit_rows_of(batch.as_ref()).unwrap();
         let mut actual = txn_visitor.set_transactions;
@@ -771,4 +1036,225 @@ mod tests {
             })
         );
     }
+
+    #[test]
+    fn test_parse_checkpoint_file_action_visitor() -> DeltaResult<()> {
+        let data = action_batch();
+        let mut visitor = CheckpointFileActionsVisitor {
+            seen_file_keys: &mut HashSet::new(),
+            selection_vector: vec![false; 8], // 8 rows in the action batch
+            is_log_batch: true,
+            total_actions: 0,
+            total_add_actions: 0,
+            minimum_file_retention_timestamp: 0, // No tombstones are expired
+        };
+
+        visitor.visit_rows_of(data.as_ref())?;
+
+        let expected = vec![true, true, false, false, false, false, false, false];
+        assert_eq!(visitor.selection_vector, expected);
+        assert_eq!(visitor.seen_file_keys.len(), 2);
+        assert_eq!(visitor.total_actions, 2);
+        assert_eq!(visitor.total_add_actions, 1);
+        Ok(())
+    }
+
+    #[test]
+    fn test_checkpoint_file_action_visitor_boundary_cases_for_tombstone_expiration(
+    ) -> DeltaResult<()> {
+        let json_strings: StringArray = vec![
+        r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#,
+        r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#,
+        r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#,
+        r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, // Missing timestamp defaults to 0
+        ]
+        .into();
+        let batch = parse_json_batch(json_strings);
+
+        let mut visitor = CheckpointFileActionsVisitor {
+            seen_file_keys: &mut HashSet::new(),
+            selection_vector: vec![false; 4],
+            is_log_batch: true,
+            total_actions: 0,
+            total_add_actions: 0,
+            minimum_file_retention_timestamp: 100, // Threshold set to 100
+        };
+
+        visitor.visit_rows_of(batch.as_ref())?;
+
+        let expected = vec![false, false, true, false]; // Only "one_above_threshold" should be kept
+        assert_eq!(visitor.selection_vector, expected);
+        assert_eq!(visitor.seen_file_keys.len(), 4); // All are recorded as seen even if expired
+        assert_eq!(visitor.total_actions, 1);
+        assert_eq!(visitor.total_add_actions, 0);
+        Ok(())
+    }
+
+    #[test]
+    fn test_checkpoint_file_action_visitor_duplicate_file_actions_in_log_batch() -> DeltaResult<()>
+    {
+        let json_strings: StringArray = vec![
+            r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#,
+            r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, // Duplicate path
+            ]
+        .into();
+        let batch = parse_json_batch(json_strings);
+
+        let mut visitor = CheckpointFileActionsVisitor {
+            seen_file_keys: &mut HashSet::new(),
+            selection_vector: vec![false; 2],
+            is_log_batch: true, // Log batch
+            total_actions: 0,
+            total_add_actions: 0,
+            minimum_file_retention_timestamp: 0,
+        };
+
+        visitor.visit_rows_of(batch.as_ref())?;
+
+        // First one should be included, second one skipped as a duplicate
+        let expected = vec![true, false];
+        assert_eq!(visitor.selection_vector, expected);
+        assert_eq!(visitor.seen_file_keys.len(), 1);
+        assert_eq!(visitor.total_actions, 1);
+        assert_eq!(visitor.total_add_actions, 1);
+        Ok(())
+    }
+
+    #[test]
+    fn test_checkpoint_file_action_visitor_duplicate_file_actions_in_checkpoint_batch(
+    ) -> DeltaResult<()> {
+        let json_strings: StringArray = vec![
+            r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#,
+            // Duplicate path
+            r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, 
+        ]
+        .into();
+        let batch = parse_json_batch(json_strings);
+
+        let mut visitor = CheckpointFileActionsVisitor {
+            seen_file_keys: &mut HashSet::new(),
+            selection_vector: vec![false; 2],
+            is_log_batch: false, // Checkpoint batch
+            total_actions: 0,
+            total_add_actions: 0,
+            minimum_file_retention_timestamp: 0,
+        };
+
+        visitor.visit_rows_of(batch.as_ref())?;
+
+        // Both should be included since we don't track duplicates in checkpoint batches
+        let expected = vec![true, true];
+        assert_eq!(visitor.selection_vector, expected);
+        assert_eq!(visitor.seen_file_keys.len(), 0); // No tracking for checkpoint batches
+        assert_eq!(visitor.total_actions, 2);
+        assert_eq!(visitor.total_add_actions, 2);
+        Ok(())
+    }
+
+    #[test]
+    fn test_checkpoint_file_action_visitor_with_deletion_vectors() -> DeltaResult<()> {
+        let json_strings: StringArray = vec![
+            r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#,
+            // Same path but different DV
+            r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"two","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, 
+            // Duplicate of first entry
+            r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, 
+        ]
+        .into();
+        let batch = parse_json_batch(json_strings);
+
+        let mut visitor = CheckpointFileActionsVisitor {
+            seen_file_keys: &mut HashSet::new(),
+            selection_vector: vec![false; 3],
+            is_log_batch: true,
+            total_actions: 0,
+            total_add_actions: 0,
+            minimum_file_retention_timestamp: 0,
+        };
+
+        visitor.visit_rows_of(batch.as_ref())?;
+
+        let expected = vec![true, true, false]; // Third one is a duplicate
+        assert_eq!(visitor.selection_vector, expected);
+        assert_eq!(visitor.seen_file_keys.len(), 2);
+        assert_eq!(visitor.total_actions, 2);
+        assert_eq!(visitor.total_add_actions, 2);
+        Ok(())
+    }
+
+    #[test]
+    fn test_parse_checkpoint_non_file_actions_visitor() -> DeltaResult<()> {
+        let data = action_batch();
+        let mut visitor = CheckpointNonFileActionsVisitor {
+            seen_protocol: false,
+            seen_metadata: false,
+            seen_txns: &mut HashSet::new(),
+            selection_vector: vec![false; 8],
+            total_actions: 0,
+        };
+
+        visitor.visit_rows_of(data.as_ref())?;
+
+        let expected = vec![false, false, false, true, true, false, false, true];
+        assert_eq!(visitor.selection_vector, expected);
+        assert_eq!(visitor.seen_metadata, true);
+        assert_eq!(visitor.seen_protocol, true);
+        assert_eq!(visitor.seen_txns.len(), 1);
+        assert_eq!(visitor.total_actions, 3);
+        Ok(())
+    }
+
+    #[test]
+    fn test_checkpoint_non_file_actions_visitor_txn_already_seen() -> DeltaResult<()> {
+        let json_strings: StringArray =
+            vec![r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#].into();
+        let batch = parse_json_batch(json_strings);
+
+        // Pre-populate with app1
+        let mut seen_txns = HashSet::new();
+        seen_txns.insert("app1".to_string());
+
+        let mut visitor = CheckpointNonFileActionsVisitor {
+            seen_protocol: false,
+            seen_metadata: false,
+            seen_txns: &mut seen_txns,
+            selection_vector: vec![false; 1],
+            total_actions: 0,
+        };
+
+        visitor.visit_rows_of(batch.as_ref())?;
+
+        let expected = vec![false]; // Transaction should be skipped as it's already seen
+        assert_eq!(visitor.selection_vector, expected);
+        assert_eq!(visitor.seen_txns.len(), 1); // Still only one transaction
+        assert_eq!(visitor.total_actions, 0);
+        Ok(())
+    }
+
+    #[test]
+    fn test_checkpoint_non_file_actions_visitor_protocol_and_metadata_already_seen(
+    ) -> DeltaResult<()> {
+        let json_strings: StringArray = vec![
+            r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#,
+            r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#,
+        ]
+        .into();
+        let batch = parse_json_batch(json_strings);
+
+        // Set protocol and metadata as already seen
+        let mut visitor = CheckpointNonFileActionsVisitor {
+            seen_protocol: true, // Already seen
+            seen_metadata: true, // Already seen
+            seen_txns: &mut HashSet::new(),
+            selection_vector: vec![false; 2],
+            total_actions: 0,
+        };
+
+        visitor.visit_rows_of(batch.as_ref())?;
+
+        let expected = vec![false, false]; // Both should be skipped
+        assert_eq!(visitor.selection_vector, expected);
+        assert_eq!(visitor.total_actions, 0);
+        Ok(())
+    }
 }
diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index 0e26b610f..b0d3ea8f0 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -19,12 +19,12 @@ use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator};
 /// The subset of file action fields that uniquely identifies it in the log, used for deduplication
 /// of adds and removes during log replay.
 #[derive(Debug, Hash, Eq, PartialEq)]
-struct FileActionKey {
-    path: String,
-    dv_unique_id: Option<String>,
+pub(crate) struct FileActionKey {
+    pub(crate) path: String,
+    pub(crate) dv_unique_id: Option<String>,
 }
 impl FileActionKey {
-    fn new(path: impl Into<String>, dv_unique_id: Option<String>) -> Self {
+    pub(crate) fn new(path: impl Into<String>, dv_unique_id: Option<String>) -> Self {
         let path = path.into();
         Self { path, dv_unique_id }
     }
@@ -59,7 +59,7 @@ impl AddRemoveDedupVisitor<'_> {
     /// should be ignored). If not already seen, register it so we can recognize future duplicates.
     /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it
     /// and should process it.
-    fn check_and_record_seen(&mut self, key: FileActionKey) -> bool {
+    pub fn check_and_record_seen(&mut self, key: FileActionKey) -> bool {
         // Note: each (add.path + add.dv_unique_id()) pair has a
         // unique Add + Remove pair in the log. For example:
         // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json

From e500a107abe4c818bd1c451a70bf965124857f05 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Tue, 11 Mar 2025 22:04:35 -0700
Subject: [PATCH 02/45] remove pub

---
 kernel/src/scan/log_replay.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index b0d3ea8f0..dbcd056df 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -59,7 +59,7 @@ impl AddRemoveDedupVisitor<'_> {
     /// should be ignored). If not already seen, register it so we can recognize future duplicates.
     /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it
     /// and should process it.
-    pub fn check_and_record_seen(&mut self, key: FileActionKey) -> bool {
+    fn check_and_record_seen(&mut self, key: FileActionKey) -> bool {
         // Note: each (add.path + add.dv_unique_id()) pair has a
         // unique Add + Remove pair in the log. For example:
         // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json

From 19733cd003eb7a72d962f9cf1d1556e26d2f7f77 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Tue, 11 Mar 2025 22:28:59 -0700
Subject: [PATCH 03/45] assert! instead of assert_eq with bool

---
 kernel/src/actions/visitors.rs | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs
index 9eef22ed5..3ade3d914 100644
--- a/kernel/src/actions/visitors.rs
+++ b/kernel/src/actions/visitors.rs
@@ -824,10 +824,7 @@ mod tests {
         let json_handler = engine.get_json_handler();
         let output_schema = get_log_schema().clone();
         json_handler
-            .parse_json(
-                string_array_to_engine_data(json_strings.into()),
-                output_schema,
-            )
+            .parse_json(string_array_to_engine_data(json_strings), output_schema)
             .unwrap()
     }
 
@@ -1197,8 +1194,8 @@ mod tests {
 
         let expected = vec![false, false, false, true, true, false, false, true];
         assert_eq!(visitor.selection_vector, expected);
-        assert_eq!(visitor.seen_metadata, true);
-        assert_eq!(visitor.seen_protocol, true);
+        assert!(visitor.seen_metadata);
+        assert!(visitor.seen_protocol);
         assert_eq!(visitor.seen_txns.len(), 1);
         assert_eq!(visitor.total_actions, 3);
         Ok(())

From 87c9f31f97a0d7a22e07c337b6c92ee9945c19df Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Thu, 13 Mar 2025 13:22:09 -0700
Subject: [PATCH 04/45] log replay for checkpoints

---
 kernel/src/actions/visitors.rs       | 116 +++++---------
 kernel/src/checkpoints/log_replay.rs | 229 +++++++++++++++++++++++++++
 kernel/src/checkpoints/mod.rs        |   1 +
 kernel/src/lib.rs                    |   1 +
 kernel/src/path.rs                   |  17 ++
 kernel/src/utils.rs                  |  25 ++-
 6 files changed, 315 insertions(+), 74 deletions(-)
 create mode 100644 kernel/src/checkpoints/log_replay.rs
 create mode 100644 kernel/src/checkpoints/mod.rs

diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs
index 3ade3d914..e0e622b05 100644
--- a/kernel/src/actions/visitors.rs
+++ b/kernel/src/actions/visitors.rs
@@ -491,13 +491,13 @@ impl RowVisitor for SidecarVisitor {
 /// actions for that same (path, dvId) pair. If the first action for a given (path, dvId) is a remove
 /// action, we should only include it if it is not expired (i.e., its deletion timestamp is greater
 /// than the minimum file retention timestamp).
-struct CheckpointFileActionsVisitor<'seen> {
-    seen_file_keys: &'seen mut HashSet<FileActionKey>,
-    selection_vector: Vec<bool>,
-    is_log_batch: bool,
-    total_actions: usize,
-    total_add_actions: usize,
-    minimum_file_retention_timestamp: i64,
+pub(crate) struct CheckpointFileActionsVisitor<'seen> {
+    pub(crate) seen_file_keys: &'seen mut HashSet<FileActionKey>,
+    pub(crate) selection_vector: &'seen mut Vec<bool>,
+    pub(crate) is_log_batch: bool,
+    pub(crate) total_actions: usize,
+    pub(crate) total_add_actions: usize,
+    pub(crate) minimum_file_retention_timestamp: i64,
 }
 
 #[allow(unused)] // TODO: Remove flag once used for checkpoint writing
@@ -653,10 +653,10 @@ impl RowVisitor for CheckpointFileActionsVisitor<'_> {
 #[cfg_attr(feature = "developer-visibility", visibility::make(pub))]
 pub(crate) struct CheckpointNonFileActionsVisitor<'seen> {
     // Non-file actions state
-    pub(crate) seen_protocol: bool,
-    pub(crate) seen_metadata: bool,
+    pub(crate) seen_protocol: &'seen mut bool,
+    pub(crate) seen_metadata: &'seen mut bool,
     pub(crate) seen_txns: &'seen mut HashSet<String>,
-    pub(crate) selection_vector: Vec<bool>,
+    pub(crate) selection_vector: &'seen mut Vec<bool>,
     pub(crate) total_actions: usize,
 }
 
@@ -668,8 +668,8 @@ impl CheckpointNonFileActionsVisitor<'_> {
         i: usize,
         getter: &'a dyn GetData<'a>,
     ) -> DeltaResult<bool> {
-        if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol {
-            self.seen_protocol = true;
+        if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !*self.seen_protocol {
+            *self.seen_protocol = true;
             Ok(true)
         } else {
             Ok(false)
@@ -682,8 +682,8 @@ impl CheckpointNonFileActionsVisitor<'_> {
         i: usize,
         getter: &'a dyn GetData<'a>,
     ) -> DeltaResult<bool> {
-        if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata {
-            self.seen_metadata = true;
+        if getter.get_str(i, "metaData.id")?.is_some() && !*self.seen_metadata {
+            *self.seen_metadata = true;
             Ok(true)
         } else {
             Ok(false)
@@ -777,30 +777,13 @@ pub(crate) fn visit_deletion_vector_at<'a>(
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-
-    use crate::arrow::array::{RecordBatch, StringArray};
-    use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
+    use crate::arrow::array::StringArray;
+    use crate::utils::test_utils::parse_json_batch;
+    use crate::EngineData;
 
     use super::*;
-    use crate::{
-        actions::get_log_schema,
-        engine::arrow_data::ArrowEngineData,
-        engine::sync::{json::SyncJsonHandler, SyncEngine},
-        Engine, EngineData, JsonHandler,
-    };
-
-    // TODO(nick): Merge all copies of this into one "test utils" thing
-    fn string_array_to_engine_data(string_array: StringArray) -> Box<dyn EngineData> {
-        let string_field = Arc::new(Field::new("a", DataType::Utf8, true));
-        let schema = Arc::new(ArrowSchema::new(vec![string_field]));
-        let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)])
-            .expect("Can't convert to record batch");
-        Box::new(ArrowEngineData::new(batch))
-    }
 
-    fn action_batch() -> Box<ArrowEngineData> {
-        let handler = SyncJsonHandler {};
+    fn action_batch() -> Box<dyn EngineData> {
         let json_strings: StringArray = vec![
             r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#,
             r#"{"remove":{"path":"part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, 
@@ -812,20 +795,7 @@ mod tests {
             r#"{"txn":{"appId":"myApp","version": 3}}"#,
         ]
         .into();
-        let output_schema = get_log_schema().clone();
-        let parsed = handler
-            .parse_json(string_array_to_engine_data(json_strings), output_schema)
-            .unwrap();
-        ArrowEngineData::try_from_engine_data(parsed).unwrap()
-    }
-
-    fn parse_json_batch(json_strings: StringArray) -> Box<dyn EngineData> {
-        let engine = SyncEngine::new();
-        let json_handler = engine.get_json_handler();
-        let output_schema = get_log_schema().clone();
-        json_handler
-            .parse_json(string_array_to_engine_data(json_strings), output_schema)
-            .unwrap()
+        parse_json_batch(json_strings)
     }
 
     #[test]
@@ -1039,7 +1009,7 @@ mod tests {
         let data = action_batch();
         let mut visitor = CheckpointFileActionsVisitor {
             seen_file_keys: &mut HashSet::new(),
-            selection_vector: vec![false; 8], // 8 rows in the action batch
+            selection_vector: &mut vec![false; 8], // 8 rows in the action batch
             is_log_batch: true,
             total_actions: 0,
             total_add_actions: 0,
@@ -1049,7 +1019,7 @@ mod tests {
         visitor.visit_rows_of(data.as_ref())?;
 
         let expected = vec![true, true, false, false, false, false, false, false];
-        assert_eq!(visitor.selection_vector, expected);
+        assert_eq!(*visitor.selection_vector, expected);
         assert_eq!(visitor.seen_file_keys.len(), 2);
         assert_eq!(visitor.total_actions, 2);
         assert_eq!(visitor.total_add_actions, 1);
@@ -1070,7 +1040,7 @@ mod tests {
 
         let mut visitor = CheckpointFileActionsVisitor {
             seen_file_keys: &mut HashSet::new(),
-            selection_vector: vec![false; 4],
+            selection_vector: &mut vec![false; 4],
             is_log_batch: true,
             total_actions: 0,
             total_add_actions: 0,
@@ -1080,7 +1050,7 @@ mod tests {
         visitor.visit_rows_of(batch.as_ref())?;
 
         let expected = vec![false, false, true, false]; // Only "one_above_threshold" should be kept
-        assert_eq!(visitor.selection_vector, expected);
+        assert_eq!(*visitor.selection_vector, expected);
         assert_eq!(visitor.seen_file_keys.len(), 4); // All are recorded as seen even if expired
         assert_eq!(visitor.total_actions, 1);
         assert_eq!(visitor.total_add_actions, 0);
@@ -1099,7 +1069,7 @@ mod tests {
 
         let mut visitor = CheckpointFileActionsVisitor {
             seen_file_keys: &mut HashSet::new(),
-            selection_vector: vec![false; 2],
+            selection_vector: &mut vec![false; 2],
             is_log_batch: true, // Log batch
             total_actions: 0,
             total_add_actions: 0,
@@ -1110,7 +1080,7 @@ mod tests {
 
         // First one should be included, second one skipped as a duplicate
         let expected = vec![true, false];
-        assert_eq!(visitor.selection_vector, expected);
+        assert_eq!(*visitor.selection_vector, expected);
         assert_eq!(visitor.seen_file_keys.len(), 1);
         assert_eq!(visitor.total_actions, 1);
         assert_eq!(visitor.total_add_actions, 1);
@@ -1130,7 +1100,7 @@ mod tests {
 
         let mut visitor = CheckpointFileActionsVisitor {
             seen_file_keys: &mut HashSet::new(),
-            selection_vector: vec![false; 2],
+            selection_vector: &mut vec![false; 2],
             is_log_batch: false, // Checkpoint batch
             total_actions: 0,
             total_add_actions: 0,
@@ -1141,7 +1111,7 @@ mod tests {
 
         // Both should be included since we don't track duplicates in checkpoint batches
         let expected = vec![true, true];
-        assert_eq!(visitor.selection_vector, expected);
+        assert_eq!(*visitor.selection_vector, expected);
         assert_eq!(visitor.seen_file_keys.len(), 0); // No tracking for checkpoint batches
         assert_eq!(visitor.total_actions, 2);
         assert_eq!(visitor.total_add_actions, 2);
@@ -1162,7 +1132,7 @@ mod tests {
 
         let mut visitor = CheckpointFileActionsVisitor {
             seen_file_keys: &mut HashSet::new(),
-            selection_vector: vec![false; 3],
+            selection_vector: &mut vec![false; 3],
             is_log_batch: true,
             total_actions: 0,
             total_add_actions: 0,
@@ -1172,7 +1142,7 @@ mod tests {
         visitor.visit_rows_of(batch.as_ref())?;
 
         let expected = vec![true, true, false]; // Third one is a duplicate
-        assert_eq!(visitor.selection_vector, expected);
+        assert_eq!(*visitor.selection_vector, expected);
         assert_eq!(visitor.seen_file_keys.len(), 2);
         assert_eq!(visitor.total_actions, 2);
         assert_eq!(visitor.total_add_actions, 2);
@@ -1183,19 +1153,19 @@ mod tests {
     fn test_parse_checkpoint_non_file_actions_visitor() -> DeltaResult<()> {
         let data = action_batch();
         let mut visitor = CheckpointNonFileActionsVisitor {
-            seen_protocol: false,
-            seen_metadata: false,
+            seen_protocol: &mut false,
+            seen_metadata: &mut false,
             seen_txns: &mut HashSet::new(),
-            selection_vector: vec![false; 8],
+            selection_vector: &mut vec![false; 8],
             total_actions: 0,
         };
 
         visitor.visit_rows_of(data.as_ref())?;
 
         let expected = vec![false, false, false, true, true, false, false, true];
-        assert_eq!(visitor.selection_vector, expected);
-        assert!(visitor.seen_metadata);
-        assert!(visitor.seen_protocol);
+        assert_eq!(*visitor.selection_vector, expected);
+        assert!(*visitor.seen_metadata);
+        assert!(*visitor.seen_protocol);
         assert_eq!(visitor.seen_txns.len(), 1);
         assert_eq!(visitor.total_actions, 3);
         Ok(())
@@ -1212,17 +1182,17 @@ mod tests {
         seen_txns.insert("app1".to_string());
 
         let mut visitor = CheckpointNonFileActionsVisitor {
-            seen_protocol: false,
-            seen_metadata: false,
+            seen_protocol: &mut false,
+            seen_metadata: &mut false,
             seen_txns: &mut seen_txns,
-            selection_vector: vec![false; 1],
+            selection_vector: &mut vec![false; 1],
             total_actions: 0,
         };
 
         visitor.visit_rows_of(batch.as_ref())?;
 
         let expected = vec![false]; // Transaction should be skipped as it's already seen
-        assert_eq!(visitor.selection_vector, expected);
+        assert_eq!(*visitor.selection_vector, expected);
         assert_eq!(visitor.seen_txns.len(), 1); // Still only one transaction
         assert_eq!(visitor.total_actions, 0);
         Ok(())
@@ -1240,17 +1210,17 @@ mod tests {
 
         // Set protocol and metadata as already seen
         let mut visitor = CheckpointNonFileActionsVisitor {
-            seen_protocol: true, // Already seen
-            seen_metadata: true, // Already seen
+            seen_protocol: &mut true, // Already seen
+            seen_metadata: &mut true, // Already seen
             seen_txns: &mut HashSet::new(),
-            selection_vector: vec![false; 2],
+            selection_vector: &mut vec![false; 2],
             total_actions: 0,
         };
 
         visitor.visit_rows_of(batch.as_ref())?;
 
         let expected = vec![false, false]; // Both should be skipped
-        assert_eq!(visitor.selection_vector, expected);
+        assert_eq!(*visitor.selection_vector, expected);
         assert_eq!(visitor.total_actions, 0);
         Ok(())
     }
diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs
new file mode 100644
index 000000000..a632fd336
--- /dev/null
+++ b/kernel/src/checkpoints/log_replay.rs
@@ -0,0 +1,229 @@
+use std::collections::HashSet;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::Arc;
+
+use crate::actions::visitors::{CheckpointFileActionsVisitor, CheckpointNonFileActionsVisitor};
+use crate::engine_data::RowVisitor;
+use crate::scan::log_replay::FileActionKey;
+use crate::{DeltaResult, EngineData};
+
+/// `LogReplayForCheckpoints` is responsible for filtering actions during log
+/// replay to include only those that should be included in a V1 checkpoint.
+struct LogReplayForCheckpoints {
+    /// Tracks file actions that have been seen during log replay to avoid duplicates.
+    /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances.
+    seen_file_keys: HashSet<FileActionKey>,
+
+    /// Counter for the total number of actions processed during log replay.
+    total_actions: Arc<AtomicUsize>,
+
+    /// Counter for the total number of add actions processed during log replay.
+    total_add_actions: Arc<AtomicUsize>,
+
+    /// Indicates whether a protocol action has been seen in the log.
+    seen_protocol: bool,
+
+    /// Indicates whether a metadata action has been seen in the log.
+    seen_metadata: bool,
+
+    /// Set of transaction app IDs that have been processed to avoid duplicates.
+    seen_txns: HashSet<String>,
+
+    /// Minimum timestamp for file retention, used for filtering expired tombstones.
+    minimum_file_retention_timestamp: i64,
+}
+
+impl LogReplayForCheckpoints {
+    pub(super) fn new(
+        total_actions_counter: Arc<AtomicUsize>,
+        total_add_actions_counter: Arc<AtomicUsize>,
+        minimum_file_retention_timestamp: i64,
+    ) -> Self {
+        Self {
+            seen_file_keys: Default::default(),
+            total_actions: total_actions_counter,
+            total_add_actions: total_add_actions_counter,
+            seen_protocol: false,
+            seen_metadata: false,
+            seen_txns: Default::default(),
+            minimum_file_retention_timestamp,
+        }
+    }
+
+    /// Iterates over actions and filters them for inclusion in a V1 checkpoint.
+    ///
+    /// This function processes batches of actions in reverse chronological order
+    /// (from most recent to least recent) and performs the necessary filtering
+    /// to ensure the checkpoint contains only the actions needed to reconstruct
+    /// the complete state of the table.
+    ///
+    /// # Filtering Rules
+    ///
+    /// The following rules apply when filtering actions:
+    ///
+    /// 1. Only the most recent protocol and metadata actions are included
+    /// 2. For each app ID, only the most recent transaction action is included
+    /// 3. File actions are deduplicated based on path and unique ID
+    /// 4. Tombstones older than `minimum_file_retention_timestamp` are excluded
+    pub(super) fn process_v1_checkpoint_batch(
+        &mut self,
+        actions: Box<dyn EngineData>,
+        is_log_batch: bool,
+    ) -> DeltaResult<(Box<dyn EngineData>, Vec<bool>)> {
+        // Initialize selection vector with all rows un-selected
+        let mut selection_vector = vec![false; actions.len()];
+        assert_eq!(
+            selection_vector.len(),
+            actions.len(),
+            "Initial selection vector length does not match actions length"
+        );
+
+        // Create the non file actions visitor to process non file actions and update selection vector
+        let mut non_file_actions_visitor = CheckpointNonFileActionsVisitor {
+            seen_protocol: &mut self.seen_protocol,
+            seen_metadata: &mut self.seen_metadata,
+            seen_txns: &mut self.seen_txns,
+            selection_vector: &mut selection_vector,
+            total_actions: 0,
+        };
+
+        // Process actions and let visitor update selection vector
+        non_file_actions_visitor.visit_rows_of(actions.as_ref())?;
+
+        // Update shared counters with non-file action counts from this batch
+        self.total_actions
+            .fetch_add(non_file_actions_visitor.total_actions, Ordering::Relaxed);
+
+        // Create the file actions visitor to process file actions and update selection vector
+        let mut file_actions_visitor = CheckpointFileActionsVisitor {
+            seen_file_keys: &mut self.seen_file_keys,
+            is_log_batch,
+            selection_vector: &mut selection_vector,
+            total_actions: 0,
+            total_add_actions: 0,
+            minimum_file_retention_timestamp: self.minimum_file_retention_timestamp,
+        };
+
+        // Process actions and let visitor update selection vector
+        file_actions_visitor.visit_rows_of(actions.as_ref())?;
+
+        // Update shared counters with file action counts from this batch
+        self.total_actions
+            .fetch_add(file_actions_visitor.total_actions, Ordering::Relaxed);
+        self.total_add_actions
+            .fetch_add(file_actions_visitor.total_add_actions, Ordering::Relaxed);
+
+        Ok((actions, selection_vector))
+    }
+}
+
+/// Given an iterator of (engine_data, bool) tuples, returns an iterator of
+/// `(engine_data, selection_vec)`. Each row that is selected in the returned `engine_data` _must_
+/// be written to the V1 checkpoint file in order to capture the table version's complete state.
+///  Non-selected rows _must_ be ignored. The boolean flag indicates whether the record batch
+///  is a log or checkpoint batch.
+///
+/// Note: The iterator of (engine_data, bool) tuples must be sorted by the order of the actions in
+/// the log from most recent to least recent.
+pub(crate) fn v1_checkpoint_actions_iter(
+    action_iter: impl Iterator<Item = DeltaResult<(Box<dyn EngineData>, bool)>> + Send + 'static,
+    total_actions_counter: Arc<AtomicUsize>,
+    total_add_actions_counter: Arc<AtomicUsize>,
+    minimum_file_retention_timestamp: i64,
+) -> impl Iterator<Item = DeltaResult<(Box<dyn EngineData>, Vec<bool>)>> + Send + 'static {
+    let mut log_scanner = LogReplayForCheckpoints::new(
+        total_actions_counter,
+        total_add_actions_counter,
+        minimum_file_retention_timestamp,
+    );
+
+    action_iter
+        .map(move |action_res| {
+            let (batch, is_log_batch) = action_res?;
+            log_scanner.process_v1_checkpoint_batch(batch, is_log_batch)
+        })
+        // Only yield batches that have at least one selected row
+        .filter(|res| res.as_ref().map_or(true, |(_, sv)| sv.contains(&true)))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::atomic::{AtomicUsize, Ordering};
+    use std::sync::Arc;
+
+    use crate::arrow::array::StringArray;
+    use crate::checkpoints::log_replay::v1_checkpoint_actions_iter;
+    use crate::utils::test_utils::parse_json_batch;
+    use crate::DeltaResult;
+
+    /// Tests the end-to-end processing of multiple batches with various action types
+    /// This tests the integration of the visitors with the main iterator function.
+    /// More granular testing is performed in the individual visitor tests.
+    #[test]
+    fn test_v1_checkpoint_actions_iter_multi_batch_integration() -> DeltaResult<()> {
+        // Setup counters
+        let total_actions_counter = Arc::new(AtomicUsize::new(0));
+        let total_add_actions_counter = Arc::new(AtomicUsize::new(0));
+
+        // Create first batch with protocol, metadata, and some files
+        let json_strings1: StringArray = vec![
+            r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#,
+            r#"{"metaData":{"id":"test2","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#,
+            r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#,
+            r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#,
+        ].into();
+
+        // Create second batch with some duplicates and new files
+        let json_strings2: StringArray = vec![
+            // Protocol and metadata should be skipped as duplicates
+            r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#,
+            r#"{"metaData":{"id":"test1","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#,
+            // New files
+            r#"{"add":{"path":"file3","partitionValues":{},"size":800,"modificationTime":102,"dataChange":true}}"#,
+            // Duplicate file should be skipped
+            r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#,            // Transaction
+            r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#
+        ].into();
+
+        // Create third batch with all duplicate actions (should be filtered out completely)
+        let json_strings3: StringArray = vec![
+            r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#,
+            r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#,
+        ].into();
+
+        let input_batches = vec![
+            Ok((parse_json_batch(json_strings1), true)),
+            Ok((parse_json_batch(json_strings2), true)),
+            Ok((parse_json_batch(json_strings3), true)),
+        ];
+
+        // Run the iterator
+        let results: Vec<_> = v1_checkpoint_actions_iter(
+            input_batches.into_iter(),
+            total_actions_counter.clone(),
+            total_add_actions_counter.clone(),
+            0,
+        )
+        .collect::<Result<Vec<_>, _>>()?;
+
+        // Expect two batches in results (third batch should be filtered)"
+        assert_eq!(results.len(), 2);
+
+        // First batch should have all rows selected
+        let (_, selection_vector1) = &results[0];
+        assert_eq!(selection_vector1, &vec![true, true, true, true]);
+
+        // Second batch should have only new file and transaction selected
+        let (_, selection_vector2) = &results[1];
+        assert_eq!(selection_vector2, &vec![false, false, true, false, true]);
+
+        // Verify counters
+        // 6 total actions (4 from batch1 + 2 from batch2 + 0 from batch3)
+        assert_eq!(total_actions_counter.load(Ordering::Relaxed), 6);
+
+        // 3 add actions (2 from batch1 + 1 from batch2)
+        assert_eq!(total_add_actions_counter.load(Ordering::Relaxed), 3);
+
+        Ok(())
+    }
+}
diff --git a/kernel/src/checkpoints/mod.rs b/kernel/src/checkpoints/mod.rs
new file mode 100644
index 000000000..826ff771f
--- /dev/null
+++ b/kernel/src/checkpoints/mod.rs
@@ -0,0 +1 @@
+pub mod log_replay;
diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs
index 65a0a6ab5..bf2476921 100644
--- a/kernel/src/lib.rs
+++ b/kernel/src/lib.rs
@@ -74,6 +74,7 @@ use url::Url;
 use self::schema::{DataType, SchemaRef};
 
 pub mod actions;
+pub mod checkpoints;
 pub mod engine_data;
 pub mod error;
 pub mod expressions;
diff --git a/kernel/src/path.rs b/kernel/src/path.rs
index df372f08e..f9988cc8a 100644
--- a/kernel/src/path.rs
+++ b/kernel/src/path.rs
@@ -196,6 +196,23 @@ impl ParsedLogPath<Url> {
         }
         Ok(path)
     }
+
+    /// Create a new ParsedCommitPath<Url> for a new parquet v1 checkpoint file at the specified version
+    pub(crate) fn new_v1_checkpoint(
+        table_root: &Url,
+        version: Version,
+    ) -> DeltaResult<ParsedLogPath<Url>> {
+        let filename = format!("{:020}.checkpoint.parquet", version);
+        let location = table_root.join("_delta_log/")?.join(&filename)?;
+        let path = Self::try_from(location)?
+            .ok_or_else(|| Error::internal_error("attempted to create invalid checkpoint path"))?;
+        if !path.is_checkpoint() {
+            return Err(Error::internal_error(
+                "ParsedLogPath::new_commit created a non-checkpoint path",
+            ));
+        }
+        Ok(path)
+    }
 }
 
 #[cfg(test)]
diff --git a/kernel/src/utils.rs b/kernel/src/utils.rs
index fd2db2501..7713e042a 100644
--- a/kernel/src/utils.rs
+++ b/kernel/src/utils.rs
@@ -22,11 +22,15 @@ pub(crate) mod test_utils {
     use tempfile::TempDir;
     use test_utils::delta_path_for_version;
 
+    use crate::actions::get_log_schema;
+    use crate::arrow::array::StringArray;
+    use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
+    use crate::engine::sync::SyncEngine;
     use crate::{
         actions::{Add, Cdc, CommitInfo, Metadata, Protocol, Remove},
         engine::arrow_data::ArrowEngineData,
-        EngineData,
     };
+    use crate::{Engine, EngineData};
 
     #[derive(Serialize)]
     pub(crate) enum Action {
@@ -97,4 +101,23 @@ pub(crate) mod test_utils {
     pub(crate) fn assert_batch_matches(actual: Box<dyn EngineData>, expected: Box<dyn EngineData>) {
         assert_eq!(into_record_batch(actual), into_record_batch(expected));
     }
+
+    /// Converts a `StringArray` to an `EngineData` object
+    pub(crate) fn string_array_to_engine_data(string_array: StringArray) -> Box<dyn EngineData> {
+        let string_field = Arc::new(Field::new("a", DataType::Utf8, true));
+        let schema = Arc::new(ArrowSchema::new(vec![string_field]));
+        let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)])
+            .expect("Can't convert to record batch");
+        Box::new(ArrowEngineData::new(batch))
+    }
+
+    /// Parses a batch of JSON strings into an `EngineData` object
+    pub(crate) fn parse_json_batch(json_strings: StringArray) -> Box<dyn EngineData> {
+        let engine = SyncEngine::new();
+        let json_handler = engine.get_json_handler();
+        let output_schema = get_log_schema().clone();
+        json_handler
+            .parse_json(string_array_to_engine_data(json_strings), output_schema)
+            .unwrap()
+    }
 }

From db5ccd05ba8be030fd8941d4b0025fcbe1372d49 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Thu, 13 Mar 2025 15:29:15 -0700
Subject: [PATCH 05/45] rename & some clean up

---
 kernel/src/checkpoints/log_replay.rs | 17 ++++++++++-------
 kernel/src/engine/arrow_data.rs      | 19 +++----------------
 kernel/src/engine/default/json.rs    |  9 +--------
 kernel/src/scan/mod.rs               | 15 +++------------
 4 files changed, 17 insertions(+), 43 deletions(-)

diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs
index a632fd336..4bd6c3448 100644
--- a/kernel/src/checkpoints/log_replay.rs
+++ b/kernel/src/checkpoints/log_replay.rs
@@ -7,9 +7,10 @@ use crate::engine_data::RowVisitor;
 use crate::scan::log_replay::FileActionKey;
 use crate::{DeltaResult, EngineData};
 
-/// `LogReplayForCheckpoints` is responsible for filtering actions during log
+/// `V1CheckpointLogReplayScanner` is responsible for filtering actions during log
 /// replay to include only those that should be included in a V1 checkpoint.
-struct LogReplayForCheckpoints {
+#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented
+struct V1CheckpointLogReplayScanner {
     /// Tracks file actions that have been seen during log replay to avoid duplicates.
     /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances.
     seen_file_keys: HashSet<FileActionKey>,
@@ -33,7 +34,8 @@ struct LogReplayForCheckpoints {
     minimum_file_retention_timestamp: i64,
 }
 
-impl LogReplayForCheckpoints {
+#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented
+impl V1CheckpointLogReplayScanner {
     pub(super) fn new(
         total_actions_counter: Arc<AtomicUsize>,
         total_add_actions_counter: Arc<AtomicUsize>,
@@ -65,7 +67,7 @@ impl LogReplayForCheckpoints {
     /// 2. For each app ID, only the most recent transaction action is included
     /// 3. File actions are deduplicated based on path and unique ID
     /// 4. Tombstones older than `minimum_file_retention_timestamp` are excluded
-    pub(super) fn process_v1_checkpoint_batch(
+    pub(super) fn filter_v1_checkpoint_actions(
         &mut self,
         actions: Box<dyn EngineData>,
         is_log_batch: bool,
@@ -125,13 +127,14 @@ impl LogReplayForCheckpoints {
 ///
 /// Note: The iterator of (engine_data, bool) tuples must be sorted by the order of the actions in
 /// the log from most recent to least recent.
+#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented
 pub(crate) fn v1_checkpoint_actions_iter(
     action_iter: impl Iterator<Item = DeltaResult<(Box<dyn EngineData>, bool)>> + Send + 'static,
     total_actions_counter: Arc<AtomicUsize>,
     total_add_actions_counter: Arc<AtomicUsize>,
     minimum_file_retention_timestamp: i64,
 ) -> impl Iterator<Item = DeltaResult<(Box<dyn EngineData>, Vec<bool>)>> + Send + 'static {
-    let mut log_scanner = LogReplayForCheckpoints::new(
+    let mut log_scanner = V1CheckpointLogReplayScanner::new(
         total_actions_counter,
         total_add_actions_counter,
         minimum_file_retention_timestamp,
@@ -140,7 +143,7 @@ pub(crate) fn v1_checkpoint_actions_iter(
     action_iter
         .map(move |action_res| {
             let (batch, is_log_batch) = action_res?;
-            log_scanner.process_v1_checkpoint_batch(batch, is_log_batch)
+            log_scanner.filter_v1_checkpoint_actions(batch, is_log_batch)
         })
         // Only yield batches that have at least one selected row
         .filter(|res| res.as_ref().map_or(true, |(_, sv)| sv.contains(&true)))
@@ -156,7 +159,7 @@ mod tests {
     use crate::utils::test_utils::parse_json_batch;
     use crate::DeltaResult;
 
-    /// Tests the end-to-end processing of multiple batches with various action types
+    /// Tests the end-to-end processing of multiple batches with various action types.
     /// This tests the integration of the visitors with the main iterator function.
     /// More granular testing is performed in the individual visitor tests.
     #[test]
diff --git a/kernel/src/engine/arrow_data.rs b/kernel/src/engine/arrow_data.rs
index 988380901..b09b27ff9 100644
--- a/kernel/src/engine/arrow_data.rs
+++ b/kernel/src/engine/arrow_data.rs
@@ -294,27 +294,14 @@ impl ArrowEngineData {
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-
-    use crate::arrow::array::{RecordBatch, StringArray};
-    use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
-
+    use crate::arrow::array::StringArray;
+    use crate::utils::test_utils::string_array_to_engine_data;
     use crate::{
         actions::{get_log_schema, Metadata, Protocol},
         engine::sync::SyncEngine,
-        DeltaResult, Engine, EngineData,
+        DeltaResult, Engine,
     };
 
-    use super::ArrowEngineData;
-
-    fn string_array_to_engine_data(string_array: StringArray) -> Box<dyn EngineData> {
-        let string_field = Arc::new(Field::new("a", DataType::Utf8, true));
-        let schema = Arc::new(ArrowSchema::new(vec![string_field]));
-        let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)])
-            .expect("Can't convert to record batch");
-        Box::new(ArrowEngineData::new(batch))
-    }
-
     #[test]
     fn test_md_extract() -> DeltaResult<()> {
         let engine = SyncEngine::new();
diff --git a/kernel/src/engine/default/json.rs b/kernel/src/engine/default/json.rs
index 98a9b0dc7..8b401a3d4 100644
--- a/kernel/src/engine/default/json.rs
+++ b/kernel/src/engine/default/json.rs
@@ -257,6 +257,7 @@ mod tests {
     use crate::engine::default::executor::tokio::{
         TokioBackgroundExecutor, TokioMultiThreadExecutor,
     };
+    use crate::utils::test_utils::string_array_to_engine_data;
     use futures::future;
     use itertools::Itertools;
     use object_store::local::LocalFileSystem;
@@ -471,14 +472,6 @@ mod tests {
         }
     }
 
-    fn string_array_to_engine_data(string_array: StringArray) -> Box<dyn EngineData> {
-        let string_field = Arc::new(Field::new("a", DataType::Utf8, true));
-        let schema = Arc::new(ArrowSchema::new(vec![string_field]));
-        let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)])
-            .expect("Can't convert to record batch");
-        Box::new(ArrowEngineData::new(batch))
-    }
-
     #[test]
     fn test_parse_json() {
         let store = Arc::new(LocalFileSystem::new());
diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs
index ccdff3d66..689a6eab3 100644
--- a/kernel/src/scan/mod.rs
+++ b/kernel/src/scan/mod.rs
@@ -663,8 +663,8 @@ pub fn selection_vector(
 // some utils that are used in file_stream.rs and state.rs tests
 #[cfg(test)]
 pub(crate) mod test_utils {
-    use crate::arrow::array::{RecordBatch, StringArray};
-    use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
+    use crate::arrow::array::StringArray;
+    use crate::utils::test_utils::string_array_to_engine_data;
     use itertools::Itertools;
     use std::sync::Arc;
 
@@ -676,20 +676,11 @@ pub(crate) mod test_utils {
         },
         scan::log_replay::scan_action_iter,
         schema::SchemaRef,
-        EngineData, JsonHandler,
+        JsonHandler,
     };
 
     use super::{state::ScanCallback, Transform};
 
-    // TODO(nick): Merge all copies of this into one "test utils" thing
-    fn string_array_to_engine_data(string_array: StringArray) -> Box<dyn EngineData> {
-        let string_field = Arc::new(Field::new("a", DataType::Utf8, true));
-        let schema = Arc::new(ArrowSchema::new(vec![string_field]));
-        let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)])
-            .expect("Can't convert to record batch");
-        Box::new(ArrowEngineData::new(batch))
-    }
-
     // Generates a batch of sidecar actions with the given paths.
     // The schema is provided as null columns affect equality checks.
     pub(crate) fn sidecar_batch_with_given_paths(

From 42c08c1f439a5d20adcba1f56df74b3e65b469ec Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Thu, 13 Mar 2025 15:43:25 -0700
Subject: [PATCH 06/45] remove new path for now

---
 kernel/src/path.rs | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/kernel/src/path.rs b/kernel/src/path.rs
index f9988cc8a..df372f08e 100644
--- a/kernel/src/path.rs
+++ b/kernel/src/path.rs
@@ -196,23 +196,6 @@ impl ParsedLogPath<Url> {
         }
         Ok(path)
     }
-
-    /// Create a new ParsedCommitPath<Url> for a new parquet v1 checkpoint file at the specified version
-    pub(crate) fn new_v1_checkpoint(
-        table_root: &Url,
-        version: Version,
-    ) -> DeltaResult<ParsedLogPath<Url>> {
-        let filename = format!("{:020}.checkpoint.parquet", version);
-        let location = table_root.join("_delta_log/")?.join(&filename)?;
-        let path = Self::try_from(location)?
-            .ok_or_else(|| Error::internal_error("attempted to create invalid checkpoint path"))?;
-        if !path.is_checkpoint() {
-            return Err(Error::internal_error(
-                "ParsedLogPath::new_commit created a non-checkpoint path",
-            ));
-        }
-        Ok(path)
-    }
 }
 
 #[cfg(test)]

From f91baebe5af22c4c01a7529fdf9967ffa04c510f Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Fri, 21 Mar 2025 19:33:35 -0700
Subject: [PATCH 07/45] merge non file action visitor tests

---
 kernel/src/actions/visitors.rs | 58 +++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs
index 3ade3d914..150beffe6 100644
--- a/kernel/src/actions/visitors.rs
+++ b/kernel/src/actions/visitors.rs
@@ -784,10 +784,8 @@ mod tests {
 
     use super::*;
     use crate::{
-        actions::get_log_schema,
-        engine::arrow_data::ArrowEngineData,
-        engine::sync::{json::SyncJsonHandler, SyncEngine},
-        Engine, EngineData, JsonHandler,
+        actions::get_log_schema, engine::arrow_data::ArrowEngineData, engine::sync::SyncEngine,
+        Engine, EngineData,
     };
 
     // TODO(nick): Merge all copies of this into one "test utils" thing
@@ -799,8 +797,7 @@ mod tests {
         Box::new(ArrowEngineData::new(batch))
     }
 
-    fn action_batch() -> Box<ArrowEngineData> {
-        let handler = SyncJsonHandler {};
+    fn action_batch() -> Box<dyn EngineData> {
         let json_strings: StringArray = vec![
             r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#,
             r#"{"remove":{"path":"part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, 
@@ -812,11 +809,7 @@ mod tests {
             r#"{"txn":{"appId":"myApp","version": 3}}"#,
         ]
         .into();
-        let output_schema = get_log_schema().clone();
-        let parsed = handler
-            .parse_json(string_array_to_engine_data(json_strings), output_schema)
-            .unwrap();
-        ArrowEngineData::try_from_engine_data(parsed).unwrap()
+        parse_json_batch(json_strings)
     }
 
     fn parse_json_batch(json_strings: StringArray) -> Box<dyn EngineData> {
@@ -1202,26 +1195,30 @@ mod tests {
     }
 
     #[test]
-    fn test_checkpoint_non_file_actions_visitor_txn_already_seen() -> DeltaResult<()> {
-        let json_strings: StringArray =
-            vec![r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#].into();
+    fn test_checkpoint_non_file_actions_visitor_already_seen_actions() -> DeltaResult<()> {
+        let json_strings: StringArray = vec![
+            r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#,
+            r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#,
+            r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#,
+        ].into();
         let batch = parse_json_batch(json_strings);
 
-        // Pre-populate with app1
+        // Pre-populate with txn app1
         let mut seen_txns = HashSet::new();
         seen_txns.insert("app1".to_string());
 
         let mut visitor = CheckpointNonFileActionsVisitor {
-            seen_protocol: false,
-            seen_metadata: false,
+            seen_protocol: true, // Already seen
+            seen_metadata: true, // Already seen
             seen_txns: &mut seen_txns,
-            selection_vector: vec![false; 1],
+            selection_vector: vec![false; 3],
             total_actions: 0,
         };
 
         visitor.visit_rows_of(batch.as_ref())?;
 
-        let expected = vec![false]; // Transaction should be skipped as it's already seen
+        // All actions should be skipped as they have already been seen
+        let expected = vec![false; 3];
         assert_eq!(visitor.selection_vector, expected);
         assert_eq!(visitor.seen_txns.len(), 1); // Still only one transaction
         assert_eq!(visitor.total_actions, 0);
@@ -1229,29 +1226,32 @@ mod tests {
     }
 
     #[test]
-    fn test_checkpoint_non_file_actions_visitor_protocol_and_metadata_already_seen(
-    ) -> DeltaResult<()> {
+    fn test_checkpoint_non_file_actions_visitor_duplicate_non_file_actions() -> DeltaResult<()> {
         let json_strings: StringArray = vec![
+            r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#,
+            r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#,
             r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#,
-            r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#,
+            r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#,
+            r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#,
+            r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#,
         ]
         .into();
         let batch = parse_json_batch(json_strings);
 
-        // Set protocol and metadata as already seen
         let mut visitor = CheckpointNonFileActionsVisitor {
-            seen_protocol: true, // Already seen
-            seen_metadata: true, // Already seen
-            seen_txns: &mut HashSet::new(),
-            selection_vector: vec![false; 2],
+            seen_protocol: false,
+            seen_metadata: false,
+            seen_txns: &mut HashSet::new(), // Empty set
+            selection_vector: vec![false; 6],
             total_actions: 0,
         };
 
         visitor.visit_rows_of(batch.as_ref())?;
 
-        let expected = vec![false, false]; // Both should be skipped
+        let expected = vec![true, false, true, false, true, false];
         assert_eq!(visitor.selection_vector, expected);
-        assert_eq!(visitor.total_actions, 0);
+        assert_eq!(visitor.seen_txns.len(), 1);
+        assert_eq!(visitor.total_actions, 3);
         Ok(())
     }
 }

From 9fdfba70f63371a72bc624f7228e6f92f7760ab6 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Mon, 24 Mar 2025 15:22:56 -0700
Subject: [PATCH 08/45] mvp for refactor

---
 kernel/src/actions/visitors.rs       |  80 ++++--------
 kernel/src/checkpoints/log_replay.rs |  78 ++++++-----
 kernel/src/lib.rs                    |   1 +
 kernel/src/log_replay.rs             | 154 ++++++++++++++++++++++
 kernel/src/scan/log_replay.rs        | 188 ++++++++++++---------------
 5 files changed, 311 insertions(+), 190 deletions(-)
 create mode 100644 kernel/src/log_replay.rs

diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs
index e0e622b05..c348c92e2 100644
--- a/kernel/src/actions/visitors.rs
+++ b/kernel/src/actions/visitors.rs
@@ -3,10 +3,9 @@
 
 use std::collections::{HashMap, HashSet};
 use std::sync::LazyLock;
-use tracing::debug;
 
 use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
-use crate::scan::log_replay::FileActionKey;
+use crate::log_replay::{FileActionKey, FileActionVisitor};
 use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType};
 use crate::utils::require;
 use crate::{DeltaResult, Error};
@@ -500,40 +499,30 @@ pub(crate) struct CheckpointFileActionsVisitor<'seen> {
     pub(crate) minimum_file_retention_timestamp: i64,
 }
 
-#[allow(unused)] // TODO: Remove flag once used for checkpoint writing
-impl CheckpointFileActionsVisitor<'_> {
-    /// Checks if log replay already processed this logical file (in which case the current action
-    /// should be ignored). If not already seen, register it so we can recognize future duplicates.
-    /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it
-    /// and should process it.
-    ///
-    /// TODO: This method is a duplicate of AddRemoveDedupVisior's method!
-    fn check_and_record_seen(&mut self, key: FileActionKey) -> bool {
-        // Note: each (add.path + add.dv_unique_id()) pair has a
-        // unique Add + Remove pair in the log. For example:
-        // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json
-
-        if self.seen_file_keys.contains(&key) {
-            debug!(
-                "Ignoring duplicate ({}, {:?}) in scan, is log {}",
-                key.path, key.dv_unique_id, self.is_log_batch
-            );
-            true
-        } else {
-            debug!(
-                "Including ({}, {:?}) in scan, is log {}",
-                key.path, key.dv_unique_id, self.is_log_batch
-            );
-            if self.is_log_batch {
-                // Remember file actions from this batch so we can ignore duplicates as we process
-                // batches from older commit and/or checkpoint files. We don't track checkpoint
-                // batches because they are already the oldest actions and never replace anything.
-                self.seen_file_keys.insert(key);
-            }
-            false
-        }
+impl FileActionVisitor for CheckpointFileActionsVisitor<'_> {
+    fn seen_file_keys(&mut self) -> &mut HashSet<FileActionKey> {
+        self.seen_file_keys
     }
 
+    fn add_path_index(&self) -> usize {
+        0
+    }
+
+    fn remove_path_index(&self) -> Option<usize> {
+        Some(4)
+    }
+
+    fn add_dv_start_index(&self) -> usize {
+        1
+    }
+
+    fn remove_dv_start_index(&self) -> Option<usize> {
+        Some(6)
+    }
+}
+
+#[allow(unused)] // TODO: Remove flag once used for checkpoint writing
+impl CheckpointFileActionsVisitor<'_> {
     /// A remove action includes a timestamp indicating when the deletion occurred. Physical files  
     /// are deleted lazily after a user-defined expiration time, allowing concurrent readers to  
     /// access stale snapshots. A remove action remains as a tombstone in a checkpoint file until
@@ -556,29 +545,14 @@ impl CheckpointFileActionsVisitor<'_> {
         i: usize,
         getters: &[&'a dyn GetData<'a>],
     ) -> DeltaResult<bool> {
-        // Add will have a path at index 0 if it is valid; otherwise we may
-        // have a remove with a path at index 4. In either case, extract the three dv getters at
-        // indexes that immediately follow a valid path index.
-        let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? {
-            (path, &getters[1..4], true)
-        } else if let Some(path) = getters[4].get_opt(i, "remove.path")? {
-            (path, &getters[6..9], false)
-        } else {
+        // Retrieve the file action key and whether it is an add action
+        let Some((file_key, is_add)) = self.extract_file_action(i, getters)? else {
+            // Not a file action
             return Ok(false);
         };
 
-        let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? {
-            Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts(
-                storage_type,
-                dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?,
-                dv_getters[2].get_opt(i, "deletionVector.offset")?,
-            )),
-            None => None,
-        };
-
         // Check both adds and removes (skipping already-seen)
-        let file_key = FileActionKey::new(path, dv_unique_id);
-        if self.check_and_record_seen(file_key) {
+        if self.check_and_record_seen(file_key, self.is_log_batch) {
             return Ok(false);
         }
 
diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs
index 4bd6c3448..98600a821 100644
--- a/kernel/src/checkpoints/log_replay.rs
+++ b/kernel/src/checkpoints/log_replay.rs
@@ -4,13 +4,13 @@ use std::sync::Arc;
 
 use crate::actions::visitors::{CheckpointFileActionsVisitor, CheckpointNonFileActionsVisitor};
 use crate::engine_data::RowVisitor;
-use crate::scan::log_replay::FileActionKey;
+use crate::log_replay::{FileActionKey, LogReplayProcessor};
 use crate::{DeltaResult, EngineData};
 
-/// `V1CheckpointLogReplayScanner` is responsible for filtering actions during log
+/// `CheckpointLogReplayProcessor` is responsible for filtering actions during log
 /// replay to include only those that should be included in a V1 checkpoint.
 #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented
-struct V1CheckpointLogReplayScanner {
+struct CheckpointLogReplayProcessor {
     /// Tracks file actions that have been seen during log replay to avoid duplicates.
     /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances.
     seen_file_keys: HashSet<FileActionKey>,
@@ -34,26 +34,10 @@ struct V1CheckpointLogReplayScanner {
     minimum_file_retention_timestamp: i64,
 }
 
-#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented
-impl V1CheckpointLogReplayScanner {
-    pub(super) fn new(
-        total_actions_counter: Arc<AtomicUsize>,
-        total_add_actions_counter: Arc<AtomicUsize>,
-        minimum_file_retention_timestamp: i64,
-    ) -> Self {
-        Self {
-            seen_file_keys: Default::default(),
-            total_actions: total_actions_counter,
-            total_add_actions: total_add_actions_counter,
-            seen_protocol: false,
-            seen_metadata: false,
-            seen_txns: Default::default(),
-            minimum_file_retention_timestamp,
-        }
-    }
+impl LogReplayProcessor for CheckpointLogReplayProcessor {
+    // Define the processing result type as a tuple of the data and selection vector
+    type ProcessingResult = (Box<dyn EngineData>, Vec<bool>);
 
-    /// Iterates over actions and filters them for inclusion in a V1 checkpoint.
-    ///
     /// This function processes batches of actions in reverse chronological order
     /// (from most recent to least recent) and performs the necessary filtering
     /// to ensure the checkpoint contains only the actions needed to reconstruct
@@ -67,16 +51,16 @@ impl V1CheckpointLogReplayScanner {
     /// 2. For each app ID, only the most recent transaction action is included
     /// 3. File actions are deduplicated based on path and unique ID
     /// 4. Tombstones older than `minimum_file_retention_timestamp` are excluded
-    pub(super) fn filter_v1_checkpoint_actions(
+    fn process_batch(
         &mut self,
-        actions: Box<dyn EngineData>,
+        batch: Box<dyn EngineData>,
         is_log_batch: bool,
-    ) -> DeltaResult<(Box<dyn EngineData>, Vec<bool>)> {
+    ) -> DeltaResult<Self::ProcessingResult> {
         // Initialize selection vector with all rows un-selected
-        let mut selection_vector = vec![false; actions.len()];
+        let mut selection_vector = vec![false; batch.len()];
         assert_eq!(
             selection_vector.len(),
-            actions.len(),
+            batch.len(),
             "Initial selection vector length does not match actions length"
         );
 
@@ -90,7 +74,7 @@ impl V1CheckpointLogReplayScanner {
         };
 
         // Process actions and let visitor update selection vector
-        non_file_actions_visitor.visit_rows_of(actions.as_ref())?;
+        non_file_actions_visitor.visit_rows_of(batch.as_ref())?;
 
         // Update shared counters with non-file action counts from this batch
         self.total_actions
@@ -107,7 +91,7 @@ impl V1CheckpointLogReplayScanner {
         };
 
         // Process actions and let visitor update selection vector
-        file_actions_visitor.visit_rows_of(actions.as_ref())?;
+        file_actions_visitor.visit_rows_of(batch.as_ref())?;
 
         // Update shared counters with file action counts from this batch
         self.total_actions
@@ -115,7 +99,31 @@ impl V1CheckpointLogReplayScanner {
         self.total_add_actions
             .fetch_add(file_actions_visitor.total_add_actions, Ordering::Relaxed);
 
-        Ok((actions, selection_vector))
+        Ok((batch, selection_vector))
+    }
+
+    // Get a reference to the set of seen file keys
+    fn seen_file_keys(&mut self) -> &mut HashSet<FileActionKey> {
+        &mut self.seen_file_keys
+    }
+}
+
+#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented
+impl CheckpointLogReplayProcessor {
+    pub(super) fn new(
+        total_actions_counter: Arc<AtomicUsize>,
+        total_add_actions_counter: Arc<AtomicUsize>,
+        minimum_file_retention_timestamp: i64,
+    ) -> Self {
+        Self {
+            seen_file_keys: Default::default(),
+            total_actions: total_actions_counter,
+            total_add_actions: total_add_actions_counter,
+            seen_protocol: false,
+            seen_metadata: false,
+            seen_txns: Default::default(),
+            minimum_file_retention_timestamp,
+        }
     }
 }
 
@@ -128,13 +136,13 @@ impl V1CheckpointLogReplayScanner {
 /// Note: The iterator of (engine_data, bool) tuples must be sorted by the order of the actions in
 /// the log from most recent to least recent.
 #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented
-pub(crate) fn v1_checkpoint_actions_iter(
+pub(crate) fn checkpoint_actions_iter(
     action_iter: impl Iterator<Item = DeltaResult<(Box<dyn EngineData>, bool)>> + Send + 'static,
     total_actions_counter: Arc<AtomicUsize>,
     total_add_actions_counter: Arc<AtomicUsize>,
     minimum_file_retention_timestamp: i64,
 ) -> impl Iterator<Item = DeltaResult<(Box<dyn EngineData>, Vec<bool>)>> + Send + 'static {
-    let mut log_scanner = V1CheckpointLogReplayScanner::new(
+    let mut log_scanner = CheckpointLogReplayProcessor::new(
         total_actions_counter,
         total_add_actions_counter,
         minimum_file_retention_timestamp,
@@ -143,7 +151,7 @@ pub(crate) fn v1_checkpoint_actions_iter(
     action_iter
         .map(move |action_res| {
             let (batch, is_log_batch) = action_res?;
-            log_scanner.filter_v1_checkpoint_actions(batch, is_log_batch)
+            log_scanner.process_batch(batch, is_log_batch)
         })
         // Only yield batches that have at least one selected row
         .filter(|res| res.as_ref().map_or(true, |(_, sv)| sv.contains(&true)))
@@ -155,7 +163,7 @@ mod tests {
     use std::sync::Arc;
 
     use crate::arrow::array::StringArray;
-    use crate::checkpoints::log_replay::v1_checkpoint_actions_iter;
+    use crate::checkpoints::log_replay::checkpoint_actions_iter;
     use crate::utils::test_utils::parse_json_batch;
     use crate::DeltaResult;
 
@@ -201,7 +209,7 @@ mod tests {
         ];
 
         // Run the iterator
-        let results: Vec<_> = v1_checkpoint_actions_iter(
+        let results: Vec<_> = checkpoint_actions_iter(
             input_batches.into_iter(),
             total_actions_counter.clone(),
             total_add_actions_counter.clone(),
diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs
index bf2476921..787d2a482 100644
--- a/kernel/src/lib.rs
+++ b/kernel/src/lib.rs
@@ -78,6 +78,7 @@ pub mod checkpoints;
 pub mod engine_data;
 pub mod error;
 pub mod expressions;
+pub mod log_replay;
 pub mod scan;
 pub mod schema;
 pub mod snapshot;
diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
new file mode 100644
index 000000000..e545f2408
--- /dev/null
+++ b/kernel/src/log_replay.rs
@@ -0,0 +1,154 @@
+use std::collections::{HashMap, HashSet};
+use std::sync::{
+    atomic::{AtomicUsize, Ordering},
+    Arc, LazyLock,
+};
+
+use itertools::Itertools;
+use tracing::debug;
+
+use crate::actions::deletion_vector::DeletionVectorDescriptor;
+use crate::actions::get_log_add_schema;
+use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
+use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator};
+
+#[derive(Debug, Hash, Eq, PartialEq)]
+/// The subset of file action fields that uniquely identifies it in the log, used for deduplication
+/// of adds and removes during log replay.
+pub struct FileActionKey {
+    pub(crate) path: String,
+    pub(crate) dv_unique_id: Option<String>,
+}
+
+impl FileActionKey {
+    pub fn new(path: impl Into<String>, dv_unique_id: Option<String>) -> Self {
+        let path = path.into();
+        Self { path, dv_unique_id }
+    }
+}
+
+/// Trait defining the interface for log replay processors that process and filter
+/// Delta Lake log actions based on different strategies.
+pub trait LogReplayProcessor {
+    /// The type of results produced by this processor
+    type ProcessingResult;
+
+    /// Process a batch of actions and return the filtered result
+    fn process_batch(
+        &mut self,
+        batch: Box<dyn EngineData>,
+        is_log_batch: bool,
+    ) -> DeltaResult<Self::ProcessingResult>;
+
+    // Get a reference to the set of seen file keys
+    fn seen_file_keys(&mut self) -> &mut HashSet<FileActionKey>;
+}
+
+/// Base trait for visitors that process file actions during log replay
+pub trait FileActionVisitor {
+    /// Get a reference to the set of seen file keys
+    fn seen_file_keys(&mut self) -> &mut HashSet<FileActionKey>;
+
+    /// Checks if log replay already processed this logical file (in which case the current action
+    /// should be ignored). If not already seen, register it so we can recognize future duplicates.
+    /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it
+    /// and should process it.
+    fn check_and_record_seen(&mut self, key: FileActionKey, is_log_batch: bool) -> bool {
+        // Note: each (add.path + add.dv_unique_id()) pair has a
+        // unique Add + Remove pair in the log. For example:
+        // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json
+
+        if self.seen_file_keys().contains(&key) {
+            debug!(
+                "Ignoring duplicate ({}, {:?}) in scan, is log {}",
+                key.path, key.dv_unique_id, is_log_batch
+            );
+            true
+        } else {
+            debug!(
+                "Including ({}, {:?}) in scan, is log {}",
+                key.path, key.dv_unique_id, is_log_batch
+            );
+            if is_log_batch {
+                // Remember file actions from this batch so we can ignore duplicates as we process
+                // batches from older commit and/or checkpoint files. We don't track checkpoint
+                // batches because they are already the oldest actions and never replace anything.
+                self.seen_file_keys().insert(key);
+            }
+            false
+        }
+    }
+
+    /// Index in getters array for add.path
+    fn add_path_index(&self) -> usize;
+
+    /// Index in getters array for remove.path
+    fn remove_path_index(&self) -> Option<usize>;
+
+    /// Starting index for add action's deletion vector getters
+    /// (Assumes 3 consecutive items: storageType, pathOrInlineDv, offset)
+    fn add_dv_start_index(&self) -> usize;
+
+    /// Starting index for remove action's deletion vector getters
+    /// (Assumes 3 consecutive items: storageType, pathOrInlineDv, offset)
+    fn remove_dv_start_index(&self) -> Option<usize>;
+
+    /// Extract deletion vector unique ID
+    fn extract_dv_unique_id<'a>(
+        &self,
+        i: usize,
+        getters: &[&'a dyn GetData<'a>],
+        is_add: bool,
+    ) -> DeltaResult<Option<String>> {
+        // Get the starting index based on action type
+        let start_idx = if is_add {
+            self.add_dv_start_index()
+        } else if let Some(idx) = self.remove_dv_start_index() {
+            idx
+        } else {
+            return Err(Error::GenericError {
+                source: "DV getters should exist".into(),
+            });
+        };
+
+        // Extract the DV unique ID
+        match getters[start_idx].get_opt(i, "deletionVector.storageType")? {
+            Some(storage_type) => Ok(Some(DeletionVectorDescriptor::unique_id_from_parts(
+                storage_type,
+                getters[start_idx + 1].get(i, "deletionVector.pathOrInlineDv")?,
+                getters[start_idx + 2].get_opt(i, "deletionVector.offset")?,
+            ))),
+            None => Ok(None),
+        }
+    }
+
+    /// Extract file action key and determine if it's an add operation
+    fn extract_file_action<'a>(
+        &self,
+        i: usize,
+        getters: &[&'a dyn GetData<'a>],
+    ) -> DeltaResult<Option<(FileActionKey, bool)>> {
+        // Try to extract an add action path
+        if let Some(path) = getters[self.add_path_index()].get_str(i, "add.path")? {
+            let dv_unique_id = self.extract_dv_unique_id(i, getters, true)?;
+            let file_key = FileActionKey::new(path, dv_unique_id);
+            return Ok(Some((file_key, true)));
+        }
+
+        // The AddRemoveDedupVisitor does not include remove action getters when
+        // dealing with non-log batches (since they are not needed for deduplication).
+        let Some(remove_idx) = self.remove_path_index() else {
+            return Ok(None);
+        };
+
+        // Try to extract a remove action path
+        if let Some(path) = getters[remove_idx].get_str(i, "remove.path")? {
+            let dv_unique_id = self.extract_dv_unique_id(i, getters, false)?;
+            let file_key = FileActionKey::new(path, dv_unique_id);
+            return Ok(Some((file_key, false)));
+        }
+
+        // No path found, not a file action
+        Ok(None)
+    }
+}
diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index dbcd056df..b2c56c026 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -3,41 +3,29 @@ use std::collections::{HashMap, HashSet};
 use std::sync::{Arc, LazyLock};
 
 use itertools::Itertools;
-use tracing::debug;
 
 use super::data_skipping::DataSkippingFilter;
 use super::{ScanData, Transform};
 use crate::actions::get_log_add_schema;
 use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
 use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef};
+use crate::log_replay::{FileActionKey, FileActionVisitor, LogReplayProcessor};
 use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _};
 use crate::scan::{DeletionVectorDescriptor, Scalar, TransformExpr};
 use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType};
 use crate::utils::require;
 use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator};
 
-/// The subset of file action fields that uniquely identifies it in the log, used for deduplication
-/// of adds and removes during log replay.
-#[derive(Debug, Hash, Eq, PartialEq)]
-pub(crate) struct FileActionKey {
-    pub(crate) path: String,
-    pub(crate) dv_unique_id: Option<String>,
-}
-impl FileActionKey {
-    pub(crate) fn new(path: impl Into<String>, dv_unique_id: Option<String>) -> Self {
-        let path = path.into();
-        Self { path, dv_unique_id }
-    }
-}
-
-struct LogReplayScanner {
+struct ScanLogReplayProcessor {
     partition_filter: Option<ExpressionRef>,
     data_skipping_filter: Option<DataSkippingFilter>,
-
+    add_transform: Arc<dyn ExpressionEvaluator>,
+    logical_schema: SchemaRef,
+    transform: Option<Arc<Transform>>,
     /// A set of (data file path, dv_unique_id) pairs that have been seen thus
     /// far in the log. This is used to filter out files with Remove actions as
     /// well as duplicate entries in the log.
-    seen: HashSet<FileActionKey>,
+    seen_file_keys: HashSet<FileActionKey>,
 }
 
 /// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds. Log
@@ -45,7 +33,7 @@ struct LogReplayScanner {
 /// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the
 /// first action for a given file is a remove, then that file does not show up in the result at all.
 struct AddRemoveDedupVisitor<'seen> {
-    seen: &'seen mut HashSet<FileActionKey>,
+    seen_file_keys: &'seen mut HashSet<FileActionKey>,
     selection_vector: Vec<bool>,
     logical_schema: SchemaRef,
     transform: Option<Arc<Transform>>,
@@ -54,37 +42,37 @@ struct AddRemoveDedupVisitor<'seen> {
     is_log_batch: bool,
 }
 
-impl AddRemoveDedupVisitor<'_> {
-    /// Checks if log replay already processed this logical file (in which case the current action
-    /// should be ignored). If not already seen, register it so we can recognize future duplicates.
-    /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it
-    /// and should process it.
-    fn check_and_record_seen(&mut self, key: FileActionKey) -> bool {
-        // Note: each (add.path + add.dv_unique_id()) pair has a
-        // unique Add + Remove pair in the log. For example:
-        // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json
-
-        if self.seen.contains(&key) {
-            debug!(
-                "Ignoring duplicate ({}, {:?}) in scan, is log {}",
-                key.path, key.dv_unique_id, self.is_log_batch
-            );
-            true
+impl FileActionVisitor for AddRemoveDedupVisitor<'_> {
+    fn seen_file_keys(&mut self) -> &mut HashSet<FileActionKey> {
+        self.seen_file_keys
+    }
+
+    fn add_path_index(&self) -> usize {
+        0
+    }
+
+    fn remove_path_index(&self) -> Option<usize> {
+        if self.is_log_batch {
+            Some(5)
         } else {
-            debug!(
-                "Including ({}, {:?}) in scan, is log {}",
-                key.path, key.dv_unique_id, self.is_log_batch
-            );
-            if self.is_log_batch {
-                // Remember file actions from this batch so we can ignore duplicates as we process
-                // batches from older commit and/or checkpoint files. We don't track checkpoint
-                // batches because they are already the oldest actions and never replace anything.
-                self.seen.insert(key);
-            }
-            false
+            None // No remove action getters when not a log batch
         }
     }
 
+    fn add_dv_start_index(&self) -> usize {
+        2
+    }
+
+    fn remove_dv_start_index(&self) -> Option<usize> {
+        if self.is_log_batch {
+            Some(6)
+        } else {
+            None // No remove action getters when not a log batch
+        }
+    }
+}
+
+impl AddRemoveDedupVisitor<'_> {
     fn parse_partition_value(
         &self,
         field_idx: usize,
@@ -162,28 +150,12 @@ impl AddRemoveDedupVisitor<'_> {
     /// True if this row contains an Add action that should survive log replay. Skip it if the row
     /// is not an Add action, or the file has already been seen previously.
     fn is_valid_add<'a>(&mut self, i: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<bool> {
-        // Add will have a path at index 0 if it is valid; otherwise, if it is a log batch, we may
-        // have a remove with a path at index 4. In either case, extract the three dv getters at
-        // indexes that immediately follow a valid path index.
-        let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? {
-            (path, &getters[2..5], true)
-        } else if !self.is_log_batch {
-            return Ok(false);
-        } else if let Some(path) = getters[5].get_opt(i, "remove.path")? {
-            (path, &getters[6..9], false)
-        } else {
+        // Retrieve the file action key and whether it is an add action
+        let Some((file_key, is_add)) = self.extract_file_action(i, getters)? else {
+            // Not a file action
             return Ok(false);
         };
 
-        let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? {
-            Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts(
-                storage_type,
-                dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?,
-                dv_getters[2].get_opt(i, "deletionVector.offset")?,
-            )),
-            None => None,
-        };
-
         // Apply partition pruning (to adds only) before deduplication, so that we don't waste memory
         // tracking pruned files. Removes don't get pruned and we'll still have to track them.
         //
@@ -203,8 +175,7 @@ impl AddRemoveDedupVisitor<'_> {
         };
 
         // Check both adds and removes (skipping already-seen), but only transform and return adds
-        let file_key = FileActionKey::new(path, dv_unique_id);
-        if self.check_and_record_seen(file_key) || !is_add {
+        if self.check_and_record_seen(file_key, self.is_log_batch) || !is_add {
             return Ok(false);
         }
         let transform = self
@@ -310,48 +281,70 @@ fn get_add_transform_expr() -> Expression {
     ])
 }
 
-impl LogReplayScanner {
-    /// Create a new [`LogReplayScanner`] instance
-    fn new(engine: &dyn Engine, physical_predicate: Option<(ExpressionRef, SchemaRef)>) -> Self {
-        Self {
-            partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()),
-            data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate),
-            seen: Default::default(),
-        }
-    }
+impl LogReplayProcessor for ScanLogReplayProcessor {
+    type ProcessingResult = ScanData;
 
-    fn process_scan_batch(
+    fn process_batch(
         &mut self,
-        add_transform: &dyn ExpressionEvaluator,
-        actions: &dyn EngineData,
-        logical_schema: SchemaRef,
-        transform: Option<Arc<Transform>>,
+        batch: Box<dyn EngineData>,
         is_log_batch: bool,
-    ) -> DeltaResult<ScanData> {
+    ) -> DeltaResult<Self::ProcessingResult> {
         // Apply data skipping to get back a selection vector for actions that passed skipping. We
         // will update the vector below as log replay identifies duplicates that should be ignored.
         let selection_vector = match &self.data_skipping_filter {
-            Some(filter) => filter.apply(actions)?,
-            None => vec![true; actions.len()],
+            Some(filter) => filter.apply(batch.as_ref())?,
+            None => vec![true; batch.len()],
         };
-        assert_eq!(selection_vector.len(), actions.len());
+        assert_eq!(selection_vector.len(), batch.len());
+
+        let logical_schema = self.logical_schema.clone();
+        let transform = self.transform.clone();
+        let partition_filter = self.partition_filter.clone();
+        let result = self.add_transform.evaluate(batch.as_ref())?;
 
         let mut visitor = AddRemoveDedupVisitor {
-            seen: &mut self.seen,
+            seen_file_keys: &mut self.seen_file_keys(),
             selection_vector,
             logical_schema,
             transform,
-            partition_filter: self.partition_filter.clone(),
+            partition_filter,
             row_transform_exprs: Vec::new(),
             is_log_batch,
         };
-        visitor.visit_rows_of(actions)?;
+
+        visitor.visit_rows_of(batch.as_ref())?;
 
         // TODO: Teach expression eval to respect the selection vector we just computed so carefully!
         let selection_vector = visitor.selection_vector;
-        let result = add_transform.evaluate(actions)?;
         Ok((result, selection_vector, visitor.row_transform_exprs))
     }
+
+    fn seen_file_keys(&mut self) -> &mut HashSet<FileActionKey> {
+        &mut self.seen_file_keys
+    }
+}
+
+impl ScanLogReplayProcessor {
+    /// Create a new [`ScanLogReplayProcessor`] instance
+    fn new(
+        engine: &dyn Engine,
+        physical_predicate: Option<(ExpressionRef, SchemaRef)>,
+        logical_schema: SchemaRef,
+        transform: Option<Arc<Transform>>,
+    ) -> Self {
+        Self {
+            partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()),
+            data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate),
+            add_transform: engine.get_expression_handler().get_evaluator(
+                get_log_add_schema().clone(),
+                get_add_transform_expr(),
+                SCAN_ROW_DATATYPE.clone(),
+            ),
+            seen_file_keys: Default::default(),
+            logical_schema,
+            transform,
+        }
+    }
 }
 
 /// Given an iterator of (engine_data, bool) tuples and a predicate, returns an iterator of
@@ -365,22 +358,13 @@ pub(crate) fn scan_action_iter(
     transform: Option<Arc<Transform>>,
     physical_predicate: Option<(ExpressionRef, SchemaRef)>,
 ) -> impl Iterator<Item = DeltaResult<ScanData>> {
-    let mut log_scanner = LogReplayScanner::new(engine, physical_predicate);
-    let add_transform = engine.get_expression_handler().get_evaluator(
-        get_log_add_schema().clone(),
-        get_add_transform_expr(),
-        SCAN_ROW_DATATYPE.clone(),
-    );
+    let mut log_scanner =
+        ScanLogReplayProcessor::new(engine, physical_predicate, logical_schema, transform);
+
     action_iter
         .map(move |action_res| {
             let (batch, is_log_batch) = action_res?;
-            log_scanner.process_scan_batch(
-                add_transform.as_ref(),
-                batch.as_ref(),
-                logical_schema.clone(),
-                transform.clone(),
-                is_log_batch,
-            )
+            log_scanner.process_batch(batch, is_log_batch)
         })
         .filter(|res| res.as_ref().map_or(true, |(_, sv, _)| sv.contains(&true)))
 }

From d420fd1fd2ad5e3d172052b99698b4929178d1e8 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Mon, 24 Mar 2025 15:31:31 -0700
Subject: [PATCH 09/45] these github action checks clog my screen

---
 kernel/src/log_replay.rs      | 13 +++----------
 kernel/src/scan/log_replay.rs |  2 +-
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
index e545f2408..cfd4a10c0 100644
--- a/kernel/src/log_replay.rs
+++ b/kernel/src/log_replay.rs
@@ -1,16 +1,9 @@
-use std::collections::{HashMap, HashSet};
-use std::sync::{
-    atomic::{AtomicUsize, Ordering},
-    Arc, LazyLock,
-};
-
-use itertools::Itertools;
+use std::collections::HashSet;
 use tracing::debug;
 
 use crate::actions::deletion_vector::DeletionVectorDescriptor;
-use crate::actions::get_log_add_schema;
-use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
-use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator};
+use crate::engine_data::{GetData, TypedGetData as _};
+use crate::{DeltaResult, EngineData, Error};
 
 #[derive(Debug, Hash, Eq, PartialEq)]
 /// The subset of file action fields that uniquely identifies it in the log, used for deduplication
diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index b2c56c026..8dce0ed6f 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -11,7 +11,7 @@ use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
 use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef};
 use crate::log_replay::{FileActionKey, FileActionVisitor, LogReplayProcessor};
 use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _};
-use crate::scan::{DeletionVectorDescriptor, Scalar, TransformExpr};
+use crate::scan::{Scalar, TransformExpr};
 use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType};
 use crate::utils::require;
 use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator};

From 9e0e0483a88f995fd55ac6755caf4bf473325a82 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Tue, 25 Mar 2025 14:34:44 -0700
Subject: [PATCH 10/45] base file actions struct

---
 kernel/src/actions/visitors.rs | 272 ++++++++++++++++++++++++---------
 kernel/src/scan/log_replay.rs  | 104 +++++--------
 2 files changed, 240 insertions(+), 136 deletions(-)

diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs
index 150beffe6..9a04411e1 100644
--- a/kernel/src/actions/visitors.rs
+++ b/kernel/src/actions/visitors.rs
@@ -492,9 +492,7 @@ impl RowVisitor for SidecarVisitor {
 /// action, we should only include it if it is not expired (i.e., its deletion timestamp is greater
 /// than the minimum file retention timestamp).
 struct CheckpointFileActionsVisitor<'seen> {
-    seen_file_keys: &'seen mut HashSet<FileActionKey>,
-    selection_vector: Vec<bool>,
-    is_log_batch: bool,
+    deduplicator: FileActionDeduplicator<'seen>,
     total_actions: usize,
     total_add_actions: usize,
     minimum_file_retention_timestamp: i64,
@@ -502,35 +500,22 @@ struct CheckpointFileActionsVisitor<'seen> {
 
 #[allow(unused)] // TODO: Remove flag once used for checkpoint writing
 impl CheckpointFileActionsVisitor<'_> {
-    /// Checks if log replay already processed this logical file (in which case the current action
-    /// should be ignored). If not already seen, register it so we can recognize future duplicates.
-    /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it
-    /// and should process it.
-    ///
-    /// TODO: This method is a duplicate of AddRemoveDedupVisior's method!
-    fn check_and_record_seen(&mut self, key: FileActionKey) -> bool {
-        // Note: each (add.path + add.dv_unique_id()) pair has a
-        // unique Add + Remove pair in the log. For example:
-        // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json
-
-        if self.seen_file_keys.contains(&key) {
-            debug!(
-                "Ignoring duplicate ({}, {:?}) in scan, is log {}",
-                key.path, key.dv_unique_id, self.is_log_batch
-            );
-            true
-        } else {
-            debug!(
-                "Including ({}, {:?}) in scan, is log {}",
-                key.path, key.dv_unique_id, self.is_log_batch
-            );
-            if self.is_log_batch {
-                // Remember file actions from this batch so we can ignore duplicates as we process
-                // batches from older commit and/or checkpoint files. We don't track checkpoint
-                // batches because they are already the oldest actions and never replace anything.
-                self.seen_file_keys.insert(key);
-            }
-            false
+    /// Create a new CheckpointFileActionsVisitor
+    fn new(
+        seen_file_keys: &mut HashSet<FileActionKey>,
+        selection_vector: Vec<bool>,
+        is_log_batch: bool,
+        minimum_file_retention_timestamp: i64,
+    ) -> CheckpointFileActionsVisitor<'_> {
+        CheckpointFileActionsVisitor {
+            deduplicator: FileActionDeduplicator::new(
+                seen_file_keys,
+                selection_vector,
+                is_log_batch,
+            ),
+            total_actions: 0,
+            total_add_actions: 0,
+            minimum_file_retention_timestamp,
         }
     }
 
@@ -556,29 +541,17 @@ impl CheckpointFileActionsVisitor<'_> {
         i: usize,
         getters: &[&'a dyn GetData<'a>],
     ) -> DeltaResult<bool> {
-        // Add will have a path at index 0 if it is valid; otherwise we may
-        // have a remove with a path at index 4. In either case, extract the three dv getters at
-        // indexes that immediately follow a valid path index.
-        let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? {
-            (path, &getters[1..4], true)
-        } else if let Some(path) = getters[4].get_opt(i, "remove.path")? {
-            (path, &getters[6..9], false)
-        } else {
+        let Some((file_key, is_add)) = self.deduplicator.extract_file_action(
+            i, getters, 0,     // add_path_index
+            4,     // remove_path_index
+            1,     // add_dv_start_index
+            6,     // remove_dv_start_index
+            false, // Never skip remove actions (even if we're processing a log batch)
+        )?
+        else {
             return Ok(false);
         };
-
-        let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? {
-            Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts(
-                storage_type,
-                dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?,
-                dv_getters[2].get_opt(i, "deletionVector.offset")?,
-            )),
-            None => None,
-        };
-
-        // Check both adds and removes (skipping already-seen)
-        let file_key = FileActionKey::new(path, dv_unique_id);
-        if self.check_and_record_seen(file_key) {
+        if self.deduplicator.check_and_record_seen(file_key) {
             return Ok(false);
         }
 
@@ -634,7 +607,7 @@ impl RowVisitor for CheckpointFileActionsVisitor<'_> {
             let should_select = self.is_valid_file_action(i, getters)?;
 
             if should_select {
-                self.selection_vector[i] = true;
+                self.deduplicator.selection_vector[i] = true;
                 self.total_actions += 1;
             }
         }
@@ -642,6 +615,145 @@ impl RowVisitor for CheckpointFileActionsVisitor<'_> {
     }
 }
 
+/// Core implementation for deduplicating file actions in Delta log replay
+/// This struct extracts the common functionality from the CheckpointVisitor
+/// and the ScanDataVisitor.
+pub(crate) struct FileActionDeduplicator<'seen> {
+    /// A set of (data file path, dv_unique_id) pairs that have been seen thus
+    /// far in the log for deduplication
+    seen_file_keys: &'seen mut HashSet<FileActionKey>,
+    /// Selection vector to track which rows should be included
+    selection_vector: Vec<bool>,
+    /// Whether we're processing a log batch (as opposed to a checkpoint)
+    is_log_batch: bool,
+}
+
+impl<'seen> FileActionDeduplicator<'seen> {
+    pub(crate) fn new(
+        seen_file_keys: &'seen mut HashSet<FileActionKey>,
+        selection_vector: Vec<bool>,
+        is_log_batch: bool,
+    ) -> Self {
+        Self {
+            seen_file_keys,
+            selection_vector,
+            is_log_batch,
+        }
+    }
+
+    /// Checks if log replay already processed this logical file (in which case the current action
+    /// should be ignored). If not already seen, register it so we can recognize future duplicates.
+    /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it
+    /// and should process it.
+    pub(crate) fn check_and_record_seen(&mut self, key: FileActionKey) -> bool {
+        // Note: each (add.path + add.dv_unique_id()) pair has a
+        // unique Add + Remove pair in the log. For example:
+        // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json
+
+        if self.seen_file_keys.contains(&key) {
+            debug!(
+                "Ignoring duplicate ({}, {:?}) in scan, is log {}",
+                key.path, key.dv_unique_id, self.is_log_batch
+            );
+            true
+        } else {
+            debug!(
+                "Including ({}, {:?}) in scan, is log {}",
+                key.path, key.dv_unique_id, self.is_log_batch
+            );
+            if self.is_log_batch {
+                // Remember file actions from this batch so we can ignore duplicates as we process
+                // batches from older commit and/or checkpoint files. We don't track checkpoint
+                // batches because they are already the oldest actions and never replace anything.
+                self.seen_file_keys.insert(key);
+            }
+            false
+        }
+    }
+
+    /// Extract deletion vector unique ID
+    fn extract_dv_unique_id<'a>(
+        &self,
+        i: usize,
+        getters: &[&'a dyn GetData<'a>],
+        add_dv_start_index: Option<usize>,
+        remove_dv_start_index: Option<usize>,
+    ) -> DeltaResult<Option<String>> {
+        // Get the starting index based on action type
+        let start_idx = add_dv_start_index
+            .or(remove_dv_start_index)
+            .ok_or_else(|| Error::GenericError {
+                source: "starting indices for add/remove DVs should have been passed".into(),
+            })?;
+
+        // Extract the DV unique ID
+        match getters[start_idx].get_opt(i, "deletionVector.storageType")? {
+            Some(storage_type) => Ok(Some(DeletionVectorDescriptor::unique_id_from_parts(
+                storage_type,
+                getters[start_idx + 1].get(i, "deletionVector.pathOrInlineDv")?,
+                getters[start_idx + 2].get_opt(i, "deletionVector.offset")?,
+            ))),
+            None => Ok(None),
+        }
+    }
+
+    /// Extract file action key and determine if it's an add operation
+    pub(crate) fn extract_file_action<'a>(
+        &self,
+        i: usize,
+        getters: &[&'a dyn GetData<'a>],
+        add_path_index: usize,
+        remove_path_index: usize,
+        add_dv_start_index: usize,
+        remove_dv_start_index: usize,
+        skip_removes: bool,
+    ) -> DeltaResult<Option<(FileActionKey, bool)>> {
+        // Try to extract an add action path
+        if let Some(path) = getters[add_path_index].get_str(i, "add.path")? {
+            let dv_unique_id =
+                self.extract_dv_unique_id(i, getters, Some(add_dv_start_index), None)?;
+            return Ok(Some((FileActionKey::new(path, dv_unique_id), true)));
+        }
+
+        // The AddRemoveDedupVisitor does not include remove action getters when
+        // dealing with non-log batches (since they are not needed for deduplication).
+        // In this case, we should skip remove actions.
+        if skip_removes {
+            return Ok(None);
+        }
+
+        // Try to extract a remove action path
+        if let Some(path) = getters[remove_path_index].get_str(i, "remove.path")? {
+            let dv_unique_id =
+                self.extract_dv_unique_id(i, getters, None, Some(remove_dv_start_index))?;
+            return Ok(Some((FileActionKey::new(path, dv_unique_id), false)));
+        }
+
+        // If we didn't find an add or remove action, return None
+        return Ok(None);
+    }
+
+    /// Get the selection vector
+    pub(crate) fn selection_vector(self) -> Vec<bool> {
+        self.selection_vector
+    }
+
+    /// Get reference to the selection vector
+    pub(crate) fn selection_vector_ref(&self) -> &Vec<bool> {
+        &self.selection_vector
+    }
+
+    /// Get mutable reference to the selection vector
+    pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec<bool> {
+        &mut self.selection_vector
+    }
+
+    /// Get whether we are processing a log batch
+    pub(crate) fn is_log_batch(&self) -> bool {
+        self.is_log_batch
+    }
+}
+
 /// A visitor that selects non-file actions for a checkpoint file. Since log replay visits actions
 /// in newest-first order, we only keep the first occurrence of:
 /// - a protocol action,
@@ -1030,10 +1142,13 @@ mod tests {
     #[test]
     fn test_parse_checkpoint_file_action_visitor() -> DeltaResult<()> {
         let data = action_batch();
-        let mut visitor = CheckpointFileActionsVisitor {
+        let deduplicator = FileActionDeduplicator {
             seen_file_keys: &mut HashSet::new(),
-            selection_vector: vec![false; 8], // 8 rows in the action batch
+            selection_vector: vec![false; 8],
             is_log_batch: true,
+        };
+        let mut visitor = CheckpointFileActionsVisitor {
+            deduplicator,
             total_actions: 0,
             total_add_actions: 0,
             minimum_file_retention_timestamp: 0, // No tombstones are expired
@@ -1042,8 +1157,8 @@ mod tests {
         visitor.visit_rows_of(data.as_ref())?;
 
         let expected = vec![true, true, false, false, false, false, false, false];
-        assert_eq!(visitor.selection_vector, expected);
-        assert_eq!(visitor.seen_file_keys.len(), 2);
+        assert_eq!(visitor.deduplicator.seen_file_keys.len(), 2);
+        assert_eq!(visitor.deduplicator.selection_vector(), expected);
         assert_eq!(visitor.total_actions, 2);
         assert_eq!(visitor.total_add_actions, 1);
         Ok(())
@@ -1061,10 +1176,13 @@ mod tests {
         .into();
         let batch = parse_json_batch(json_strings);
 
-        let mut visitor = CheckpointFileActionsVisitor {
+        let deduplicator = FileActionDeduplicator {
             seen_file_keys: &mut HashSet::new(),
             selection_vector: vec![false; 4],
             is_log_batch: true,
+        };
+        let mut visitor = CheckpointFileActionsVisitor {
+            deduplicator,
             total_actions: 0,
             total_add_actions: 0,
             minimum_file_retention_timestamp: 100, // Threshold set to 100
@@ -1073,8 +1191,8 @@ mod tests {
         visitor.visit_rows_of(batch.as_ref())?;
 
         let expected = vec![false, false, true, false]; // Only "one_above_threshold" should be kept
-        assert_eq!(visitor.selection_vector, expected);
-        assert_eq!(visitor.seen_file_keys.len(), 4); // All are recorded as seen even if expired
+        assert_eq!(visitor.deduplicator.seen_file_keys.len(), 4); // All are recorded as seen even if expired
+        assert_eq!(visitor.deduplicator.selection_vector(), expected);
         assert_eq!(visitor.total_actions, 1);
         assert_eq!(visitor.total_add_actions, 0);
         Ok(())
@@ -1090,10 +1208,13 @@ mod tests {
         .into();
         let batch = parse_json_batch(json_strings);
 
-        let mut visitor = CheckpointFileActionsVisitor {
+        let deduplicator = FileActionDeduplicator {
             seen_file_keys: &mut HashSet::new(),
             selection_vector: vec![false; 2],
-            is_log_batch: true, // Log batch
+            is_log_batch: true,
+        };
+        let mut visitor = CheckpointFileActionsVisitor {
+            deduplicator,
             total_actions: 0,
             total_add_actions: 0,
             minimum_file_retention_timestamp: 0,
@@ -1103,8 +1224,8 @@ mod tests {
 
         // First one should be included, second one skipped as a duplicate
         let expected = vec![true, false];
-        assert_eq!(visitor.selection_vector, expected);
-        assert_eq!(visitor.seen_file_keys.len(), 1);
+        assert_eq!(visitor.deduplicator.seen_file_keys.len(), 1);
+        assert_eq!(visitor.deduplicator.selection_vector(), expected);
         assert_eq!(visitor.total_actions, 1);
         assert_eq!(visitor.total_add_actions, 1);
         Ok(())
@@ -1121,10 +1242,13 @@ mod tests {
         .into();
         let batch = parse_json_batch(json_strings);
 
-        let mut visitor = CheckpointFileActionsVisitor {
+        let deduplicator = FileActionDeduplicator {
             seen_file_keys: &mut HashSet::new(),
             selection_vector: vec![false; 2],
-            is_log_batch: false, // Checkpoint batch
+            is_log_batch: false,
+        };
+        let mut visitor = CheckpointFileActionsVisitor {
+            deduplicator,
             total_actions: 0,
             total_add_actions: 0,
             minimum_file_retention_timestamp: 0,
@@ -1134,8 +1258,8 @@ mod tests {
 
         // Both should be included since we don't track duplicates in checkpoint batches
         let expected = vec![true, true];
-        assert_eq!(visitor.selection_vector, expected);
-        assert_eq!(visitor.seen_file_keys.len(), 0); // No tracking for checkpoint batches
+        assert_eq!(visitor.deduplicator.seen_file_keys.len(), 0); // No tracking for checkpoint batches
+        assert_eq!(visitor.deduplicator.selection_vector(), expected);
         assert_eq!(visitor.total_actions, 2);
         assert_eq!(visitor.total_add_actions, 2);
         Ok(())
@@ -1152,11 +1276,13 @@ mod tests {
         ]
         .into();
         let batch = parse_json_batch(json_strings);
-
-        let mut visitor = CheckpointFileActionsVisitor {
+        let deduplicator = FileActionDeduplicator {
             seen_file_keys: &mut HashSet::new(),
             selection_vector: vec![false; 3],
             is_log_batch: true,
+        };
+        let mut visitor = CheckpointFileActionsVisitor {
+            deduplicator,
             total_actions: 0,
             total_add_actions: 0,
             minimum_file_retention_timestamp: 0,
@@ -1165,8 +1291,8 @@ mod tests {
         visitor.visit_rows_of(batch.as_ref())?;
 
         let expected = vec![true, true, false]; // Third one is a duplicate
-        assert_eq!(visitor.selection_vector, expected);
-        assert_eq!(visitor.seen_file_keys.len(), 2);
+        assert_eq!(visitor.deduplicator.seen_file_keys.len(), 2);
+        assert_eq!(visitor.deduplicator.selection_vector(), expected);
         assert_eq!(visitor.total_actions, 2);
         assert_eq!(visitor.total_add_actions, 2);
         Ok(())
diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index dbcd056df..59e3e52c1 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -3,15 +3,15 @@ use std::collections::{HashMap, HashSet};
 use std::sync::{Arc, LazyLock};
 
 use itertools::Itertools;
-use tracing::debug;
 
 use super::data_skipping::DataSkippingFilter;
 use super::{ScanData, Transform};
 use crate::actions::get_log_add_schema;
+use crate::actions::visitors::FileActionDeduplicator;
 use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
 use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef};
 use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _};
-use crate::scan::{DeletionVectorDescriptor, Scalar, TransformExpr};
+use crate::scan::{Scalar, TransformExpr};
 use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType};
 use crate::utils::require;
 use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator};
@@ -45,43 +45,28 @@ struct LogReplayScanner {
 /// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the
 /// first action for a given file is a remove, then that file does not show up in the result at all.
 struct AddRemoveDedupVisitor<'seen> {
-    seen: &'seen mut HashSet<FileActionKey>,
-    selection_vector: Vec<bool>,
+    deduplicator: FileActionDeduplicator<'seen>,
     logical_schema: SchemaRef,
     transform: Option<Arc<Transform>>,
     partition_filter: Option<ExpressionRef>,
     row_transform_exprs: Vec<Option<ExpressionRef>>,
-    is_log_batch: bool,
 }
 
 impl AddRemoveDedupVisitor<'_> {
-    /// Checks if log replay already processed this logical file (in which case the current action
-    /// should be ignored). If not already seen, register it so we can recognize future duplicates.
-    /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it
-    /// and should process it.
-    fn check_and_record_seen(&mut self, key: FileActionKey) -> bool {
-        // Note: each (add.path + add.dv_unique_id()) pair has a
-        // unique Add + Remove pair in the log. For example:
-        // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json
-
-        if self.seen.contains(&key) {
-            debug!(
-                "Ignoring duplicate ({}, {:?}) in scan, is log {}",
-                key.path, key.dv_unique_id, self.is_log_batch
-            );
-            true
-        } else {
-            debug!(
-                "Including ({}, {:?}) in scan, is log {}",
-                key.path, key.dv_unique_id, self.is_log_batch
-            );
-            if self.is_log_batch {
-                // Remember file actions from this batch so we can ignore duplicates as we process
-                // batches from older commit and/or checkpoint files. We don't track checkpoint
-                // batches because they are already the oldest actions and never replace anything.
-                self.seen.insert(key);
-            }
-            false
+    fn new(
+        seen: &mut HashSet<FileActionKey>,
+        selection_vector: Vec<bool>,
+        logical_schema: SchemaRef,
+        transform: Option<Arc<Transform>>,
+        partition_filter: Option<ExpressionRef>,
+        is_log_batch: bool,
+    ) -> AddRemoveDedupVisitor<'_> {
+        AddRemoveDedupVisitor {
+            deduplicator: FileActionDeduplicator::new(seen, selection_vector, is_log_batch),
+            logical_schema,
+            transform,
+            partition_filter,
+            row_transform_exprs: Vec::new(),
         }
     }
 
@@ -162,28 +147,19 @@ impl AddRemoveDedupVisitor<'_> {
     /// True if this row contains an Add action that should survive log replay. Skip it if the row
     /// is not an Add action, or the file has already been seen previously.
     fn is_valid_add<'a>(&mut self, i: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<bool> {
-        // Add will have a path at index 0 if it is valid; otherwise, if it is a log batch, we may
-        // have a remove with a path at index 4. In either case, extract the three dv getters at
-        // indexes that immediately follow a valid path index.
-        let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? {
-            (path, &getters[2..5], true)
-        } else if !self.is_log_batch {
-            return Ok(false);
-        } else if let Some(path) = getters[5].get_opt(i, "remove.path")? {
-            (path, &getters[6..9], false)
-        } else {
+        let Some((file_key, is_add)) = self.deduplicator.extract_file_action(
+            i,
+            getters,
+            0,                                 // add_path_index
+            5,                                 // remove_path_index
+            2,                                 // add_dv_start_index
+            6,                                 // remove_dv_start_index
+            !self.deduplicator.is_log_batch(), // skip_removes if it's a log batch
+        )?
+        else {
             return Ok(false);
         };
 
-        let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? {
-            Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts(
-                storage_type,
-                dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?,
-                dv_getters[2].get_opt(i, "deletionVector.offset")?,
-            )),
-            None => None,
-        };
-
         // Apply partition pruning (to adds only) before deduplication, so that we don't waste memory
         // tracking pruned files. Removes don't get pruned and we'll still have to track them.
         //
@@ -203,8 +179,7 @@ impl AddRemoveDedupVisitor<'_> {
         };
 
         // Check both adds and removes (skipping already-seen), but only transform and return adds
-        let file_key = FileActionKey::new(path, dv_unique_id);
-        if self.check_and_record_seen(file_key) || !is_add {
+        if self.deduplicator.check_and_record_seen(file_key) || !is_add {
             return Ok(false);
         }
         let transform = self
@@ -243,7 +218,7 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> {
             (names, types).into()
         });
         let (names, types) = NAMES_AND_TYPES.as_ref();
-        if self.is_log_batch {
+        if self.deduplicator.is_log_batch() {
             (names, types)
         } else {
             // All checkpoint actions are already reconciled and Remove actions in checkpoint files
@@ -253,7 +228,11 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> {
     }
 
     fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> {
-        let expected_getters = if self.is_log_batch { 9 } else { 5 };
+        let expected_getters = if self.deduplicator.is_log_batch() {
+            9
+        } else {
+            5
+        };
         require!(
             getters.len() == expected_getters,
             Error::InternalError(format!(
@@ -263,8 +242,8 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> {
         );
 
         for i in 0..row_count {
-            if self.selection_vector[i] {
-                self.selection_vector[i] = self.is_valid_add(i, getters)?;
+            if self.deduplicator.selection_vector_ref()[i] {
+                self.deduplicator.selection_vector_mut()[i] = self.is_valid_add(i, getters)?;
             }
         }
         Ok(())
@@ -336,19 +315,18 @@ impl LogReplayScanner {
         };
         assert_eq!(selection_vector.len(), actions.len());
 
-        let mut visitor = AddRemoveDedupVisitor {
-            seen: &mut self.seen,
+        let mut visitor = AddRemoveDedupVisitor::new(
+            &mut self.seen,
             selection_vector,
             logical_schema,
             transform,
-            partition_filter: self.partition_filter.clone(),
-            row_transform_exprs: Vec::new(),
+            self.partition_filter.clone(),
             is_log_batch,
-        };
+        );
         visitor.visit_rows_of(actions)?;
 
         // TODO: Teach expression eval to respect the selection vector we just computed so carefully!
-        let selection_vector = visitor.selection_vector;
+        let selection_vector = visitor.deduplicator.selection_vector();
         let result = add_transform.evaluate(actions)?;
         Ok((result, selection_vector, visitor.row_transform_exprs))
     }

From 303444b5df466f697722bc85c4f23dd340d6faff Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Tue, 25 Mar 2025 15:34:58 -0700
Subject: [PATCH 11/45] combine visitors

---
 kernel/src/actions/visitors.rs | 457 ++++++++++++++++++++-------------
 1 file changed, 281 insertions(+), 176 deletions(-)

diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs
index 9a04411e1..73eb25d93 100644
--- a/kernel/src/actions/visitors.rs
+++ b/kernel/src/actions/visitors.rs
@@ -484,38 +484,61 @@ impl RowVisitor for SidecarVisitor {
         Ok(())
     }
 }
-
-/// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds and
-/// removes to be included in a checkpoint file. Log replay visits actions newest-first, so once
-/// we've seen a file action for a given (path, dvId) pair, we should ignore all subsequent (older)
-/// actions for that same (path, dvId) pair. If the first action for a given (path, dvId) is a remove
-/// action, we should only include it if it is not expired (i.e., its deletion timestamp is greater
-/// than the minimum file retention timestamp).
-struct CheckpointFileActionsVisitor<'seen> {
-    deduplicator: FileActionDeduplicator<'seen>,
-    total_actions: usize,
+/// A visitor that filters actions for inclusion in a checkpoint file.
+///
+/// This visitor processes actions in newest-to-oldest order (as they appear in log
+/// replay) and applies deduplication logic for both file and non-file actions.
+///
+/// # File Action Filtering
+/// - Keeps only the first occurrence of each unique (path, dvId) pair
+/// - Excludes expired tombstone remove actions (where deletionTimestamp ≤ minimumFileRetentionTimestamp)
+///
+/// # Non-File Action Filtering
+/// - Keeps only the first protocol action
+/// - Keeps only the first metadata action
+/// - Keeps only the first transaction action for each unique app ID
+///
+/// This filtered set of actions represents the minimal set needed to reconstruct
+/// the latest valid state of the table.
+#[cfg_attr(feature = "developer-visibility", visibility::make(pub))]
+pub(crate) struct CheckpointVisitor<'seen> {
+    // File actions deduplication state
+    file_deduplicator: FileActionDeduplicator<'seen>,
+    total_file_actions: usize,
     total_add_actions: usize,
     minimum_file_retention_timestamp: i64,
+
+    // Non-file actions deduplication state
+    seen_protocol: bool,
+    seen_metadata: bool,
+    seen_txns: &'seen mut HashSet<String>,
+    total_non_file_actions: usize,
 }
 
-#[allow(unused)] // TODO: Remove flag once used for checkpoint writing
-impl CheckpointFileActionsVisitor<'_> {
-    /// Create a new CheckpointFileActionsVisitor
-    fn new(
-        seen_file_keys: &mut HashSet<FileActionKey>,
+#[allow(unused)]
+impl CheckpointVisitor<'_> {
+    /// Create a new CheckpointVisitor
+    fn new<'seen>(
+        seen_file_keys: &'seen mut HashSet<FileActionKey>,
+        seen_txns: &'seen mut HashSet<String>,
         selection_vector: Vec<bool>,
         is_log_batch: bool,
         minimum_file_retention_timestamp: i64,
-    ) -> CheckpointFileActionsVisitor<'_> {
-        CheckpointFileActionsVisitor {
-            deduplicator: FileActionDeduplicator::new(
+    ) -> CheckpointVisitor<'seen> {
+        CheckpointVisitor {
+            file_deduplicator: FileActionDeduplicator::new(
                 seen_file_keys,
                 selection_vector,
                 is_log_batch,
             ),
-            total_actions: 0,
+            total_file_actions: 0,
             total_add_actions: 0,
             minimum_file_retention_timestamp,
+
+            seen_protocol: false,
+            seen_metadata: false,
+            seen_txns,
+            total_non_file_actions: 0,
         }
     }
 
@@ -541,8 +564,8 @@ impl CheckpointFileActionsVisitor<'_> {
         i: usize,
         getters: &[&'a dyn GetData<'a>],
     ) -> DeltaResult<bool> {
-        let Some((file_key, is_add)) = self.deduplicator.extract_file_action(
-            i, getters, 0,     // add_path_index
+        let Some((file_key, is_add)) = self.file_deduplicator.extract_file_action(
+            i, &getters, 0,     // add_path_index
             4,     // remove_path_index
             1,     // add_dv_start_index
             6,     // remove_dv_start_index
@@ -551,11 +574,12 @@ impl CheckpointFileActionsVisitor<'_> {
         else {
             return Ok(false);
         };
-        if self.deduplicator.check_and_record_seen(file_key) {
+
+        if self.file_deduplicator.check_and_record_seen(file_key) {
             return Ok(false);
         }
 
-        // Ignore expired tombstones.
+        // Ignore expired tombstones. The getter at the fifth index is the remove action's deletionTimestamp.
         if !is_add && self.is_expired_tombstone(i, getters[5])? {
             return Ok(false);
         }
@@ -564,39 +588,98 @@ impl CheckpointFileActionsVisitor<'_> {
             self.total_add_actions += 1;
         }
 
+        self.total_file_actions += 1;
         Ok(true)
     }
+
+    /// Returns true if the row contains a protocol action, and we haven't seen one yet.
+    fn is_valid_protocol_action<'a>(
+        &mut self,
+        i: usize,
+        getter: &'a dyn GetData<'a>,
+    ) -> DeltaResult<bool> {
+        if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol {
+            self.seen_protocol = true;
+            self.total_non_file_actions += 1;
+            Ok(true)
+        } else {
+            Ok(false)
+        }
+    }
+
+    /// Returns true if the row contains a metadata action, and we haven't seen one yet.
+    fn is_valid_metadata_action<'a>(
+        &mut self,
+        i: usize,
+        getter: &'a dyn GetData<'a>,
+    ) -> DeltaResult<bool> {
+        if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata {
+            self.seen_metadata = true;
+            self.total_non_file_actions += 1;
+            Ok(true)
+        } else {
+            Ok(false)
+        }
+    }
+
+    /// Returns true if the row contains a txn action with an appId that we haven't seen yet.
+    fn is_valid_txn_action<'a>(
+        &mut self,
+        i: usize,
+        getter: &'a dyn GetData<'a>,
+    ) -> DeltaResult<bool> {
+        let app_id = match getter.get_str(i, "txn.appId")? {
+            Some(id) => id,
+            None => return Ok(false),
+        };
+
+        // Attempting to insert the app_id into the set. If it's already present, the insert will
+        // return false, indicating that we've already seen this app_id.
+        if self.seen_txns.insert(app_id.to_string()) {
+            self.total_non_file_actions += 1;
+            Ok(true)
+        } else {
+            Ok(false)
+        }
+    }
 }
 
-impl RowVisitor for CheckpointFileActionsVisitor<'_> {
+impl RowVisitor for CheckpointVisitor<'_> {
     fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) {
         // The data columns visited must be in the following order:
         // 1. ADD
         // 2. REMOVE
-        static CHECKPOINT_FILE_ACTION_COLUMNS: LazyLock<ColumnNamesAndTypes> =
-            LazyLock::new(|| {
-                const STRING: DataType = DataType::STRING;
-                const INTEGER: DataType = DataType::INTEGER;
-                let types_and_names = vec![
-                    (STRING, column_name!("add.path")),
-                    (STRING, column_name!("add.deletionVector.storageType")),
-                    (STRING, column_name!("add.deletionVector.pathOrInlineDv")),
-                    (INTEGER, column_name!("add.deletionVector.offset")),
-                    (STRING, column_name!("remove.path")),
-                    (DataType::LONG, column_name!("remove.deletionTimestamp")),
-                    (STRING, column_name!("remove.deletionVector.storageType")),
-                    (STRING, column_name!("remove.deletionVector.pathOrInlineDv")),
-                    (INTEGER, column_name!("remove.deletionVector.offset")),
-                ];
-                let (types, names) = types_and_names.into_iter().unzip();
-                (names, types).into()
-            });
-        CHECKPOINT_FILE_ACTION_COLUMNS.as_ref()
+        // 3. METADATA
+        // 4. PROTOCOL
+        // 5. TXN
+        static NAMES_AND_TYPES: LazyLock<ColumnNamesAndTypes> = LazyLock::new(|| {
+            const STRING: DataType = DataType::STRING;
+            const INTEGER: DataType = DataType::INTEGER;
+            let types_and_names = vec![
+                // File action columns
+                (STRING, column_name!("add.path")),
+                (STRING, column_name!("add.deletionVector.storageType")),
+                (STRING, column_name!("add.deletionVector.pathOrInlineDv")),
+                (INTEGER, column_name!("add.deletionVector.offset")),
+                (STRING, column_name!("remove.path")),
+                (DataType::LONG, column_name!("remove.deletionTimestamp")),
+                (STRING, column_name!("remove.deletionVector.storageType")),
+                (STRING, column_name!("remove.deletionVector.pathOrInlineDv")),
+                (INTEGER, column_name!("remove.deletionVector.offset")),
+                // Non-file action columns
+                (STRING, column_name!("metaData.id")),
+                (INTEGER, column_name!("protocol.minReaderVersion")),
+                (STRING, column_name!("txn.appId")),
+            ];
+            let (types, names) = types_and_names.into_iter().unzip();
+            (names, types).into()
+        });
+        NAMES_AND_TYPES.as_ref()
     }
 
     fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> {
         require!(
-            getters.len() == 9,
+            getters.len() == 12,
             Error::InternalError(format!(
                 "Wrong number of visitor getters: {}",
                 getters.len()
@@ -604,11 +687,17 @@ impl RowVisitor for CheckpointFileActionsVisitor<'_> {
         );
 
         for i in 0..row_count {
-            let should_select = self.is_valid_file_action(i, getters)?;
+            // Check for non-file actions (metadata, protocol, txn)
+            let is_non_file_action = self.is_valid_metadata_action(i, getters[9])?
+                || self.is_valid_protocol_action(i, getters[10])?
+                || self.is_valid_txn_action(i, getters[11])?;
 
-            if should_select {
-                self.deduplicator.selection_vector[i] = true;
-                self.total_actions += 1;
+            // Check for file actions (add, remove)
+            let is_file_action = self.is_valid_file_action(i, getters)?;
+
+            // Mark the row for selection if it's either a valid non-file or file action
+            if is_non_file_action || is_file_action {
+                self.file_deduplicator.selection_vector_mut()[i] = true;
             }
         }
         Ok(())
@@ -1140,100 +1229,105 @@ mod tests {
     }
 
     #[test]
-    fn test_parse_checkpoint_file_action_visitor() -> DeltaResult<()> {
+    fn test_checkpoint_visitor() -> DeltaResult<()> {
         let data = action_batch();
-        let deduplicator = FileActionDeduplicator {
-            seen_file_keys: &mut HashSet::new(),
-            selection_vector: vec![false; 8],
-            is_log_batch: true,
-        };
-        let mut visitor = CheckpointFileActionsVisitor {
-            deduplicator,
-            total_actions: 0,
-            total_add_actions: 0,
-            minimum_file_retention_timestamp: 0, // No tombstones are expired
-        };
+        let mut seen_file_keys = HashSet::new();
+        let mut seen_txns = HashSet::new();
+        let mut visitor = CheckpointVisitor::new(
+            &mut seen_file_keys,
+            &mut seen_txns,
+            vec![false; 8],
+            true,
+            0, // minimum_file_retention_timestamp (no expired tombstones)
+        );
 
         visitor.visit_rows_of(data.as_ref())?;
 
-        let expected = vec![true, true, false, false, false, false, false, false];
-        assert_eq!(visitor.deduplicator.seen_file_keys.len(), 2);
-        assert_eq!(visitor.deduplicator.selection_vector(), expected);
-        assert_eq!(visitor.total_actions, 2);
+        // Combined results from both file and non-file actions
+        // Row 0 is an add action
+        // Row 1 is a remove action
+        // Row 3 is a protocol action
+        // Row 4 is a metadata action
+        // Row 7 is a txn action
+        let expected = vec![true, true, false, true, true, false, false, true];
+
+        // Verify file action results
+        assert_eq!(visitor.total_file_actions, 2);
         assert_eq!(visitor.total_add_actions, 1);
+
+        // Verify non-file action results
+        assert!(visitor.seen_protocol);
+        assert!(visitor.seen_metadata);
+        assert_eq!(visitor.seen_txns.len(), 1);
+        assert_eq!(visitor.total_non_file_actions, 3);
+
+        assert_eq!(visitor.file_deduplicator.selection_vector, expected);
         Ok(())
     }
 
     #[test]
-    fn test_checkpoint_file_action_visitor_boundary_cases_for_tombstone_expiration(
-    ) -> DeltaResult<()> {
+    fn test_checkpoint_visitor_boundary_cases_for_tombstone_expiration() -> DeltaResult<()> {
         let json_strings: StringArray = vec![
-        r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#,
-        r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#,
-        r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#,
-        r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, // Missing timestamp defaults to 0
+            r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#,
+            r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#,
+            r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#,
+            // Missing timestamp defaults to 0
+            r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, 
         ]
         .into();
         let batch = parse_json_batch(json_strings);
 
-        let deduplicator = FileActionDeduplicator {
-            seen_file_keys: &mut HashSet::new(),
-            selection_vector: vec![false; 4],
-            is_log_batch: true,
-        };
-        let mut visitor = CheckpointFileActionsVisitor {
-            deduplicator,
-            total_actions: 0,
-            total_add_actions: 0,
-            minimum_file_retention_timestamp: 100, // Threshold set to 100
-        };
+        let mut seen_file_keys = HashSet::new();
+        let mut seen_txns = HashSet::new();
+        let mut visitor = CheckpointVisitor::new(
+            &mut seen_file_keys,
+            &mut seen_txns,
+            vec![false; 4],
+            true,
+            100, // minimum_file_retention_timestamp (threshold set to 100)
+        );
 
         visitor.visit_rows_of(batch.as_ref())?;
 
-        let expected = vec![false, false, true, false]; // Only "one_above_threshold" should be kept
-        assert_eq!(visitor.deduplicator.seen_file_keys.len(), 4); // All are recorded as seen even if expired
-        assert_eq!(visitor.deduplicator.selection_vector(), expected);
-        assert_eq!(visitor.total_actions, 1);
+        // Only "one_above_threshold" should be kept
+        let expected = vec![false, false, true, false];
+        assert_eq!(visitor.file_deduplicator.selection_vector, expected);
+        assert_eq!(visitor.total_file_actions, 1);
         assert_eq!(visitor.total_add_actions, 0);
+        assert_eq!(visitor.total_non_file_actions, 0);
         Ok(())
     }
 
     #[test]
-    fn test_checkpoint_file_action_visitor_duplicate_file_actions_in_log_batch() -> DeltaResult<()>
-    {
+    fn test_checkpoint_visitor_conflicting_file_actions_in_log_batch() -> DeltaResult<()> {
         let json_strings: StringArray = vec![
             r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#,
-            r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, // Duplicate path
-            ]
+             // Duplicate path
+            r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#,
+        ]
         .into();
         let batch = parse_json_batch(json_strings);
 
-        let deduplicator = FileActionDeduplicator {
-            seen_file_keys: &mut HashSet::new(),
-            selection_vector: vec![false; 2],
-            is_log_batch: true,
-        };
-        let mut visitor = CheckpointFileActionsVisitor {
-            deduplicator,
-            total_actions: 0,
-            total_add_actions: 0,
-            minimum_file_retention_timestamp: 0,
-        };
+        let mut seen_file_keys = HashSet::new();
+        let mut seen_txns = HashSet::new();
+        let mut visitor =
+            CheckpointVisitor::new(&mut seen_file_keys, &mut seen_txns, vec![false; 2], true, 0);
 
         visitor.visit_rows_of(batch.as_ref())?;
 
         // First one should be included, second one skipped as a duplicate
         let expected = vec![true, false];
-        assert_eq!(visitor.deduplicator.seen_file_keys.len(), 1);
-        assert_eq!(visitor.deduplicator.selection_vector(), expected);
-        assert_eq!(visitor.total_actions, 1);
+        assert_eq!(visitor.file_deduplicator.selection_vector, expected);
+        assert_eq!(visitor.total_file_actions, 1);
         assert_eq!(visitor.total_add_actions, 1);
+        assert_eq!(visitor.total_non_file_actions, 0);
         Ok(())
     }
 
     #[test]
-    fn test_checkpoint_file_action_visitor_duplicate_file_actions_in_checkpoint_batch(
-    ) -> DeltaResult<()> {
+    fn test_checkpoint_visitor_duplicate_file_actions_in_checkpoint_batch() -> DeltaResult<()> {
+        // Note: this is NOT a valid checkpoint batch since it contains duplicate file actions!
+        // However, we should still be able to parse it without errors, and the duplicates should be included.
         let json_strings: StringArray = vec![
             r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#,
             // Duplicate path
@@ -1242,31 +1336,29 @@ mod tests {
         .into();
         let batch = parse_json_batch(json_strings);
 
-        let deduplicator = FileActionDeduplicator {
-            seen_file_keys: &mut HashSet::new(),
-            selection_vector: vec![false; 2],
-            is_log_batch: false,
-        };
-        let mut visitor = CheckpointFileActionsVisitor {
-            deduplicator,
-            total_actions: 0,
-            total_add_actions: 0,
-            minimum_file_retention_timestamp: 0,
-        };
+        let mut seen_file_keys = HashSet::new();
+        let mut seen_txns = HashSet::new();
+        let mut visitor = CheckpointVisitor::new(
+            &mut seen_file_keys,
+            &mut seen_txns,
+            vec![false; 2],
+            false, // is_log_batch = false (checkpoint batch)
+            0,
+        );
 
         visitor.visit_rows_of(batch.as_ref())?;
 
         // Both should be included since we don't track duplicates in checkpoint batches
         let expected = vec![true, true];
-        assert_eq!(visitor.deduplicator.seen_file_keys.len(), 0); // No tracking for checkpoint batches
-        assert_eq!(visitor.deduplicator.selection_vector(), expected);
-        assert_eq!(visitor.total_actions, 2);
+        assert_eq!(visitor.file_deduplicator.selection_vector, expected);
+        assert_eq!(visitor.total_file_actions, 2);
         assert_eq!(visitor.total_add_actions, 2);
+        assert_eq!(visitor.total_non_file_actions, 0);
         Ok(())
     }
 
     #[test]
-    fn test_checkpoint_file_action_visitor_with_deletion_vectors() -> DeltaResult<()> {
+    fn test_checkpoint_visitor_with_deletion_vectors() -> DeltaResult<()> {
         let json_strings: StringArray = vec![
             r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#,
             // Same path but different DV
@@ -1276,52 +1368,52 @@ mod tests {
         ]
         .into();
         let batch = parse_json_batch(json_strings);
-        let deduplicator = FileActionDeduplicator {
-            seen_file_keys: &mut HashSet::new(),
-            selection_vector: vec![false; 3],
-            is_log_batch: true,
-        };
-        let mut visitor = CheckpointFileActionsVisitor {
-            deduplicator,
-            total_actions: 0,
-            total_add_actions: 0,
-            minimum_file_retention_timestamp: 0,
-        };
+
+        let mut seen_file_keys = HashSet::new();
+        let mut seen_txns = HashSet::new();
+        let mut visitor =
+            CheckpointVisitor::new(&mut seen_file_keys, &mut seen_txns, vec![false; 3], true, 0);
 
         visitor.visit_rows_of(batch.as_ref())?;
 
         let expected = vec![true, true, false]; // Third one is a duplicate
-        assert_eq!(visitor.deduplicator.seen_file_keys.len(), 2);
-        assert_eq!(visitor.deduplicator.selection_vector(), expected);
-        assert_eq!(visitor.total_actions, 2);
+        assert_eq!(visitor.file_deduplicator.selection_vector, expected);
+        assert_eq!(visitor.total_file_actions, 2);
         assert_eq!(visitor.total_add_actions, 2);
+        assert_eq!(visitor.total_non_file_actions, 0);
+
         Ok(())
     }
 
     #[test]
-    fn test_parse_checkpoint_non_file_actions_visitor() -> DeltaResult<()> {
-        let data = action_batch();
-        let mut visitor = CheckpointNonFileActionsVisitor {
-            seen_protocol: false,
-            seen_metadata: false,
-            seen_txns: &mut HashSet::new(),
-            selection_vector: vec![false; 8],
-            total_actions: 0,
-        };
+    fn test_checkpoint_visitor_non_file_actions() -> DeltaResult<()> {
+        let json_strings: StringArray = vec![
+            r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#,
+            r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#,
+            r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#,
+        ].into();
+        let batch = parse_json_batch(json_strings);
 
-        visitor.visit_rows_of(data.as_ref())?;
+        let mut seen_file_keys = HashSet::new();
+        let mut seen_txns = HashSet::new();
+        let mut visitor =
+            CheckpointVisitor::new(&mut seen_file_keys, &mut seen_txns, vec![false; 3], true, 0);
 
-        let expected = vec![false, false, false, true, true, false, false, true];
-        assert_eq!(visitor.selection_vector, expected);
-        assert!(visitor.seen_metadata);
+        visitor.visit_rows_of(batch.as_ref())?;
+
+        let expected = vec![true, true, true];
+        assert_eq!(visitor.file_deduplicator.selection_vector, expected);
         assert!(visitor.seen_protocol);
+        assert!(visitor.seen_metadata);
         assert_eq!(visitor.seen_txns.len(), 1);
-        assert_eq!(visitor.total_actions, 3);
+        assert_eq!(visitor.total_non_file_actions, 3);
+        assert_eq!(visitor.total_file_actions, 0);
+
         Ok(())
     }
 
     #[test]
-    fn test_checkpoint_non_file_actions_visitor_already_seen_actions() -> DeltaResult<()> {
+    fn test_checkpoint_visitor_already_seen_non_file_actions() -> DeltaResult<()> {
         let json_strings: StringArray = vec![
             r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#,
             r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#,
@@ -1330,54 +1422,67 @@ mod tests {
         let batch = parse_json_batch(json_strings);
 
         // Pre-populate with txn app1
+        let mut seen_file_keys = HashSet::new();
         let mut seen_txns = HashSet::new();
         seen_txns.insert("app1".to_string());
 
-        let mut visitor = CheckpointNonFileActionsVisitor {
-            seen_protocol: true, // Already seen
-            seen_metadata: true, // Already seen
-            seen_txns: &mut seen_txns,
-            selection_vector: vec![false; 3],
-            total_actions: 0,
-        };
+        let mut visitor = CheckpointVisitor::new(
+            &mut seen_file_keys,
+            &mut seen_txns, // Pre-populated transaction
+            vec![false; 3],
+            true,
+            0,
+        );
+
+        // Mark these as already seen
+        visitor.seen_protocol = true;
+        visitor.seen_metadata = true;
 
         visitor.visit_rows_of(batch.as_ref())?;
 
         // All actions should be skipped as they have already been seen
-        let expected = vec![false; 3];
-        assert_eq!(visitor.selection_vector, expected);
-        assert_eq!(visitor.seen_txns.len(), 1); // Still only one transaction
-        assert_eq!(visitor.total_actions, 0);
+        let expected = vec![false, false, false];
+        assert_eq!(visitor.file_deduplicator.selection_vector, expected);
+        assert_eq!(visitor.total_non_file_actions, 0);
+        assert_eq!(visitor.total_file_actions, 0);
+
         Ok(())
     }
 
     #[test]
-    fn test_checkpoint_non_file_actions_visitor_duplicate_non_file_actions() -> DeltaResult<()> {
+    fn test_checkpoint_visitor_duplicate_non_file_actions() -> DeltaResult<()> {
         let json_strings: StringArray = vec![
             r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#,
-            r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#,
-            r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#,
-            r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#,
-            r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#,
+            r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, // Duplicate txn
+            r#"{"txn":{"appId":"app2","version":1,"lastUpdated":123456789}}"#, // Different app ID
+            r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#,
+            r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, // Duplicate protocol
             r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#,
+            // Duplicate metadata
+            r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, 
         ]
         .into();
         let batch = parse_json_batch(json_strings);
 
-        let mut visitor = CheckpointNonFileActionsVisitor {
-            seen_protocol: false,
-            seen_metadata: false,
-            seen_txns: &mut HashSet::new(), // Empty set
-            selection_vector: vec![false; 6],
-            total_actions: 0,
-        };
+        let mut seen_file_keys = HashSet::new();
+        let mut seen_txns = HashSet::new();
+        let mut visitor = CheckpointVisitor::new(
+            &mut seen_file_keys,
+            &mut seen_txns,
+            vec![false; 7],
+            true, // is_log_batch
+            0,    // minimum_file_retention_timestamp
+        );
 
         visitor.visit_rows_of(batch.as_ref())?;
 
-        let expected = vec![true, false, true, false, true, false];
-        assert_eq!(visitor.selection_vector, expected);
-        assert_eq!(visitor.seen_txns.len(), 1);
-        assert_eq!(visitor.total_actions, 3);
+        // First occurrence of each type should be included
+        let expected = vec![true, false, true, true, false, true, false];
+        assert_eq!(visitor.file_deduplicator.selection_vector, expected);
+        assert_eq!(visitor.seen_txns.len(), 2); // Two different app IDs
+        assert_eq!(visitor.total_non_file_actions, 4); // 2 txns + 1 protocol + 1 metadata
+        assert_eq!(visitor.total_file_actions, 0);
+
         Ok(())
     }
 }

From 5dbc924b65eeb2d3a5b34f03059d1d03a9b80f6d Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Tue, 25 Mar 2025 17:11:15 -0700
Subject: [PATCH 12/45] fmt

---
 kernel/src/actions/visitors.rs | 115 ++++++++++++++++++++++-----------
 kernel/src/scan/log_replay.rs  |   8 +--
 2 files changed, 79 insertions(+), 44 deletions(-)

diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs
index 73eb25d93..a93aa71ec 100644
--- a/kernel/src/actions/visitors.rs
+++ b/kernel/src/actions/visitors.rs
@@ -503,7 +503,7 @@ impl RowVisitor for SidecarVisitor {
 #[cfg_attr(feature = "developer-visibility", visibility::make(pub))]
 pub(crate) struct CheckpointVisitor<'seen> {
     // File actions deduplication state
-    file_deduplicator: FileActionDeduplicator<'seen>,
+    deduplicator: FileActionDeduplicator<'seen>,
     total_file_actions: usize,
     total_add_actions: usize,
     minimum_file_retention_timestamp: i64,
@@ -526,7 +526,7 @@ impl CheckpointVisitor<'_> {
         minimum_file_retention_timestamp: i64,
     ) -> CheckpointVisitor<'seen> {
         CheckpointVisitor {
-            file_deduplicator: FileActionDeduplicator::new(
+            deduplicator: FileActionDeduplicator::new(
                 seen_file_keys,
                 selection_vector,
                 is_log_batch,
@@ -564,18 +564,18 @@ impl CheckpointVisitor<'_> {
         i: usize,
         getters: &[&'a dyn GetData<'a>],
     ) -> DeltaResult<bool> {
-        let Some((file_key, is_add)) = self.file_deduplicator.extract_file_action(
-            i, &getters, 0,     // add_path_index
-            4,     // remove_path_index
-            1,     // add_dv_start_index
-            6,     // remove_dv_start_index
-            false, // Never skip remove actions (even if we're processing a log batch)
+        // Extract file action key and determine if it's an add operation
+        let Some((file_key, is_add)) = self.deduplicator.extract_file_action(
+            i,
+            getters,
+            // Do not skip remove actions (even if we're processing a log batch)
+            FileActionExtractConfig::new(0, 4, 1, 6, false),
         )?
         else {
             return Ok(false);
         };
 
-        if self.file_deduplicator.check_and_record_seen(file_key) {
+        if self.deduplicator.check_and_record_seen(file_key) {
             return Ok(false);
         }
 
@@ -697,13 +697,46 @@ impl RowVisitor for CheckpointVisitor<'_> {
 
             // Mark the row for selection if it's either a valid non-file or file action
             if is_non_file_action || is_file_action {
-                self.file_deduplicator.selection_vector_mut()[i] = true;
+                self.deduplicator.selection_vector_mut()[i] = true;
             }
         }
         Ok(())
     }
 }
 
+/// This struct contains indices and configuration options needed to
+/// extract file actions from action batches in the Delta log.
+pub(crate) struct FileActionExtractConfig {
+    /// Index of the getter containing the add.path column
+    pub add_path_index: usize,
+    /// Index of the getter containing the remove.path column
+    pub remove_path_index: usize,
+    /// Starting index for add action deletion vector columns
+    pub add_dv_start_index: usize,
+    /// Starting index for remove action deletion vector columns
+    pub remove_dv_start_index: usize,
+    /// Whether to skip remove actions when extracting file actions
+    pub skip_removes: bool,
+}
+
+impl FileActionExtractConfig {
+    pub(crate) fn new(
+        add_path_index: usize,
+        remove_path_index: usize,
+        add_dv_start_index: usize,
+        remove_dv_start_index: usize,
+        skip_removes: bool,
+    ) -> Self {
+        Self {
+            add_path_index,
+            remove_path_index,
+            add_dv_start_index,
+            remove_dv_start_index,
+            skip_removes,
+        }
+    }
+}
+
 /// Core implementation for deduplicating file actions in Delta log replay
 /// This struct extracts the common functionality from the CheckpointVisitor
 /// and the ScanDataVisitor.
@@ -786,58 +819,64 @@ impl<'seen> FileActionDeduplicator<'seen> {
         }
     }
 
-    /// Extract file action key and determine if it's an add operation
+    /// Extracts a file action key and determines if it's an add operation.
+    ///
+    /// This method examines the data at the given index using the provided getters and config
+    /// to identify whether a file action exists and what type it is.
+    ///
+    /// # Arguments
+    ///
+    /// * `i` - Index position in the data structure to examine
+    /// * `getters` - Collection of data getter implementations used to access the data
+    /// * `config` - Configuration specifying where to find add/remove operations
+    ///
+    /// # Returns
+    ///
+    /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation
+    /// * `Ok(None)` - When no file action is found
+    /// * `Err(...)` - On any error during extraction
     pub(crate) fn extract_file_action<'a>(
         &self,
         i: usize,
         getters: &[&'a dyn GetData<'a>],
-        add_path_index: usize,
-        remove_path_index: usize,
-        add_dv_start_index: usize,
-        remove_dv_start_index: usize,
-        skip_removes: bool,
+        config: FileActionExtractConfig,
     ) -> DeltaResult<Option<(FileActionKey, bool)>> {
         // Try to extract an add action path
-        if let Some(path) = getters[add_path_index].get_str(i, "add.path")? {
+        if let Some(path) = getters[config.add_path_index].get_str(i, "add.path")? {
             let dv_unique_id =
-                self.extract_dv_unique_id(i, getters, Some(add_dv_start_index), None)?;
+                self.extract_dv_unique_id(i, getters, Some(config.add_dv_start_index), None)?;
             return Ok(Some((FileActionKey::new(path, dv_unique_id), true)));
         }
 
-        // The AddRemoveDedupVisitor does not include remove action getters when
-        // dealing with non-log batches (since they are not needed for deduplication).
-        // In this case, we should skip remove actions.
-        if skip_removes {
+        // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint file.
+        if config.skip_removes {
             return Ok(None);
         }
 
         // Try to extract a remove action path
-        if let Some(path) = getters[remove_path_index].get_str(i, "remove.path")? {
+        if let Some(path) = getters[config.remove_path_index].get_str(i, "remove.path")? {
             let dv_unique_id =
-                self.extract_dv_unique_id(i, getters, None, Some(remove_dv_start_index))?;
+                self.extract_dv_unique_id(i, getters, None, Some(config.remove_dv_start_index))?;
             return Ok(Some((FileActionKey::new(path, dv_unique_id), false)));
         }
 
-        // If we didn't find an add or remove action, return None
-        return Ok(None);
+        // No file action found
+        Ok(None)
     }
 
-    /// Get the selection vector
     pub(crate) fn selection_vector(self) -> Vec<bool> {
         self.selection_vector
     }
 
-    /// Get reference to the selection vector
     pub(crate) fn selection_vector_ref(&self) -> &Vec<bool> {
         &self.selection_vector
     }
 
-    /// Get mutable reference to the selection vector
     pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec<bool> {
         &mut self.selection_vector
     }
 
-    /// Get whether we are processing a log batch
+    /// Returns whether we are currently processing a log batch.
     pub(crate) fn is_log_batch(&self) -> bool {
         self.is_log_batch
     }
@@ -1261,7 +1300,7 @@ mod tests {
         assert_eq!(visitor.seen_txns.len(), 1);
         assert_eq!(visitor.total_non_file_actions, 3);
 
-        assert_eq!(visitor.file_deduplicator.selection_vector, expected);
+        assert_eq!(visitor.deduplicator.selection_vector, expected);
         Ok(())
     }
 
@@ -1291,7 +1330,7 @@ mod tests {
 
         // Only "one_above_threshold" should be kept
         let expected = vec![false, false, true, false];
-        assert_eq!(visitor.file_deduplicator.selection_vector, expected);
+        assert_eq!(visitor.deduplicator.selection_vector, expected);
         assert_eq!(visitor.total_file_actions, 1);
         assert_eq!(visitor.total_add_actions, 0);
         assert_eq!(visitor.total_non_file_actions, 0);
@@ -1317,7 +1356,7 @@ mod tests {
 
         // First one should be included, second one skipped as a duplicate
         let expected = vec![true, false];
-        assert_eq!(visitor.file_deduplicator.selection_vector, expected);
+        assert_eq!(visitor.deduplicator.selection_vector, expected);
         assert_eq!(visitor.total_file_actions, 1);
         assert_eq!(visitor.total_add_actions, 1);
         assert_eq!(visitor.total_non_file_actions, 0);
@@ -1350,7 +1389,7 @@ mod tests {
 
         // Both should be included since we don't track duplicates in checkpoint batches
         let expected = vec![true, true];
-        assert_eq!(visitor.file_deduplicator.selection_vector, expected);
+        assert_eq!(visitor.deduplicator.selection_vector, expected);
         assert_eq!(visitor.total_file_actions, 2);
         assert_eq!(visitor.total_add_actions, 2);
         assert_eq!(visitor.total_non_file_actions, 0);
@@ -1377,7 +1416,7 @@ mod tests {
         visitor.visit_rows_of(batch.as_ref())?;
 
         let expected = vec![true, true, false]; // Third one is a duplicate
-        assert_eq!(visitor.file_deduplicator.selection_vector, expected);
+        assert_eq!(visitor.deduplicator.selection_vector, expected);
         assert_eq!(visitor.total_file_actions, 2);
         assert_eq!(visitor.total_add_actions, 2);
         assert_eq!(visitor.total_non_file_actions, 0);
@@ -1402,7 +1441,7 @@ mod tests {
         visitor.visit_rows_of(batch.as_ref())?;
 
         let expected = vec![true, true, true];
-        assert_eq!(visitor.file_deduplicator.selection_vector, expected);
+        assert_eq!(visitor.deduplicator.selection_vector, expected);
         assert!(visitor.seen_protocol);
         assert!(visitor.seen_metadata);
         assert_eq!(visitor.seen_txns.len(), 1);
@@ -1442,7 +1481,7 @@ mod tests {
 
         // All actions should be skipped as they have already been seen
         let expected = vec![false, false, false];
-        assert_eq!(visitor.file_deduplicator.selection_vector, expected);
+        assert_eq!(visitor.deduplicator.selection_vector, expected);
         assert_eq!(visitor.total_non_file_actions, 0);
         assert_eq!(visitor.total_file_actions, 0);
 
@@ -1478,7 +1517,7 @@ mod tests {
 
         // First occurrence of each type should be included
         let expected = vec![true, false, true, true, false, true, false];
-        assert_eq!(visitor.file_deduplicator.selection_vector, expected);
+        assert_eq!(visitor.deduplicator.selection_vector, expected);
         assert_eq!(visitor.seen_txns.len(), 2); // Two different app IDs
         assert_eq!(visitor.total_non_file_actions, 4); // 2 txns + 1 protocol + 1 metadata
         assert_eq!(visitor.total_file_actions, 0);
diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index 59e3e52c1..392b1511c 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -7,7 +7,7 @@ use itertools::Itertools;
 use super::data_skipping::DataSkippingFilter;
 use super::{ScanData, Transform};
 use crate::actions::get_log_add_schema;
-use crate::actions::visitors::FileActionDeduplicator;
+use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig};
 use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
 use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef};
 use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _};
@@ -150,11 +150,7 @@ impl AddRemoveDedupVisitor<'_> {
         let Some((file_key, is_add)) = self.deduplicator.extract_file_action(
             i,
             getters,
-            0,                                 // add_path_index
-            5,                                 // remove_path_index
-            2,                                 // add_dv_start_index
-            6,                                 // remove_dv_start_index
-            !self.deduplicator.is_log_batch(), // skip_removes if it's a log batch
+            FileActionExtractConfig::new(0, 5, 2, 6, !self.deduplicator.is_log_batch()),
         )?
         else {
             return Ok(false);

From b7939610ebf92dbbc9825437bac45b99a3b221d1 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Tue, 25 Mar 2025 17:39:26 -0700
Subject: [PATCH 13/45] remove old code

---
 kernel/src/actions/visitors.rs | 109 +--------------------------------
 1 file changed, 1 insertion(+), 108 deletions(-)

diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs
index a93aa71ec..c0feb93eb 100644
--- a/kernel/src/actions/visitors.rs
+++ b/kernel/src/actions/visitors.rs
@@ -739,7 +739,7 @@ impl FileActionExtractConfig {
 
 /// Core implementation for deduplicating file actions in Delta log replay
 /// This struct extracts the common functionality from the CheckpointVisitor
-/// and the ScanDataVisitor.
+/// and the AddRemoveDedupVisitor.
 pub(crate) struct FileActionDeduplicator<'seen> {
     /// A set of (data file path, dv_unique_id) pairs that have been seen thus
     /// far in the log for deduplication
@@ -882,113 +882,6 @@ impl<'seen> FileActionDeduplicator<'seen> {
     }
 }
 
-/// A visitor that selects non-file actions for a checkpoint file. Since log replay visits actions
-/// in newest-first order, we only keep the first occurrence of:
-/// - a protocol action,
-/// - a metadata action,
-/// - a transaction (txn) action for a given app ID.
-///
-/// Any subsequent (older) actions of the same type are ignored. This visitor tracks which actions
-/// have been seen and includes only the first occurrence of each in the selection vector.
-#[cfg_attr(feature = "developer-visibility", visibility::make(pub))]
-pub(crate) struct CheckpointNonFileActionsVisitor<'seen> {
-    // Non-file actions state
-    pub(crate) seen_protocol: bool,
-    pub(crate) seen_metadata: bool,
-    pub(crate) seen_txns: &'seen mut HashSet<String>,
-    pub(crate) selection_vector: Vec<bool>,
-    pub(crate) total_actions: usize,
-}
-
-#[allow(unused)] // TODO: Remove flag once used for checkpoint writing
-impl CheckpointNonFileActionsVisitor<'_> {
-    /// Returns true if the row contains a protocol action, and we haven’t seen one yet.
-    fn is_valid_protocol_action<'a>(
-        &mut self,
-        i: usize,
-        getter: &'a dyn GetData<'a>,
-    ) -> DeltaResult<bool> {
-        if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol {
-            self.seen_protocol = true;
-            Ok(true)
-        } else {
-            Ok(false)
-        }
-    }
-
-    /// Returns true if the row contains a metadata action, and we haven’t seen one yet.
-    fn is_valid_metadata_action<'a>(
-        &mut self,
-        i: usize,
-        getter: &'a dyn GetData<'a>,
-    ) -> DeltaResult<bool> {
-        if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata {
-            self.seen_metadata = true;
-            Ok(true)
-        } else {
-            Ok(false)
-        }
-    }
-
-    /// Returns true if the row contains a txn action with an appId that we haven’t seen yet.
-    fn is_valid_txn_action<'a>(
-        &mut self,
-        i: usize,
-        getter: &'a dyn GetData<'a>,
-    ) -> DeltaResult<bool> {
-        let app_id = match getter.get_str(i, "txn.appId")? {
-            Some(id) => id,
-            None => return Ok(false),
-        };
-
-        Ok(self.seen_txns.insert(app_id.to_string()))
-    }
-}
-
-impl RowVisitor for CheckpointNonFileActionsVisitor<'_> {
-    fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) {
-        // The data columns visited must be in the following order:
-        // 1. METADATA
-        // 2. PROTOCOL
-        // 3. TXN
-        static CHECKPOINT_NON_FILE_ACTION_COLUMNS: LazyLock<ColumnNamesAndTypes> =
-            LazyLock::new(|| {
-                const STRING: DataType = DataType::STRING;
-                const INTEGER: DataType = DataType::INTEGER;
-                let types_and_names = vec![
-                    (STRING, column_name!("metaData.id")),
-                    (INTEGER, column_name!("protocol.minReaderVersion")),
-                    (STRING, column_name!("txn.appId")),
-                ];
-                let (types, names) = types_and_names.into_iter().unzip();
-                (names, types).into()
-            });
-        CHECKPOINT_NON_FILE_ACTION_COLUMNS.as_ref()
-    }
-
-    fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> {
-        require!(
-            getters.len() == 3,
-            Error::InternalError(format!(
-                "Wrong number of visitor getters: {}",
-                getters.len()
-            ))
-        );
-
-        for i in 0..row_count {
-            let should_select = self.is_valid_metadata_action(i, getters[0])?
-                || self.is_valid_protocol_action(i, getters[1])?
-                || self.is_valid_txn_action(i, getters[2])?;
-
-            if should_select {
-                self.selection_vector[i] = true;
-                self.total_actions += 1;
-            }
-        }
-        Ok(())
-    }
-}
-
 /// Get a DV out of some engine data. The caller is responsible for slicing the `getters` slice such
 /// that the first element contains the `storageType` element of the deletion vector.
 pub(crate) fn visit_deletion_vector_at<'a>(

From 508976ff35a8e10da28222c4a33030eba468965e Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Tue, 25 Mar 2025 18:10:05 -0700
Subject: [PATCH 14/45] move FileActionKey

---
 kernel/src/actions/visitors.rs | 15 ++++++++++++++-
 kernel/src/scan/log_replay.rs  | 16 +---------------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs
index c0feb93eb..037dfdd42 100644
--- a/kernel/src/actions/visitors.rs
+++ b/kernel/src/actions/visitors.rs
@@ -6,7 +6,6 @@ use std::sync::LazyLock;
 use tracing::debug;
 
 use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
-use crate::scan::log_replay::FileActionKey;
 use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType};
 use crate::utils::require;
 use crate::{DeltaResult, Error};
@@ -704,6 +703,20 @@ impl RowVisitor for CheckpointVisitor<'_> {
     }
 }
 
+/// The subset of file action fields that uniquely identifies it in the log, used for deduplication
+/// of adds and removes during log replay.
+#[derive(Debug, Hash, Eq, PartialEq)]
+pub(crate) struct FileActionKey {
+    pub(crate) path: String,
+    pub(crate) dv_unique_id: Option<String>,
+}
+impl FileActionKey {
+    pub(crate) fn new(path: impl Into<String>, dv_unique_id: Option<String>) -> Self {
+        let path = path.into();
+        Self { path, dv_unique_id }
+    }
+}
+
 /// This struct contains indices and configuration options needed to
 /// extract file actions from action batches in the Delta log.
 pub(crate) struct FileActionExtractConfig {
diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index 392b1511c..d3287eb5d 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -7,7 +7,7 @@ use itertools::Itertools;
 use super::data_skipping::DataSkippingFilter;
 use super::{ScanData, Transform};
 use crate::actions::get_log_add_schema;
-use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig};
+use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig, FileActionKey};
 use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
 use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef};
 use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _};
@@ -16,20 +16,6 @@ use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructFie
 use crate::utils::require;
 use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator};
 
-/// The subset of file action fields that uniquely identifies it in the log, used for deduplication
-/// of adds and removes during log replay.
-#[derive(Debug, Hash, Eq, PartialEq)]
-pub(crate) struct FileActionKey {
-    pub(crate) path: String,
-    pub(crate) dv_unique_id: Option<String>,
-}
-impl FileActionKey {
-    pub(crate) fn new(path: impl Into<String>, dv_unique_id: Option<String>) -> Self {
-        let path = path.into();
-        Self { path, dv_unique_id }
-    }
-}
-
 struct LogReplayScanner {
     partition_filter: Option<ExpressionRef>,
     data_skipping_filter: Option<DataSkippingFilter>,

From 0160ef151185de1f2c10c2e0a866ebafe3e2eabb Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Wed, 26 Mar 2025 01:23:57 -0700
Subject: [PATCH 15/45] fix whitespace

---
 kernel/src/actions/visitors.rs | 84 +++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs
index 828fc4387..4d93d6fd3 100644
--- a/kernel/src/actions/visitors.rs
+++ b/kernel/src/actions/visitors.rs
@@ -1182,13 +1182,13 @@ mod tests {
     #[test]
     fn test_checkpoint_visitor_boundary_cases_for_tombstone_expiration() -> DeltaResult<()> {
         let json_strings: StringArray = vec![
-        r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#,
-        r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#,
-        r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#,
-        // Missing timestamp defaults to 0
-        r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, 
-    ]
-    .into();
+            r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#,
+            r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#,
+            r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#,
+            // Missing timestamp defaults to 0
+            r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, 
+        ]
+        .into();
         let batch = parse_json_batch(json_strings);
 
         let mut seen_file_keys = HashSet::new();
@@ -1217,11 +1217,11 @@ mod tests {
     #[test]
     fn test_checkpoint_visitor_conflicting_file_actions_in_log_batch() -> DeltaResult<()> {
         let json_strings: StringArray = vec![
-        r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#,
-         // Duplicate path
-        r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#,
-    ]
-    .into();
+            r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#,
+            // Duplicate path
+            r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#,
+        ]
+        .into();
         let batch = parse_json_batch(json_strings);
 
         let mut seen_file_keys = HashSet::new();
@@ -1252,11 +1252,11 @@ mod tests {
         // Note: this is NOT a valid checkpoint batch since it contains duplicate file actions!
         // However, we should still be able to parse it without errors, and the duplicates should be included.
         let json_strings: StringArray = vec![
-        r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#,
-        // Duplicate path
-        r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, 
-    ]
-    .into();
+            r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#,
+            // Duplicate path
+            r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, 
+        ]
+        .into();
         let batch = parse_json_batch(json_strings);
 
         let mut seen_file_keys = HashSet::new();
@@ -1285,13 +1285,13 @@ mod tests {
     #[test]
     fn test_checkpoint_visitor_with_deletion_vectors() -> DeltaResult<()> {
         let json_strings: StringArray = vec![
-        r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#,
-        // Same path but different DV
-        r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"two","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, 
-        // Duplicate of first entry
-        r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, 
-    ]
-    .into();
+            r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#,
+            // Same path but different DV
+            r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"two","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, 
+            // Duplicate of first entry
+            r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, 
+        ]
+        .into();
         let batch = parse_json_batch(json_strings);
 
         let mut seen_file_keys = HashSet::new();
@@ -1320,10 +1320,10 @@ mod tests {
     #[test]
     fn test_checkpoint_visitor_non_file_actions() -> DeltaResult<()> {
         let json_strings: StringArray = vec![
-        r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#,
-        r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#,
-        r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#,
-    ].into();
+            r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#,
+            r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#,
+            r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#,
+        ].into();
         let batch = parse_json_batch(json_strings);
 
         let mut seen_file_keys = HashSet::new();
@@ -1354,10 +1354,10 @@ mod tests {
     #[test]
     fn test_checkpoint_visitor_already_seen_non_file_actions() -> DeltaResult<()> {
         let json_strings: StringArray = vec![
-        r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#,
-        r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#,
-        r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#,
-    ].into();
+            r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#,
+            r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#,
+            r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#,
+        ].into();
         let batch = parse_json_batch(json_strings);
 
         // Pre-populate with txn app1
@@ -1389,16 +1389,16 @@ mod tests {
     #[test]
     fn test_checkpoint_visitor_duplicate_non_file_actions() -> DeltaResult<()> {
         let json_strings: StringArray = vec![
-        r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#,
-        r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, // Duplicate txn
-        r#"{"txn":{"appId":"app2","version":1,"lastUpdated":123456789}}"#, // Different app ID
-        r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#,
-        r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, // Duplicate protocol
-        r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#,
-        // Duplicate metadata
-        r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, 
-    ]
-    .into();
+            r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#,
+            r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, // Duplicate txn
+            r#"{"txn":{"appId":"app2","version":1,"lastUpdated":123456789}}"#, // Different app ID
+            r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#,
+            r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, // Duplicate protocol
+            r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#,
+            // Duplicate metadata
+            r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, 
+        ]
+        .into();
         let batch = parse_json_batch(json_strings);
 
         let mut seen_file_keys = HashSet::new();

From aae7046782e1b9c98f26d4dd9d38d05c6be78fb0 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Wed, 26 Mar 2025 01:28:27 -0700
Subject: [PATCH 16/45] remove old code

---
 kernel/src/log_replay.rs | 114 +--------------------------------------
 1 file changed, 1 insertion(+), 113 deletions(-)

diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
index cfd4a10c0..e98dd6f03 100644
--- a/kernel/src/log_replay.rs
+++ b/kernel/src/log_replay.rs
@@ -1,9 +1,6 @@
 use std::collections::HashSet;
-use tracing::debug;
 
-use crate::actions::deletion_vector::DeletionVectorDescriptor;
-use crate::engine_data::{GetData, TypedGetData as _};
-use crate::{DeltaResult, EngineData, Error};
+use crate::{DeltaResult, EngineData};
 
 #[derive(Debug, Hash, Eq, PartialEq)]
 /// The subset of file action fields that uniquely identifies it in the log, used for deduplication
@@ -36,112 +33,3 @@ pub trait LogReplayProcessor {
     // Get a reference to the set of seen file keys
     fn seen_file_keys(&mut self) -> &mut HashSet<FileActionKey>;
 }
-
-/// Base trait for visitors that process file actions during log replay
-pub trait FileActionVisitor {
-    /// Get a reference to the set of seen file keys
-    fn seen_file_keys(&mut self) -> &mut HashSet<FileActionKey>;
-
-    /// Checks if log replay already processed this logical file (in which case the current action
-    /// should be ignored). If not already seen, register it so we can recognize future duplicates.
-    /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it
-    /// and should process it.
-    fn check_and_record_seen(&mut self, key: FileActionKey, is_log_batch: bool) -> bool {
-        // Note: each (add.path + add.dv_unique_id()) pair has a
-        // unique Add + Remove pair in the log. For example:
-        // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json
-
-        if self.seen_file_keys().contains(&key) {
-            debug!(
-                "Ignoring duplicate ({}, {:?}) in scan, is log {}",
-                key.path, key.dv_unique_id, is_log_batch
-            );
-            true
-        } else {
-            debug!(
-                "Including ({}, {:?}) in scan, is log {}",
-                key.path, key.dv_unique_id, is_log_batch
-            );
-            if is_log_batch {
-                // Remember file actions from this batch so we can ignore duplicates as we process
-                // batches from older commit and/or checkpoint files. We don't track checkpoint
-                // batches because they are already the oldest actions and never replace anything.
-                self.seen_file_keys().insert(key);
-            }
-            false
-        }
-    }
-
-    /// Index in getters array for add.path
-    fn add_path_index(&self) -> usize;
-
-    /// Index in getters array for remove.path
-    fn remove_path_index(&self) -> Option<usize>;
-
-    /// Starting index for add action's deletion vector getters
-    /// (Assumes 3 consecutive items: storageType, pathOrInlineDv, offset)
-    fn add_dv_start_index(&self) -> usize;
-
-    /// Starting index for remove action's deletion vector getters
-    /// (Assumes 3 consecutive items: storageType, pathOrInlineDv, offset)
-    fn remove_dv_start_index(&self) -> Option<usize>;
-
-    /// Extract deletion vector unique ID
-    fn extract_dv_unique_id<'a>(
-        &self,
-        i: usize,
-        getters: &[&'a dyn GetData<'a>],
-        is_add: bool,
-    ) -> DeltaResult<Option<String>> {
-        // Get the starting index based on action type
-        let start_idx = if is_add {
-            self.add_dv_start_index()
-        } else if let Some(idx) = self.remove_dv_start_index() {
-            idx
-        } else {
-            return Err(Error::GenericError {
-                source: "DV getters should exist".into(),
-            });
-        };
-
-        // Extract the DV unique ID
-        match getters[start_idx].get_opt(i, "deletionVector.storageType")? {
-            Some(storage_type) => Ok(Some(DeletionVectorDescriptor::unique_id_from_parts(
-                storage_type,
-                getters[start_idx + 1].get(i, "deletionVector.pathOrInlineDv")?,
-                getters[start_idx + 2].get_opt(i, "deletionVector.offset")?,
-            ))),
-            None => Ok(None),
-        }
-    }
-
-    /// Extract file action key and determine if it's an add operation
-    fn extract_file_action<'a>(
-        &self,
-        i: usize,
-        getters: &[&'a dyn GetData<'a>],
-    ) -> DeltaResult<Option<(FileActionKey, bool)>> {
-        // Try to extract an add action path
-        if let Some(path) = getters[self.add_path_index()].get_str(i, "add.path")? {
-            let dv_unique_id = self.extract_dv_unique_id(i, getters, true)?;
-            let file_key = FileActionKey::new(path, dv_unique_id);
-            return Ok(Some((file_key, true)));
-        }
-
-        // The AddRemoveDedupVisitor does not include remove action getters when
-        // dealing with non-log batches (since they are not needed for deduplication).
-        let Some(remove_idx) = self.remove_path_index() else {
-            return Ok(None);
-        };
-
-        // Try to extract a remove action path
-        if let Some(path) = getters[remove_idx].get_str(i, "remove.path")? {
-            let dv_unique_id = self.extract_dv_unique_id(i, getters, false)?;
-            let file_key = FileActionKey::new(path, dv_unique_id);
-            return Ok(Some((file_key, false)));
-        }
-
-        // No path found, not a file action
-        Ok(None)
-    }
-}

From f5743709a48c2ebf4e2c1086cbb85d486daac31c Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Wed, 26 Mar 2025 11:25:28 -0700
Subject: [PATCH 17/45] refactor more

---
 kernel/src/checkpoints/log_replay.rs | 47 ++++++++++++++++--------
 kernel/src/log_replay.rs             | 55 ++++++++++++++++++++++++++++
 kernel/src/scan/log_replay.rs        | 11 ++----
 kernel/src/scan/mod.rs               |  9 +++++
 4 files changed, 99 insertions(+), 23 deletions(-)

diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs
index 23c1f50d7..dc64b766c 100644
--- a/kernel/src/checkpoints/log_replay.rs
+++ b/kernel/src/checkpoints/log_replay.rs
@@ -4,9 +4,23 @@ use std::sync::Arc;
 
 use crate::actions::visitors::CheckpointVisitor;
 use crate::engine_data::RowVisitor;
-use crate::log_replay::{FileActionKey, LogReplayProcessor};
+use crate::log_replay::{
+    apply_processor_to_iterator, FileActionKey, HasSelectionVector, LogReplayProcessor,
+};
 use crate::{DeltaResult, EngineData};
 
+pub struct CheckpointData {
+    #[allow(unused)]
+    data: Box<dyn EngineData>,
+    selection_vector: Vec<bool>,
+}
+
+impl HasSelectionVector for CheckpointData {
+    fn has_selected_rows(&self) -> bool {
+        self.selection_vector.contains(&true)
+    }
+}
+
 /// `CheckpointLogReplayProcessor` is responsible for filtering actions during log
 /// replay to include only those that should be included in a V1 checkpoint.
 #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented
@@ -36,7 +50,7 @@ struct CheckpointLogReplayProcessor {
 
 impl LogReplayProcessor for CheckpointLogReplayProcessor {
     // Define the processing result type as a tuple of the data and selection vector
-    type ProcessingResult = (Box<dyn EngineData>, Vec<bool>);
+    type ProcessingResult = CheckpointData;
 
     /// This function processes batches of actions in reverse chronological order
     /// (from most recent to least recent) and performs the necessary filtering
@@ -90,7 +104,10 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor {
         self.seen_protocol = visitor.seen_protocol;
         self.seen_metadata = visitor.seen_metadata;
 
-        Ok((batch, visitor.deduplicator.selection_vector()))
+        Ok(CheckpointData {
+            data: batch,
+            selection_vector: visitor.deduplicator.selection_vector(),
+        })
     }
 
     // Get a reference to the set of seen file keys
@@ -132,20 +149,14 @@ pub(crate) fn checkpoint_actions_iter(
     total_actions_counter: Arc<AtomicUsize>,
     total_add_actions_counter: Arc<AtomicUsize>,
     minimum_file_retention_timestamp: i64,
-) -> impl Iterator<Item = DeltaResult<(Box<dyn EngineData>, Vec<bool>)>> + Send + 'static {
+) -> impl Iterator<Item = DeltaResult<CheckpointData>> + Send + 'static {
     let mut log_scanner = CheckpointLogReplayProcessor::new(
         total_actions_counter,
         total_add_actions_counter,
         minimum_file_retention_timestamp,
     );
 
-    action_iter
-        .map(move |action_res| {
-            let (batch, is_log_batch) = action_res?;
-            log_scanner.process_batch(batch, is_log_batch)
-        })
-        // Only yield batches that have at least one selected row
-        .filter(|res| res.as_ref().map_or(true, |(_, sv)| sv.contains(&true)))
+    apply_processor_to_iterator(log_scanner, action_iter)
 }
 
 #[cfg(test)]
@@ -212,12 +223,18 @@ mod tests {
         assert_eq!(results.len(), 2);
 
         // First batch should have all rows selected
-        let (_, selection_vector1) = &results[0];
-        assert_eq!(selection_vector1, &vec![true, true, true, true]);
+        let checkpoint_data = &results[0];
+        assert_eq!(
+            checkpoint_data.selection_vector,
+            vec![true, true, true, true]
+        );
 
         // Second batch should have only new file and transaction selected
-        let (_, selection_vector2) = &results[1];
-        assert_eq!(selection_vector2, &vec![false, false, true, false, true]);
+        let checkpoint_data = &results[1];
+        assert_eq!(
+            checkpoint_data.selection_vector,
+            vec![false, false, true, false, true]
+        );
 
         // Verify counters
         // 6 total actions (4 from batch1 + 2 from batch2 + 0 from batch3)
diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
index e98dd6f03..852c3fe0d 100644
--- a/kernel/src/log_replay.rs
+++ b/kernel/src/log_replay.rs
@@ -32,4 +32,59 @@ pub trait LogReplayProcessor {
 
     // Get a reference to the set of seen file keys
     fn seen_file_keys(&mut self) -> &mut HashSet<FileActionKey>;
+
+    // Create a selection vector of appropriate length with all elements set to the given value
+    fn create_selection_vector(
+        &self,
+        batch: &Box<dyn EngineData>,
+        default_value: bool,
+    ) -> Vec<bool> {
+        let selection_vector = vec![default_value; batch.len()];
+        assert_eq!(
+            selection_vector.len(),
+            batch.len(),
+            "Selection vector length does not match actions length"
+        );
+        selection_vector
+    }
+
+    // Filter an iterator to only include results with at least one selected item
+    fn filter_non_empty_results<T, I>(iter: I) -> impl Iterator<Item = DeltaResult<T>>
+    where
+        I: Iterator<Item = DeltaResult<T>>,
+        T: HasSelectionVector,
+    {
+        iter.filter(|res| {
+            res.as_ref()
+                .map_or(true, |result| result.has_selected_rows())
+        })
+    }
+}
+
+/// Trait for types that contain a selection vector
+pub trait HasSelectionVector {
+    /// Check if the selection vector contains at least one selected row
+    fn has_selected_rows(&self) -> bool;
+}
+
+/// Applies the given processor to the given iterator of action results,
+/// and filters out batches with no selected rows.
+///
+/// This function abstracts the common pattern used by both checkpoint and scan iterators.
+pub fn apply_processor_to_iterator<P>(
+    mut processor: P,
+    action_iter: impl Iterator<Item = DeltaResult<(Box<dyn EngineData>, bool)>>,
+) -> impl Iterator<Item = DeltaResult<P::ProcessingResult>>
+where
+    P: LogReplayProcessor<ProcessingResult: HasSelectionVector>,
+{
+    action_iter
+        .map(move |action_res| {
+            let (batch, is_log_batch) = action_res?;
+            processor.process_batch(batch, is_log_batch)
+        })
+        .filter(|res| {
+            res.as_ref()
+                .map_or(true, |result| result.has_selected_rows())
+        })
 }
diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index 6a0fa929a..42a88af4f 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -10,7 +10,7 @@ use crate::actions::get_log_add_schema;
 use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig};
 use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
 use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef};
-use crate::log_replay::{FileActionKey, LogReplayProcessor};
+use crate::log_replay::{apply_processor_to_iterator, FileActionKey, LogReplayProcessor};
 use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _};
 use crate::scan::{Scalar, TransformExpr};
 use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType};
@@ -350,15 +350,10 @@ pub(crate) fn scan_action_iter(
     transform: Option<Arc<Transform>>,
     physical_predicate: Option<(ExpressionRef, SchemaRef)>,
 ) -> impl Iterator<Item = DeltaResult<ScanData>> {
-    let mut log_scanner =
+    let log_scanner =
         ScanLogReplayProcessor::new(engine, physical_predicate, logical_schema, transform);
 
-    action_iter
-        .map(move |action_res| {
-            let (batch, is_log_batch) = action_res?;
-            log_scanner.process_batch(batch, is_log_batch)
-        })
-        .filter(|res| res.as_ref().map_or(true, |(_, sv, _)| sv.contains(&true)))
+    apply_processor_to_iterator(log_scanner, action_iter)
 }
 
 #[cfg(test)]
diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs
index a8e5da899..92fd00cea 100644
--- a/kernel/src/scan/mod.rs
+++ b/kernel/src/scan/mod.rs
@@ -13,6 +13,7 @@ use crate::actions::deletion_vector::{
 };
 use crate::actions::{get_log_schema, ADD_NAME, REMOVE_NAME, SIDECAR_NAME};
 use crate::expressions::{ColumnName, Expression, ExpressionRef, ExpressionTransform, Scalar};
+use crate::log_replay::HasSelectionVector;
 use crate::predicates::{DefaultPredicateEvaluator, EmptyColumnResolver};
 use crate::scan::state::{DvInfo, Stats};
 use crate::schema::{
@@ -324,6 +325,14 @@ pub(crate) enum TransformExpr {
 // (data, deletion_vec, transforms)
 pub type ScanData = (Box<dyn EngineData>, Vec<bool>, Vec<Option<ExpressionRef>>);
 
+// Implementation for the scan result type
+impl HasSelectionVector for ScanData {
+    fn has_selected_rows(&self) -> bool {
+        let (_, sv, _) = self;
+        sv.contains(&true)
+    }
+}
+
 /// The result of building a scan over a table. This can be used to get the actual data from
 /// scanning the table.
 pub struct Scan {

From a618833af203866fc356cde62131cf3a1572c61a Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Wed, 26 Mar 2025 14:10:24 -0700
Subject: [PATCH 18/45] refactor

---
 kernel/src/lib.rs             |   1 +
 kernel/src/log_replay.rs      | 176 ++++++++++++++++++++++++++++++++++
 kernel/src/scan/log_replay.rs | 132 +++++++++++--------------
 3 files changed, 231 insertions(+), 78 deletions(-)
 create mode 100644 kernel/src/log_replay.rs

diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs
index 2e4698658..bb21bb0f9 100644
--- a/kernel/src/lib.rs
+++ b/kernel/src/lib.rs
@@ -77,6 +77,7 @@ pub mod actions;
 pub mod engine_data;
 pub mod error;
 pub mod expressions;
+pub mod log_replay;
 pub mod scan;
 pub mod schema;
 pub mod snapshot;
diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
new file mode 100644
index 000000000..7650183a0
--- /dev/null
+++ b/kernel/src/log_replay.rs
@@ -0,0 +1,176 @@
+use crate::{
+    actions::deletion_vector::DeletionVectorDescriptor,
+    engine_data::{GetData, TypedGetData},
+    DeltaResult,
+};
+use std::collections::HashSet;
+use tracing::debug;
+
+/// The subset of file action fields that uniquely identifies it in the log, used for deduplication
+/// of adds and removes during log replay.
+#[derive(Debug, Hash, Eq, PartialEq)]
+pub(crate) struct FileActionKey {
+    pub(crate) path: String,
+    pub(crate) dv_unique_id: Option<String>,
+}
+impl FileActionKey {
+    pub(crate) fn new(path: impl Into<String>, dv_unique_id: Option<String>) -> Self {
+        let path = path.into();
+        Self { path, dv_unique_id }
+    }
+}
+
+/// Core implementation for deduplicating file actions in Delta log replay
+/// This struct extracts the common functionality from the incoming CheckpointVisitor
+/// and the AddRemoveDedupVisitor.
+pub(crate) struct FileActionDeduplicator<'seen> {
+    /// A set of (data file path, dv_unique_id) pairs that have been seen thus
+    /// far in the log for deduplication
+    seen_file_keys: &'seen mut HashSet<FileActionKey>,
+    /// Selection vector to track which rows should be included
+    selection_vector: Vec<bool>,
+    /// Whether we're processing a log batch (as opposed to a checkpoint)
+    is_log_batch: bool,
+    /// Index of the getter containing the add.path column
+    add_path_index: usize,
+    /// Index of the getter containing the remove.path column
+    remove_path_index: usize,
+    /// Starting index for add action deletion vector columns
+    add_dv_start_index: usize,
+    /// Starting index for remove action deletion vector columns
+    remove_dv_start_index: usize,
+}
+
+impl<'seen> FileActionDeduplicator<'seen> {
+    pub(crate) fn new(
+        seen_file_keys: &'seen mut HashSet<FileActionKey>,
+        selection_vector: Vec<bool>,
+        is_log_batch: bool,
+        add_path_index: usize,
+        remove_path_index: usize,
+        add_dv_start_index: usize,
+        remove_dv_start_index: usize,
+    ) -> Self {
+        Self {
+            seen_file_keys,
+            selection_vector,
+            is_log_batch,
+            add_path_index,
+            remove_path_index,
+            add_dv_start_index,
+            remove_dv_start_index,
+        }
+    }
+
+    /// Checks if log replay already processed this logical file (in which case the current action
+    /// should be ignored). If not already seen, register it so we can recognize future duplicates.
+    /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it
+    /// and should process it.
+    pub(crate) fn check_and_record_seen(&mut self, key: FileActionKey) -> bool {
+        // Note: each (add.path + add.dv_unique_id()) pair has a
+        // unique Add + Remove pair in the log. For example:
+        // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json
+
+        if self.seen_file_keys.contains(&key) {
+            debug!(
+                "Ignoring duplicate ({}, {:?}) in scan, is log {}",
+                key.path, key.dv_unique_id, self.is_log_batch
+            );
+            true
+        } else {
+            debug!(
+                "Including ({}, {:?}) in scan, is log {}",
+                key.path, key.dv_unique_id, self.is_log_batch
+            );
+            if self.is_log_batch {
+                // Remember file actions from this batch so we can ignore duplicates as we process
+                // batches from older commit and/or checkpoint files. We don't track checkpoint
+                // batches because they are already the oldest actions and never replace anything.
+                self.seen_file_keys.insert(key);
+            }
+            false
+        }
+    }
+
+    /// Extract the deletion vector unique ID if it exists.
+    fn extract_dv_unique_id<'a>(
+        &self,
+        i: usize,
+        getters: &[&'a dyn GetData<'a>],
+        dv_start_index: usize,
+    ) -> DeltaResult<Option<String>> {
+        match getters[dv_start_index].get_opt(i, "deletionVector.storageType")? {
+            Some(storage_type) => {
+                let path_or_inline =
+                    getters[dv_start_index + 1].get(i, "deletionVector.pathOrInlineDv")?;
+                let offset = getters[dv_start_index + 2].get_opt(i, "deletionVector.offset")?;
+
+                Ok(Some(DeletionVectorDescriptor::unique_id_from_parts(
+                    storage_type,
+                    path_or_inline,
+                    offset,
+                )))
+            }
+            None => Ok(None),
+        }
+    }
+
+    /// Extracts a file action key and determines if it's an add operation.
+    /// This method examines the data at the given index using the provided getters
+    /// to identify whether a file action exists and what type it is.
+    ///
+    /// # Arguments
+    ///
+    /// * `i` - Index position in the data structure to examine
+    /// * `getters` - Collection of data getter implementations used to access the data
+    /// * `skip_removes` - Whether to skip remove actions when extracting file actions
+    ///
+    /// # Returns
+    ///
+    /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation
+    /// * `Ok(None)` - When no file action is found
+    /// * `Err(...)` - On any error during extraction
+    pub(crate) fn extract_file_action<'a>(
+        &self,
+        i: usize,
+        getters: &[&'a dyn GetData<'a>],
+        skip_removes: bool,
+    ) -> DeltaResult<Option<(FileActionKey, bool)>> {
+        // Try to extract an add action by the required path column
+        if let Some(path) = getters[self.add_path_index].get_str(i, "add.path")? {
+            let dv_unique_id = self.extract_dv_unique_id(i, getters, self.add_dv_start_index)?;
+            return Ok(Some((FileActionKey::new(path, dv_unique_id), true)));
+        }
+
+        // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint batch.
+        if skip_removes {
+            return Ok(None);
+        }
+
+        // Try to extract a remove action by the required path column
+        if let Some(path) = getters[self.remove_path_index].get_str(i, "remove.path")? {
+            let dv_unique_id = self.extract_dv_unique_id(i, getters, self.remove_dv_start_index)?;
+            return Ok(Some((FileActionKey::new(path, dv_unique_id), false)));
+        }
+
+        // No file action found
+        Ok(None)
+    }
+
+    pub(crate) fn selection_vector(self) -> Vec<bool> {
+        self.selection_vector
+    }
+
+    pub(crate) fn selection_vector_ref(&self) -> &Vec<bool> {
+        &self.selection_vector
+    }
+
+    pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec<bool> {
+        &mut self.selection_vector
+    }
+
+    /// Returns whether we are currently processing a log batch.
+    pub(crate) fn is_log_batch(&self) -> bool {
+        self.is_log_batch
+    }
+}
diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index 0e26b610f..a2d65f1b0 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -3,33 +3,19 @@ use std::collections::{HashMap, HashSet};
 use std::sync::{Arc, LazyLock};
 
 use itertools::Itertools;
-use tracing::debug;
 
 use super::data_skipping::DataSkippingFilter;
 use super::{ScanData, Transform};
 use crate::actions::get_log_add_schema;
 use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
 use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef};
+use crate::log_replay::{FileActionDeduplicator, FileActionKey};
 use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _};
-use crate::scan::{DeletionVectorDescriptor, Scalar, TransformExpr};
+use crate::scan::{Scalar, TransformExpr};
 use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType};
 use crate::utils::require;
 use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator};
 
-/// The subset of file action fields that uniquely identifies it in the log, used for deduplication
-/// of adds and removes during log replay.
-#[derive(Debug, Hash, Eq, PartialEq)]
-struct FileActionKey {
-    path: String,
-    dv_unique_id: Option<String>,
-}
-impl FileActionKey {
-    fn new(path: impl Into<String>, dv_unique_id: Option<String>) -> Self {
-        let path = path.into();
-        Self { path, dv_unique_id }
-    }
-}
-
 struct LogReplayScanner {
     partition_filter: Option<ExpressionRef>,
     data_skipping_filter: Option<DataSkippingFilter>,
@@ -45,43 +31,43 @@ struct LogReplayScanner {
 /// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the
 /// first action for a given file is a remove, then that file does not show up in the result at all.
 struct AddRemoveDedupVisitor<'seen> {
-    seen: &'seen mut HashSet<FileActionKey>,
-    selection_vector: Vec<bool>,
+    deduplicator: FileActionDeduplicator<'seen>,
     logical_schema: SchemaRef,
     transform: Option<Arc<Transform>>,
     partition_filter: Option<ExpressionRef>,
     row_transform_exprs: Vec<Option<ExpressionRef>>,
-    is_log_batch: bool,
 }
 
 impl AddRemoveDedupVisitor<'_> {
-    /// Checks if log replay already processed this logical file (in which case the current action
-    /// should be ignored). If not already seen, register it so we can recognize future duplicates.
-    /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it
-    /// and should process it.
-    fn check_and_record_seen(&mut self, key: FileActionKey) -> bool {
-        // Note: each (add.path + add.dv_unique_id()) pair has a
-        // unique Add + Remove pair in the log. For example:
-        // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json
-
-        if self.seen.contains(&key) {
-            debug!(
-                "Ignoring duplicate ({}, {:?}) in scan, is log {}",
-                key.path, key.dv_unique_id, self.is_log_batch
-            );
-            true
-        } else {
-            debug!(
-                "Including ({}, {:?}) in scan, is log {}",
-                key.path, key.dv_unique_id, self.is_log_batch
-            );
-            if self.is_log_batch {
-                // Remember file actions from this batch so we can ignore duplicates as we process
-                // batches from older commit and/or checkpoint files. We don't track checkpoint
-                // batches because they are already the oldest actions and never replace anything.
-                self.seen.insert(key);
-            }
-            false
+    // The index position in the row getters for the following columns
+    const ADD_PATH_INDEX: usize = 0;
+    const ADD_PARTITION_VALUES_INDEX: usize = 1;
+    const ADD_DV_START_INDEX: usize = 2;
+    const REMOVE_PATH_INDEX: usize = 5;
+    const REMOVE_DV_START_INDEX: usize = 6;
+
+    fn new(
+        seen: &mut HashSet<FileActionKey>,
+        selection_vector: Vec<bool>,
+        logical_schema: SchemaRef,
+        transform: Option<Arc<Transform>>,
+        partition_filter: Option<ExpressionRef>,
+        is_log_batch: bool,
+    ) -> AddRemoveDedupVisitor<'_> {
+        AddRemoveDedupVisitor {
+            deduplicator: FileActionDeduplicator::new(
+                seen,
+                selection_vector,
+                is_log_batch,
+                Self::ADD_PATH_INDEX,
+                Self::REMOVE_PATH_INDEX,
+                Self::ADD_DV_START_INDEX,
+                Self::REMOVE_DV_START_INDEX,
+            ),
+            logical_schema,
+            transform,
+            partition_filter,
+            row_transform_exprs: Vec::new(),
         }
     }
 
@@ -162,26 +148,13 @@ impl AddRemoveDedupVisitor<'_> {
     /// True if this row contains an Add action that should survive log replay. Skip it if the row
     /// is not an Add action, or the file has already been seen previously.
     fn is_valid_add<'a>(&mut self, i: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<bool> {
-        // Add will have a path at index 0 if it is valid; otherwise, if it is a log batch, we may
-        // have a remove with a path at index 4. In either case, extract the three dv getters at
-        // indexes that immediately follow a valid path index.
-        let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? {
-            (path, &getters[2..5], true)
-        } else if !self.is_log_batch {
+        // Remove getters are not included when visiting a non-log batch (checkpoint batch), so do
+        // not try to extract remove actions in that case.
+        let Some((file_key, is_add)) =
+            self.deduplicator
+                .extract_file_action(i, getters, self.deduplicator.is_log_batch())?
+        else {
             return Ok(false);
-        } else if let Some(path) = getters[5].get_opt(i, "remove.path")? {
-            (path, &getters[6..9], false)
-        } else {
-            return Ok(false);
-        };
-
-        let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? {
-            Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts(
-                storage_type,
-                dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?,
-                dv_getters[2].get_opt(i, "deletionVector.offset")?,
-            )),
-            None => None,
         };
 
         // Apply partition pruning (to adds only) before deduplication, so that we don't waste memory
@@ -192,7 +165,8 @@ impl AddRemoveDedupVisitor<'_> {
         // encounter if the table's schema was replaced after the most recent checkpoint.
         let partition_values = match &self.transform {
             Some(transform) if is_add => {
-                let partition_values = getters[1].get(i, "add.partitionValues")?;
+                let partition_values =
+                    getters[Self::ADD_PARTITION_VALUES_INDEX].get(i, "add.partitionValues")?;
                 let partition_values = self.parse_partition_values(transform, &partition_values)?;
                 if self.is_file_partition_pruned(&partition_values) {
                     return Ok(false);
@@ -203,8 +177,7 @@ impl AddRemoveDedupVisitor<'_> {
         };
 
         // Check both adds and removes (skipping already-seen), but only transform and return adds
-        let file_key = FileActionKey::new(path, dv_unique_id);
-        if self.check_and_record_seen(file_key) || !is_add {
+        if self.deduplicator.check_and_record_seen(file_key) || !is_add {
             return Ok(false);
         }
         let transform = self
@@ -243,7 +216,7 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> {
             (names, types).into()
         });
         let (names, types) = NAMES_AND_TYPES.as_ref();
-        if self.is_log_batch {
+        if self.deduplicator.is_log_batch() {
             (names, types)
         } else {
             // All checkpoint actions are already reconciled and Remove actions in checkpoint files
@@ -253,7 +226,11 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> {
     }
 
     fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> {
-        let expected_getters = if self.is_log_batch { 9 } else { 5 };
+        let expected_getters = if self.deduplicator.is_log_batch() {
+            9
+        } else {
+            5
+        };
         require!(
             getters.len() == expected_getters,
             Error::InternalError(format!(
@@ -263,8 +240,8 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> {
         );
 
         for i in 0..row_count {
-            if self.selection_vector[i] {
-                self.selection_vector[i] = self.is_valid_add(i, getters)?;
+            if self.deduplicator.selection_vector_ref()[i] {
+                self.deduplicator.selection_vector_mut()[i] = self.is_valid_add(i, getters)?;
             }
         }
         Ok(())
@@ -336,19 +313,18 @@ impl LogReplayScanner {
         };
         assert_eq!(selection_vector.len(), actions.len());
 
-        let mut visitor = AddRemoveDedupVisitor {
-            seen: &mut self.seen,
+        let mut visitor = AddRemoveDedupVisitor::new(
+            &mut self.seen,
             selection_vector,
             logical_schema,
             transform,
-            partition_filter: self.partition_filter.clone(),
-            row_transform_exprs: Vec::new(),
+            self.partition_filter.clone(),
             is_log_batch,
-        };
+        );
         visitor.visit_rows_of(actions)?;
 
         // TODO: Teach expression eval to respect the selection vector we just computed so carefully!
-        let selection_vector = visitor.selection_vector;
+        let selection_vector = visitor.deduplicator.selection_vector();
         let result = add_transform.evaluate(actions)?;
         Ok((result, selection_vector, visitor.row_transform_exprs))
     }

From 7da74b268f38672c54651749c27777d7293cdbc3 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Wed, 26 Mar 2025 14:14:15 -0700
Subject: [PATCH 19/45] more docs

---
 kernel/src/log_replay.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
index 7650183a0..ef9004ae1 100644
--- a/kernel/src/log_replay.rs
+++ b/kernel/src/log_replay.rs
@@ -25,7 +25,8 @@ impl FileActionKey {
 /// and the AddRemoveDedupVisitor.
 pub(crate) struct FileActionDeduplicator<'seen> {
     /// A set of (data file path, dv_unique_id) pairs that have been seen thus
-    /// far in the log for deduplication
+    /// far in the log for deduplication. This is a mutable reference to the set
+    /// of seen file keys that persists across multiple log batches.
     seen_file_keys: &'seen mut HashSet<FileActionKey>,
     /// Selection vector to track which rows should be included
     selection_vector: Vec<bool>,

From 220a216a2968531943a0773a87b4e2fc702d08fe Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Wed, 26 Mar 2025 15:05:59 -0700
Subject: [PATCH 20/45] invert is_log_batch logic

---
 kernel/src/scan/log_replay.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index a2d65f1b0..b6bdc1570 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -152,7 +152,7 @@ impl AddRemoveDedupVisitor<'_> {
         // not try to extract remove actions in that case.
         let Some((file_key, is_add)) =
             self.deduplicator
-                .extract_file_action(i, getters, self.deduplicator.is_log_batch())?
+                .extract_file_action(i, getters, !self.deduplicator.is_log_batch())?
         else {
             return Ok(false);
         };

From 9d86911fadb6a6aa6267a82cc4aec9c3949ec0da Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Wed, 26 Mar 2025 15:14:46 -0700
Subject: [PATCH 21/45] docs

---
 kernel/src/log_replay.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
index ef9004ae1..d6b175f28 100644
--- a/kernel/src/log_replay.rs
+++ b/kernel/src/log_replay.rs
@@ -20,9 +20,12 @@ impl FileActionKey {
     }
 }
 
-/// Core implementation for deduplicating file actions in Delta log replay
-/// This struct extracts the common functionality from the incoming CheckpointVisitor
-/// and the AddRemoveDedupVisitor.
+/// Maintains state and provides functionality for deduplicating file actions during log replay.
+///
+/// This struct is embedded in visitors AddRemoveDedupVisitor and CheckpointVisitor to track
+/// which files have been seen across multiple log batches. Since logs are processed
+/// newest-to-oldest, this deduplicator ensures that each unique file (identified by path
+///  and deletion vector ID) is processed only once.
 pub(crate) struct FileActionDeduplicator<'seen> {
     /// A set of (data file path, dv_unique_id) pairs that have been seen thus
     /// far in the log for deduplication. This is a mutable reference to the set

From e5b0e32056b8ea12060fd48cb18b2eb63f3e537f Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Wed, 26 Mar 2025 15:16:42 -0700
Subject: [PATCH 22/45] docs

---
 kernel/src/log_replay.rs | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
index d6b175f28..e400c27d1 100644
--- a/kernel/src/log_replay.rs
+++ b/kernel/src/log_replay.rs
@@ -22,10 +22,9 @@ impl FileActionKey {
 
 /// Maintains state and provides functionality for deduplicating file actions during log replay.
 ///
-/// This struct is embedded in visitors AddRemoveDedupVisitor and CheckpointVisitor to track
-/// which files have been seen across multiple log batches. Since logs are processed
-/// newest-to-oldest, this deduplicator ensures that each unique file (identified by path
-///  and deletion vector ID) is processed only once.
+/// This struct is embedded in visitors to track which files have been seen across multiple
+/// log batches. Since logs are processed newest-to-oldest, this deduplicator ensures that each
+/// unique file (identified by path and deletion vector ID) is processed only once.
 pub(crate) struct FileActionDeduplicator<'seen> {
     /// A set of (data file path, dv_unique_id) pairs that have been seen thus
     /// far in the log for deduplication. This is a mutable reference to the set

From a5393dcc896d05071b2a704b82e6f47b93f07bcc Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Wed, 26 Mar 2025 15:48:28 -0700
Subject: [PATCH 23/45] docs and imports

---
 kernel/src/log_replay.rs | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
index e400c27d1..521b6e81e 100644
--- a/kernel/src/log_replay.rs
+++ b/kernel/src/log_replay.rs
@@ -1,9 +1,10 @@
-use crate::{
-    actions::deletion_vector::DeletionVectorDescriptor,
-    engine_data::{GetData, TypedGetData},
-    DeltaResult,
-};
+//! This module provides structures and functionality to faciliate the log replay process.
 use std::collections::HashSet;
+
+use crate::actions::deletion_vector::DeletionVectorDescriptor;
+use crate::engine_data::{GetData, TypedGetData};
+use crate::DeltaResult;
+
 use tracing::debug;
 
 /// The subset of file action fields that uniquely identifies it in the log, used for deduplication
@@ -24,7 +25,9 @@ impl FileActionKey {
 ///
 /// This struct is embedded in visitors to track which files have been seen across multiple
 /// log batches. Since logs are processed newest-to-oldest, this deduplicator ensures that each
-/// unique file (identified by path and deletion vector ID) is processed only once.
+/// unique file (identified by path and deletion vector ID) is processed only once. Performing
+/// deduplication at the visitor level avoids having to load all actions into memory at once,
+/// significantly reducing memory usage for large Delta tables with extensive history.
 pub(crate) struct FileActionDeduplicator<'seen> {
     /// A set of (data file path, dv_unique_id) pairs that have been seen thus
     /// far in the log for deduplication. This is a mutable reference to the set

From a23c651ad7aa3a8399f836eac5e6113bec2aafde Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Wed, 26 Mar 2025 20:04:05 -0700
Subject: [PATCH 24/45] improve mod doc

---
 kernel/src/log_replay.rs | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
index 521b6e81e..e5854ca31 100644
--- a/kernel/src/log_replay.rs
+++ b/kernel/src/log_replay.rs
@@ -1,4 +1,20 @@
-//! This module provides structures and functionality to faciliate the log replay process.
+//! This module provides log replay utilities.
+//!
+//! Log replay is the process of transforming an iterator of action batches (read from Delta
+//! transaction logs) into an iterator of filtered/transformed actions for specific use cases.
+//! The logs, which record all table changes as JSON entries, are processed batch by batch,
+//! typically from newest to oldest.
+//!
+//! Log replay can be implemented in various ways:
+//! - For table scans: Deduplicate file actions to identify the current set of valid files
+//! - For checkpointing: Filter actions to include only those needed to rebuild table state
+//!
+//! This module provides structures for efficient batch processing, focusing on file action
+//! deduplication with `FileActionDeduplicator` which tracks unique files across log batches
+//! to minimize memory usage for tables with extensive history.
+//!
+//! Future extensions will support additional log replay processors beyond the current use cases.
+
 use std::collections::HashSet;
 
 use crate::actions::deletion_vector::DeletionVectorDescriptor;

From d712d181204d068e9346cc0e3e6ee582c95a80a7 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Thu, 27 Mar 2025 09:43:14 -0700
Subject: [PATCH 25/45] improve doc

---
 kernel/src/log_replay.rs | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
index e5854ca31..3b2a84692 100644
--- a/kernel/src/log_replay.rs
+++ b/kernel/src/log_replay.rs
@@ -5,15 +5,14 @@
 //! The logs, which record all table changes as JSON entries, are processed batch by batch,
 //! typically from newest to oldest.
 //!
-//! Log replay can be implemented in various ways:
-//! - For table scans: Deduplicate file actions to identify the current set of valid files
-//! - For checkpointing: Filter actions to include only those needed to rebuild table state
+//! Log replay is currently implemented for table scans, which filter and apply transofmations
+//! to file actions to produce a view of the table state at a specific point in time.  
+//! Future extensions will support additional log replay processors beyond the current use case.
+//! (e.g. checkpointing: filter actions to include only those needed to rebuild table state)
 //!
 //! This module provides structures for efficient batch processing, focusing on file action
 //! deduplication with `FileActionDeduplicator` which tracks unique files across log batches
 //! to minimize memory usage for tables with extensive history.
-//!
-//! Future extensions will support additional log replay processors beyond the current use cases.
 
 use std::collections::HashSet;
 

From e564ae17ca5f6b17659a6ac05867af7df0681621 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Thu, 27 Mar 2025 09:46:10 -0700
Subject: [PATCH 26/45] docs'

---
 kernel/src/log_replay.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
index 3b2a84692..d9a906525 100644
--- a/kernel/src/log_replay.rs
+++ b/kernel/src/log_replay.rs
@@ -5,8 +5,8 @@
 //! The logs, which record all table changes as JSON entries, are processed batch by batch,
 //! typically from newest to oldest.
 //!
-//! Log replay is currently implemented for table scans, which filter and apply transofmations
-//! to file actions to produce a view of the table state at a specific point in time.  
+//! Log replay is currently implemented for table scans, which filter and apply transformations
+//! to produce file actions which builds the view of the table state at a specific point in time.  
 //! Future extensions will support additional log replay processors beyond the current use case.
 //! (e.g. checkpointing: filter actions to include only those needed to rebuild table state)
 //!

From b14ff195c0655cfecc29d6b666af6f72c4bcd29d Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Thu, 27 Mar 2025 13:42:06 -0700
Subject: [PATCH 27/45] docs

---
 kernel/src/log_replay.rs      | 2 +-
 kernel/src/scan/log_replay.rs | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
index d9a906525..0064a701a 100644
--- a/kernel/src/log_replay.rs
+++ b/kernel/src/log_replay.rs
@@ -178,7 +178,7 @@ impl<'seen> FileActionDeduplicator<'seen> {
         Ok(None)
     }
 
-    pub(crate) fn selection_vector(self) -> Vec<bool> {
+    pub(crate) fn into_selection_vector(self) -> Vec<bool> {
         self.selection_vector
     }
 
diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index b6bdc1570..77a985125 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -148,6 +148,10 @@ impl AddRemoveDedupVisitor<'_> {
     /// True if this row contains an Add action that should survive log replay. Skip it if the row
     /// is not an Add action, or the file has already been seen previously.
     fn is_valid_add<'a>(&mut self, i: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<bool> {
+        // When processing file actions, we extract path and deletion vector information based on action type:
+        // - For Add actions: path is at index 0, followed by DV fields at indexes 2-4
+        // - For Remove actions (in log batches only): path is at index 5, followed by DV fields at indexes 6-8
+        // The file extraction logic selects the appropriate indexes based on whether we found a valid path.
         // Remove getters are not included when visiting a non-log batch (checkpoint batch), so do
         // not try to extract remove actions in that case.
         let Some((file_key, is_add)) =
@@ -324,7 +328,7 @@ impl LogReplayScanner {
         visitor.visit_rows_of(actions)?;
 
         // TODO: Teach expression eval to respect the selection vector we just computed so carefully!
-        let selection_vector = visitor.deduplicator.selection_vector();
+        let selection_vector = visitor.deduplicator.into_selection_vector();
         let result = add_transform.evaluate(actions)?;
         Ok((result, selection_vector, visitor.row_transform_exprs))
     }

From a52d484be0741e5e9d3e72336a0e65d8b86a3298 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Thu, 27 Mar 2025 14:22:33 -0700
Subject: [PATCH 28/45] update

---
 kernel/src/log_replay.rs      | 16 ----------------
 kernel/src/scan/log_replay.rs | 14 +++++++++-----
 2 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
index 0064a701a..39aa4ab6e 100644
--- a/kernel/src/log_replay.rs
+++ b/kernel/src/log_replay.rs
@@ -48,8 +48,6 @@ pub(crate) struct FileActionDeduplicator<'seen> {
     /// far in the log for deduplication. This is a mutable reference to the set
     /// of seen file keys that persists across multiple log batches.
     seen_file_keys: &'seen mut HashSet<FileActionKey>,
-    /// Selection vector to track which rows should be included
-    selection_vector: Vec<bool>,
     /// Whether we're processing a log batch (as opposed to a checkpoint)
     is_log_batch: bool,
     /// Index of the getter containing the add.path column
@@ -65,7 +63,6 @@ pub(crate) struct FileActionDeduplicator<'seen> {
 impl<'seen> FileActionDeduplicator<'seen> {
     pub(crate) fn new(
         seen_file_keys: &'seen mut HashSet<FileActionKey>,
-        selection_vector: Vec<bool>,
         is_log_batch: bool,
         add_path_index: usize,
         remove_path_index: usize,
@@ -74,7 +71,6 @@ impl<'seen> FileActionDeduplicator<'seen> {
     ) -> Self {
         Self {
             seen_file_keys,
-            selection_vector,
             is_log_batch,
             add_path_index,
             remove_path_index,
@@ -178,18 +174,6 @@ impl<'seen> FileActionDeduplicator<'seen> {
         Ok(None)
     }
 
-    pub(crate) fn into_selection_vector(self) -> Vec<bool> {
-        self.selection_vector
-    }
-
-    pub(crate) fn selection_vector_ref(&self) -> &Vec<bool> {
-        &self.selection_vector
-    }
-
-    pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec<bool> {
-        &mut self.selection_vector
-    }
-
     /// Returns whether we are currently processing a log batch.
     pub(crate) fn is_log_batch(&self) -> bool {
         self.is_log_batch
diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index 77a985125..3c6c2e845 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -36,6 +36,7 @@ struct AddRemoveDedupVisitor<'seen> {
     transform: Option<Arc<Transform>>,
     partition_filter: Option<ExpressionRef>,
     row_transform_exprs: Vec<Option<ExpressionRef>>,
+    selection_vector: Vec<bool>,
 }
 
 impl AddRemoveDedupVisitor<'_> {
@@ -57,7 +58,6 @@ impl AddRemoveDedupVisitor<'_> {
         AddRemoveDedupVisitor {
             deduplicator: FileActionDeduplicator::new(
                 seen,
-                selection_vector,
                 is_log_batch,
                 Self::ADD_PATH_INDEX,
                 Self::REMOVE_PATH_INDEX,
@@ -68,6 +68,7 @@ impl AddRemoveDedupVisitor<'_> {
             transform,
             partition_filter,
             row_transform_exprs: Vec::new(),
+            selection_vector,
         }
     }
 
@@ -244,8 +245,8 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> {
         );
 
         for i in 0..row_count {
-            if self.deduplicator.selection_vector_ref()[i] {
-                self.deduplicator.selection_vector_mut()[i] = self.is_valid_add(i, getters)?;
+            if self.selection_vector[i] {
+                self.selection_vector[i] = self.is_valid_add(i, getters)?;
             }
         }
         Ok(())
@@ -328,9 +329,12 @@ impl LogReplayScanner {
         visitor.visit_rows_of(actions)?;
 
         // TODO: Teach expression eval to respect the selection vector we just computed so carefully!
-        let selection_vector = visitor.deduplicator.into_selection_vector();
         let result = add_transform.evaluate(actions)?;
-        Ok((result, selection_vector, visitor.row_transform_exprs))
+        Ok((
+            result,
+            visitor.selection_vector,
+            visitor.row_transform_exprs,
+        ))
     }
 }
 

From a243a989af2d99bfa3ab07ddc78ec236ec0fbb54 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Thu, 27 Mar 2025 16:59:37 -0700
Subject: [PATCH 29/45] nits

---
 kernel/src/log_replay.rs      | 31 ++++++++++++++++++++++++-------
 kernel/src/scan/log_replay.rs | 23 ++++++++++++-----------
 2 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
index 39aa4ab6e..12528d296 100644
--- a/kernel/src/log_replay.rs
+++ b/kernel/src/log_replay.rs
@@ -18,8 +18,8 @@ use std::collections::HashSet;
 
 use crate::actions::deletion_vector::DeletionVectorDescriptor;
 use crate::engine_data::{GetData, TypedGetData};
+use crate::log_replay::FileActionKeyType::{Add, Remove};
 use crate::DeltaResult;
-
 use tracing::debug;
 
 /// The subset of file action fields that uniquely identifies it in the log, used for deduplication
@@ -28,14 +28,31 @@ use tracing::debug;
 pub(crate) struct FileActionKey {
     pub(crate) path: String,
     pub(crate) dv_unique_id: Option<String>,
+    pub(crate) action_type: FileActionKeyType,
 }
+
 impl FileActionKey {
-    pub(crate) fn new(path: impl Into<String>, dv_unique_id: Option<String>) -> Self {
+    pub(crate) fn new(
+        path: impl Into<String>,
+        dv_unique_id: Option<String>,
+        action_type: FileActionKeyType,
+    ) -> Self {
         let path = path.into();
-        Self { path, dv_unique_id }
+        Self {
+            path,
+            dv_unique_id,
+            action_type,
+        }
     }
 }
 
+// File actions are either add or remove actions.
+#[derive(Debug, Hash, Eq, PartialEq)]
+pub(crate) enum FileActionKeyType {
+    Add,
+    Remove,
+}
+
 /// Maintains state and provides functionality for deduplicating file actions during log replay.
 ///
 /// This struct is embedded in visitors to track which files have been seen across multiple
@@ -144,7 +161,7 @@ impl<'seen> FileActionDeduplicator<'seen> {
     ///
     /// # Returns
     ///
-    /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation
+    /// * `Ok(Some((key))` - When a file action is found, returns the key
     /// * `Ok(None)` - When no file action is found
     /// * `Err(...)` - On any error during extraction
     pub(crate) fn extract_file_action<'a>(
@@ -152,11 +169,11 @@ impl<'seen> FileActionDeduplicator<'seen> {
         i: usize,
         getters: &[&'a dyn GetData<'a>],
         skip_removes: bool,
-    ) -> DeltaResult<Option<(FileActionKey, bool)>> {
+    ) -> DeltaResult<Option<FileActionKey>> {
         // Try to extract an add action by the required path column
         if let Some(path) = getters[self.add_path_index].get_str(i, "add.path")? {
             let dv_unique_id = self.extract_dv_unique_id(i, getters, self.add_dv_start_index)?;
-            return Ok(Some((FileActionKey::new(path, dv_unique_id), true)));
+            return Ok(Some(FileActionKey::new(path, dv_unique_id, Add)));
         }
 
         // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint batch.
@@ -167,7 +184,7 @@ impl<'seen> FileActionDeduplicator<'seen> {
         // Try to extract a remove action by the required path column
         if let Some(path) = getters[self.remove_path_index].get_str(i, "remove.path")? {
             let dv_unique_id = self.extract_dv_unique_id(i, getters, self.remove_dv_start_index)?;
-            return Ok(Some((FileActionKey::new(path, dv_unique_id), false)));
+            return Ok(Some(FileActionKey::new(path, dv_unique_id, Remove)));
         }
 
         // No file action found
diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index 3c6c2e845..2042ab090 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -9,7 +9,7 @@ use super::{ScanData, Transform};
 use crate::actions::get_log_add_schema;
 use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
 use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef};
-use crate::log_replay::{FileActionDeduplicator, FileActionKey};
+use crate::log_replay::{FileActionDeduplicator, FileActionKey, FileActionKeyType};
 use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _};
 use crate::scan::{Scalar, TransformExpr};
 use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType};
@@ -32,11 +32,11 @@ struct LogReplayScanner {
 /// first action for a given file is a remove, then that file does not show up in the result at all.
 struct AddRemoveDedupVisitor<'seen> {
     deduplicator: FileActionDeduplicator<'seen>,
+    selection_vector: Vec<bool>,
     logical_schema: SchemaRef,
     transform: Option<Arc<Transform>>,
     partition_filter: Option<ExpressionRef>,
     row_transform_exprs: Vec<Option<ExpressionRef>>,
-    selection_vector: Vec<bool>,
 }
 
 impl AddRemoveDedupVisitor<'_> {
@@ -64,11 +64,11 @@ impl AddRemoveDedupVisitor<'_> {
                 Self::ADD_DV_START_INDEX,
                 Self::REMOVE_DV_START_INDEX,
             ),
+            selection_vector,
             logical_schema,
             transform,
             partition_filter,
             row_transform_exprs: Vec::new(),
-            selection_vector,
         }
     }
 
@@ -155,12 +155,15 @@ impl AddRemoveDedupVisitor<'_> {
         // The file extraction logic selects the appropriate indexes based on whether we found a valid path.
         // Remove getters are not included when visiting a non-log batch (checkpoint batch), so do
         // not try to extract remove actions in that case.
-        let Some((file_key, is_add)) =
-            self.deduplicator
-                .extract_file_action(i, getters, !self.deduplicator.is_log_batch())?
+        let Some(file_key) = self.deduplicator.extract_file_action(
+            i,
+            getters,
+            !self.deduplicator.is_log_batch(), // skip_removes. true if this is a checkpoint batch
+        )?
         else {
             return Ok(false);
         };
+        let is_add = matches!(file_key.action_type, FileActionKeyType::Add);
 
         // Apply partition pruning (to adds only) before deduplication, so that we don't waste memory
         // tracking pruned files. Removes don't get pruned and we'll still have to track them.
@@ -231,11 +234,9 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> {
     }
 
     fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> {
-        let expected_getters = if self.deduplicator.is_log_batch() {
-            9
-        } else {
-            5
-        };
+        let is_log_batch = self.deduplicator.is_log_batch();
+        let expected_getters = if is_log_batch { 9 } else { 5 };
+
         require!(
             getters.len() == expected_getters,
             Error::InternalError(format!(

From 9f06382993af7c30b164cf3b452880141adc4dc1 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Thu, 27 Mar 2025 17:05:14 -0700
Subject: [PATCH 30/45] Revert "nits"

This reverts commit a243a989af2d99bfa3ab07ddc78ec236ec0fbb54.
---
 kernel/src/log_replay.rs      | 31 +++++++------------------------
 kernel/src/scan/log_replay.rs | 23 +++++++++++------------
 2 files changed, 18 insertions(+), 36 deletions(-)

diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
index 12528d296..39aa4ab6e 100644
--- a/kernel/src/log_replay.rs
+++ b/kernel/src/log_replay.rs
@@ -18,8 +18,8 @@ use std::collections::HashSet;
 
 use crate::actions::deletion_vector::DeletionVectorDescriptor;
 use crate::engine_data::{GetData, TypedGetData};
-use crate::log_replay::FileActionKeyType::{Add, Remove};
 use crate::DeltaResult;
+
 use tracing::debug;
 
 /// The subset of file action fields that uniquely identifies it in the log, used for deduplication
@@ -28,31 +28,14 @@ use tracing::debug;
 pub(crate) struct FileActionKey {
     pub(crate) path: String,
     pub(crate) dv_unique_id: Option<String>,
-    pub(crate) action_type: FileActionKeyType,
 }
-
 impl FileActionKey {
-    pub(crate) fn new(
-        path: impl Into<String>,
-        dv_unique_id: Option<String>,
-        action_type: FileActionKeyType,
-    ) -> Self {
+    pub(crate) fn new(path: impl Into<String>, dv_unique_id: Option<String>) -> Self {
         let path = path.into();
-        Self {
-            path,
-            dv_unique_id,
-            action_type,
-        }
+        Self { path, dv_unique_id }
     }
 }
 
-// File actions are either add or remove actions.
-#[derive(Debug, Hash, Eq, PartialEq)]
-pub(crate) enum FileActionKeyType {
-    Add,
-    Remove,
-}
-
 /// Maintains state and provides functionality for deduplicating file actions during log replay.
 ///
 /// This struct is embedded in visitors to track which files have been seen across multiple
@@ -161,7 +144,7 @@ impl<'seen> FileActionDeduplicator<'seen> {
     ///
     /// # Returns
     ///
-    /// * `Ok(Some((key))` - When a file action is found, returns the key
+    /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation
     /// * `Ok(None)` - When no file action is found
     /// * `Err(...)` - On any error during extraction
     pub(crate) fn extract_file_action<'a>(
@@ -169,11 +152,11 @@ impl<'seen> FileActionDeduplicator<'seen> {
         i: usize,
         getters: &[&'a dyn GetData<'a>],
         skip_removes: bool,
-    ) -> DeltaResult<Option<FileActionKey>> {
+    ) -> DeltaResult<Option<(FileActionKey, bool)>> {
         // Try to extract an add action by the required path column
         if let Some(path) = getters[self.add_path_index].get_str(i, "add.path")? {
             let dv_unique_id = self.extract_dv_unique_id(i, getters, self.add_dv_start_index)?;
-            return Ok(Some(FileActionKey::new(path, dv_unique_id, Add)));
+            return Ok(Some((FileActionKey::new(path, dv_unique_id), true)));
         }
 
         // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint batch.
@@ -184,7 +167,7 @@ impl<'seen> FileActionDeduplicator<'seen> {
         // Try to extract a remove action by the required path column
         if let Some(path) = getters[self.remove_path_index].get_str(i, "remove.path")? {
             let dv_unique_id = self.extract_dv_unique_id(i, getters, self.remove_dv_start_index)?;
-            return Ok(Some(FileActionKey::new(path, dv_unique_id, Remove)));
+            return Ok(Some((FileActionKey::new(path, dv_unique_id), false)));
         }
 
         // No file action found
diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index 2042ab090..3c6c2e845 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -9,7 +9,7 @@ use super::{ScanData, Transform};
 use crate::actions::get_log_add_schema;
 use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
 use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef};
-use crate::log_replay::{FileActionDeduplicator, FileActionKey, FileActionKeyType};
+use crate::log_replay::{FileActionDeduplicator, FileActionKey};
 use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _};
 use crate::scan::{Scalar, TransformExpr};
 use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType};
@@ -32,11 +32,11 @@ struct LogReplayScanner {
 /// first action for a given file is a remove, then that file does not show up in the result at all.
 struct AddRemoveDedupVisitor<'seen> {
     deduplicator: FileActionDeduplicator<'seen>,
-    selection_vector: Vec<bool>,
     logical_schema: SchemaRef,
     transform: Option<Arc<Transform>>,
     partition_filter: Option<ExpressionRef>,
     row_transform_exprs: Vec<Option<ExpressionRef>>,
+    selection_vector: Vec<bool>,
 }
 
 impl AddRemoveDedupVisitor<'_> {
@@ -64,11 +64,11 @@ impl AddRemoveDedupVisitor<'_> {
                 Self::ADD_DV_START_INDEX,
                 Self::REMOVE_DV_START_INDEX,
             ),
-            selection_vector,
             logical_schema,
             transform,
             partition_filter,
             row_transform_exprs: Vec::new(),
+            selection_vector,
         }
     }
 
@@ -155,15 +155,12 @@ impl AddRemoveDedupVisitor<'_> {
         // The file extraction logic selects the appropriate indexes based on whether we found a valid path.
         // Remove getters are not included when visiting a non-log batch (checkpoint batch), so do
         // not try to extract remove actions in that case.
-        let Some(file_key) = self.deduplicator.extract_file_action(
-            i,
-            getters,
-            !self.deduplicator.is_log_batch(), // skip_removes. true if this is a checkpoint batch
-        )?
+        let Some((file_key, is_add)) =
+            self.deduplicator
+                .extract_file_action(i, getters, !self.deduplicator.is_log_batch())?
         else {
             return Ok(false);
         };
-        let is_add = matches!(file_key.action_type, FileActionKeyType::Add);
 
         // Apply partition pruning (to adds only) before deduplication, so that we don't waste memory
         // tracking pruned files. Removes don't get pruned and we'll still have to track them.
@@ -234,9 +231,11 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> {
     }
 
     fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> {
-        let is_log_batch = self.deduplicator.is_log_batch();
-        let expected_getters = if is_log_batch { 9 } else { 5 };
-
+        let expected_getters = if self.deduplicator.is_log_batch() {
+            9
+        } else {
+            5
+        };
         require!(
             getters.len() == expected_getters,
             Error::InternalError(format!(

From 58f38c0345179ad11300fad2197953ac4adc61e0 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Thu, 27 Mar 2025 17:07:08 -0700
Subject: [PATCH 31/45] nits

---
 kernel/src/scan/log_replay.rs | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index 3c6c2e845..37e504405 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -32,11 +32,11 @@ struct LogReplayScanner {
 /// first action for a given file is a remove, then that file does not show up in the result at all.
 struct AddRemoveDedupVisitor<'seen> {
     deduplicator: FileActionDeduplicator<'seen>,
+    selection_vector: Vec<bool>,
     logical_schema: SchemaRef,
     transform: Option<Arc<Transform>>,
     partition_filter: Option<ExpressionRef>,
     row_transform_exprs: Vec<Option<ExpressionRef>>,
-    selection_vector: Vec<bool>,
 }
 
 impl AddRemoveDedupVisitor<'_> {
@@ -64,11 +64,11 @@ impl AddRemoveDedupVisitor<'_> {
                 Self::ADD_DV_START_INDEX,
                 Self::REMOVE_DV_START_INDEX,
             ),
+            selection_vector,
             logical_schema,
             transform,
             partition_filter,
             row_transform_exprs: Vec::new(),
-            selection_vector,
         }
     }
 
@@ -155,9 +155,11 @@ impl AddRemoveDedupVisitor<'_> {
         // The file extraction logic selects the appropriate indexes based on whether we found a valid path.
         // Remove getters are not included when visiting a non-log batch (checkpoint batch), so do
         // not try to extract remove actions in that case.
-        let Some((file_key, is_add)) =
-            self.deduplicator
-                .extract_file_action(i, getters, !self.deduplicator.is_log_batch())?
+        let Some((file_key, is_add)) = self.deduplicator.extract_file_action(
+            i,
+            getters,
+            !self.deduplicator.is_log_batch(), // skip_removes. true if this is a checkpoint batch
+        )?
         else {
             return Ok(false);
         };
@@ -231,11 +233,8 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> {
     }
 
     fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> {
-        let expected_getters = if self.deduplicator.is_log_batch() {
-            9
-        } else {
-            5
-        };
+        let is_log_batch = self.deduplicator.is_log_batch();
+        let expected_getters = if is_log_batch { 9 } else { 5 };
         require!(
             getters.len() == expected_getters,
             Error::InternalError(format!(

From 628546c45bd10fc4b16501bc47bd5693f5c1b9f8 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Thu, 27 Mar 2025 11:39:05 -0700
Subject: [PATCH 32/45] refactor

---
 kernel/src/log_replay.rs      | 54 ++++++++++++++++++-
 kernel/src/scan/log_replay.rs | 97 +++++++++++++++++++----------------
 kernel/src/scan/mod.rs        |  8 +++
 3 files changed, 115 insertions(+), 44 deletions(-)

diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs
index 39aa4ab6e..3b7e87524 100644
--- a/kernel/src/log_replay.rs
+++ b/kernel/src/log_replay.rs
@@ -18,7 +18,7 @@ use std::collections::HashSet;
 
 use crate::actions::deletion_vector::DeletionVectorDescriptor;
 use crate::engine_data::{GetData, TypedGetData};
-use crate::DeltaResult;
+use crate::{DeltaResult, EngineData};
 
 use tracing::debug;
 
@@ -179,3 +179,55 @@ impl<'seen> FileActionDeduplicator<'seen> {
         self.is_log_batch
     }
 }
+
+/// Trait defining log replay processors which implement custom filtering and transformation
+/// logic for processing action batches from transaction logs. They receive batches in reverse
+/// chronological order (newest to oldest) and typically:
+///
+/// 1. Create or maintain a selection vector to track which actions to include
+/// 2. Track already-seen file actions to deduplicate across batches
+/// 3. Apply specialized filtering based on processor type (scan, checkpoint, etc.)
+///
+pub(crate) trait LogReplayProcessor {
+    /// The type of results produced by this processor
+    type Output;
+
+    /// Process a batch of actions and return the filtered result
+    fn process_actions_batch(
+        &mut self,
+        batch: Box<dyn EngineData>,
+        is_log_batch: bool,
+    ) -> DeltaResult<Self::Output>;
+
+    // Get a reference to the set of seen file keys
+    fn seen_file_keys(&mut self) -> &mut HashSet<FileActionKey>;
+
+    /// Applies a processor to an action iterator and filters out empty results.
+    ///
+    /// This is an associated function rather than an instance method because the
+    /// returned iterator needs to own the processor.
+    fn apply_to_iterator(
+        processor: impl LogReplayProcessor<Output = Self::Output>,
+        action_iter: impl Iterator<Item = DeltaResult<(Box<dyn EngineData>, bool)>>,
+    ) -> impl Iterator<Item = DeltaResult<Self::Output>>
+    where
+        Self::Output: HasSelectionVector,
+    {
+        let mut processor = processor;
+        action_iter
+            .map(move |action_res| {
+                let (batch, is_log_batch) = action_res?;
+                processor.process_actions_batch(batch, is_log_batch)
+            })
+            .filter(|res| {
+                res.as_ref()
+                    .map_or(true, |result| result.has_selected_rows())
+            })
+    }
+}
+
+/// Trait for types that contain a selection vector used in log replay filtering.
+pub(crate) trait HasSelectionVector {
+    /// Check if the selection vector contains at least one selected row
+    fn has_selected_rows(&self) -> bool;
+}
diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index 37e504405..223c668ec 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -9,21 +9,23 @@ use super::{ScanData, Transform};
 use crate::actions::get_log_add_schema;
 use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
 use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef};
-use crate::log_replay::{FileActionDeduplicator, FileActionKey};
+use crate::log_replay::{FileActionDeduplicator, FileActionKey, LogReplayProcessor};
 use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _};
 use crate::scan::{Scalar, TransformExpr};
 use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType};
 use crate::utils::require;
 use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator};
 
-struct LogReplayScanner {
+struct ScanLogReplayProcessor {
     partition_filter: Option<ExpressionRef>,
     data_skipping_filter: Option<DataSkippingFilter>,
-
+    add_transform: Arc<dyn ExpressionEvaluator>,
+    logical_schema: SchemaRef,
+    transform: Option<Arc<Transform>>,
     /// A set of (data file path, dv_unique_id) pairs that have been seen thus
     /// far in the log. This is used to filter out files with Remove actions as
     /// well as duplicate entries in the log.
-    seen: HashSet<FileActionKey>,
+    seen_file_keys: HashSet<FileActionKey>,
 }
 
 /// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds. Log
@@ -291,41 +293,37 @@ fn get_add_transform_expr() -> Expression {
     ])
 }
 
-impl LogReplayScanner {
-    /// Create a new [`LogReplayScanner`] instance
-    fn new(engine: &dyn Engine, physical_predicate: Option<(ExpressionRef, SchemaRef)>) -> Self {
-        Self {
-            partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()),
-            data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate),
-            seen: Default::default(),
-        }
-    }
+impl LogReplayProcessor for ScanLogReplayProcessor {
+    type Output = ScanData;
 
-    fn process_scan_batch(
+    fn process_actions_batch(
         &mut self,
-        add_transform: &dyn ExpressionEvaluator,
-        actions: &dyn EngineData,
-        logical_schema: SchemaRef,
-        transform: Option<Arc<Transform>>,
+        batch: Box<dyn EngineData>,
         is_log_batch: bool,
-    ) -> DeltaResult<ScanData> {
+    ) -> DeltaResult<Self::Output> {
         // Apply data skipping to get back a selection vector for actions that passed skipping. We
         // will update the vector below as log replay identifies duplicates that should be ignored.
         let selection_vector = match &self.data_skipping_filter {
-            Some(filter) => filter.apply(actions)?,
-            None => vec![true; actions.len()],
+            Some(filter) => filter.apply(batch.as_ref())?,
+            None => vec![true; batch.len()],
         };
-        assert_eq!(selection_vector.len(), actions.len());
+        assert_eq!(selection_vector.len(), batch.len());
+
+        let logical_schema = self.logical_schema.clone();
+        let transform = self.transform.clone();
+        let partition_filter = self.partition_filter.clone();
+        let result = self.add_transform.evaluate(batch.as_ref())?;
 
         let mut visitor = AddRemoveDedupVisitor::new(
-            &mut self.seen,
+            self.seen_file_keys(),
             selection_vector,
             logical_schema,
             transform,
-            self.partition_filter.clone(),
+            partition_filter,
             is_log_batch,
         );
-        visitor.visit_rows_of(actions)?;
+
+        visitor.visit_rows_of(batch.as_ref())?;
 
         // TODO: Teach expression eval to respect the selection vector we just computed so carefully!
         let result = add_transform.evaluate(actions)?;
@@ -335,6 +333,33 @@ impl LogReplayScanner {
             visitor.row_transform_exprs,
         ))
     }
+
+    fn seen_file_keys(&mut self) -> &mut HashSet<FileActionKey> {
+        &mut self.seen_file_keys
+    }
+}
+
+impl ScanLogReplayProcessor {
+    /// Create a new [`ScanLogReplayProcessor`] instance
+    fn new(
+        engine: &dyn Engine,
+        physical_predicate: Option<(ExpressionRef, SchemaRef)>,
+        logical_schema: SchemaRef,
+        transform: Option<Arc<Transform>>,
+    ) -> Self {
+        Self {
+            partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()),
+            data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate),
+            add_transform: engine.get_expression_handler().get_evaluator(
+                get_log_add_schema().clone(),
+                get_add_transform_expr(),
+                SCAN_ROW_DATATYPE.clone(),
+            ),
+            seen_file_keys: Default::default(),
+            logical_schema,
+            transform,
+        }
+    }
 }
 
 /// Given an iterator of (engine_data, bool) tuples and a predicate, returns an iterator of
@@ -348,24 +373,10 @@ pub(crate) fn scan_action_iter(
     transform: Option<Arc<Transform>>,
     physical_predicate: Option<(ExpressionRef, SchemaRef)>,
 ) -> impl Iterator<Item = DeltaResult<ScanData>> {
-    let mut log_scanner = LogReplayScanner::new(engine, physical_predicate);
-    let add_transform = engine.get_expression_handler().get_evaluator(
-        get_log_add_schema().clone(),
-        get_add_transform_expr(),
-        SCAN_ROW_DATATYPE.clone(),
-    );
-    action_iter
-        .map(move |action_res| {
-            let (batch, is_log_batch) = action_res?;
-            log_scanner.process_scan_batch(
-                add_transform.as_ref(),
-                batch.as_ref(),
-                logical_schema.clone(),
-                transform.clone(),
-                is_log_batch,
-            )
-        })
-        .filter(|res| res.as_ref().map_or(true, |(_, sv, _)| sv.contains(&true)))
+    let log_scanner =
+        ScanLogReplayProcessor::new(engine, physical_predicate, logical_schema, transform);
+
+    ScanLogReplayProcessor::apply_to_iterator(log_scanner, action_iter)
 }
 
 #[cfg(test)]
diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs
index 0372bfd25..0b419f9a3 100644
--- a/kernel/src/scan/mod.rs
+++ b/kernel/src/scan/mod.rs
@@ -13,6 +13,7 @@ use crate::actions::deletion_vector::{
 };
 use crate::actions::{get_log_schema, ADD_NAME, REMOVE_NAME, SIDECAR_NAME};
 use crate::expressions::{ColumnName, Expression, ExpressionRef, ExpressionTransform, Scalar};
+use crate::log_replay::HasSelectionVector;
 use crate::predicates::{DefaultPredicateEvaluator, EmptyColumnResolver};
 use crate::scan::state::{DvInfo, Stats};
 use crate::schema::{
@@ -324,6 +325,13 @@ pub(crate) enum TransformExpr {
 // (data, deletion_vec, transforms)
 pub type ScanData = (Box<dyn EngineData>, Vec<bool>, Vec<Option<ExpressionRef>>);
 
+// Implementation for the scan result type
+impl HasSelectionVector for ScanData {
+    fn has_selected_rows(&self) -> bool {
+        self.1.contains(&true)
+    }
+}
+
 /// The result of building a scan over a table. This can be used to get the actual data from
 /// scanning the table.
 pub struct Scan {

From 88cf9831c9ca486df0363213f00fee45cf47727e Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Thu, 27 Mar 2025 11:49:51 -0700
Subject: [PATCH 33/45] move

---
 kernel/src/scan/log_replay.rs | 46 +++++++++++++++++------------------
 kernel/src/scan/mod.rs        |  1 -
 2 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index 223c668ec..76459e892 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -28,6 +28,29 @@ struct ScanLogReplayProcessor {
     seen_file_keys: HashSet<FileActionKey>,
 }
 
+impl ScanLogReplayProcessor {
+    /// Create a new [`ScanLogReplayProcessor`] instance
+    fn new(
+        engine: &dyn Engine,
+        physical_predicate: Option<(ExpressionRef, SchemaRef)>,
+        logical_schema: SchemaRef,
+        transform: Option<Arc<Transform>>,
+    ) -> Self {
+        Self {
+            partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()),
+            data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate),
+            add_transform: engine.get_expression_handler().get_evaluator(
+                get_log_add_schema().clone(),
+                get_add_transform_expr(),
+                SCAN_ROW_DATATYPE.clone(),
+            ),
+            seen_file_keys: Default::default(),
+            logical_schema,
+            transform,
+        }
+    }
+}
+
 /// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds. Log
 /// replay visits actions newest-first, so once we've seen a file action for a given (path, dvId)
 /// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the
@@ -339,29 +362,6 @@ impl LogReplayProcessor for ScanLogReplayProcessor {
     }
 }
 
-impl ScanLogReplayProcessor {
-    /// Create a new [`ScanLogReplayProcessor`] instance
-    fn new(
-        engine: &dyn Engine,
-        physical_predicate: Option<(ExpressionRef, SchemaRef)>,
-        logical_schema: SchemaRef,
-        transform: Option<Arc<Transform>>,
-    ) -> Self {
-        Self {
-            partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()),
-            data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate),
-            add_transform: engine.get_expression_handler().get_evaluator(
-                get_log_add_schema().clone(),
-                get_add_transform_expr(),
-                SCAN_ROW_DATATYPE.clone(),
-            ),
-            seen_file_keys: Default::default(),
-            logical_schema,
-            transform,
-        }
-    }
-}
-
 /// Given an iterator of (engine_data, bool) tuples and a predicate, returns an iterator of
 /// `(engine_data, selection_vec)`. Each row that is selected in the returned `engine_data` _must_
 /// be processed to complete the scan. Non-selected rows _must_ be ignored. The boolean flag
diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs
index 0b419f9a3..7fd1f9ea9 100644
--- a/kernel/src/scan/mod.rs
+++ b/kernel/src/scan/mod.rs
@@ -325,7 +325,6 @@ pub(crate) enum TransformExpr {
 // (data, deletion_vec, transforms)
 pub type ScanData = (Box<dyn EngineData>, Vec<bool>, Vec<Option<ExpressionRef>>);
 
-// Implementation for the scan result type
 impl HasSelectionVector for ScanData {
     fn has_selected_rows(&self) -> bool {
         self.1.contains(&true)

From 10bb7b56a65ee7f705ce1dbaa74826fcda0f092a Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Thu, 27 Mar 2025 23:05:03 -0700
Subject: [PATCH 34/45] fix rebase

---
 kernel/src/scan/log_replay.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index 76459e892..896b18b8b 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -335,6 +335,7 @@ impl LogReplayProcessor for ScanLogReplayProcessor {
         let logical_schema = self.logical_schema.clone();
         let transform = self.transform.clone();
         let partition_filter = self.partition_filter.clone();
+        // TODO: Teach expression eval to respect the selection vector we just computed so carefully!
         let result = self.add_transform.evaluate(batch.as_ref())?;
 
         let mut visitor = AddRemoveDedupVisitor::new(
@@ -347,9 +348,6 @@ impl LogReplayProcessor for ScanLogReplayProcessor {
         );
 
         visitor.visit_rows_of(batch.as_ref())?;
-
-        // TODO: Teach expression eval to respect the selection vector we just computed so carefully!
-        let result = add_transform.evaluate(actions)?;
         Ok((
             result,
             visitor.selection_vector,

From abc7e1fe4573d924372502ad26f1b5dcef4e5007 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Fri, 28 Mar 2025 16:14:53 -0700
Subject: [PATCH 35/45] merge fixes

---
 kernel/src/actions/visitors.rs       | 228 ++++-----------------------
 kernel/src/checkpoints/log_replay.rs |  34 ++--
 kernel/src/scan/log_replay.rs        |  32 +---
 3 files changed, 45 insertions(+), 249 deletions(-)

diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs
index 4d93d6fd3..aa7f14614 100644
--- a/kernel/src/actions/visitors.rs
+++ b/kernel/src/actions/visitors.rs
@@ -4,10 +4,8 @@
 use std::collections::{HashMap, HashSet};
 use std::sync::LazyLock;
 
-use tracing::debug;
-
 use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
-use crate::log_replay::FileActionKey;
+use crate::log_replay::{FileActionDeduplicator, FileActionKey};
 use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType};
 use crate::utils::require;
 use crate::{DeltaResult, Error};
@@ -505,19 +503,25 @@ impl RowVisitor for SidecarVisitor {
 pub(crate) struct CheckpointVisitor<'seen> {
     // File actions deduplication state
     pub(crate) deduplicator: FileActionDeduplicator<'seen>,
-    pub(crate) total_file_actions: usize,
-    pub(crate) total_add_actions: usize,
+    pub(crate) selection_vector: Vec<bool>,
+    pub(crate) total_file_actions: u64,
+    pub(crate) total_add_actions: u64,
     pub(crate) minimum_file_retention_timestamp: i64,
 
     // Non-file actions deduplication state
     pub(crate) seen_protocol: bool,
     pub(crate) seen_metadata: bool,
     pub(crate) seen_txns: &'seen mut HashSet<String>,
-    pub(crate) total_non_file_actions: usize,
+    pub(crate) total_non_file_actions: u64,
 }
 
 #[allow(unused)]
 impl CheckpointVisitor<'_> {
+    // The index position in the row getters for the following columns
+    const ADD_PATH_INDEX: usize = 0;
+    const ADD_DV_START_INDEX: usize = 1;
+    const REMOVE_PATH_INDEX: usize = 4;
+    const REMOVE_DV_START_INDEX: usize = 6;
     /// Create a new CheckpointVisitor
     pub(crate) fn new<'seen>(
         seen_file_keys: &'seen mut HashSet<FileActionKey>,
@@ -531,9 +535,13 @@ impl CheckpointVisitor<'_> {
         CheckpointVisitor {
             deduplicator: FileActionDeduplicator::new(
                 seen_file_keys,
-                selection_vector,
                 is_log_batch,
+                Self::ADD_PATH_INDEX,
+                Self::REMOVE_PATH_INDEX,
+                Self::ADD_DV_START_INDEX,
+                Self::REMOVE_DV_START_INDEX,
             ),
+            selection_vector,
             total_file_actions: 0,
             total_add_actions: 0,
             minimum_file_retention_timestamp,
@@ -567,17 +575,13 @@ impl CheckpointVisitor<'_> {
         i: usize,
         getters: &[&'a dyn GetData<'a>],
     ) -> DeltaResult<bool> {
-        // Extract file action key and determine if it's an add operation
-        let Some((file_key, is_add)) = self.deduplicator.extract_file_action(
-            i,
-            getters,
-            // Do not skip remove actions (even if we're processing a log batch)
-            FileActionExtractConfig::new(0, 4, 1, 6, false),
-        )?
+        // Never skip remove actions, as they may be unexpired tombstones.
+        let Some((file_key, is_add)) = self.deduplicator.extract_file_action(i, getters, false)?
         else {
             return Ok(false);
         };
 
+        // Check if we've already seen this file action
         if self.deduplicator.check_and_record_seen(file_key) {
             return Ok(false);
         }
@@ -700,191 +704,13 @@ impl RowVisitor for CheckpointVisitor<'_> {
 
             // Mark the row for selection if it's either a valid non-file or file action
             if is_non_file_action || is_file_action {
-                self.deduplicator.selection_vector_mut()[i] = true;
+                self.selection_vector[i] = true;
             }
         }
         Ok(())
     }
 }
 
-/// This struct contains indices and configuration options needed to
-/// extract file actions from action batches in the Delta log.
-pub(crate) struct FileActionExtractConfig {
-    /// Index of the getter containing the add.path column
-    pub add_path_index: usize,
-    /// Index of the getter containing the remove.path column
-    pub remove_path_index: usize,
-    /// Starting index for add action deletion vector columns
-    pub add_dv_start_index: usize,
-    /// Starting index for remove action deletion vector columns
-    pub remove_dv_start_index: usize,
-    /// Whether to skip remove actions when extracting file actions
-    pub skip_removes: bool,
-}
-
-impl FileActionExtractConfig {
-    pub(crate) fn new(
-        add_path_index: usize,
-        remove_path_index: usize,
-        add_dv_start_index: usize,
-        remove_dv_start_index: usize,
-        skip_removes: bool,
-    ) -> Self {
-        Self {
-            add_path_index,
-            remove_path_index,
-            add_dv_start_index,
-            remove_dv_start_index,
-            skip_removes,
-        }
-    }
-}
-
-/// Core implementation for deduplicating file actions in Delta log replay
-/// This struct extracts the common functionality from the CheckpointVisitor
-/// and the AddRemoveDedupVisitor.
-pub(crate) struct FileActionDeduplicator<'seen> {
-    /// A set of (data file path, dv_unique_id) pairs that have been seen thus
-    /// far in the log for deduplication
-    seen_file_keys: &'seen mut HashSet<FileActionKey>,
-    /// Selection vector to track which rows should be included
-    selection_vector: Vec<bool>,
-    /// Whether we're processing a log batch (as opposed to a checkpoint)
-    is_log_batch: bool,
-}
-
-impl<'seen> FileActionDeduplicator<'seen> {
-    pub(crate) fn new(
-        seen_file_keys: &'seen mut HashSet<FileActionKey>,
-        selection_vector: Vec<bool>,
-        is_log_batch: bool,
-    ) -> Self {
-        Self {
-            seen_file_keys,
-            selection_vector,
-            is_log_batch,
-        }
-    }
-
-    /// Checks if log replay already processed this logical file (in which case the current action
-    /// should be ignored). If not already seen, register it so we can recognize future duplicates.
-    /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it
-    /// and should process it.
-    pub(crate) fn check_and_record_seen(&mut self, key: FileActionKey) -> bool {
-        // Note: each (add.path + add.dv_unique_id()) pair has a
-        // unique Add + Remove pair in the log. For example:
-        // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json
-
-        if self.seen_file_keys.contains(&key) {
-            debug!(
-                "Ignoring duplicate ({}, {:?}) in scan, is log {}",
-                key.path, key.dv_unique_id, self.is_log_batch
-            );
-            true
-        } else {
-            debug!(
-                "Including ({}, {:?}) in scan, is log {}",
-                key.path, key.dv_unique_id, self.is_log_batch
-            );
-            if self.is_log_batch {
-                // Remember file actions from this batch so we can ignore duplicates as we process
-                // batches from older commit and/or checkpoint files. We don't track checkpoint
-                // batches because they are already the oldest actions and never replace anything.
-                self.seen_file_keys.insert(key);
-            }
-            false
-        }
-    }
-
-    /// Extract deletion vector unique ID
-    fn extract_dv_unique_id<'a>(
-        &self,
-        i: usize,
-        getters: &[&'a dyn GetData<'a>],
-        add_dv_start_index: Option<usize>,
-        remove_dv_start_index: Option<usize>,
-    ) -> DeltaResult<Option<String>> {
-        // Get the starting index based on action type
-        let start_idx = add_dv_start_index
-            .or(remove_dv_start_index)
-            .ok_or_else(|| Error::GenericError {
-                source: "starting indices for add/remove DVs should have been passed".into(),
-            })?;
-
-        // Extract the DV unique ID
-        match getters[start_idx].get_opt(i, "deletionVector.storageType")? {
-            Some(storage_type) => Ok(Some(DeletionVectorDescriptor::unique_id_from_parts(
-                storage_type,
-                getters[start_idx + 1].get(i, "deletionVector.pathOrInlineDv")?,
-                getters[start_idx + 2].get_opt(i, "deletionVector.offset")?,
-            ))),
-            None => Ok(None),
-        }
-    }
-
-    /// Extracts a file action key and determines if it's an add operation.
-    ///
-    /// This method examines the data at the given index using the provided getters and config
-    /// to identify whether a file action exists and what type it is.
-    ///
-    /// # Arguments
-    ///
-    /// * `i` - Index position in the data structure to examine
-    /// * `getters` - Collection of data getter implementations used to access the data
-    /// * `config` - Configuration specifying where to find add/remove operations
-    ///
-    /// # Returns
-    ///
-    /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation
-    /// * `Ok(None)` - When no file action is found
-    /// * `Err(...)` - On any error during extraction
-    pub(crate) fn extract_file_action<'a>(
-        &self,
-        i: usize,
-        getters: &[&'a dyn GetData<'a>],
-        config: FileActionExtractConfig,
-    ) -> DeltaResult<Option<(FileActionKey, bool)>> {
-        // Try to extract an add action path
-        if let Some(path) = getters[config.add_path_index].get_str(i, "add.path")? {
-            let dv_unique_id =
-                self.extract_dv_unique_id(i, getters, Some(config.add_dv_start_index), None)?;
-            return Ok(Some((FileActionKey::new(path, dv_unique_id), true)));
-        }
-
-        // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint file.
-        if config.skip_removes {
-            return Ok(None);
-        }
-
-        // Try to extract a remove action path
-        if let Some(path) = getters[config.remove_path_index].get_str(i, "remove.path")? {
-            let dv_unique_id =
-                self.extract_dv_unique_id(i, getters, None, Some(config.remove_dv_start_index))?;
-            return Ok(Some((FileActionKey::new(path, dv_unique_id), false)));
-        }
-
-        // No file action found
-        Ok(None)
-    }
-
-    pub(crate) fn selection_vector(self) -> Vec<bool> {
-        self.selection_vector
-    }
-
-    pub(crate) fn selection_vector_ref(&self) -> &Vec<bool> {
-        &self.selection_vector
-    }
-
-    pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec<bool> {
-        &mut self.selection_vector
-    }
-
-    /// Returns whether we are currently processing a log batch.
-    pub(crate) fn is_log_batch(&self) -> bool {
-        self.is_log_batch
-    }
-}
-
 /// Get a DV out of some engine data. The caller is responsible for slicing the `getters` slice such
 /// that the first element contains the `storageType` element of the deletion vector.
 pub(crate) fn visit_deletion_vector_at<'a>(
@@ -1175,7 +1001,7 @@ mod tests {
         assert_eq!(visitor.seen_txns.len(), 1);
         assert_eq!(visitor.total_non_file_actions, 3);
 
-        assert_eq!(visitor.deduplicator.selection_vector, expected);
+        assert_eq!(visitor.selection_vector, expected);
         Ok(())
     }
 
@@ -1207,7 +1033,7 @@ mod tests {
 
         // Only "one_above_threshold" should be kept
         let expected = vec![false, false, true, false];
-        assert_eq!(visitor.deduplicator.selection_vector, expected);
+        assert_eq!(visitor.selection_vector, expected);
         assert_eq!(visitor.total_file_actions, 1);
         assert_eq!(visitor.total_add_actions, 0);
         assert_eq!(visitor.total_non_file_actions, 0);
@@ -1240,7 +1066,7 @@ mod tests {
 
         // First one should be included, second one skipped as a duplicate
         let expected = vec![true, false];
-        assert_eq!(visitor.deduplicator.selection_vector, expected);
+        assert_eq!(visitor.selection_vector, expected);
         assert_eq!(visitor.total_file_actions, 1);
         assert_eq!(visitor.total_add_actions, 1);
         assert_eq!(visitor.total_non_file_actions, 0);
@@ -1275,7 +1101,7 @@ mod tests {
 
         // Both should be included since we don't track duplicates in checkpoint batches
         let expected = vec![true, true];
-        assert_eq!(visitor.deduplicator.selection_vector, expected);
+        assert_eq!(visitor.selection_vector, expected);
         assert_eq!(visitor.total_file_actions, 2);
         assert_eq!(visitor.total_add_actions, 2);
         assert_eq!(visitor.total_non_file_actions, 0);
@@ -1309,7 +1135,7 @@ mod tests {
         visitor.visit_rows_of(batch.as_ref())?;
 
         let expected = vec![true, true, false]; // Third one is a duplicate
-        assert_eq!(visitor.deduplicator.selection_vector, expected);
+        assert_eq!(visitor.selection_vector, expected);
         assert_eq!(visitor.total_file_actions, 2);
         assert_eq!(visitor.total_add_actions, 2);
         assert_eq!(visitor.total_non_file_actions, 0);
@@ -1341,7 +1167,7 @@ mod tests {
         visitor.visit_rows_of(batch.as_ref())?;
 
         let expected = vec![true, true, true];
-        assert_eq!(visitor.deduplicator.selection_vector, expected);
+        assert_eq!(visitor.selection_vector, expected);
         assert!(visitor.seen_protocol);
         assert!(visitor.seen_metadata);
         assert_eq!(visitor.seen_txns.len(), 1);
@@ -1379,7 +1205,7 @@ mod tests {
 
         // All actions should be skipped as they have already been seen
         let expected = vec![false, false, false];
-        assert_eq!(visitor.deduplicator.selection_vector, expected);
+        assert_eq!(visitor.selection_vector, expected);
         assert_eq!(visitor.total_non_file_actions, 0);
         assert_eq!(visitor.total_file_actions, 0);
 
@@ -1417,7 +1243,7 @@ mod tests {
 
         // First occurrence of each type should be included
         let expected = vec![true, false, true, true, false, true, false];
-        assert_eq!(visitor.deduplicator.selection_vector, expected);
+        assert_eq!(visitor.selection_vector, expected);
         assert_eq!(visitor.seen_txns.len(), 2); // Two different app IDs
         assert_eq!(visitor.total_non_file_actions, 4);
         assert_eq!(visitor.total_file_actions, 0);
diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs
index dc64b766c..0a31bffc1 100644
--- a/kernel/src/checkpoints/log_replay.rs
+++ b/kernel/src/checkpoints/log_replay.rs
@@ -1,12 +1,10 @@
 use std::collections::HashSet;
-use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::Arc;
 
 use crate::actions::visitors::CheckpointVisitor;
 use crate::engine_data::RowVisitor;
-use crate::log_replay::{
-    apply_processor_to_iterator, FileActionKey, HasSelectionVector, LogReplayProcessor,
-};
+use crate::log_replay::{FileActionKey, HasSelectionVector, LogReplayProcessor};
 use crate::{DeltaResult, EngineData};
 
 pub struct CheckpointData {
@@ -30,10 +28,10 @@ struct CheckpointLogReplayProcessor {
     seen_file_keys: HashSet<FileActionKey>,
 
     /// Counter for the total number of actions processed during log replay.
-    total_actions: Arc<AtomicUsize>,
+    total_actions: Arc<AtomicU64>,
 
     /// Counter for the total number of add actions processed during log replay.
-    total_add_actions: Arc<AtomicUsize>,
+    total_add_actions: Arc<AtomicU64>,
 
     /// Indicates whether a protocol action has been seen in the log.
     seen_protocol: bool,
@@ -50,7 +48,7 @@ struct CheckpointLogReplayProcessor {
 
 impl LogReplayProcessor for CheckpointLogReplayProcessor {
     // Define the processing result type as a tuple of the data and selection vector
-    type ProcessingResult = CheckpointData;
+    type Output = CheckpointData;
 
     /// This function processes batches of actions in reverse chronological order
     /// (from most recent to least recent) and performs the necessary filtering
@@ -65,11 +63,11 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor {
     /// 2. For each app ID, only the most recent transaction action is included
     /// 3. File actions are deduplicated based on path and unique ID
     /// 4. Tombstones older than `minimum_file_retention_timestamp` are excluded
-    fn process_batch(
+    fn process_actions_batch(
         &mut self,
         batch: Box<dyn EngineData>,
         is_log_batch: bool,
-    ) -> DeltaResult<Self::ProcessingResult> {
+    ) -> DeltaResult<Self::Output> {
         // Initialize selection vector with all rows un-selected
         let selection_vector = vec![false; batch.len()];
         assert_eq!(
@@ -106,7 +104,7 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor {
 
         Ok(CheckpointData {
             data: batch,
-            selection_vector: visitor.deduplicator.selection_vector(),
+            selection_vector: visitor.selection_vector,
         })
     }
 
@@ -119,8 +117,8 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor {
 #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented
 impl CheckpointLogReplayProcessor {
     pub(super) fn new(
-        total_actions_counter: Arc<AtomicUsize>,
-        total_add_actions_counter: Arc<AtomicUsize>,
+        total_actions_counter: Arc<AtomicU64>,
+        total_add_actions_counter: Arc<AtomicU64>,
         minimum_file_retention_timestamp: i64,
     ) -> Self {
         Self {
@@ -146,8 +144,8 @@ impl CheckpointLogReplayProcessor {
 #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented
 pub(crate) fn checkpoint_actions_iter(
     action_iter: impl Iterator<Item = DeltaResult<(Box<dyn EngineData>, bool)>> + Send + 'static,
-    total_actions_counter: Arc<AtomicUsize>,
-    total_add_actions_counter: Arc<AtomicUsize>,
+    total_actions_counter: Arc<AtomicU64>,
+    total_add_actions_counter: Arc<AtomicU64>,
     minimum_file_retention_timestamp: i64,
 ) -> impl Iterator<Item = DeltaResult<CheckpointData>> + Send + 'static {
     let mut log_scanner = CheckpointLogReplayProcessor::new(
@@ -156,12 +154,12 @@ pub(crate) fn checkpoint_actions_iter(
         minimum_file_retention_timestamp,
     );
 
-    apply_processor_to_iterator(log_scanner, action_iter)
+    CheckpointLogReplayProcessor::apply_to_iterator(log_scanner, action_iter)
 }
 
 #[cfg(test)]
 mod tests {
-    use std::sync::atomic::{AtomicUsize, Ordering};
+    use std::sync::atomic::{AtomicU64, Ordering};
     use std::sync::Arc;
 
     use crate::arrow::array::StringArray;
@@ -175,8 +173,8 @@ mod tests {
     #[test]
     fn test_v1_checkpoint_actions_iter_multi_batch_integration() -> DeltaResult<()> {
         // Setup counters
-        let total_actions_counter = Arc::new(AtomicUsize::new(0));
-        let total_add_actions_counter = Arc::new(AtomicUsize::new(0));
+        let total_actions_counter = Arc::new(AtomicU64::new(0));
+        let total_add_actions_counter = Arc::new(AtomicU64::new(0));
 
         // Create first batch with protocol, metadata, and some files
         let json_strings1: StringArray = vec![
diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs
index 7782b1857..0959d9cf8 100644
--- a/kernel/src/scan/log_replay.rs
+++ b/kernel/src/scan/log_replay.rs
@@ -7,7 +7,6 @@ use itertools::Itertools;
 use super::data_skipping::DataSkippingFilter;
 use super::{ScanData, Transform};
 use crate::actions::get_log_add_schema;
-use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig};
 use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
 use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef};
 use crate::log_replay::{FileActionDeduplicator, FileActionKey, LogReplayProcessor};
@@ -29,29 +28,6 @@ struct ScanLogReplayProcessor {
     seen_file_keys: HashSet<FileActionKey>,
 }
 
-impl ScanLogReplayProcessor {
-    /// Create a new [`ScanLogReplayProcessor`] instance
-    fn new(
-        engine: &dyn Engine,
-        physical_predicate: Option<(ExpressionRef, SchemaRef)>,
-        logical_schema: SchemaRef,
-        transform: Option<Arc<Transform>>,
-    ) -> Self {
-        Self {
-            partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()),
-            data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate),
-            add_transform: engine.get_expression_handler().get_evaluator(
-                get_log_add_schema().clone(),
-                get_add_transform_expr(),
-                SCAN_ROW_DATATYPE.clone(),
-            ),
-            seen_file_keys: Default::default(),
-            logical_schema,
-            transform,
-        }
-    }
-}
-
 /// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds. Log
 /// replay visits actions newest-first, so once we've seen a file action for a given (path, dvId)
 /// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the
@@ -270,8 +246,8 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> {
         );
 
         for i in 0..row_count {
-            if self.deduplicator.selection_vector_ref()[i] {
-                self.deduplicator.selection_vector_mut()[i] = self.is_valid_add(i, getters)?;
+            if self.selection_vector[i] {
+                self.selection_vector[i] = self.is_valid_add(i, getters)?;
             }
         }
         Ok(())
@@ -359,10 +335,6 @@ impl LogReplayProcessor for ScanLogReplayProcessor {
     fn seen_file_keys(&mut self) -> &mut HashSet<FileActionKey> {
         &mut self.seen_file_keys
     }
-
-    fn seen_file_keys(&mut self) -> &mut HashSet<FileActionKey> {
-        &mut self.seen_file_keys
-    }
 }
 
 impl ScanLogReplayProcessor {

From 7fbfe29fb19698694f193212d9415915d6cd8a06 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Fri, 28 Mar 2025 20:19:47 -0700
Subject: [PATCH 36/45] mvp

---
 kernel/src/checkpoints/log_replay.rs |   5 +-
 kernel/src/checkpoints/mod.rs        | 318 +++++++++++++++++++++++++++
 kernel/src/checkpoints/tests.rs      | 183 +++++++++++++++
 kernel/src/path.rs                   |  78 +++++++
 kernel/src/table.rs                  |  15 ++
 kernel/src/table_configuration.rs    |  19 ++
 6 files changed, 615 insertions(+), 3 deletions(-)
 create mode 100644 kernel/src/checkpoints/tests.rs

diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs
index 0a31bffc1..cbad63305 100644
--- a/kernel/src/checkpoints/log_replay.rs
+++ b/kernel/src/checkpoints/log_replay.rs
@@ -8,9 +8,8 @@ use crate::log_replay::{FileActionKey, HasSelectionVector, LogReplayProcessor};
 use crate::{DeltaResult, EngineData};
 
 pub struct CheckpointData {
-    #[allow(unused)]
-    data: Box<dyn EngineData>,
-    selection_vector: Vec<bool>,
+    pub data: Box<dyn EngineData>,
+    pub selection_vector: Vec<bool>,
 }
 
 impl HasSelectionVector for CheckpointData {
diff --git a/kernel/src/checkpoints/mod.rs b/kernel/src/checkpoints/mod.rs
index 826ff771f..f5a764779 100644
--- a/kernel/src/checkpoints/mod.rs
+++ b/kernel/src/checkpoints/mod.rs
@@ -1 +1,319 @@
+//! # Delta Kernel Checkpoint API
+//!
+//! This module provides functionality for writing single-file checkpoints in Delta tables.
+//!
+//! 1. Single-file Classic-named V1 Checkpoint - For legacy tables without v2Checkpoints feature
+//! 2. Single-file Classic-named V2 Checkpoint - For backwards compatibility with v2Checkpoints feature
+//! 3. Single-file UUID-named V2 Checkpoint - Recommended for small to medium tables with v2Checkpoints feature
+//!
+//! The API is designed with a builder pattern for configuring and creating checkpoint writers.
+//!
+//! # Example
+//! ```
+//! let path = "./tests/data/app-txn-no-checkpoint";
+//! let engine = Arc::new(SyncEngine::new());
+//! let table = Table::try_from_uri(path)?;
+//! // Create a checkpoint builder for the table at a specific version
+//! let builder = table.checkpoint(&engine, Some(2))?;
+//! // Configure the builder (optional)
+//! let writer = builder.with_classic_naming(true);
+//! // Build the checkpoint writer
+//! let writer = builder.build(&engine)?;
+//! // Get the checkpoint data and path
+//! let checkpoint_data = writer.get_checkpoint_info()?;
+//! /* Engine writes data to file path and collects metadata: (path, bytes, timestamp) */
+//! /* All checkpoint data must be written before calling .finalize_checkpoint() */
+//! writer.finalize_checkpoint()?;
+//! ```
+use log_replay::{checkpoint_actions_iter, CheckpointData};
+use std::{
+    sync::{atomic::AtomicU64, Arc, LazyLock},
+    time::{Duration, SystemTime, UNIX_EPOCH},
+};
+use url::Url;
+
+use crate::{
+    actions::{
+        Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, METADATA_NAME,
+        PROTOCOL_NAME, REMOVE_NAME, SET_TRANSACTION_NAME, SIDECAR_NAME,
+    },
+    path::ParsedLogPath,
+    snapshot::Snapshot,
+    DeltaResult, Engine, EngineData, Error,
+};
+
+use crate::actions::schemas::GetStructField;
+use crate::schema::{SchemaRef, StructType};
 pub mod log_replay;
+#[cfg(test)]
+mod tests;
+
+/// Read schema definition for collecting checkpoint actions
+static CHECKPOINT_READ_SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
+    StructType::new([
+        Option::<Add>::get_struct_field(ADD_NAME),
+        Option::<Remove>::get_struct_field(REMOVE_NAME),
+        Option::<Metadata>::get_struct_field(METADATA_NAME),
+        Option::<Protocol>::get_struct_field(PROTOCOL_NAME),
+        Option::<SetTransaction>::get_struct_field(SET_TRANSACTION_NAME),
+        Option::<Sidecar>::get_struct_field(SIDECAR_NAME),
+    ])
+    .into()
+});
+
+/// Returns the read schema to collect checkpoint actions
+#[cfg_attr(feature = "developer-visibility", visibility::make(pub))]
+#[cfg_attr(not(feature = "developer-visibility"), visibility::make(pub(crate)))]
+fn get_checkpoint_read_schema() -> &'static SchemaRef {
+    &CHECKPOINT_READ_SCHEMA
+}
+
+/// Contains the path and data for a single-file checkpoint.
+///
+/// This struct holds all the necessary information for writing a checkpoint file,
+/// including the destination path and the iterator over checkpoint actions.
+pub struct SingleFileCheckpointData {
+    /// The target URL where the checkpoint file will be written
+    pub path: Url,
+
+    /// Iterator over checkpoint actions to be written to the file
+    pub data: Box<dyn Iterator<Item = DeltaResult<CheckpointData>>>,
+}
+
+/// Writer for creating checkpoint files in Delta tables.
+///
+/// The CheckpointWriter orchestrates the process of writing checkpoint data to storage.
+/// It manages the one-time consumption of checkpoint data and tracks statistics
+/// about the actions included in the checkpoint.
+pub struct CheckpointWriter {
+    /// Using Option to enforce single consumption at compile time
+    single_file_checkpoint_data: Option<SingleFileCheckpointData>,
+
+    /// Counter for the total number of actions in the checkpoint
+    total_actions_counter: Arc<AtomicU64>,
+
+    /// Counter for add file actions specifically
+    total_add_actions_counter: Arc<AtomicU64>,
+}
+
+impl CheckpointWriter {
+    /// Creates a new CheckpointWriter with the provided checkpoint data and counters
+    fn new(
+        single_file_checkpoint_data: Option<SingleFileCheckpointData>,
+        total_actions_counter: Arc<AtomicU64>,
+        total_add_actions_counter: Arc<AtomicU64>,
+    ) -> Self {
+        Self {
+            single_file_checkpoint_data,
+            total_actions_counter,
+            total_add_actions_counter,
+        }
+    }
+
+    /// Retrieves the checkpoint data and path information
+    ///
+    /// This method takes ownership of the checkpoint data, ensuring it can
+    /// only be consumed once. It returns an error if the data has already
+    /// been consumed.
+    pub fn get_checkpoint_info(&mut self) -> DeltaResult<SingleFileCheckpointData> {
+        self.single_file_checkpoint_data
+            .take()
+            .ok_or_else(|| Error::generic("Checkpoint data already consumed"))
+    }
+
+    /// Finalizes the checkpoint writing process
+    ///
+    /// This method should be only called AFTER writing all checkpoint data to
+    /// ensure proper completion of the checkpoint operation, which includes
+    /// writing the _last_checkpoint file.
+    pub fn finalize_checkpoint(self) -> DeltaResult<()> {
+        Ok(())
+    }
+}
+
+/// Builder for configuring and creating CheckpointWriter instances
+///
+/// The CheckpointBuilder provides an interface for configuring checkpoint
+/// generation. It handles table feature detection and enforces compatibility
+/// between configuration options and table features.
+pub struct CheckpointBuilder {
+    /// The table snapshot from which to create the checkpoint
+    snapshot: Snapshot,
+
+    /// Whether to use classic naming for the checkpoint file
+    with_classic_naming: bool,
+}
+
+impl CheckpointBuilder {
+    /// Creates a new CheckpointBuilder with the given snapshot
+    pub(crate) fn new(snapshot: Snapshot) -> Self {
+        Self {
+            snapshot,
+            with_classic_naming: false,
+        }
+    }
+
+    /// Configures the builder to use classic naming scheme
+    ///
+    /// Classic naming is required for V1 checkpoints and optional for V2 checkpoints.
+    /// For V2 checkpoints, the default is UUID naming unless this method is called.
+    pub fn with_classic_naming(mut self, with_classic_naming: bool) -> Self {
+        self.with_classic_naming = with_classic_naming;
+        self
+    }
+
+    /// Builds a CheckpointWriter based on the configuration
+    ///
+    /// This method validates the configuration against table features and creates
+    /// a CheckpointWriter for the appropriate checkpoint type. It performs protocol
+    /// table feature checks to determine if v2Checkpoints are supported.
+    ///
+    /// # Arguments
+    /// * `engine` - The engine implementation for data operations
+    ///
+    /// # Returns
+    /// * `DeltaResult<CheckpointWriter>` - A configured checkpoint writer on success,
+    ///   or an error if the configuration is incompatible with table features
+    pub fn build(self, engine: &dyn Engine) -> DeltaResult<CheckpointWriter> {
+        let v2_checkpoints_supported = self
+            .snapshot
+            .table_configuration()
+            .is_v2_checkpoint_supported();
+
+        let deleted_file_retention_timestamp = self.deleted_file_retention_timestamp()?;
+
+        // Create counters for tracking actions
+        let total_actions_counter = Arc::new(AtomicU64::new(0));
+        let total_add_actions_counter = Arc::new(AtomicU64::new(0));
+
+        // Create iterator over actions for checkpoint data
+        let checkpoint_data = checkpoint_actions_iter(
+            self.replay_for_checkpoint_data(engine)?,
+            total_actions_counter.clone(),
+            total_add_actions_counter.clone(),
+            deleted_file_retention_timestamp,
+        );
+
+        // Generate checkpoint path based on builder configuration
+        // Classic naming is required for V1 checkpoints and optional for V2 checkpoints
+        let checkpoint_path = if self.with_classic_naming || !v2_checkpoints_supported {
+            ParsedLogPath::new_classic_parquet_checkpoint(
+                self.snapshot.table_root(),
+                self.snapshot.version(),
+            )?
+        } else {
+            ParsedLogPath::new_uuid_parquet_checkpoint(
+                self.snapshot.table_root(),
+                self.snapshot.version(),
+            )?
+        };
+
+        let data = SingleFileCheckpointData {
+            data: Box::new(checkpoint_data),
+            path: checkpoint_path.location,
+        };
+
+        Ok(CheckpointWriter::new(
+            Some(data),
+            total_actions_counter,
+            total_add_actions_counter,
+        ))
+    }
+
+    /// Prepares the iterator over actions for checkpoint creation
+    ///
+    /// This method is factored out to facilitate testing and returns an iterator
+    /// over all actions to be included in the checkpoint.
+    fn replay_for_checkpoint_data(
+        &self,
+        engine: &dyn Engine,
+    ) -> DeltaResult<impl Iterator<Item = DeltaResult<(Box<dyn EngineData>, bool)>> + Send> {
+        let read_schema = get_checkpoint_read_schema();
+
+        self.snapshot.log_segment().read_actions(
+            engine,
+            read_schema.clone(),
+            read_schema.clone(),
+            None,
+        )
+    }
+
+    /// Calculates the cutoff timestamp for deleted file cleanup.
+    ///
+    /// This function determines the minimum timestamp before which deleted files
+    /// will be permanently removed during VACUUM operations, based on the table's
+    /// deleted_file_retention_duration property.
+    ///
+    /// Returns the cutoff timestamp in milliseconds since epoch, matching
+    /// the remove action's deletion_timestamp format for comparison.
+    ///
+    /// The default retention period is 7 days, matching delta-spark's behavior.
+    pub(crate) fn deleted_file_retention_timestamp(&self) -> DeltaResult<i64> {
+        let retention_duration = self
+            .snapshot
+            .table_properties()
+            .deleted_file_retention_duration;
+
+        deleted_file_retention_timestamp_with_time(
+            retention_duration,
+            SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .map_err(|e| Error::generic(format!("Failed to calculate system time: {}", e)))?,
+        )
+    }
+}
+
+/// Internal implementation with injectable time parameter for testing
+fn deleted_file_retention_timestamp_with_time(
+    retention_duration: Option<Duration>,
+    now_duration: Duration,
+) -> DeltaResult<i64> {
+    // Use provided retention duration or default (7 days)
+    let retention_duration =
+        retention_duration.unwrap_or_else(|| Duration::from_secs(60 * 60 * 24 * 7));
+
+    // Convert to milliseconds for remove action deletion_timestamp comparison
+    let now_ms: i64 = now_duration
+        .as_millis()
+        .try_into()
+        .map_err(|_| Error::generic("Current timestamp exceeds i64 millisecond range"))?;
+
+    let retention_ms: i64 = retention_duration
+        .as_millis()
+        .try_into()
+        .map_err(|_| Error::generic("Retention duration exceeds i64 millisecond range"))?;
+
+    // Simple subtraction - will produce negative values if retention > now
+    Ok(now_ms - retention_ms)
+}
+
+#[cfg(test)]
+mod unit_tests {
+    use super::*;
+    use std::time::Duration;
+
+    #[test]
+    fn test_deleted_file_retention_timestamp() -> DeltaResult<()> {
+        let now = Duration::from_secs(1000).as_millis() as i64;
+
+        // Test cases
+        let test_cases = [
+            // Default case (7 days)
+            (None, now - (7 * 24 * 60 * 60 * 1000)),
+            // Zero retention
+            (Some(Duration::from_secs(0)), now),
+            // Custom retention (2000 seconds)
+            // This results in a negative timestamp which is valid - as it just means that
+            // the retention window extends to before UNIX epoch.
+            (Some(Duration::from_secs(2000)), now - (2000 * 1000)),
+        ];
+
+        for (retention, expected) in test_cases {
+            let result =
+                deleted_file_retention_timestamp_with_time(retention, Duration::from_secs(1000))?;
+            assert_eq!(result, expected);
+        }
+
+        Ok(())
+    }
+}
diff --git a/kernel/src/checkpoints/tests.rs b/kernel/src/checkpoints/tests.rs
new file mode 100644
index 000000000..6975e73bc
--- /dev/null
+++ b/kernel/src/checkpoints/tests.rs
@@ -0,0 +1,183 @@
+use std::sync::Arc;
+
+use object_store::{memory::InMemory, path::Path, ObjectStore};
+use test_utils::delta_path_for_version;
+use url::Url;
+
+use crate::{
+    actions::{Add, Metadata, Protocol, Remove},
+    engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine},
+    utils::test_utils::Action,
+    DeltaResult, Table,
+};
+
+// Create an in-memory store and return the store and the URL for the store's _delta_log directory.
+fn new_in_memory_store() -> (Arc<InMemory>, Url) {
+    (
+        Arc::new(InMemory::new()),
+        Url::parse("memory:///")
+            .unwrap()
+            .join("_delta_log/")
+            .unwrap(),
+    )
+}
+
+/// Writes all actions to a _delta_log json commit file in the store.
+/// This function formats the provided filename into the _delta_log directory.
+fn write_commit_to_store(
+    store: &Arc<InMemory>,
+    actions: Vec<Action>,
+    version: u64,
+) -> DeltaResult<()> {
+    let json_lines: Vec<String> = actions
+        .into_iter()
+        .map(|action| serde_json::to_string(&action).expect("action to string"))
+        .collect();
+    let content = json_lines.join("\n");
+
+    let commit_path = format!("_delta_log/{}", delta_path_for_version(version, "json"));
+
+    tokio::runtime::Runtime::new()
+        .expect("create tokio runtime")
+        .block_on(async { store.put(&Path::from(commit_path), content.into()).await })?;
+
+    Ok(())
+}
+
+#[test]
+fn test_checkpoint_latest_version_by_default() -> DeltaResult<()> {
+    let (store, _) = new_in_memory_store();
+    let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new()));
+
+    // 1st commit: adds `fake_path_1`
+    write_commit_to_store(
+        &store,
+        vec![Action::Add(Add {
+            path: "fake_path_1".into(),
+            data_change: true,
+            ..Default::default()
+        })],
+        0,
+    )?;
+
+    // 2nd commit: adds `fake_path_2` & removes `fake_path_1`
+    write_commit_to_store(
+        &store,
+        vec![
+            Action::Add(Add {
+                path: "fake_path_2".into(),
+                data_change: true,
+                ..Default::default()
+            }),
+            Action::Remove(Remove {
+                path: "fake_path_1".into(),
+                data_change: true,
+                ..Default::default()
+            }),
+        ],
+        1,
+    )?;
+
+    // 3rd commit: metadata & protocol actions
+    write_commit_to_store(
+            &store,
+            vec![
+                Action::Metadata(Metadata {
+                    id: "fake_path_1".into(),
+                    schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(),
+                    ..Default::default()
+                }),
+                Action::Protocol(Protocol::try_new(3, 7, Vec::<String>::new().into(), Vec::<String>::new().into())?),
+            ],
+            2,
+        )?;
+    let table_root = Url::parse("memory:///")?;
+    let table = Table::new(table_root);
+    let mut checkpointer = table.checkpoint(&engine, None)?.build(&engine)?;
+    let checkpoint_data = checkpointer.get_checkpoint_info()?;
+    let mut data_iter = checkpoint_data.data;
+    assert_eq!(
+        checkpoint_data.path,
+        Url::parse("memory:///_delta_log/00000000000000000002.checkpoint.parquet")?
+    );
+
+    // The first batch should be the metadata and protocol actions.
+    let checkpoint_data = data_iter.next().unwrap()?;
+
+    assert_eq!(checkpoint_data.selection_vector, [true, true]);
+
+    // The second batch should be the add action as the remove action is expired.
+    let checkpoint_data = data_iter.next().unwrap()?;
+    assert_eq!(checkpoint_data.selection_vector, [true, false]);
+
+    // The third batch should not be included as the selection vector does not
+    // contain any true values, as the add action is removed in a following commit.
+    assert!(data_iter.next().is_none());
+
+    Ok(())
+}
+
+/// Test that `checkpoint` works with a specific version parameter
+#[test]
+fn test_checkpoint_specific_version() -> DeltaResult<()> {
+    let (store, _) = new_in_memory_store();
+    let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new()));
+
+    // Create test actions
+    // 1st commit (version 0) - metadata and protocol actions
+    write_commit_to_store(
+        &store,
+        vec![
+           Action::Protocol(Protocol::try_new(3, 7, Vec::<String>::new().into(), Vec::<String>::new().into())?),
+            Action::Metadata(Metadata {
+                id: "test-table-v0".into(),
+                schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(),
+                ..Default::default()
+            }),
+        ],
+        0,
+    )?;
+
+    // 2nd commit (version 1) - add and remove actions
+    write_commit_to_store(
+        &store,
+        vec![
+            Action::Add(Add {
+                path: "file1.parquet".into(),
+                data_change: true,
+                ..Default::default()
+            }),
+            Action::Add(Add {
+                path: "file2.parquet".into(),
+                data_change: true,
+                ..Default::default()
+            }),
+        ],
+        1,
+    )?;
+
+    // Initialize the table
+    let table_root = Url::parse("memory:///")?;
+    let table = Table::new(table_root);
+
+    // Create the V1CheckpointFileIterator for version 1 specifically
+    let mut checkpointer = table.checkpoint(&engine, Some(0))?.build(&engine)?;
+
+    // Get the file data iterator
+    let checkpoint_data = checkpointer.get_checkpoint_info()?;
+
+    // Verify checkpoint file path is for version 0
+    let expected_path = Url::parse("memory:///_delta_log/00000000000000000000.checkpoint.parquet")?;
+    assert_eq!(checkpoint_data.path, expected_path);
+
+    let mut data_iter = checkpoint_data.data;
+
+    // The first batch should be the metadata and protocol actions.
+    let checkpoint_data = data_iter.next().unwrap()?;
+    assert_eq!(checkpoint_data.selection_vector, [true, true]);
+
+    // No more data should exist because we only requested version 0
+    assert!(data_iter.next().is_none());
+
+    Ok(())
+}
diff --git a/kernel/src/path.rs b/kernel/src/path.rs
index df372f08e..1cdab91a9 100644
--- a/kernel/src/path.rs
+++ b/kernel/src/path.rs
@@ -2,6 +2,7 @@
 
 use std::str::FromStr;
 use url::Url;
+use uuid::Uuid;
 
 use crate::{DeltaResult, Error, FileMeta, Version};
 
@@ -196,6 +197,42 @@ impl ParsedLogPath<Url> {
         }
         Ok(path)
     }
+
+    /// Create a new ParsedCommitPath<Url> for a classic-named parquet checkpoint file at the specified version
+    pub(crate) fn new_classic_parquet_checkpoint(
+        table_root: &Url,
+        version: Version,
+    ) -> DeltaResult<ParsedLogPath<Url>> {
+        let filename = format!("{:020}.checkpoint.parquet", version);
+        let location = table_root.join("_delta_log/")?.join(&filename)?;
+        let path = Self::try_from(location)?
+            .ok_or_else(|| Error::internal_error("attempted to create invalid checkpoint path"))?;
+        if !path.is_checkpoint() {
+            return Err(Error::internal_error(
+                "ParsedLogPath::new_classic_parquet_checkpoint created a non-checkpoint path",
+            ));
+        }
+        Ok(path)
+    }
+
+    /// Create a new ParsedCommitPath<Url> for a uuid-named parquet checkpoint file at the specified version
+    pub(crate) fn new_uuid_parquet_checkpoint(
+        table_root: &Url,
+        version: Version,
+    ) -> DeltaResult<ParsedLogPath<Url>> {
+        // Generate a random UUID v4
+        let uuid = Uuid::new_v4().to_string();
+        let filename = format!("{:020}.checkpoint.{}.parquet", version, uuid);
+        let location = table_root.join("_delta_log/")?.join(&filename)?;
+        let path = Self::try_from(location)?
+            .ok_or_else(|| Error::internal_error("attempted to create invalid checkpoint path"))?;
+        if !path.is_checkpoint() {
+            return Err(Error::internal_error(
+                "ParsedLogPath::new_uuid_parquet_checkpoint created a non-checkpoint path",
+            ));
+        }
+        Ok(path)
+    }
 }
 
 #[cfg(test)]
@@ -566,4 +603,45 @@ mod tests {
         assert!(matches!(log_path.file_type, LogPathFileType::Commit));
         assert_eq!(log_path.filename, "00000000000000000010.json");
     }
+    #[test]
+    fn test_new_uuid_parquet_checkpoint() {
+        let table_log_dir = table_log_dir_url();
+        let log_path = ParsedLogPath::new_uuid_parquet_checkpoint(&table_log_dir, 10).unwrap();
+
+        // Basic properties
+        assert_eq!(log_path.version, 10);
+        assert!(log_path.is_checkpoint());
+        assert_eq!(log_path.extension, "parquet");
+        assert!(matches!(
+            log_path.file_type,
+            LogPathFileType::UuidCheckpoint(_)
+        ));
+
+        // Filename structure
+        let parts: Vec<&str> = log_path.filename.split('.').collect();
+        assert_eq!(parts.len(), 4);
+        assert_eq!(parts[0], "00000000000000000010");
+        assert_eq!(parts[1], "checkpoint");
+        assert_eq!(parts[3], "parquet");
+
+        // Validate UUID
+        assert!(!parts[2].is_empty());
+        assert!(Uuid::parse_str(parts[2]).is_ok());
+    }
+
+    #[test]
+    fn test_new_classic_parquet_checkpoint() {
+        let table_log_dir = table_log_dir_url();
+        let log_path = ParsedLogPath::new_classic_parquet_checkpoint(&table_log_dir, 10).unwrap();
+
+        // Basic properties
+        assert_eq!(log_path.version, 10);
+        assert!(log_path.is_checkpoint());
+        assert_eq!(log_path.extension, "parquet");
+        assert!(matches!(
+            log_path.file_type,
+            LogPathFileType::SinglePartCheckpoint
+        ));
+        assert_eq!(log_path.filename, "00000000000000000010.checkpoint.parquet");
+    }
 }
diff --git a/kernel/src/table.rs b/kernel/src/table.rs
index 97e1596d7..36bdc4743 100644
--- a/kernel/src/table.rs
+++ b/kernel/src/table.rs
@@ -7,6 +7,7 @@ use std::path::PathBuf;
 
 use url::Url;
 
+use crate::checkpoints::CheckpointBuilder;
 use crate::snapshot::Snapshot;
 use crate::table_changes::TableChanges;
 use crate::transaction::Transaction;
@@ -98,6 +99,20 @@ impl Table {
         )
     }
 
+    /// Creates a [`CheckpointBuilder`] for generating table checkpoints.
+    ///
+    /// Checkpoints are compact representations of the table state that improve reading performance.
+    /// Supports three checkpoint types: Classic V1 (legacy tables), Classic V2 (backwards
+    /// compatibility), and UUID V2 (recommended for small/medium tables with v2Checkpoints feature).
+    pub fn checkpoint(
+        &self,
+        engine: &dyn Engine,
+        version: Option<Version>,
+    ) -> DeltaResult<CheckpointBuilder> {
+        let snapshot = self.snapshot(engine, version)?;
+        Ok(CheckpointBuilder::new(snapshot))
+    }
+
     /// Create a new write transaction for this table.
     pub fn new_transaction(&self, engine: &dyn Engine) -> DeltaResult<Transaction> {
         Transaction::try_new(self.snapshot(engine, None)?)
diff --git a/kernel/src/table_configuration.rs b/kernel/src/table_configuration.rs
index e2d287b60..3b659615d 100644
--- a/kernel/src/table_configuration.rs
+++ b/kernel/src/table_configuration.rs
@@ -238,6 +238,25 @@ impl TableConfiguration {
             version => (2..=6).contains(&version),
         }
     }
+
+    /// Returns `true` if V2 checkpoint is supported on this table. To support V2 checkpoint,
+    /// a table must support reader version 3, writer version 7, and the v2Checkpoint feature in
+    /// both the protocol's readerFeatures and writerFeatures.
+    ///
+    /// See: <https://github.com/delta-io/delta/blob/master/PROTOCOL.md#v2-checkpoint-table-feature>
+    #[cfg_attr(feature = "developer-visibility", visibility::make(pub))]
+    #[allow(unused)] // needed to compile w/o default features
+    pub(crate) fn is_v2_checkpoint_supported(&self) -> bool {
+        let read_supported = self
+            .protocol()
+            .has_reader_feature(&ReaderFeatures::V2Checkpoint)
+            && self.protocol.min_reader_version() == 3;
+        let write_supported = self
+            .protocol()
+            .has_writer_feature(&WriterFeatures::V2Checkpoint)
+            && self.protocol.min_writer_version() == 7;
+        read_supported && write_supported
+    }
 }
 
 #[cfg(test)]

From 5abba3daf12a4a934a35b8cb42e91e07a7bdfcca Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Sun, 30 Mar 2025 13:42:46 -0700
Subject: [PATCH 37/45] usize & ulong type conversions

---
 ffi/src/expressions/kernel.rs                 | 15 ++++
 ffi/src/schema.rs                             | 20 +++++
 kernel/src/actions/schemas.rs                 |  2 +
 kernel/src/engine/arrow_conversion.rs         | 14 +++
 kernel/src/engine/arrow_expression/mod.rs     | 29 ++++++
 .../src/engine/parquet_row_group_skipping.rs  | 89 +++++++++++++++++++
 kernel/src/expressions/scalars.rs             | 26 ++++++
 kernel/src/schema/mod.rs                      | 10 +++
 8 files changed, 205 insertions(+)

diff --git a/ffi/src/expressions/kernel.rs b/ffi/src/expressions/kernel.rs
index c8ce1b2d4..5abe69f77 100644
--- a/ffi/src/expressions/kernel.rs
+++ b/ffi/src/expressions/kernel.rs
@@ -65,6 +65,10 @@ pub struct EngineExpressionVisitor {
     pub visit_literal_int: VisitLiteralFn<i32>,
     /// Visit a 64bit `long`  belonging to the list identified by `sibling_list_id`.
     pub visit_literal_long: VisitLiteralFn<i64>,
+    /// Visit a 64bit unsigned `long`  belonging to the list identified by `sibling_list_id`.
+    pub visit_literal_ulong: VisitLiteralFn<u64>,
+    /// Visit a 32bit unsigned `integer` int belonging to the list identified by `sibling_list_id`.
+    pub visit_literal_uint: VisitLiteralFn<u32>,
     /// Visit a 16bit `short` belonging to the list identified by `sibling_list_id`.
     pub visit_literal_short: VisitLiteralFn<i16>,
     /// Visit an 8bit `byte` belonging to the list identified by `sibling_list_id`.
@@ -292,6 +296,17 @@ fn visit_expression_internal(
         match scalar {
             Scalar::Integer(val) => call!(visitor, visit_literal_int, sibling_list_id, *val),
             Scalar::Long(val) => call!(visitor, visit_literal_long, sibling_list_id, *val),
+            Scalar::ULong(val) => call!(visitor, visit_literal_ulong, sibling_list_id, *val), // TODO: Fix typecast
+            Scalar::USize(val) => {
+                #[cfg(target_pointer_width = "32")]
+                {
+                    call!(visitor, visit_literal_uint, sibling_list_id, *val as u64)
+                }
+                #[cfg(target_pointer_width = "64")]
+                {
+                    call!(visitor, visit_literal_ulong, sibling_list_id, *val as u64)
+                }
+            }
             Scalar::Short(val) => call!(visitor, visit_literal_short, sibling_list_id, *val),
             Scalar::Byte(val) => call!(visitor, visit_literal_byte, sibling_list_id, *val),
             Scalar::Float(val) => call!(visitor, visit_literal_float, sibling_list_id, *val),
diff --git a/ffi/src/schema.rs b/ffi/src/schema.rs
index a474c80c3..0cd1ed423 100644
--- a/ffi/src/schema.rs
+++ b/ffi/src/schema.rs
@@ -102,6 +102,24 @@ pub struct EngineSchemaVisitor {
         metadata: &CStringMap,
     ),
 
+    /// Visit a `ulong` belonging to the list identified by `sibling_list_id`.
+    pub visit_ulong: extern "C" fn(
+        data: *mut c_void,
+        sibling_list_id: usize,
+        name: KernelStringSlice,
+        is_nullable: bool,
+        metadata: &CStringMap,
+    ),
+
+    /// Visit a `usize` belonging to the list identified by `sibling_list_id`.
+    pub visit_usize: extern "C" fn(
+        data: *mut c_void,
+        sibling_list_id: usize,
+        name: KernelStringSlice,
+        is_nullable: bool,
+        metadata: &CStringMap,
+    ),
+
     /// Visit an `integer` belonging to the list identified by `sibling_list_id`.
     pub visit_integer: extern "C" fn(
         data: *mut c_void,
@@ -308,6 +326,8 @@ fn visit_schema_impl(schema: &StructType, visitor: &mut EngineSchemaVisitor) ->
             }
             &DataType::STRING => call!(visit_string),
             &DataType::LONG => call!(visit_long),
+            &DataType::ULONG => call!(visit_ulong),
+            &DataType::USIZE => call!(visit_usize),
             &DataType::INTEGER => call!(visit_integer),
             &DataType::SHORT => call!(visit_short),
             &DataType::BYTE => call!(visit_byte),
diff --git a/kernel/src/actions/schemas.rs b/kernel/src/actions/schemas.rs
index aa3b3e47b..dfccbd028 100644
--- a/kernel/src/actions/schemas.rs
+++ b/kernel/src/actions/schemas.rs
@@ -36,6 +36,8 @@ macro_rules! impl_to_data_type {
 
 impl_to_data_type!(
     (String, DataType::STRING),
+    (u64, DataType::ULONG),
+    (usize, DataType::USIZE),
     (i64, DataType::LONG),
     (i32, DataType::INTEGER),
     (i16, DataType::SHORT),
diff --git a/kernel/src/engine/arrow_conversion.rs b/kernel/src/engine/arrow_conversion.rs
index a425cd143..6242d27bd 100644
--- a/kernel/src/engine/arrow_conversion.rs
+++ b/kernel/src/engine/arrow_conversion.rs
@@ -100,6 +100,20 @@ impl TryFrom<&DataType> for ArrowDataType {
                 match p {
                     PrimitiveType::String => Ok(ArrowDataType::Utf8),
                     PrimitiveType::Long => Ok(ArrowDataType::Int64), // undocumented type
+                    PrimitiveType::ULong => Ok(ArrowDataType::UInt64),
+                    // Since usize is platform dependent, we need to check the target_pointer_width
+                    // to determine the correct arrow type to use.
+                    PrimitiveType::USize => {
+                        #[cfg(target_pointer_width = "32")]
+                        {
+                            Ok(ArrowDataType::UInt32)
+                        }
+
+                        #[cfg(target_pointer_width = "64")]
+                        {
+                            Ok(ArrowDataType::UInt64)
+                        }
+                    }
                     PrimitiveType::Integer => Ok(ArrowDataType::Int32),
                     PrimitiveType::Short => Ok(ArrowDataType::Int16),
                     PrimitiveType::Byte => Ok(ArrowDataType::Int8),
diff --git a/kernel/src/engine/arrow_expression/mod.rs b/kernel/src/engine/arrow_expression/mod.rs
index 6e15d10bd..621ff7755 100644
--- a/kernel/src/engine/arrow_expression/mod.rs
+++ b/kernel/src/engine/arrow_expression/mod.rs
@@ -19,6 +19,7 @@ use crate::expressions::{Expression, Scalar};
 use crate::schema::{DataType, PrimitiveType, SchemaRef};
 use crate::{EngineData, ExpressionEvaluator, ExpressionHandler};
 
+use arrow_53::array::UInt64Array;
 use itertools::Itertools;
 use tracing::debug;
 
@@ -40,6 +41,20 @@ impl Scalar {
         let arr: ArrayRef = match self {
             Integer(val) => Arc::new(Int32Array::from_value(*val, num_rows)),
             Long(val) => Arc::new(Int64Array::from_value(*val, num_rows)),
+            ULong(val) => Arc::new(UInt64Array::from_value(*val, num_rows)),
+            // Since usize is platform dependent, we need to check the target_pointer_width
+            // to determine the correct array type to use.
+            USize(val) => {
+                #[cfg(target_pointer_width = "32")]
+                {
+                    Arc::new(UInt32Array::from_value(*val as u32, num_rows))
+                }
+
+                #[cfg(target_pointer_width = "64")]
+                {
+                    Arc::new(UInt64Array::from_value(*val as u64, num_rows))
+                }
+            }
             Short(val) => Arc::new(Int16Array::from_value(*val, num_rows)),
             Byte(val) => Arc::new(Int8Array::from_value(*val, num_rows)),
             Float(val) => Arc::new(Float32Array::from_value(*val, num_rows)),
@@ -88,6 +103,20 @@ impl Scalar {
             Null(DataType::SHORT) => Arc::new(Int16Array::new_null(num_rows)),
             Null(DataType::INTEGER) => Arc::new(Int32Array::new_null(num_rows)),
             Null(DataType::LONG) => Arc::new(Int64Array::new_null(num_rows)),
+            Null(DataType::ULONG) => Arc::new(UInt64Array::new_null(num_rows)),
+            // Since usize is platform dependent, we need to check the target_pointer_width
+            // to determine the correct array type to use.
+            Null(DataType::USIZE) => {
+                #[cfg(target_pointer_width = "32")]
+                {
+                    Arc::new(UInt32Array::new_null(num_rows))
+                }
+
+                #[cfg(target_pointer_width = "64")]
+                {
+                    Arc::new(UInt64Array::new_null(num_rows))
+                }
+            }
             Null(DataType::FLOAT) => Arc::new(Float32Array::new_null(num_rows)),
             Null(DataType::DOUBLE) => Arc::new(Float64Array::new_null(num_rows)),
             Null(DataType::STRING) => Arc::new(StringArray::new_null(num_rows)),
diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs
index fbce2f913..c9d78fbdd 100644
--- a/kernel/src/engine/parquet_row_group_skipping.rs
+++ b/kernel/src/engine/parquet_row_group_skipping.rs
@@ -105,6 +105,50 @@ impl ParquetStatsProvider for RowGroupFilter<'_> {
             (Long, Statistics::Int64(s)) => s.min_opt()?.into(),
             (Long, Statistics::Int32(s)) => (*s.min_opt()? as i64).into(),
             (Long, _) => return None,
+            (ULong, Statistics::Int64(s)) =>
+            // Attempt to convert value to u64, return None if conversion fails
+            {
+                u64::try_from(*s.min_opt()?).ok()?.into()
+            }
+
+            // Handling ULong type with Int32 statistics
+            (ULong, Statistics::Int32(s)) =>
+            // Attempt to convert value to u64, return None if conversion fails
+            {
+                u64::try_from(*s.min_opt()?).ok()?.into()
+            }
+
+            (ULong, _) => return None,
+            // Handling USize type on 64-bit architecture with Int64 statistics
+            #[cfg(target_pointer_width = "64")]
+            (USize, Statistics::Int64(s)) =>
+            // Attempt to convert value to usize, return None if conversion fails
+            {
+                usize::try_from(*s.min_opt()?).ok()?.into()
+            }
+            // Handling USize type on 64-bit architecture with Int32 statistics
+            #[cfg(target_pointer_width = "64")]
+            (USize, Statistics::Int32(s)) =>
+            // Attempt to convert value to usize, converting from u64 if needed
+            {
+                usize::try_from(*s.min_opt()? as u64).ok()?.into()
+            }
+            // Handling USize type on 32-bit architecture with Int64 statistics
+            #[cfg(target_pointer_width = "32")]
+            (USize, Statistics::Int64(s)) =>
+            // Attempt to convert value to usize, ensuring it's cast to u32 first
+            {
+                usize::try_from(*s.min_opt()? as u32).ok()?.into()
+            }
+
+            // Handling USize type on 32-bit architecture with Int32 statistics
+            #[cfg(target_pointer_width = "32")]
+            (USize, Statistics::Int32(s)) =>
+            // Attempt to convert vvalue to usize, return None if conversion fails
+            {
+                usize::try_from(*s.min_opt()?).ok()?.into()
+            }
+            (USize, _) => return None,
             (Integer, Statistics::Int32(s)) => s.min_opt()?.into(),
             (Integer, _) => return None,
             (Short, Statistics::Int32(s)) => (*s.min_opt()? as i16).into(),
@@ -147,6 +191,51 @@ impl ParquetStatsProvider for RowGroupFilter<'_> {
             (Long, Statistics::Int64(s)) => s.max_opt()?.into(),
             (Long, Statistics::Int32(s)) => (*s.max_opt()? as i64).into(),
             (Long, _) => return None,
+            (ULong, Statistics::Int64(s)) =>
+            // Attempt to convert value to u64, return None if conversion fails
+            {
+                u64::try_from(*s.min_opt()?).ok()?.into()
+            }
+
+            // Handling ULong type with Int32 statistics
+            (ULong, Statistics::Int32(s)) =>
+            // Attempt to convert value to u64, return None if conversion fails
+            {
+                u64::try_from(*s.min_opt()?).ok()?.into()
+            }
+
+            (ULong, _) => return None,
+            // Handling USize type on 64-bit architecture with Int64 statistics
+            #[cfg(target_pointer_width = "64")]
+            (USize, Statistics::Int64(s)) =>
+            // Attempt to convert value to usize, return None if conversion fails
+            {
+                usize::try_from(*s.min_opt()?).ok()?.into()
+            }
+            // Handling USize type on 64-bit architecture with Int32 statistics
+            #[cfg(target_pointer_width = "64")]
+            (USize, Statistics::Int32(s)) =>
+            // Attempt to convert value to usize, converting from u64 if needed
+            {
+                usize::try_from(*s.min_opt()? as u64).ok()?.into()
+            }
+            // Handling USize type on 32-bit architecture with Int64 statistics
+            #[cfg(target_pointer_width = "32")]
+            (USize, Statistics::Int64(s)) =>
+            // Attempt to convert value to usize, ensuring it's cast to u32 first
+            {
+                usize::try_from(*s.min_opt()? as u32).ok()?.into()
+            }
+
+            // Handling USize type on 32-bit architecture with Int32 statistics
+            #[cfg(target_pointer_width = "32")]
+            (USize, Statistics::Int32(s)) =>
+            // Attempt to convert vvalue to usize, return None if conversion fails
+            {
+                usize::try_from(*s.min_opt()?).ok()?.into()
+            }
+            (USize, _) => return None,
+
             (Integer, Statistics::Int32(s)) => s.max_opt()?.into(),
             (Integer, _) => return None,
             (Short, Statistics::Int32(s)) => (*s.max_opt()? as i16).into(),
diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs
index 90f5358a6..f5e887fad 100644
--- a/kernel/src/expressions/scalars.rs
+++ b/kernel/src/expressions/scalars.rs
@@ -96,6 +96,10 @@ pub enum Scalar {
     Integer(i32),
     /// 64bit integer
     Long(i64),
+    // unsigned 64bit integer
+    ULong(u64),
+    // usize
+    USize(usize),
     /// 16bit integer
     Short(i16),
     /// 8bit integer
@@ -131,6 +135,8 @@ impl Scalar {
         match self {
             Self::Integer(_) => DataType::INTEGER,
             Self::Long(_) => DataType::LONG,
+            Self::ULong(_) => DataType::ULONG,
+            Self::USize(_) => DataType::USIZE,
             Self::Short(_) => DataType::SHORT,
             Self::Byte(_) => DataType::BYTE,
             Self::Float(_) => DataType::FLOAT,
@@ -169,6 +175,8 @@ impl Display for Scalar {
         match self {
             Self::Integer(i) => write!(f, "{}", i),
             Self::Long(i) => write!(f, "{}", i),
+            Self::ULong(i) => write!(f, "{}", i),
+            Self::USize(i) => write!(f, "{}", i),
             Self::Short(i) => write!(f, "{}", i),
             Self::Byte(i) => write!(f, "{}", i),
             Self::Float(fl) => write!(f, "{}", fl),
@@ -241,6 +249,10 @@ impl PartialOrd for Scalar {
             (Integer(_), _) => None,
             (Long(a), Long(b)) => a.partial_cmp(b),
             (Long(_), _) => None,
+            (ULong(a), ULong(b)) => a.partial_cmp(b),
+            (ULong(_), _) => None,
+            (USize(a), USize(b)) => a.partial_cmp(b),
+            (USize(_), _) => None,
             (Short(a), Short(b)) => a.partial_cmp(b),
             (Short(_), _) => None,
             (Byte(a), Byte(b)) => a.partial_cmp(b),
@@ -338,6 +350,18 @@ impl From<&[u8]> for Scalar {
     }
 }
 
+impl From<u64> for Scalar {
+    fn from(u: u64) -> Self {
+        Self::ULong(u.into())
+    }
+}
+
+impl From<usize> for Scalar {
+    fn from(u: usize) -> Self {
+        Self::USize(u.into())
+    }
+}
+
 // TODO: add more From impls
 
 impl PrimitiveType {
@@ -378,6 +402,8 @@ impl PrimitiveType {
             Short => self.parse_str_as_scalar(raw, Scalar::Short),
             Integer => self.parse_str_as_scalar(raw, Scalar::Integer),
             Long => self.parse_str_as_scalar(raw, Scalar::Long),
+            ULong => self.parse_str_as_scalar(raw, Scalar::ULong),
+            USize => self.parse_str_as_scalar(raw, Scalar::USize),
             Float => self.parse_str_as_scalar(raw, Scalar::Float),
             Double => self.parse_str_as_scalar(raw, Scalar::Double),
             Boolean => {
diff --git a/kernel/src/schema/mod.rs b/kernel/src/schema/mod.rs
index 3a5648b57..ede497305 100644
--- a/kernel/src/schema/mod.rs
+++ b/kernel/src/schema/mod.rs
@@ -493,6 +493,12 @@ pub enum PrimitiveType {
     String,
     /// i64: 8-byte signed integer. Range: -9223372036854775808 to 9223372036854775807
     Long,
+    /// u64: 8-byte unsigned integer. Range: 0 to 18446744073709551615
+    ULong,
+    /// usize: Platform-dependent unsigned integer. Typically used for indexing and memory sizes.
+    ///  - 64-bit platforms: Range 0 to 18_446_744_073_709_551_615
+    ///  - 32-bit platforms: Range 0 to 4_294_967_295
+    USize,
     /// i32: 4-byte signed integer. Range: -2147483648 to 2147483647
     Integer,
     /// i16: 2-byte signed integer numbers. Range: -32768 to 32767
@@ -559,6 +565,8 @@ impl Display for PrimitiveType {
         match self {
             PrimitiveType::String => write!(f, "string"),
             PrimitiveType::Long => write!(f, "long"),
+            PrimitiveType::ULong => write!(f, "ulong"),
+            PrimitiveType::USize => write!(f, "usize"),
             PrimitiveType::Integer => write!(f, "integer"),
             PrimitiveType::Short => write!(f, "short"),
             PrimitiveType::Byte => write!(f, "byte"),
@@ -624,6 +632,8 @@ impl From<SchemaRef> for DataType {
 impl DataType {
     pub const STRING: Self = DataType::Primitive(PrimitiveType::String);
     pub const LONG: Self = DataType::Primitive(PrimitiveType::Long);
+    pub const ULONG: Self = DataType::Primitive(PrimitiveType::ULong);
+    pub const USIZE: Self = DataType::Primitive(PrimitiveType::USize);
     pub const INTEGER: Self = DataType::Primitive(PrimitiveType::Integer);
     pub const SHORT: Self = DataType::Primitive(PrimitiveType::Short);
     pub const BYTE: Self = DataType::Primitive(PrimitiveType::Byte);

From 80fc9360ee690fa6d6210eeb08cdab4b27065fa2 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Sun, 30 Mar 2025 13:50:34 -0700
Subject: [PATCH 38/45] finalize_checkpoint API

---
 kernel/src/actions/visitors.rs       |   6 +-
 kernel/src/checkpoints/log_replay.rs |  20 +-
 kernel/src/checkpoints/mod.rs        | 336 ++++++++++++++++++++++++++-
 kernel/src/transaction.rs            |   1 +
 4 files changed, 338 insertions(+), 25 deletions(-)

diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs
index 76c83b1c3..d46aa8b0a 100644
--- a/kernel/src/actions/visitors.rs
+++ b/kernel/src/actions/visitors.rs
@@ -504,15 +504,15 @@ pub(crate) struct CheckpointVisitor<'seen> {
     // File actions deduplication state
     pub(crate) deduplicator: FileActionDeduplicator<'seen>,
     pub(crate) selection_vector: Vec<bool>,
-    pub(crate) total_file_actions: u64,
-    pub(crate) total_add_actions: u64,
+    pub(crate) total_file_actions: i64,
+    pub(crate) total_add_actions: i64,
     pub(crate) minimum_file_retention_timestamp: i64,
 
     // Non-file actions deduplication state
     pub(crate) seen_protocol: bool,
     pub(crate) seen_metadata: bool,
     pub(crate) seen_txns: &'seen mut HashSet<String>,
-    pub(crate) total_non_file_actions: u64,
+    pub(crate) total_non_file_actions: i64,
 }
 
 #[allow(unused)]
diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs
index cbad63305..cab607324 100644
--- a/kernel/src/checkpoints/log_replay.rs
+++ b/kernel/src/checkpoints/log_replay.rs
@@ -1,5 +1,5 @@
 use std::collections::HashSet;
-use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::atomic::{AtomicI64, Ordering};
 use std::sync::Arc;
 
 use crate::actions::visitors::CheckpointVisitor;
@@ -27,10 +27,10 @@ struct CheckpointLogReplayProcessor {
     seen_file_keys: HashSet<FileActionKey>,
 
     /// Counter for the total number of actions processed during log replay.
-    total_actions: Arc<AtomicU64>,
+    total_actions: Arc<AtomicI64>,
 
     /// Counter for the total number of add actions processed during log replay.
-    total_add_actions: Arc<AtomicU64>,
+    total_add_actions: Arc<AtomicI64>,
 
     /// Indicates whether a protocol action has been seen in the log.
     seen_protocol: bool,
@@ -116,8 +116,8 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor {
 #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented
 impl CheckpointLogReplayProcessor {
     pub(super) fn new(
-        total_actions_counter: Arc<AtomicU64>,
-        total_add_actions_counter: Arc<AtomicU64>,
+        total_actions_counter: Arc<AtomicI64>,
+        total_add_actions_counter: Arc<AtomicI64>,
         minimum_file_retention_timestamp: i64,
     ) -> Self {
         Self {
@@ -143,8 +143,8 @@ impl CheckpointLogReplayProcessor {
 #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented
 pub(crate) fn checkpoint_actions_iter(
     action_iter: impl Iterator<Item = DeltaResult<(Box<dyn EngineData>, bool)>> + Send + 'static,
-    total_actions_counter: Arc<AtomicU64>,
-    total_add_actions_counter: Arc<AtomicU64>,
+    total_actions_counter: Arc<AtomicI64>,
+    total_add_actions_counter: Arc<AtomicI64>,
     minimum_file_retention_timestamp: i64,
 ) -> impl Iterator<Item = DeltaResult<CheckpointData>> + Send + 'static {
     let mut log_scanner = CheckpointLogReplayProcessor::new(
@@ -158,7 +158,7 @@ pub(crate) fn checkpoint_actions_iter(
 
 #[cfg(test)]
 mod tests {
-    use std::sync::atomic::{AtomicU64, Ordering};
+    use std::sync::atomic::{AtomicI64, Ordering};
     use std::sync::Arc;
 
     use crate::arrow::array::StringArray;
@@ -172,8 +172,8 @@ mod tests {
     #[test]
     fn test_v1_checkpoint_actions_iter_multi_batch_integration() -> DeltaResult<()> {
         // Setup counters
-        let total_actions_counter = Arc::new(AtomicU64::new(0));
-        let total_add_actions_counter = Arc::new(AtomicU64::new(0));
+        let total_actions_counter = Arc::new(AtomicI64::new(0));
+        let total_add_actions_counter = Arc::new(AtomicI64::new(0));
 
         // Create first batch with protocol, metadata, and some files
         let json_strings1: StringArray = vec![
diff --git a/kernel/src/checkpoints/mod.rs b/kernel/src/checkpoints/mod.rs
index f5a764779..72311b259 100644
--- a/kernel/src/checkpoints/mod.rs
+++ b/kernel/src/checkpoints/mod.rs
@@ -27,11 +27,17 @@
 //! ```
 use log_replay::{checkpoint_actions_iter, CheckpointData};
 use std::{
-    sync::{atomic::AtomicU64, Arc, LazyLock},
+    sync::{
+        atomic::{AtomicI64, Ordering},
+        Arc, LazyLock,
+    },
     time::{Duration, SystemTime, UNIX_EPOCH},
 };
 use url::Url;
 
+use crate::actions::schemas::GetStructField;
+use crate::expressions::column_expr;
+use crate::schema::{SchemaRef, StructType};
 use crate::{
     actions::{
         Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, METADATA_NAME,
@@ -39,15 +45,30 @@ use crate::{
     },
     path::ParsedLogPath,
     snapshot::Snapshot,
-    DeltaResult, Engine, EngineData, Error,
+    DeltaResult, Engine, EngineData, Error, Expression, Version,
 };
-
-use crate::actions::schemas::GetStructField;
-use crate::schema::{SchemaRef, StructType};
 pub mod log_replay;
 #[cfg(test)]
 mod tests;
 
+/// Schema definition for the _last_checkpoint file
+pub(crate) static CHECKPOINT_METADATA_SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
+    Arc::new(StructType::new(vec![
+        <Version>::get_struct_field("version"),
+        <i64>::get_struct_field("size"),
+        Option::<usize>::get_struct_field("parts"),
+        Option::<i64>::get_struct_field("sizeInBytes"),
+        Option::<i64>::get_struct_field("numOfAddFiles"),
+        // Option::<Schema>::get_struct_field("checkpoint_schema"), TODO: Schema
+        // Option::<String>::get_struct_field("checksum"), TODO: Checksum
+    ]))
+});
+
+/// Get the expected schema for the _last_checkpoint file
+pub fn get_checkpoint_metadata_schema() -> &'static SchemaRef {
+    &CHECKPOINT_METADATA_SCHEMA
+}
+
 /// Read schema definition for collecting checkpoint actions
 static CHECKPOINT_READ_SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
     StructType::new([
@@ -90,23 +111,38 @@ pub struct CheckpointWriter {
     single_file_checkpoint_data: Option<SingleFileCheckpointData>,
 
     /// Counter for the total number of actions in the checkpoint
-    total_actions_counter: Arc<AtomicU64>,
+    total_actions_counter: Arc<AtomicI64>,
 
     /// Counter for add file actions specifically
-    total_add_actions_counter: Arc<AtomicU64>,
+    total_add_actions_counter: Arc<AtomicI64>,
+
+    /// Version of the checkpoint
+    version: Version,
+
+    /// Number of parts of the checkpoint
+    parts: usize,
+
+    /// Path to table's log
+    log_root: Url,
 }
 
 impl CheckpointWriter {
     /// Creates a new CheckpointWriter with the provided checkpoint data and counters
     fn new(
         single_file_checkpoint_data: Option<SingleFileCheckpointData>,
-        total_actions_counter: Arc<AtomicU64>,
-        total_add_actions_counter: Arc<AtomicU64>,
+        total_actions_counter: Arc<AtomicI64>,
+        total_add_actions_counter: Arc<AtomicI64>,
+        version: Version,
+        parts: usize,
+        log_root: Url,
     ) -> Self {
         Self {
             single_file_checkpoint_data,
             total_actions_counter,
             total_add_actions_counter,
+            version,
+            parts,
+            log_root,
         }
     }
 
@@ -126,9 +162,75 @@ impl CheckpointWriter {
     /// This method should be only called AFTER writing all checkpoint data to
     /// ensure proper completion of the checkpoint operation, which includes
     /// writing the _last_checkpoint file.
-    pub fn finalize_checkpoint(self) -> DeltaResult<()> {
+    ///
+    /// Metadata is a single-row EngineData batch with {size_in_bytes: i64}
+    /// Given the engine collected checkpoint metadata we want to extend
+    /// the EngineData batch with the remaining fields for the `_last_checkpoint`
+    /// file.
+
+    pub fn finalize_checkpoint(
+        self,
+        engine: &dyn Engine,
+        metadata: &dyn EngineData,
+    ) -> DeltaResult<()> {
+        // Prepare the checkpoint metadata
+        let checkpoint_metadata = self.prepare_last_checkpoint_metadata(engine, metadata)?;
+
+        // Write the metadata to _last_checkpoint.json
+        let last_checkpoint_path = self.log_root.join("_last_checkpoint.json")?;
+
+        engine.get_json_handler().write_json_file(
+            &last_checkpoint_path,
+            Box::new(std::iter::once(Ok(checkpoint_metadata))),
+            true, // overwrite the last checkpoint file
+        )?;
+
         Ok(())
     }
+
+    /// Prepares the _last_checkpoint metadata batch
+    ///
+    /// This method validates and transforms the engine-provided metadata into
+    /// the complete checkpoint metadata including counters and versioning information.
+    ///
+    /// Refactored into a separate method to facilitate testing.
+    fn prepare_last_checkpoint_metadata(
+        &self,
+        engine: &dyn Engine,
+        metadata: &dyn EngineData,
+    ) -> DeltaResult<Box<dyn EngineData>> {
+        // Validate metadata has exactly one row
+        if metadata.len() != 1 {
+            return Err(Error::Generic(format!(
+                "Engine checkpoint metadata should have exactly one row, found {}",
+                metadata.len()
+            )));
+        }
+
+        // Create expression for transforming the metadata
+        let last_checkpoint_exprs = [
+            Expression::literal(self.version),
+            Expression::literal(self.total_actions_counter.load(Ordering::SeqCst)),
+            Expression::literal(self.parts),
+            column_expr!("sizeInBytes"),
+            Expression::literal(self.total_add_actions_counter.load(Ordering::SeqCst)),
+        ];
+        let last_checkpoint_expr = Expression::struct_from(last_checkpoint_exprs);
+
+        // Get schemas for transformation
+        let last_checkpoint_schema = get_checkpoint_metadata_schema();
+        let engine_metadata_schema = last_checkpoint_schema.project_as_struct(&["sizeInBytes"])?;
+
+        // Create the evaluator for the transformation
+        let last_checkpoint_metadata_evaluator = engine.get_expression_handler().get_evaluator(
+            engine_metadata_schema.into(),
+            last_checkpoint_expr,
+            last_checkpoint_schema.clone().into(),
+        );
+
+        // Transform the metadata
+        Ok(last_checkpoint_metadata_evaluator.evaluate(metadata)?)
+    }
 }
 
 /// Builder for configuring and creating CheckpointWriter instances
@@ -183,8 +285,8 @@ impl CheckpointBuilder {
         let deleted_file_retention_timestamp = self.deleted_file_retention_timestamp()?;
 
         // Create counters for tracking actions
-        let total_actions_counter = Arc::new(AtomicU64::new(0));
-        let total_add_actions_counter = Arc::new(AtomicU64::new(0));
+        let total_actions_counter = Arc::new(AtomicI64::new(0));
+        let total_add_actions_counter = Arc::new(AtomicI64::new(0));
 
         // Create iterator over actions for checkpoint data
         let checkpoint_data = checkpoint_actions_iter(
@@ -217,6 +319,9 @@ impl CheckpointBuilder {
             Some(data),
             total_actions_counter,
             total_add_actions_counter,
+            self.snapshot.version(),
+            1,
+            self.snapshot.log_segment().log_root.clone(),
         ))
     }
 
@@ -290,7 +395,71 @@ fn deleted_file_retention_timestamp_with_time(
 #[cfg(test)]
 mod unit_tests {
     use super::*;
+    use crate::arrow::array::Int64Array;
+    use crate::arrow::datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema};
+    use crate::arrow::record_batch::RecordBatch;
+    use crate::engine::arrow_data::ArrowEngineData;
+    use crate::engine::arrow_expression::ArrowExpressionHandler;
+    use crate::{ExpressionHandler, FileSystemClient, JsonHandler, ParquetHandler};
+    use arrow_53::json::LineDelimitedWriter;
+    use std::sync::{atomic::AtomicI64, Arc};
     use std::time::Duration;
+    use url::Url;
+
+    // Helper to serialize and extract the _last_checkpoint JSON for verification
+    fn as_json(data: Box<dyn EngineData>) -> serde_json::Value {
+        let record_batch: RecordBatch = data
+            .into_any()
+            .downcast::<ArrowEngineData>()
+            .unwrap()
+            .into();
+
+        let buf = Vec::new();
+        let mut writer = LineDelimitedWriter::new(buf);
+        writer.write_batches(&[&record_batch]).unwrap();
+        writer.finish().unwrap();
+        let buf = writer.into_inner();
+
+        serde_json::from_slice(&buf).unwrap()
+    }
+
+    // TODO(seb): Merge with other definitions and move to a common test module
+    pub(crate) struct ExprEngine(Arc<dyn ExpressionHandler>);
+
+    impl ExprEngine {
+        pub(crate) fn new() -> Self {
+            ExprEngine(Arc::new(ArrowExpressionHandler))
+        }
+    }
+
+    impl Engine for ExprEngine {
+        fn get_expression_handler(&self) -> Arc<dyn ExpressionHandler> {
+            self.0.clone()
+        }
+
+        fn get_json_handler(&self) -> Arc<dyn JsonHandler> {
+            unimplemented!()
+        }
+
+        fn get_parquet_handler(&self) -> Arc<dyn ParquetHandler> {
+            unimplemented!()
+        }
+
+        fn get_file_system_client(&self) -> Arc<dyn FileSystemClient> {
+            unimplemented!()
+        }
+    }
+
+    /// Creates a mock engine metadata batch with size_in_bytes field
+    fn create_engine_metadata(size_in_bytes: i64) -> Box<dyn EngineData> {
+        // Create Arrow schema with size_in_bytes field
+        let schema = ArrowSchema::new(vec![Field::new("sizeInBytes", ArrowDataType::Int64, false)]);
+
+        let size_array = Int64Array::from(vec![size_in_bytes]);
+        let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(size_array)])
+            .expect("Failed to create record batch");
+        Box::new(ArrowEngineData::new(record_batch))
+    }
 
     #[test]
     fn test_deleted_file_retention_timestamp() -> DeltaResult<()> {
@@ -316,4 +485,147 @@ mod unit_tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_prepare_last_checkpoint_metadata() -> DeltaResult<()> {
+        // Setup test data
+        let size_in_bytes: i64 = 1024 * 1024; // 1MB
+        let version: Version = 10;
+        let parts: usize = 3;
+        let total_actions_counter = Arc::new(AtomicI64::new(100));
+        let total_add_actions_counter = Arc::new(AtomicI64::new(75));
+
+        let log_root = Url::parse("memory://test-table/_delta_log/").unwrap();
+        let engine = ExprEngine::new();
+
+        // Create engine metadata with size_in_bytes
+        let metadata = create_engine_metadata(size_in_bytes);
+
+        // Create checkpoint writer
+        let writer = CheckpointWriter::new(
+            None, // We don't need checkpoint data for this test
+            total_actions_counter.clone(),
+            total_add_actions_counter.clone(),
+            version,
+            parts,
+            log_root,
+        );
+
+        // Call the method under test
+        let last_checkpoint_batch = writer.prepare_last_checkpoint_metadata(&engine, &*metadata)?;
+
+        // Convert to JSON for easier verification
+        let json = as_json(last_checkpoint_batch);
+
+        // Verify the values match our expectations
+        assert_eq!(json["version"], version);
+        assert_eq!(json["size"], total_actions_counter.load(Ordering::Relaxed));
+        assert_eq!(json["parts"], parts as i64);
+        assert_eq!(json["sizeInBytes"], size_in_bytes);
+        assert_eq!(
+            json["numOfAddFiles"],
+            total_add_actions_counter.load(Ordering::Relaxed)
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_prepare_last_checkpoint_metadata_with_empty_batch() {
+        // Setup test data
+        let version: Version = 10;
+        let parts: usize = 3;
+        let total_actions_counter = Arc::new(AtomicI64::new(100));
+        let total_add_actions_counter = Arc::new(AtomicI64::new(75));
+
+        let log_root = Url::parse("memory://test-table/_delta_log/").unwrap();
+        let engine = ExprEngine::new();
+
+        // Create empty metadata (no rows)
+        let empty_schema = Arc::new(ArrowSchema::new(vec![Field::new(
+            "sizeInBytes",
+            ArrowDataType::Int64,
+            false,
+        )]));
+        let empty_batch = RecordBatch::try_new(
+            empty_schema,
+            vec![Arc::new(Int64Array::from(Vec::<i64>::new()))],
+        )
+        .expect("Failed to create empty batch");
+        let empty_metadata = ArrowEngineData::new(empty_batch);
+
+        // Create checkpoint writer
+        let writer = CheckpointWriter::new(
+            None,
+            total_actions_counter,
+            total_add_actions_counter,
+            version,
+            parts,
+            log_root,
+        );
+
+        // Call the method under test - should fail with InvalidCommitInfo
+        let result = writer.prepare_last_checkpoint_metadata(&engine, &empty_metadata);
+        assert!(result.is_err());
+
+        match result {
+            Err(Error::Generic(e)) => {
+                assert_eq!(
+                    e,
+                    "Engine checkpoint metadata should have exactly one row, found 0"
+                );
+            }
+            _ => panic!("Should have failed with error"),
+        }
+    }
+
+    #[test]
+    fn test_prepare_last_checkpoint_metadata_with_multiple_rows() {
+        // Setup test data
+        let version: Version = 10;
+        let parts: usize = 1;
+        let total_actions_counter = Arc::new(AtomicI64::new(50));
+        let total_add_actions_counter = Arc::new(AtomicI64::new(30));
+
+        // Create a log root URL
+        let log_root = Url::parse("memory://test-table/_delta_log/").unwrap();
+
+        // Create engine
+        let engine = ExprEngine::new();
+
+        // Create metadata with multiple rows
+        let schema = Arc::new(ArrowSchema::new(vec![Field::new(
+            "sizeInBytes",
+            ArrowDataType::Int64,
+            false,
+        )]));
+        let multi_row_batch =
+            RecordBatch::try_new(schema, vec![Arc::new(Int64Array::from(vec![1024, 2048]))])
+                .expect("Failed to create multi-row batch");
+        let multi_row_metadata = ArrowEngineData::new(multi_row_batch);
+
+        // Create checkpoint writer
+        let writer = CheckpointWriter::new(
+            None,
+            total_actions_counter,
+            total_add_actions_counter,
+            version,
+            parts,
+            log_root,
+        );
+
+        // Call the method under test - should fail with InvalidCommitInfo
+        let result = writer.prepare_last_checkpoint_metadata(&engine, &multi_row_metadata);
+        assert!(result.is_err());
+
+        match result {
+            Err(Error::Generic(e)) => {
+                assert_eq!(
+                    e,
+                    "Engine checkpoint metadata should have exactly one row, found 2"
+                );
+            }
+            _ => panic!("Should have failed with error"),
+        }
+    }
 }
diff --git a/kernel/src/transaction.rs b/kernel/src/transaction.rs
index 138d4fdef..94abba4be 100644
--- a/kernel/src/transaction.rs
+++ b/kernel/src/transaction.rs
@@ -344,6 +344,7 @@ mod tests {
     use crate::arrow::json::writer::LineDelimitedWriter;
     use crate::arrow::record_batch::RecordBatch;
 
+    // TODO(seb): Merge with other definitions and move to a common test module
     struct ExprEngine(Arc<dyn ExpressionHandler>);
 
     impl ExprEngine {

From 5e4df58d91920692349c9b9ef4462b8cde57da5e Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Sun, 30 Mar 2025 13:55:39 -0700
Subject: [PATCH 39/45] nits

---
 kernel/src/checkpoints/mod.rs             | 3 +--
 kernel/src/engine/arrow_expression/mod.rs | 6 +++++-
 kernel/src/expressions/scalars.rs         | 4 ++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/kernel/src/checkpoints/mod.rs b/kernel/src/checkpoints/mod.rs
index 72311b259..1b8ebf1b1 100644
--- a/kernel/src/checkpoints/mod.rs
+++ b/kernel/src/checkpoints/mod.rs
@@ -167,7 +167,6 @@ impl CheckpointWriter {
     /// Given the engine collected checkpoint metadata we want to extend
     /// the EngineData batch with the remaining fields for the `_last_checkpoint`
     /// file.
-
     pub fn finalize_checkpoint(
         self,
         engine: &dyn Engine,
@@ -229,7 +228,7 @@ impl CheckpointWriter {
         );
 
         // Transform the metadata
-        Ok(last_checkpoint_metadata_evaluator.evaluate(metadata)?)
+        last_checkpoint_metadata_evaluator.evaluate(metadata)
     }
 }
 
diff --git a/kernel/src/engine/arrow_expression/mod.rs b/kernel/src/engine/arrow_expression/mod.rs
index 621ff7755..1ff1834b7 100644
--- a/kernel/src/engine/arrow_expression/mod.rs
+++ b/kernel/src/engine/arrow_expression/mod.rs
@@ -1,6 +1,11 @@
 //! Expression handling based on arrow-rs compute kernels.
 use std::sync::Arc;
 
+#[cfg(target_pointer_width = "32")]
+use crate::arrow::array::UInt32Array;
+#[cfg(target_pointer_width = "64")]
+use crate::arrow::array::UInt64Array;
+
 use crate::arrow::array::{
     Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, Float32Array,
     Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, RecordBatch,
@@ -19,7 +24,6 @@ use crate::expressions::{Expression, Scalar};
 use crate::schema::{DataType, PrimitiveType, SchemaRef};
 use crate::{EngineData, ExpressionEvaluator, ExpressionHandler};
 
-use arrow_53::array::UInt64Array;
 use itertools::Itertools;
 use tracing::debug;
 
diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs
index f5e887fad..55b22c982 100644
--- a/kernel/src/expressions/scalars.rs
+++ b/kernel/src/expressions/scalars.rs
@@ -352,13 +352,13 @@ impl From<&[u8]> for Scalar {
 
 impl From<u64> for Scalar {
     fn from(u: u64) -> Self {
-        Self::ULong(u.into())
+        Self::ULong(u)
     }
 }
 
 impl From<usize> for Scalar {
     fn from(u: usize) -> Self {
-        Self::USize(u.into())
+        Self::USize(u)
     }
 }
 

From c8bcc2e466f8cf009f0ff972c589e5f3e38aa2a1 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Sun, 30 Mar 2025 14:01:49 -0700
Subject: [PATCH 40/45] ignore doc test

---
 kernel/src/checkpoints/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/src/checkpoints/mod.rs b/kernel/src/checkpoints/mod.rs
index 1b8ebf1b1..813eefc24 100644
--- a/kernel/src/checkpoints/mod.rs
+++ b/kernel/src/checkpoints/mod.rs
@@ -9,7 +9,7 @@
 //! The API is designed with a builder pattern for configuring and creating checkpoint writers.
 //!
 //! # Example
-//! ```
+//! ```ignore
 //! let path = "./tests/data/app-txn-no-checkpoint";
 //! let engine = Arc::new(SyncEngine::new());
 //! let table = Table::try_from_uri(path)?;

From c4ba531fef129aaa508362925d0d8a3849d37c98 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Mon, 31 Mar 2025 15:06:31 -0700
Subject: [PATCH 41/45] rename and update struct types

---
 acceptance/tests/other.rs       |  4 +--
 kernel/src/actions/schemas.rs   |  9 ++++++
 kernel/src/log_segment.rs       | 55 +++++++++++++++++++++++++--------
 kernel/src/log_segment/tests.rs | 22 ++++++-------
 kernel/src/snapshot.rs          | 15 ++++-----
 5 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/acceptance/tests/other.rs b/acceptance/tests/other.rs
index 5a89f23de..daea7ac77 100644
--- a/acceptance/tests/other.rs
+++ b/acceptance/tests/other.rs
@@ -3,7 +3,7 @@
 /// Since each new `.rs` file in this directory results in increased build and link time, it is
 /// important to only add new files if absolutely necessary for code readability or test
 /// performance.
-use delta_kernel::snapshot::CheckpointMetadata;
+use delta_kernel::snapshot::LastCheckpointHint;
 
 #[test]
 fn test_checkpoint_serde() {
@@ -11,7 +11,7 @@ fn test_checkpoint_serde() {
         "./tests/dat/out/reader_tests/generated/with_checkpoint/delta/_delta_log/_last_checkpoint",
     )
     .unwrap();
-    let cp: CheckpointMetadata = serde_json::from_reader(file).unwrap();
+    let cp: LastCheckpointHint = serde_json::from_reader(file).unwrap();
     assert_eq!(cp.version, 2)
 }
 
diff --git a/kernel/src/actions/schemas.rs b/kernel/src/actions/schemas.rs
index aa3b3e47b..4ecb1d3fb 100644
--- a/kernel/src/actions/schemas.rs
+++ b/kernel/src/actions/schemas.rs
@@ -8,6 +8,15 @@ pub(crate) trait ToSchema {
     fn to_schema() -> StructType;
 }
 
+/// Implement ToSchema for StructType to enable its use within Option<T> fields
+/// in schema-derived structs. This follows the system pattern where schema types
+/// implement ToSchema rather than directly implementing ToDataType.
+impl ToSchema for StructType {
+    fn to_schema() -> StructType {
+        StructType::new(vec![])
+    }
+}
+
 pub(crate) trait ToDataType {
     fn to_data_type() -> DataType;
 }
diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs
index c146f9eca..cbbd632f4 100644
--- a/kernel/src/log_segment.rs
+++ b/kernel/src/log_segment.rs
@@ -8,7 +8,7 @@ use crate::actions::{
 };
 use crate::path::{LogPathFileType, ParsedLogPath};
 use crate::schema::SchemaRef;
-use crate::snapshot::CheckpointMetadata;
+use crate::snapshot::LastCheckpointHint;
 use crate::utils::require;
 use crate::{
     DeltaResult, Engine, EngineData, Error, Expression, ExpressionRef, FileSystemClient,
@@ -109,7 +109,7 @@ impl LogSegment {
     /// parts. All these parts will have the same checkpoint version.
     ///
     /// The options for constructing a LogSegment for Snapshot are as follows:
-    /// - `checkpoint_hint`: a `CheckpointMetadata` to start the log segment from (e.g. from reading the `last_checkpoint` file).
+    /// - `checkpoint_hint`: a `LastCheckpointHint` to start the log segment from (e.g. from reading the `last_checkpoint` file).
     /// - `time_travel_version`: The version of the log that the Snapshot will be at.
     ///
     /// [`Snapshot`]: crate::snapshot::Snapshot
@@ -117,7 +117,7 @@ impl LogSegment {
     pub(crate) fn for_snapshot(
         fs_client: &dyn FileSystemClient,
         log_root: Url,
-        checkpoint_hint: impl Into<Option<CheckpointMetadata>>,
+        checkpoint_hint: impl Into<Option<LastCheckpointHint>>,
         time_travel_version: impl Into<Option<Version>>,
     ) -> DeltaResult<Self> {
         let time_travel_version = time_travel_version.into();
@@ -127,7 +127,12 @@ impl LogSegment {
                 (Some(cp), None) => {
                     list_log_files_with_checkpoint(&cp, fs_client, &log_root, None)?
                 }
-                (Some(cp), Some(end_version)) if cp.version <= end_version => {
+                // If type conversion fails, we skip the checkpoint hint and list all log files.
+                // Else, we check if the checkpoint hint's version is less than or equal to the
+                // time travel version.
+                (Some(cp), Some(end_version))
+                    if i64::try_from(end_version).map_or(false, |v| cp.version <= v) =>
+                {
                     list_log_files_with_checkpoint(&cp, fs_client, &log_root, Some(end_version))?
                 }
                 _ => list_log_files_with_version(fs_client, &log_root, None, time_travel_version)?,
@@ -535,15 +540,26 @@ fn group_checkpoint_parts(parts: Vec<ParsedLogPath>) -> HashMap<u32, Vec<ParsedL
 /// the returned [`ParsedLogPath`]s will have a version less than or equal to the `end_version`.
 /// See [`list_log_files_with_version`] for details on the return type.
 fn list_log_files_with_checkpoint(
-    checkpoint_metadata: &CheckpointMetadata,
+    checkpoint_metadata: &LastCheckpointHint,
     fs_client: &dyn FileSystemClient,
     log_root: &Url,
     end_version: Option<Version>,
 ) -> DeltaResult<(Vec<ParsedLogPath>, Vec<ParsedLogPath>)> {
+    // Safely convert checkpoint_metadata.version (i64) to u64 for comparisons
+    let checkpoint_metadata_version = match u64::try_from(checkpoint_metadata.version) {
+        Ok(version) => version,
+        Err(e) => {
+            return Err(Error::InvalidCheckpoint(format!(
+                "Invalid checkpoint version (negative value): {}",
+                e
+            )));
+        }
+    };
+
     let (commit_files, checkpoint_parts) = list_log_files_with_version(
         fs_client,
         log_root,
-        Some(checkpoint_metadata.version),
+        Some(checkpoint_metadata_version),
         end_version,
     )?;
 
@@ -553,18 +569,31 @@ fn list_log_files_with_checkpoint(
             "Had a _last_checkpoint hint but didn't find any checkpoints",
         ));
     };
-    if latest_checkpoint.version != checkpoint_metadata.version {
+    if latest_checkpoint.version != checkpoint_metadata_version {
         warn!(
             "_last_checkpoint hint is out of date. _last_checkpoint version: {}. Using actual most recent: {}",
             checkpoint_metadata.version,
             latest_checkpoint.version
         );
-    } else if checkpoint_parts.len() != checkpoint_metadata.parts.unwrap_or(1) {
-        return Err(Error::InvalidCheckpoint(format!(
-            "_last_checkpoint indicated that checkpoint should have {} parts, but it has {}",
-            checkpoint_metadata.parts.unwrap_or(1),
-            checkpoint_parts.len()
-        )));
+    } else {
+        // Convert checkpoint_metadata.parts(i64) to usize for comparisons
+        let expected_parts = match usize::try_from(checkpoint_metadata.parts.unwrap_or(1)) {
+            Ok(parts) => parts,
+            Err(e) => {
+                return Err(Error::InvalidCheckpoint(format!(
+                    "Invalid number of checkpoint parts (negative or too large): {}",
+                    e
+                )));
+            }
+        };
+
+        if checkpoint_parts.len() != expected_parts {
+            return Err(Error::InvalidCheckpoint(format!(
+                "_last_checkpoint indicated that checkpoint should have {} parts, but it has {}",
+                expected_parts,
+                checkpoint_parts.len()
+            )));
+        }
     }
     Ok((commit_files, checkpoint_parts))
 }
diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs
index d00ad235c..4d3e055e1 100644
--- a/kernel/src/log_segment/tests.rs
+++ b/kernel/src/log_segment/tests.rs
@@ -22,7 +22,7 @@ use crate::path::ParsedLogPath;
 use crate::scan::test_utils::{
     add_batch_simple, add_batch_with_remove, sidecar_batch_with_given_paths,
 };
-use crate::snapshot::CheckpointMetadata;
+use crate::snapshot::LastCheckpointHint;
 use crate::utils::test_utils::{assert_batch_matches, Action};
 use crate::{
     DeltaResult, Engine, EngineData, Expression, ExpressionRef, FileMeta, FileSystemClient,
@@ -81,10 +81,10 @@ fn delta_path_for_multipart_checkpoint(version: u64, part_num: u32, num_parts: u
 }
 
 // Utility method to build a log using a list of log paths and an optional checkpoint hint. The
-// CheckpointMetadata is written to `_delta_log/_last_checkpoint`.
+// LastCheckpointHint is written to `_delta_log/_last_checkpoint`.
 fn build_log_with_paths_and_checkpoint(
     paths: &[Path],
-    checkpoint_metadata: Option<&CheckpointMetadata>,
+    checkpoint_metadata: Option<&LastCheckpointHint>,
 ) -> (Box<dyn FileSystemClient>, Url) {
     let store = Arc::new(InMemory::new());
 
@@ -269,7 +269,7 @@ fn build_snapshot_with_uuid_checkpoint_json() {
 
 #[test]
 fn build_snapshot_with_correct_last_uuid_checkpoint() {
-    let checkpoint_metadata = CheckpointMetadata {
+    let checkpoint_metadata = LastCheckpointHint {
         version: 5,
         size: 10,
         parts: Some(1),
@@ -347,7 +347,7 @@ fn build_snapshot_with_multiple_incomplete_multipart_checkpoints() {
 
 #[test]
 fn build_snapshot_with_out_of_date_last_checkpoint() {
-    let checkpoint_metadata = CheckpointMetadata {
+    let checkpoint_metadata = LastCheckpointHint {
         version: 3,
         size: 10,
         parts: None,
@@ -384,7 +384,7 @@ fn build_snapshot_with_out_of_date_last_checkpoint() {
 }
 #[test]
 fn build_snapshot_with_correct_last_multipart_checkpoint() {
-    let checkpoint_metadata = CheckpointMetadata {
+    let checkpoint_metadata = LastCheckpointHint {
         version: 5,
         size: 10,
         parts: Some(3),
@@ -427,7 +427,7 @@ fn build_snapshot_with_correct_last_multipart_checkpoint() {
 
 #[test]
 fn build_snapshot_with_missing_checkpoint_part_from_hint_fails() {
-    let checkpoint_metadata = CheckpointMetadata {
+    let checkpoint_metadata = LastCheckpointHint {
         version: 5,
         size: 10,
         parts: Some(3),
@@ -462,7 +462,7 @@ fn build_snapshot_with_missing_checkpoint_part_from_hint_fails() {
 }
 #[test]
 fn build_snapshot_with_bad_checkpoint_hint_fails() {
-    let checkpoint_metadata = CheckpointMetadata {
+    let checkpoint_metadata = LastCheckpointHint {
         version: 5,
         size: 10,
         parts: Some(1),
@@ -536,7 +536,7 @@ fn build_snapshot_with_out_of_date_last_checkpoint_and_incomplete_recent_checkpo
     // When the _last_checkpoint is out of date and the most recent checkpoint is incomplete, the
     // Snapshot should be made of the most recent complete checkpoint and the commit files that
     // follow it.
-    let checkpoint_metadata = CheckpointMetadata {
+    let checkpoint_metadata = LastCheckpointHint {
         version: 3,
         size: 10,
         parts: None,
@@ -625,7 +625,7 @@ fn build_snapshot_without_checkpoints() {
 
 #[test]
 fn build_snapshot_with_checkpoint_greater_than_time_travel_version() {
-    let checkpoint_metadata = CheckpointMetadata {
+    let checkpoint_metadata = LastCheckpointHint {
         version: 5,
         size: 10,
         parts: None,
@@ -665,7 +665,7 @@ fn build_snapshot_with_checkpoint_greater_than_time_travel_version() {
 
 #[test]
 fn build_snapshot_with_start_checkpoint_and_time_travel_version() {
-    let checkpoint_metadata = CheckpointMetadata {
+    let checkpoint_metadata = LastCheckpointHint {
         version: 3,
         size: 10,
         parts: None,
diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs
index 1d91efeeb..b27693877 100644
--- a/kernel/src/snapshot.rs
+++ b/kernel/src/snapshot.rs
@@ -1,6 +1,7 @@
 //! In-memory representation of snapshots of tables (snapshot is a table at given point in time, it
 //! has schema etc.)
 
+use delta_kernel_derive::Schema;
 use serde::{Deserialize, Serialize};
 use std::sync::Arc;
 use tracing::{debug, warn};
@@ -142,22 +143,22 @@ impl Snapshot {
     }
 }
 
-#[derive(Debug, Deserialize, Serialize)]
+#[derive(Debug, Deserialize, Serialize, Schema)]
 #[serde(rename_all = "camelCase")]
 #[cfg_attr(feature = "developer-visibility", visibility::make(pub))]
 #[cfg_attr(not(feature = "developer-visibility"), visibility::make(pub(crate)))]
-struct CheckpointMetadata {
+struct LastCheckpointHint {
     /// The version of the table when the last checkpoint was made.
     #[allow(unreachable_pub)] // used by acceptance tests (TODO make an fn accessor?)
-    pub version: Version,
+    pub version: i64, // TODO: use Version type instead of i64
     /// The number of actions that are stored in the checkpoint.
     pub(crate) size: i64,
     /// The number of fragments if the last checkpoint was written in multiple parts.
-    pub(crate) parts: Option<usize>,
+    pub(crate) parts: Option<i64>, // TODO: use u64 instead
     /// The number of bytes of the checkpoint.
-    pub(crate) size_in_bytes: Option<i64>,
+    pub(crate) size_in_bytes: Option<i64>, // TODO: use u64 instead
     /// The number of AddFile actions in the checkpoint.
-    pub(crate) num_of_add_files: Option<i64>,
+    pub(crate) num_of_add_files: Option<i64>, // TODO: use u64 instead
     /// The schema of the checkpoint file.
     pub(crate) checkpoint_schema: Option<Schema>,
     /// The checksum of the last checkpoint JSON.
@@ -175,7 +176,7 @@ struct CheckpointMetadata {
 fn read_last_checkpoint(
     fs_client: &dyn FileSystemClient,
     log_root: &Url,
-) -> DeltaResult<Option<CheckpointMetadata>> {
+) -> DeltaResult<Option<LastCheckpointHint>> {
     let file_path = log_root.join(LAST_CHECKPOINT_FILE_NAME)?;
     match fs_client
         .read_files(vec![(file_path, None)])

From f935ad7e29936b73a3dc86892c182ed42e00e4c4 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Mon, 31 Mar 2025 15:24:35 -0700
Subject: [PATCH 42/45] doc update

---
 kernel/src/snapshot.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs
index b27693877..daa54a5ae 100644
--- a/kernel/src/snapshot.rs
+++ b/kernel/src/snapshot.rs
@@ -154,7 +154,7 @@ struct LastCheckpointHint {
     /// The number of actions that are stored in the checkpoint.
     pub(crate) size: i64,
     /// The number of fragments if the last checkpoint was written in multiple parts.
-    pub(crate) parts: Option<i64>, // TODO: use u64 instead
+    pub(crate) parts: Option<i64>, // TODO: use usize instead
     /// The number of bytes of the checkpoint.
     pub(crate) size_in_bytes: Option<i64>, // TODO: use u64 instead
     /// The number of AddFile actions in the checkpoint.

From 2a78848f06fa526893a8b914dc7f08bbda20399c Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Mon, 31 Mar 2025 15:31:16 -0700
Subject: [PATCH 43/45] fix build and docs

---
 kernel/src/log_segment.rs | 2 +-
 kernel/src/snapshot.rs    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs
index cbbd632f4..389ff9cf8 100644
--- a/kernel/src/log_segment.rs
+++ b/kernel/src/log_segment.rs
@@ -131,7 +131,7 @@ impl LogSegment {
                 // Else, we check if the checkpoint hint's version is less than or equal to the
                 // time travel version.
                 (Some(cp), Some(end_version))
-                    if i64::try_from(end_version).map_or(false, |v| cp.version <= v) =>
+                    if i64::try_from(end_version).is_ok_and(|v| cp.version <= v) =>
                 {
                     list_log_files_with_checkpoint(&cp, fs_client, &log_root, Some(end_version))?
                 }
diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs
index daa54a5ae..6d87901be 100644
--- a/kernel/src/snapshot.rs
+++ b/kernel/src/snapshot.rs
@@ -98,7 +98,7 @@ impl Snapshot {
         self.table_configuration().version()
     }
 
-    /// Table [`Schema`] at this `Snapshot`s version.
+    /// Table [`type@Schema`] at this `Snapshot`s version.
     pub fn schema(&self) -> SchemaRef {
         self.table_configuration.schema()
     }

From 6231d8419eacc24c484889ad11bbf71d0e21eeb8 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Mon, 31 Mar 2025 16:34:44 -0700
Subject: [PATCH 44/45] revert primtiive additions

---
 ffi/src/expressions/kernel.rs                 | 13 ---
 ffi/src/schema.rs                             | 20 -----
 kernel/src/actions/schemas.rs                 |  2 -
 kernel/src/checkpoints/mod.rs                 | 55 +++++-------
 kernel/src/engine/arrow_conversion.rs         | 14 ---
 kernel/src/engine/arrow_expression/mod.rs     | 28 ------
 .../src/engine/parquet_row_group_skipping.rs  | 89 -------------------
 kernel/src/expressions/scalars.rs             | 26 ------
 kernel/src/schema/mod.rs                      | 10 ---
 9 files changed, 21 insertions(+), 236 deletions(-)

diff --git a/ffi/src/expressions/kernel.rs b/ffi/src/expressions/kernel.rs
index 5abe69f77..9914758d5 100644
--- a/ffi/src/expressions/kernel.rs
+++ b/ffi/src/expressions/kernel.rs
@@ -66,8 +66,6 @@ pub struct EngineExpressionVisitor {
     /// Visit a 64bit `long`  belonging to the list identified by `sibling_list_id`.
     pub visit_literal_long: VisitLiteralFn<i64>,
     /// Visit a 64bit unsigned `long`  belonging to the list identified by `sibling_list_id`.
-    pub visit_literal_ulong: VisitLiteralFn<u64>,
-    /// Visit a 32bit unsigned `integer` int belonging to the list identified by `sibling_list_id`.
     pub visit_literal_uint: VisitLiteralFn<u32>,
     /// Visit a 16bit `short` belonging to the list identified by `sibling_list_id`.
     pub visit_literal_short: VisitLiteralFn<i16>,
@@ -296,17 +294,6 @@ fn visit_expression_internal(
         match scalar {
             Scalar::Integer(val) => call!(visitor, visit_literal_int, sibling_list_id, *val),
             Scalar::Long(val) => call!(visitor, visit_literal_long, sibling_list_id, *val),
-            Scalar::ULong(val) => call!(visitor, visit_literal_ulong, sibling_list_id, *val), // TODO: Fix typecast
-            Scalar::USize(val) => {
-                #[cfg(target_pointer_width = "32")]
-                {
-                    call!(visitor, visit_literal_uint, sibling_list_id, *val as u64)
-                }
-                #[cfg(target_pointer_width = "64")]
-                {
-                    call!(visitor, visit_literal_ulong, sibling_list_id, *val as u64)
-                }
-            }
             Scalar::Short(val) => call!(visitor, visit_literal_short, sibling_list_id, *val),
             Scalar::Byte(val) => call!(visitor, visit_literal_byte, sibling_list_id, *val),
             Scalar::Float(val) => call!(visitor, visit_literal_float, sibling_list_id, *val),
diff --git a/ffi/src/schema.rs b/ffi/src/schema.rs
index 0cd1ed423..a474c80c3 100644
--- a/ffi/src/schema.rs
+++ b/ffi/src/schema.rs
@@ -102,24 +102,6 @@ pub struct EngineSchemaVisitor {
         metadata: &CStringMap,
     ),
 
-    /// Visit a `ulong` belonging to the list identified by `sibling_list_id`.
-    pub visit_ulong: extern "C" fn(
-        data: *mut c_void,
-        sibling_list_id: usize,
-        name: KernelStringSlice,
-        is_nullable: bool,
-        metadata: &CStringMap,
-    ),
-
-    /// Visit a `usize` belonging to the list identified by `sibling_list_id`.
-    pub visit_usize: extern "C" fn(
-        data: *mut c_void,
-        sibling_list_id: usize,
-        name: KernelStringSlice,
-        is_nullable: bool,
-        metadata: &CStringMap,
-    ),
-
     /// Visit an `integer` belonging to the list identified by `sibling_list_id`.
     pub visit_integer: extern "C" fn(
         data: *mut c_void,
@@ -326,8 +308,6 @@ fn visit_schema_impl(schema: &StructType, visitor: &mut EngineSchemaVisitor) ->
             }
             &DataType::STRING => call!(visit_string),
             &DataType::LONG => call!(visit_long),
-            &DataType::ULONG => call!(visit_ulong),
-            &DataType::USIZE => call!(visit_usize),
             &DataType::INTEGER => call!(visit_integer),
             &DataType::SHORT => call!(visit_short),
             &DataType::BYTE => call!(visit_byte),
diff --git a/kernel/src/actions/schemas.rs b/kernel/src/actions/schemas.rs
index a0ec4b5be..4ecb1d3fb 100644
--- a/kernel/src/actions/schemas.rs
+++ b/kernel/src/actions/schemas.rs
@@ -45,8 +45,6 @@ macro_rules! impl_to_data_type {
 
 impl_to_data_type!(
     (String, DataType::STRING),
-    (u64, DataType::ULONG),
-    (usize, DataType::USIZE),
     (i64, DataType::LONG),
     (i32, DataType::INTEGER),
     (i16, DataType::SHORT),
diff --git a/kernel/src/checkpoints/mod.rs b/kernel/src/checkpoints/mod.rs
index 813eefc24..6429df3fa 100644
--- a/kernel/src/checkpoints/mod.rs
+++ b/kernel/src/checkpoints/mod.rs
@@ -35,9 +35,12 @@ use std::{
 };
 use url::Url;
 
-use crate::actions::schemas::GetStructField;
 use crate::expressions::column_expr;
 use crate::schema::{SchemaRef, StructType};
+use crate::{
+    actions::schemas::{GetStructField, ToSchema},
+    snapshot::LastCheckpointHint,
+};
 use crate::{
     actions::{
         Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, METADATA_NAME,
@@ -45,30 +48,12 @@ use crate::{
     },
     path::ParsedLogPath,
     snapshot::Snapshot,
-    DeltaResult, Engine, EngineData, Error, Expression, Version,
+    DeltaResult, Engine, EngineData, Error, Expression,
 };
 pub mod log_replay;
 #[cfg(test)]
 mod tests;
 
-/// Schema definition for the _last_checkpoint file
-pub(crate) static CHECKPOINT_METADATA_SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
-    Arc::new(StructType::new(vec![
-        <Version>::get_struct_field("version"),
-        <i64>::get_struct_field("size"),
-        Option::<usize>::get_struct_field("parts"),
-        Option::<i64>::get_struct_field("sizeInBytes"),
-        Option::<i64>::get_struct_field("numOfAddFiles"),
-        // Option::<Schema>::get_struct_field("checkpoint_schema"), TODO: Schema
-        // Option::<String>::get_struct_field("checksum"), TODO: Checksum
-    ]))
-});
-
-/// Get the expected schema for the _last_checkpoint file
-pub fn get_checkpoint_metadata_schema() -> &'static SchemaRef {
-    &CHECKPOINT_METADATA_SCHEMA
-}
-
 /// Read schema definition for collecting checkpoint actions
 static CHECKPOINT_READ_SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
     StructType::new([
@@ -117,10 +102,10 @@ pub struct CheckpointWriter {
     total_add_actions_counter: Arc<AtomicI64>,
 
     /// Version of the checkpoint
-    version: Version,
+    version: i64,
 
     /// Number of parts of the checkpoint
-    parts: usize,
+    parts: i64,
 
     /// Path to table's log
     log_root: Url,
@@ -132,8 +117,8 @@ impl CheckpointWriter {
         single_file_checkpoint_data: Option<SingleFileCheckpointData>,
         total_actions_counter: Arc<AtomicI64>,
         total_add_actions_counter: Arc<AtomicI64>,
-        version: Version,
-        parts: usize,
+        version: i64,
+        parts: i64,
         log_root: Url,
     ) -> Self {
         Self {
@@ -205,7 +190,6 @@ impl CheckpointWriter {
                 metadata.len()
             )));
         }
-
         // Create expression for transforming the metadata
         let last_checkpoint_exprs = [
             Expression::literal(self.version),
@@ -214,11 +198,14 @@ impl CheckpointWriter {
             column_expr!("sizeInBytes"),
             Expression::literal(self.total_add_actions_counter.load(Ordering::SeqCst)),
         ];
+
         let last_checkpoint_expr = Expression::struct_from(last_checkpoint_exprs);
 
         // Get schemas for transformation
-        let last_checkpoint_schema = get_checkpoint_metadata_schema();
+        let last_checkpoint_schema = LastCheckpointHint::to_schema();
+        println!("last_checkpoint_schema: {:?}", last_checkpoint_schema);
         let engine_metadata_schema = last_checkpoint_schema.project_as_struct(&["sizeInBytes"])?;
+        println!("engine_metadata_schema: {:?}", engine_metadata_schema);
 
         // Create the evaluator for the transformation
         let last_checkpoint_metadata_evaluator = engine.get_expression_handler().get_evaluator(
@@ -318,7 +305,7 @@ impl CheckpointBuilder {
             Some(data),
             total_actions_counter,
             total_add_actions_counter,
-            self.snapshot.version(),
+            self.snapshot.version() as i64,
             1,
             self.snapshot.log_segment().log_root.clone(),
         ))
@@ -489,8 +476,8 @@ mod unit_tests {
     fn test_prepare_last_checkpoint_metadata() -> DeltaResult<()> {
         // Setup test data
         let size_in_bytes: i64 = 1024 * 1024; // 1MB
-        let version: Version = 10;
-        let parts: usize = 3;
+        let version = 10;
+        let parts = 3;
         let total_actions_counter = Arc::new(AtomicI64::new(100));
         let total_add_actions_counter = Arc::new(AtomicI64::new(75));
 
@@ -519,7 +506,7 @@ mod unit_tests {
         // Verify the values match our expectations
         assert_eq!(json["version"], version);
         assert_eq!(json["size"], total_actions_counter.load(Ordering::Relaxed));
-        assert_eq!(json["parts"], parts as i64);
+        assert_eq!(json["parts"], parts);
         assert_eq!(json["sizeInBytes"], size_in_bytes);
         assert_eq!(
             json["numOfAddFiles"],
@@ -532,8 +519,8 @@ mod unit_tests {
     #[test]
     fn test_prepare_last_checkpoint_metadata_with_empty_batch() {
         // Setup test data
-        let version: Version = 10;
-        let parts: usize = 3;
+        let version = 10;
+        let parts = 3;
         let total_actions_counter = Arc::new(AtomicI64::new(100));
         let total_add_actions_counter = Arc::new(AtomicI64::new(75));
 
@@ -581,8 +568,8 @@ mod unit_tests {
     #[test]
     fn test_prepare_last_checkpoint_metadata_with_multiple_rows() {
         // Setup test data
-        let version: Version = 10;
-        let parts: usize = 1;
+        let version = 10;
+        let parts = 1;
         let total_actions_counter = Arc::new(AtomicI64::new(50));
         let total_add_actions_counter = Arc::new(AtomicI64::new(30));
 
diff --git a/kernel/src/engine/arrow_conversion.rs b/kernel/src/engine/arrow_conversion.rs
index 6242d27bd..a425cd143 100644
--- a/kernel/src/engine/arrow_conversion.rs
+++ b/kernel/src/engine/arrow_conversion.rs
@@ -100,20 +100,6 @@ impl TryFrom<&DataType> for ArrowDataType {
                 match p {
                     PrimitiveType::String => Ok(ArrowDataType::Utf8),
                     PrimitiveType::Long => Ok(ArrowDataType::Int64), // undocumented type
-                    PrimitiveType::ULong => Ok(ArrowDataType::UInt64),
-                    // Since usize is platform dependent, we need to check the target_pointer_width
-                    // to determine the correct arrow type to use.
-                    PrimitiveType::USize => {
-                        #[cfg(target_pointer_width = "32")]
-                        {
-                            Ok(ArrowDataType::UInt32)
-                        }
-
-                        #[cfg(target_pointer_width = "64")]
-                        {
-                            Ok(ArrowDataType::UInt64)
-                        }
-                    }
                     PrimitiveType::Integer => Ok(ArrowDataType::Int32),
                     PrimitiveType::Short => Ok(ArrowDataType::Int16),
                     PrimitiveType::Byte => Ok(ArrowDataType::Int8),
diff --git a/kernel/src/engine/arrow_expression/mod.rs b/kernel/src/engine/arrow_expression/mod.rs
index 1ff1834b7..432844d21 100644
--- a/kernel/src/engine/arrow_expression/mod.rs
+++ b/kernel/src/engine/arrow_expression/mod.rs
@@ -45,20 +45,6 @@ impl Scalar {
         let arr: ArrayRef = match self {
             Integer(val) => Arc::new(Int32Array::from_value(*val, num_rows)),
             Long(val) => Arc::new(Int64Array::from_value(*val, num_rows)),
-            ULong(val) => Arc::new(UInt64Array::from_value(*val, num_rows)),
-            // Since usize is platform dependent, we need to check the target_pointer_width
-            // to determine the correct array type to use.
-            USize(val) => {
-                #[cfg(target_pointer_width = "32")]
-                {
-                    Arc::new(UInt32Array::from_value(*val as u32, num_rows))
-                }
-
-                #[cfg(target_pointer_width = "64")]
-                {
-                    Arc::new(UInt64Array::from_value(*val as u64, num_rows))
-                }
-            }
             Short(val) => Arc::new(Int16Array::from_value(*val, num_rows)),
             Byte(val) => Arc::new(Int8Array::from_value(*val, num_rows)),
             Float(val) => Arc::new(Float32Array::from_value(*val, num_rows)),
@@ -107,20 +93,6 @@ impl Scalar {
             Null(DataType::SHORT) => Arc::new(Int16Array::new_null(num_rows)),
             Null(DataType::INTEGER) => Arc::new(Int32Array::new_null(num_rows)),
             Null(DataType::LONG) => Arc::new(Int64Array::new_null(num_rows)),
-            Null(DataType::ULONG) => Arc::new(UInt64Array::new_null(num_rows)),
-            // Since usize is platform dependent, we need to check the target_pointer_width
-            // to determine the correct array type to use.
-            Null(DataType::USIZE) => {
-                #[cfg(target_pointer_width = "32")]
-                {
-                    Arc::new(UInt32Array::new_null(num_rows))
-                }
-
-                #[cfg(target_pointer_width = "64")]
-                {
-                    Arc::new(UInt64Array::new_null(num_rows))
-                }
-            }
             Null(DataType::FLOAT) => Arc::new(Float32Array::new_null(num_rows)),
             Null(DataType::DOUBLE) => Arc::new(Float64Array::new_null(num_rows)),
             Null(DataType::STRING) => Arc::new(StringArray::new_null(num_rows)),
diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs
index c9d78fbdd..fbce2f913 100644
--- a/kernel/src/engine/parquet_row_group_skipping.rs
+++ b/kernel/src/engine/parquet_row_group_skipping.rs
@@ -105,50 +105,6 @@ impl ParquetStatsProvider for RowGroupFilter<'_> {
             (Long, Statistics::Int64(s)) => s.min_opt()?.into(),
             (Long, Statistics::Int32(s)) => (*s.min_opt()? as i64).into(),
             (Long, _) => return None,
-            (ULong, Statistics::Int64(s)) =>
-            // Attempt to convert value to u64, return None if conversion fails
-            {
-                u64::try_from(*s.min_opt()?).ok()?.into()
-            }
-
-            // Handling ULong type with Int32 statistics
-            (ULong, Statistics::Int32(s)) =>
-            // Attempt to convert value to u64, return None if conversion fails
-            {
-                u64::try_from(*s.min_opt()?).ok()?.into()
-            }
-
-            (ULong, _) => return None,
-            // Handling USize type on 64-bit architecture with Int64 statistics
-            #[cfg(target_pointer_width = "64")]
-            (USize, Statistics::Int64(s)) =>
-            // Attempt to convert value to usize, return None if conversion fails
-            {
-                usize::try_from(*s.min_opt()?).ok()?.into()
-            }
-            // Handling USize type on 64-bit architecture with Int32 statistics
-            #[cfg(target_pointer_width = "64")]
-            (USize, Statistics::Int32(s)) =>
-            // Attempt to convert value to usize, converting from u64 if needed
-            {
-                usize::try_from(*s.min_opt()? as u64).ok()?.into()
-            }
-            // Handling USize type on 32-bit architecture with Int64 statistics
-            #[cfg(target_pointer_width = "32")]
-            (USize, Statistics::Int64(s)) =>
-            // Attempt to convert value to usize, ensuring it's cast to u32 first
-            {
-                usize::try_from(*s.min_opt()? as u32).ok()?.into()
-            }
-
-            // Handling USize type on 32-bit architecture with Int32 statistics
-            #[cfg(target_pointer_width = "32")]
-            (USize, Statistics::Int32(s)) =>
-            // Attempt to convert vvalue to usize, return None if conversion fails
-            {
-                usize::try_from(*s.min_opt()?).ok()?.into()
-            }
-            (USize, _) => return None,
             (Integer, Statistics::Int32(s)) => s.min_opt()?.into(),
             (Integer, _) => return None,
             (Short, Statistics::Int32(s)) => (*s.min_opt()? as i16).into(),
@@ -191,51 +147,6 @@ impl ParquetStatsProvider for RowGroupFilter<'_> {
             (Long, Statistics::Int64(s)) => s.max_opt()?.into(),
             (Long, Statistics::Int32(s)) => (*s.max_opt()? as i64).into(),
             (Long, _) => return None,
-            (ULong, Statistics::Int64(s)) =>
-            // Attempt to convert value to u64, return None if conversion fails
-            {
-                u64::try_from(*s.min_opt()?).ok()?.into()
-            }
-
-            // Handling ULong type with Int32 statistics
-            (ULong, Statistics::Int32(s)) =>
-            // Attempt to convert value to u64, return None if conversion fails
-            {
-                u64::try_from(*s.min_opt()?).ok()?.into()
-            }
-
-            (ULong, _) => return None,
-            // Handling USize type on 64-bit architecture with Int64 statistics
-            #[cfg(target_pointer_width = "64")]
-            (USize, Statistics::Int64(s)) =>
-            // Attempt to convert value to usize, return None if conversion fails
-            {
-                usize::try_from(*s.min_opt()?).ok()?.into()
-            }
-            // Handling USize type on 64-bit architecture with Int32 statistics
-            #[cfg(target_pointer_width = "64")]
-            (USize, Statistics::Int32(s)) =>
-            // Attempt to convert value to usize, converting from u64 if needed
-            {
-                usize::try_from(*s.min_opt()? as u64).ok()?.into()
-            }
-            // Handling USize type on 32-bit architecture with Int64 statistics
-            #[cfg(target_pointer_width = "32")]
-            (USize, Statistics::Int64(s)) =>
-            // Attempt to convert value to usize, ensuring it's cast to u32 first
-            {
-                usize::try_from(*s.min_opt()? as u32).ok()?.into()
-            }
-
-            // Handling USize type on 32-bit architecture with Int32 statistics
-            #[cfg(target_pointer_width = "32")]
-            (USize, Statistics::Int32(s)) =>
-            // Attempt to convert vvalue to usize, return None if conversion fails
-            {
-                usize::try_from(*s.min_opt()?).ok()?.into()
-            }
-            (USize, _) => return None,
-
             (Integer, Statistics::Int32(s)) => s.max_opt()?.into(),
             (Integer, _) => return None,
             (Short, Statistics::Int32(s)) => (*s.max_opt()? as i16).into(),
diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs
index 55b22c982..90f5358a6 100644
--- a/kernel/src/expressions/scalars.rs
+++ b/kernel/src/expressions/scalars.rs
@@ -96,10 +96,6 @@ pub enum Scalar {
     Integer(i32),
     /// 64bit integer
     Long(i64),
-    // unsigned 64bit integer
-    ULong(u64),
-    // usize
-    USize(usize),
     /// 16bit integer
     Short(i16),
     /// 8bit integer
@@ -135,8 +131,6 @@ impl Scalar {
         match self {
             Self::Integer(_) => DataType::INTEGER,
             Self::Long(_) => DataType::LONG,
-            Self::ULong(_) => DataType::ULONG,
-            Self::USize(_) => DataType::USIZE,
             Self::Short(_) => DataType::SHORT,
             Self::Byte(_) => DataType::BYTE,
             Self::Float(_) => DataType::FLOAT,
@@ -175,8 +169,6 @@ impl Display for Scalar {
         match self {
             Self::Integer(i) => write!(f, "{}", i),
             Self::Long(i) => write!(f, "{}", i),
-            Self::ULong(i) => write!(f, "{}", i),
-            Self::USize(i) => write!(f, "{}", i),
             Self::Short(i) => write!(f, "{}", i),
             Self::Byte(i) => write!(f, "{}", i),
             Self::Float(fl) => write!(f, "{}", fl),
@@ -249,10 +241,6 @@ impl PartialOrd for Scalar {
             (Integer(_), _) => None,
             (Long(a), Long(b)) => a.partial_cmp(b),
             (Long(_), _) => None,
-            (ULong(a), ULong(b)) => a.partial_cmp(b),
-            (ULong(_), _) => None,
-            (USize(a), USize(b)) => a.partial_cmp(b),
-            (USize(_), _) => None,
             (Short(a), Short(b)) => a.partial_cmp(b),
             (Short(_), _) => None,
             (Byte(a), Byte(b)) => a.partial_cmp(b),
@@ -350,18 +338,6 @@ impl From<&[u8]> for Scalar {
     }
 }
 
-impl From<u64> for Scalar {
-    fn from(u: u64) -> Self {
-        Self::ULong(u)
-    }
-}
-
-impl From<usize> for Scalar {
-    fn from(u: usize) -> Self {
-        Self::USize(u)
-    }
-}
-
 // TODO: add more From impls
 
 impl PrimitiveType {
@@ -402,8 +378,6 @@ impl PrimitiveType {
             Short => self.parse_str_as_scalar(raw, Scalar::Short),
             Integer => self.parse_str_as_scalar(raw, Scalar::Integer),
             Long => self.parse_str_as_scalar(raw, Scalar::Long),
-            ULong => self.parse_str_as_scalar(raw, Scalar::ULong),
-            USize => self.parse_str_as_scalar(raw, Scalar::USize),
             Float => self.parse_str_as_scalar(raw, Scalar::Float),
             Double => self.parse_str_as_scalar(raw, Scalar::Double),
             Boolean => {
diff --git a/kernel/src/schema/mod.rs b/kernel/src/schema/mod.rs
index ede497305..3a5648b57 100644
--- a/kernel/src/schema/mod.rs
+++ b/kernel/src/schema/mod.rs
@@ -493,12 +493,6 @@ pub enum PrimitiveType {
     String,
     /// i64: 8-byte signed integer. Range: -9223372036854775808 to 9223372036854775807
     Long,
-    /// u64: 8-byte unsigned integer. Range: 0 to 18446744073709551615
-    ULong,
-    /// usize: Platform-dependent unsigned integer. Typically used for indexing and memory sizes.
-    ///  - 64-bit platforms: Range 0 to 18_446_744_073_709_551_615
-    ///  - 32-bit platforms: Range 0 to 4_294_967_295
-    USize,
     /// i32: 4-byte signed integer. Range: -2147483648 to 2147483647
     Integer,
     /// i16: 2-byte signed integer numbers. Range: -32768 to 32767
@@ -565,8 +559,6 @@ impl Display for PrimitiveType {
         match self {
             PrimitiveType::String => write!(f, "string"),
             PrimitiveType::Long => write!(f, "long"),
-            PrimitiveType::ULong => write!(f, "ulong"),
-            PrimitiveType::USize => write!(f, "usize"),
             PrimitiveType::Integer => write!(f, "integer"),
             PrimitiveType::Short => write!(f, "short"),
             PrimitiveType::Byte => write!(f, "byte"),
@@ -632,8 +624,6 @@ impl From<SchemaRef> for DataType {
 impl DataType {
     pub const STRING: Self = DataType::Primitive(PrimitiveType::String);
     pub const LONG: Self = DataType::Primitive(PrimitiveType::Long);
-    pub const ULONG: Self = DataType::Primitive(PrimitiveType::ULong);
-    pub const USIZE: Self = DataType::Primitive(PrimitiveType::USize);
     pub const INTEGER: Self = DataType::Primitive(PrimitiveType::Integer);
     pub const SHORT: Self = DataType::Primitive(PrimitiveType::Short);
     pub const BYTE: Self = DataType::Primitive(PrimitiveType::Byte);

From 3f2c0f9ac5ea7a2d5d145b4e13a50fef3ccb1e11 Mon Sep 17 00:00:00 2001
From: sebastian tia <sebastian.tht@gmail.com>
Date: Mon, 31 Mar 2025 20:04:43 -0700
Subject: [PATCH 45/45] add the CheckpointMetadata action batch for v2
 checkpoints

---
 kernel/Cargo.toml               |   4 +-
 kernel/src/actions/mod.rs       |  13 +++
 kernel/src/checkpoints/mod.rs   |  84 +++++++++++++++++--
 kernel/src/checkpoints/tests.rs | 139 ++++++++++++++++++++++++++++++++
 kernel/src/lib.rs               |   2 +-
 5 files changed, 230 insertions(+), 12 deletions(-)

diff --git a/kernel/Cargo.toml b/kernel/Cargo.toml
index 5bc607c2a..6aa1052df 100644
--- a/kernel/Cargo.toml
+++ b/kernel/Cargo.toml
@@ -49,7 +49,7 @@ thiserror = "1"
 # only for structured logging
 tracing = { version = "0.1", features = ["log"] }
 url = "2"
-uuid = "1.10.0"
+uuid = { version = "1.10.0", features = ["v4", "fast-rng"] }
 z85 = "3.0.5"
 
 # bring in our derive macros
@@ -118,8 +118,6 @@ default-engine-base = [
   "need_arrow",
   "object_store",
   "tokio",
-  "uuid/v4",
-  "uuid/fast-rng",
 ]
 
 # the default-engine use the reqwest crate with default features which uses native-tls. if you want
diff --git a/kernel/src/actions/mod.rs b/kernel/src/actions/mod.rs
index e30d3033b..a0f9fdecd 100644
--- a/kernel/src/actions/mod.rs
+++ b/kernel/src/actions/mod.rs
@@ -544,6 +544,19 @@ pub(crate) struct SetTransaction {
     pub(crate) last_updated: Option<i64>,
 }
 
+/// The CheckpointMetadata action describes details about a checkpoint following the V2 specification.
+///
+/// [More info]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#checkpoint-metadata
+#[derive(Schema, Debug, PartialEq)]
+#[cfg_attr(feature = "developer-visibility", visibility::make(pub))]
+pub(crate) struct CheckpointMetadata {
+    /// The version of the V2 spec checkpoint.
+    pub(crate) version: i64,
+
+    /// Map containing any additional metadata about the V2 spec checkpoint.
+    pub(crate) tags: Option<HashMap<String, String>>,
+}
+
 /// The sidecar action references a sidecar file which provides some of the checkpoint's
 /// file actions. This action is only allowed in checkpoints following the V2 spec.
 ///
diff --git a/kernel/src/checkpoints/mod.rs b/kernel/src/checkpoints/mod.rs
index 6429df3fa..66b2aced3 100644
--- a/kernel/src/checkpoints/mod.rs
+++ b/kernel/src/checkpoints/mod.rs
@@ -35,12 +35,12 @@ use std::{
 };
 use url::Url;
 
-use crate::expressions::column_expr;
-use crate::schema::{SchemaRef, StructType};
 use crate::{
     actions::schemas::{GetStructField, ToSchema},
+    expressions::Scalar,
     snapshot::LastCheckpointHint,
 };
+use crate::{actions::CheckpointMetadata, expressions::column_expr};
 use crate::{
     actions::{
         Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, METADATA_NAME,
@@ -50,6 +50,10 @@ use crate::{
     snapshot::Snapshot,
     DeltaResult, Engine, EngineData, Error, Expression,
 };
+use crate::{
+    schema::{SchemaRef, StructType},
+    ExpressionHandlerExtension,
+};
 pub mod log_replay;
 #[cfg(test)]
 mod tests;
@@ -282,6 +286,13 @@ impl CheckpointBuilder {
             deleted_file_retention_timestamp,
         );
 
+        // Chain the result of create_checkpoint_metadata_batch to the checkpoint data
+        let chained = checkpoint_data.chain(create_checkpoint_metadata_batch(
+            self.snapshot.version() as i64,
+            engine,
+            v2_checkpoints_supported,
+        )?);
+
         // Generate checkpoint path based on builder configuration
         // Classic naming is required for V1 checkpoints and optional for V2 checkpoints
         let checkpoint_path = if self.with_classic_naming || !v2_checkpoints_supported {
@@ -296,13 +307,11 @@ impl CheckpointBuilder {
             )?
         };
 
-        let data = SingleFileCheckpointData {
-            data: Box::new(checkpoint_data),
-            path: checkpoint_path.location,
-        };
-
         Ok(CheckpointWriter::new(
-            Some(data),
+            Some(SingleFileCheckpointData {
+                data: Box::new(chained),
+                path: checkpoint_path.location,
+            }),
             total_actions_counter,
             total_add_actions_counter,
             self.snapshot.version() as i64,
@@ -378,6 +387,37 @@ fn deleted_file_retention_timestamp_with_time(
     Ok(now_ms - retention_ms)
 }
 
+/// Create a batch with a single row containing the [`CheckpointMetadata`] action
+/// for the V2 spec checkpoint.
+///
+/// This method calls the create_one method on the expression handler to create
+/// a single-row batch with the checkpoint metadata action. The method returns:
+/// - None if the checkpoint is not a V2 checkpoint
+/// - Some(Ok(batch)) if the batch was successfully created
+fn create_checkpoint_metadata_batch(
+    version: i64,
+    engine: &dyn Engine,
+    is_v2_checkpoint: bool,
+) -> DeltaResult<Option<DeltaResult<CheckpointData>>> {
+    if is_v2_checkpoint {
+        let values: &[Scalar] = &[version.into()];
+        let checkpoint_metadata_batch = engine.get_expression_handler().create_one(
+            // TODO: Include checkpointMetadata.tags when maps are supported
+            Arc::new(CheckpointMetadata::to_schema().project_as_struct(&["version"])?),
+            &values,
+        )?;
+
+        let result = CheckpointData {
+            data: checkpoint_metadata_batch,
+            selection_vector: vec![true],
+        };
+
+        Ok(Some(Ok(result)))
+    } else {
+        Ok(None)
+    }
+}
+
 #[cfg(test)]
 mod unit_tests {
     use super::*;
@@ -614,4 +654,32 @@ mod unit_tests {
             _ => panic!("Should have failed with error"),
         }
     }
+
+    #[test]
+    fn test_create_checkpoint_metadata() -> DeltaResult<()> {
+        let engine = ExprEngine::new();
+        let version = 10;
+        let is_v2_checkpoint = true;
+
+        // Call the method under test
+        let result = create_checkpoint_metadata_batch(version, &engine, is_v2_checkpoint)?;
+
+        assert!(result.is_some());
+        let checkpoint_data = result.unwrap()?;
+        assert!(checkpoint_data.selection_vector == vec![true]);
+
+        // Extract the batch and verify the version field
+        let arrow_data = ArrowEngineData::try_from_engine_data(checkpoint_data.data)?;
+        assert!(arrow_data.len() == 1);
+
+        // Verify the version field
+        let version_field = arrow_data
+            .record_batch()
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int64Array>()
+            .expect("Failed to downcast to Int64Array");
+        assert_eq!(version_field.value(0), version);
+        Ok(())
+    }
 }
diff --git a/kernel/src/checkpoints/tests.rs b/kernel/src/checkpoints/tests.rs
index 6975e73bc..8d5e87092 100644
--- a/kernel/src/checkpoints/tests.rs
+++ b/kernel/src/checkpoints/tests.rs
@@ -117,6 +117,145 @@ fn test_checkpoint_latest_version_by_default() -> DeltaResult<()> {
     Ok(())
 }
 
+#[test]
+fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> {
+    let (store, _) = new_in_memory_store();
+    let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new()));
+
+    // 1st commit: adds `fake_path_1`
+    write_commit_to_store(
+        &store,
+        vec![Action::Add(Add {
+            path: "fake_path_1".into(),
+            data_change: true,
+            ..Default::default()
+        })],
+        0,
+    )?;
+
+    // 2nd commit: adds `fake_path_2` & removes `fake_path_1`
+    write_commit_to_store(
+        &store,
+        vec![
+            Action::Add(Add {
+                path: "fake_path_2".into(),
+                data_change: true,
+                ..Default::default()
+            }),
+            Action::Remove(Remove {
+                path: "fake_path_1".into(),
+                data_change: true,
+                ..Default::default()
+            }),
+        ],
+        1,
+    )?;
+
+    // 3rd commit: metadata & protocol actions
+    write_commit_to_store(
+            &store,
+            vec![
+                Action::Metadata(Metadata {
+                    id: "fake_path_1".into(),
+                    schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(),
+                    ..Default::default()
+                }),
+                Action::Protocol(Protocol::try_new(3, 7, Vec::<String>::new().into(), Vec::<String>::new().into())?),
+            ],
+            2,
+        )?;
+    let table_root = Url::parse("memory:///")?;
+    let table = Table::new(table_root);
+    let mut checkpointer = table.checkpoint(&engine, None)?.build(&engine)?;
+    let checkpoint_data = checkpointer.get_checkpoint_info()?;
+    let mut data_iter = checkpoint_data.data;
+    assert_eq!(
+        checkpoint_data.path,
+        Url::parse("memory:///_delta_log/00000000000000000002.checkpoint.parquet")?
+    );
+
+    // The first batch should be the metadata and protocol actions.
+    let checkpoint_data = data_iter.next().unwrap()?;
+
+    assert_eq!(checkpoint_data.selection_vector, [true, true]);
+
+    // The second batch should be the add action as the remove action is expired.
+    let checkpoint_data = data_iter.next().unwrap()?;
+    assert_eq!(checkpoint_data.selection_vector, [true, false]);
+
+    // The third batch should not be included as the selection vector does not
+    // contain any true values, as the add action is removed in a following commit.
+    assert!(data_iter.next().is_none());
+
+    Ok(())
+}
+
+#[test]
+fn test_uuid_v2_checkpoint() -> DeltaResult<()> {
+    let (store, _) = new_in_memory_store();
+    let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new()));
+
+    // 1st commit: adds `fake_path_2` & removes `fake_path_1`
+    write_commit_to_store(
+        &store,
+        vec![
+            Action::Add(Add {
+                path: "fake_path_2".into(),
+                data_change: true,
+                ..Default::default()
+            }),
+            Action::Remove(Remove {
+                path: "fake_path_1".into(),
+                data_change: true,
+                ..Default::default()
+            }),
+        ],
+        1,
+    )?;
+
+    // 2nd commit: metadata & protocol actions
+    // Protocol action includes the v2Checkpoint reader/writer feature.
+    write_commit_to_store(
+            &store,
+            vec![
+                Action::Metadata(Metadata {
+                    id: "fake_path_1".into(),
+                    schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(),
+                    ..Default::default()
+                }),
+                Action::Protocol(Protocol::try_new(3, 7, vec!["v2Checkpoint"].into(), vec!["v2Checkpoint"].into())?),
+            ],
+            2,
+        )?;
+    let table_root = Url::parse("memory:///")?;
+    let table = Table::new(table_root);
+    let mut checkpointer = table.checkpoint(&engine, None)?.build(&engine)?;
+    let checkpoint_data = checkpointer.get_checkpoint_info()?;
+    let mut data_iter = checkpoint_data.data;
+
+    // TODO: Assert that the checkpoint file path is UUID-based
+    let path = checkpoint_data.path;
+    let parts = path.as_str().split(".");
+    assert_eq!(parts.clone().count(), 4);
+
+    // The first batch should be the metadata and protocol actions.
+    let checkpoint_data = data_iter.next().unwrap()?;
+
+    assert_eq!(checkpoint_data.selection_vector, [true, true]);
+
+    // The second batch should be the add action as the remove action is expired.
+    let checkpoint_data = data_iter.next().unwrap()?;
+    assert_eq!(checkpoint_data.selection_vector, [true, false]);
+
+    // The third batch should be the CheckpointMetaData action.
+    let checkpoint_data = data_iter.next().unwrap()?;
+    assert_eq!(checkpoint_data.selection_vector, [true]);
+
+    assert!(data_iter.next().is_none());
+
+    Ok(())
+}
+
 /// Test that `checkpoint` works with a specific version parameter
 #[test]
 fn test_checkpoint_specific_version() -> DeltaResult<()> {
diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs
index dd99fda99..91979d41f 100644
--- a/kernel/src/lib.rs
+++ b/kernel/src/lib.rs
@@ -382,7 +382,7 @@ trait ExpressionHandlerExtension: ExpressionHandler {
 }
 
 // Auto-implement the extension trait for all ExpressionHandlers
-impl<T: ExpressionHandler> ExpressionHandlerExtension for T {}
+impl<T: ?Sized + ExpressionHandler> ExpressionHandlerExtension for T {}
 
 /// Provides file system related functionalities to Delta Kernel.
 ///