From 435302e42e2f2776435b915698af1e6cd0ca7339 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 11 Mar 2025 22:03:21 -0700 Subject: [PATCH 001/176] introduce visitors --- kernel/src/actions/visitors.rs | 524 +++++++++++++++++++++++++++++++-- kernel/src/scan/log_replay.rs | 10 +- 2 files changed, 510 insertions(+), 24 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 36a2c7faf7..9eef22ed5d 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -1,10 +1,12 @@ //! This module defines visitors that can be used to extract the various delta actions from //! [`crate::engine_data::EngineData`] types. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::LazyLock; +use tracing::debug; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; +use crate::scan::log_replay::FileActionKey; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, Error}; @@ -483,6 +485,270 @@ impl RowVisitor for SidecarVisitor { } } +/// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds and +/// removes to be included in a checkpoint file. Log replay visits actions newest-first, so once +/// we've seen a file action for a given (path, dvId) pair, we should ignore all subsequent (older) +/// actions for that same (path, dvId) pair. If the first action for a given (path, dvId) is a remove +/// action, we should only include it if it is not expired (i.e., its deletion timestamp is greater +/// than the minimum file retention timestamp). +struct CheckpointFileActionsVisitor<'seen> { + seen_file_keys: &'seen mut HashSet, + selection_vector: Vec, + is_log_batch: bool, + total_actions: usize, + total_add_actions: usize, + minimum_file_retention_timestamp: i64, +} + +#[allow(unused)] // TODO: Remove flag once used for checkpoint writing +impl CheckpointFileActionsVisitor<'_> { + /// Checks if log replay already processed this logical file (in which case the current action + /// should be ignored). If not already seen, register it so we can recognize future duplicates. + /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it + /// and should process it. + /// + /// TODO: This method is a duplicate of AddRemoveDedupVisior's method! + fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { + // Note: each (add.path + add.dv_unique_id()) pair has a + // unique Add + Remove pair in the log. For example: + // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json + + if self.seen_file_keys.contains(&key) { + debug!( + "Ignoring duplicate ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + true + } else { + debug!( + "Including ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + if self.is_log_batch { + // Remember file actions from this batch so we can ignore duplicates as we process + // batches from older commit and/or checkpoint files. We don't track checkpoint + // batches because they are already the oldest actions and never replace anything. + self.seen_file_keys.insert(key); + } + false + } + } + + /// A remove action includes a timestamp indicating when the deletion occurred. Physical files + /// are deleted lazily after a user-defined expiration time, allowing concurrent readers to + /// access stale snapshots. A remove action remains as a tombstone in a checkpoint file until + /// it expires, which happens when the current time exceeds the removal timestamp plus the + /// expiration threshold. + fn is_expired_tombstone<'a>(&self, i: usize, getter: &'a dyn GetData<'a>) -> DeltaResult { + // Ideally this should never be zero, but we are following the same behavior as Delta + // Spark and the Java Kernel. + let mut deletion_timestamp: i64 = 0; + if let Some(ts) = getter.get_opt(i, "remove.deletionTimestamp")? { + deletion_timestamp = ts; + } + + Ok(deletion_timestamp <= self.minimum_file_retention_timestamp) + } + + /// Returns true if the row contains a valid file action to be included in the checkpoint. + fn is_valid_file_action<'a>( + &mut self, + i: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult { + // Add will have a path at index 0 if it is valid; otherwise we may + // have a remove with a path at index 4. In either case, extract the three dv getters at + // indexes that immediately follow a valid path index. + let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? { + (path, &getters[1..4], true) + } else if let Some(path) = getters[4].get_opt(i, "remove.path")? { + (path, &getters[6..9], false) + } else { + return Ok(false); + }; + + let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? { + Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts( + storage_type, + dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?, + dv_getters[2].get_opt(i, "deletionVector.offset")?, + )), + None => None, + }; + + // Check both adds and removes (skipping already-seen) + let file_key = FileActionKey::new(path, dv_unique_id); + if self.check_and_record_seen(file_key) { + return Ok(false); + } + + // Ignore expired tombstones. + if !is_add && self.is_expired_tombstone(i, getters[5])? { + return Ok(false); + } + + if is_add { + self.total_add_actions += 1; + } + + Ok(true) + } +} + +impl RowVisitor for CheckpointFileActionsVisitor<'_> { + fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { + // The data columns visited must be in the following order: + // 1. ADD + // 2. REMOVE + static CHECKPOINT_FILE_ACTION_COLUMNS: LazyLock = + LazyLock::new(|| { + const STRING: DataType = DataType::STRING; + const INTEGER: DataType = DataType::INTEGER; + let types_and_names = vec![ + (STRING, column_name!("add.path")), + (STRING, column_name!("add.deletionVector.storageType")), + (STRING, column_name!("add.deletionVector.pathOrInlineDv")), + (INTEGER, column_name!("add.deletionVector.offset")), + (STRING, column_name!("remove.path")), + (DataType::LONG, column_name!("remove.deletionTimestamp")), + (STRING, column_name!("remove.deletionVector.storageType")), + (STRING, column_name!("remove.deletionVector.pathOrInlineDv")), + (INTEGER, column_name!("remove.deletionVector.offset")), + ]; + let (types, names) = types_and_names.into_iter().unzip(); + (names, types).into() + }); + CHECKPOINT_FILE_ACTION_COLUMNS.as_ref() + } + + fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { + require!( + getters.len() == 9, + Error::InternalError(format!( + "Wrong number of visitor getters: {}", + getters.len() + )) + ); + + for i in 0..row_count { + let should_select = self.is_valid_file_action(i, getters)?; + + if should_select { + self.selection_vector[i] = true; + self.total_actions += 1; + } + } + Ok(()) + } +} + +/// A visitor that selects non-file actions for a checkpoint file. Since log replay visits actions +/// in newest-first order, we only keep the first occurrence of: +/// - a protocol action, +/// - a metadata action, +/// - a transaction (txn) action for a given app ID. +/// +/// Any subsequent (older) actions of the same type are ignored. This visitor tracks which actions +/// have been seen and includes only the first occurrence of each in the selection vector. +#[cfg_attr(feature = "developer-visibility", visibility::make(pub))] +pub(crate) struct CheckpointNonFileActionsVisitor<'seen> { + // Non-file actions state + pub(crate) seen_protocol: bool, + pub(crate) seen_metadata: bool, + pub(crate) seen_txns: &'seen mut HashSet, + pub(crate) selection_vector: Vec, + pub(crate) total_actions: usize, +} + +#[allow(unused)] // TODO: Remove flag once used for checkpoint writing +impl CheckpointNonFileActionsVisitor<'_> { + /// Returns true if the row contains a protocol action, and we haven’t seen one yet. + fn is_valid_protocol_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol { + self.seen_protocol = true; + Ok(true) + } else { + Ok(false) + } + } + + /// Returns true if the row contains a metadata action, and we haven’t seen one yet. + fn is_valid_metadata_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata { + self.seen_metadata = true; + Ok(true) + } else { + Ok(false) + } + } + + /// Returns true if the row contains a txn action with an appId that we haven’t seen yet. + fn is_valid_txn_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + let app_id = match getter.get_str(i, "txn.appId")? { + Some(id) => id, + None => return Ok(false), + }; + + Ok(self.seen_txns.insert(app_id.to_string())) + } +} + +impl RowVisitor for CheckpointNonFileActionsVisitor<'_> { + fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { + // The data columns visited must be in the following order: + // 1. METADATA + // 2. PROTOCOL + // 3. TXN + static CHECKPOINT_NON_FILE_ACTION_COLUMNS: LazyLock = + LazyLock::new(|| { + const STRING: DataType = DataType::STRING; + const INTEGER: DataType = DataType::INTEGER; + let types_and_names = vec![ + (STRING, column_name!("metaData.id")), + (INTEGER, column_name!("protocol.minReaderVersion")), + (STRING, column_name!("txn.appId")), + ]; + let (types, names) = types_and_names.into_iter().unzip(); + (names, types).into() + }); + CHECKPOINT_NON_FILE_ACTION_COLUMNS.as_ref() + } + + fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { + require!( + getters.len() == 3, + Error::InternalError(format!( + "Wrong number of visitor getters: {}", + getters.len() + )) + ); + + for i in 0..row_count { + let should_select = self.is_valid_metadata_action(i, getters[0])? + || self.is_valid_protocol_action(i, getters[1])? + || self.is_valid_txn_action(i, getters[2])?; + + if should_select { + self.selection_vector[i] = true; + self.total_actions += 1; + } + } + Ok(()) + } +} + /// Get a DV out of some engine data. The caller is responsible for slicing the `getters` slice such /// that the first element contains the `storageType` element of the deletion vector. pub(crate) fn visit_deletion_vector_at<'a>( @@ -537,11 +803,13 @@ mod tests { let handler = SyncJsonHandler {}; let json_strings: StringArray = vec![ r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, + r#"{"remove":{"path":"part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, r#"{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Databricks-Runtime/","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}}"#, r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#, r#"{"cdc":{"path":"_change_data/age=21/cdc-00000-93f7fceb-281a-446a-b221-07b88132d203.c000.snappy.parquet","partitionValues":{"age":"21"},"size":1033,"dataChange":false}}"#, r#"{"sidecar":{"path":"016ae953-37a9-438e-8683-9a9a4a79a395.parquet","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{"tag_foo":"tag_bar"}}}"#, + r#"{"txn":{"appId":"myApp","version": 3}}"#, ] .into(); let output_schema = get_log_schema().clone(); @@ -551,6 +819,18 @@ mod tests { ArrowEngineData::try_from_engine_data(parsed).unwrap() } + fn parse_json_batch(json_strings: StringArray) -> Box { + let engine = SyncEngine::new(); + let json_handler = engine.get_json_handler(); + let output_schema = get_log_schema().clone(); + json_handler + .parse_json( + string_array_to_engine_data(json_strings.into()), + output_schema, + ) + .unwrap() + } + #[test] fn test_parse_protocol() -> DeltaResult<()> { let data = action_batch(); @@ -639,8 +919,6 @@ mod tests { #[test] fn test_parse_add_partitioned() { - let engine = SyncEngine::new(); - let json_handler = engine.get_json_handler(); let json_strings: StringArray = vec![ r#"{"commitInfo":{"timestamp":1670892998177,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[\"c1\",\"c2\"]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"3","numOutputRows":"3","numOutputBytes":"1356"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.2.0","txnId":"046a258f-45e3-4657-b0bf-abfb0f76681c"}}"#, r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, @@ -650,10 +928,7 @@ mod tests { r#"{"add":{"path":"c1=6/c2=a/part-00011-10619b10-b691-4fd0-acc4-2a9608499d7c.c000.snappy.parquet","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":4},\"maxValues\":{\"c3\":4},\"nullCount\":{\"c3\":0}}"}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let batch = json_handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); + let batch = parse_json_batch(json_strings); let mut add_visitor = AddVisitor::default(); add_visitor.visit_rows_of(batch.as_ref()).unwrap(); let add1 = Add { @@ -697,18 +972,13 @@ mod tests { #[test] fn test_parse_remove_partitioned() { - let engine = SyncEngine::new(); - let json_handler = engine.get_json_handler(); let json_strings: StringArray = vec![ r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, r#"{"metaData":{"id":"aff5cb91-8cd9-4195-aef9-446908507302","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, r#"{"remove":{"path":"c1=4/c2=c/part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let batch = json_handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); + let batch = parse_json_batch(json_strings); let mut remove_visitor = RemoveVisitor::default(); remove_visitor.visit_rows_of(batch.as_ref()).unwrap(); let expected_remove = Remove { @@ -736,8 +1006,6 @@ mod tests { #[test] fn test_parse_txn() { - let engine = SyncEngine::new(); - let json_handler = engine.get_json_handler(); let json_strings: StringArray = vec![ r#"{"commitInfo":{"timestamp":1670892998177,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[\"c1\",\"c2\"]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"3","numOutputRows":"3","numOutputBytes":"1356"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.2.0","txnId":"046a258f-45e3-4657-b0bf-abfb0f76681c"}}"#, r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, @@ -747,10 +1015,7 @@ mod tests { r#"{"txn":{"appId":"myApp2","version": 4, "lastUpdated": 1670892998177}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let batch = json_handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); + let batch = parse_json_batch(json_strings); let mut txn_visitor = SetTransactionVisitor::default(); txn_visitor.visit_rows_of(batch.as_ref()).unwrap(); let mut actual = txn_visitor.set_transactions; @@ -771,4 +1036,225 @@ mod tests { }) ); } + + #[test] + fn test_parse_checkpoint_file_action_visitor() -> DeltaResult<()> { + let data = action_batch(); + let mut visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut HashSet::new(), + selection_vector: vec![false; 8], // 8 rows in the action batch + is_log_batch: true, + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: 0, // No tombstones are expired + }; + + visitor.visit_rows_of(data.as_ref())?; + + let expected = vec![true, true, false, false, false, false, false, false]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_file_keys.len(), 2); + assert_eq!(visitor.total_actions, 2); + assert_eq!(visitor.total_add_actions, 1); + Ok(()) + } + + #[test] + fn test_checkpoint_file_action_visitor_boundary_cases_for_tombstone_expiration( + ) -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, // Missing timestamp defaults to 0 + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut HashSet::new(), + selection_vector: vec![false; 4], + is_log_batch: true, + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: 100, // Threshold set to 100 + }; + + visitor.visit_rows_of(batch.as_ref())?; + + let expected = vec![false, false, true, false]; // Only "one_above_threshold" should be kept + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_file_keys.len(), 4); // All are recorded as seen even if expired + assert_eq!(visitor.total_actions, 1); + assert_eq!(visitor.total_add_actions, 0); + Ok(()) + } + + #[test] + fn test_checkpoint_file_action_visitor_duplicate_file_actions_in_log_batch() -> DeltaResult<()> + { + let json_strings: StringArray = vec![ + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, // Duplicate path + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut HashSet::new(), + selection_vector: vec![false; 2], + is_log_batch: true, // Log batch + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: 0, + }; + + visitor.visit_rows_of(batch.as_ref())?; + + // First one should be included, second one skipped as a duplicate + let expected = vec![true, false]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_file_keys.len(), 1); + assert_eq!(visitor.total_actions, 1); + assert_eq!(visitor.total_add_actions, 1); + Ok(()) + } + + #[test] + fn test_checkpoint_file_action_visitor_duplicate_file_actions_in_checkpoint_batch( + ) -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + // Duplicate path + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut HashSet::new(), + selection_vector: vec![false; 2], + is_log_batch: false, // Checkpoint batch + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: 0, + }; + + visitor.visit_rows_of(batch.as_ref())?; + + // Both should be included since we don't track duplicates in checkpoint batches + let expected = vec![true, true]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_file_keys.len(), 0); // No tracking for checkpoint batches + assert_eq!(visitor.total_actions, 2); + assert_eq!(visitor.total_add_actions, 2); + Ok(()) + } + + #[test] + fn test_checkpoint_file_action_visitor_with_deletion_vectors() -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Same path but different DV + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"two","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Duplicate of first entry + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut HashSet::new(), + selection_vector: vec![false; 3], + is_log_batch: true, + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: 0, + }; + + visitor.visit_rows_of(batch.as_ref())?; + + let expected = vec![true, true, false]; // Third one is a duplicate + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_file_keys.len(), 2); + assert_eq!(visitor.total_actions, 2); + assert_eq!(visitor.total_add_actions, 2); + Ok(()) + } + + #[test] + fn test_parse_checkpoint_non_file_actions_visitor() -> DeltaResult<()> { + let data = action_batch(); + let mut visitor = CheckpointNonFileActionsVisitor { + seen_protocol: false, + seen_metadata: false, + seen_txns: &mut HashSet::new(), + selection_vector: vec![false; 8], + total_actions: 0, + }; + + visitor.visit_rows_of(data.as_ref())?; + + let expected = vec![false, false, false, true, true, false, false, true]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_metadata, true); + assert_eq!(visitor.seen_protocol, true); + assert_eq!(visitor.seen_txns.len(), 1); + assert_eq!(visitor.total_actions, 3); + Ok(()) + } + + #[test] + fn test_checkpoint_non_file_actions_visitor_txn_already_seen() -> DeltaResult<()> { + let json_strings: StringArray = + vec![r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#].into(); + let batch = parse_json_batch(json_strings); + + // Pre-populate with app1 + let mut seen_txns = HashSet::new(); + seen_txns.insert("app1".to_string()); + + let mut visitor = CheckpointNonFileActionsVisitor { + seen_protocol: false, + seen_metadata: false, + seen_txns: &mut seen_txns, + selection_vector: vec![false; 1], + total_actions: 0, + }; + + visitor.visit_rows_of(batch.as_ref())?; + + let expected = vec![false]; // Transaction should be skipped as it's already seen + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_txns.len(), 1); // Still only one transaction + assert_eq!(visitor.total_actions, 0); + Ok(()) + } + + #[test] + fn test_checkpoint_non_file_actions_visitor_protocol_and_metadata_already_seen( + ) -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#, + ] + .into(); + let batch = parse_json_batch(json_strings); + + // Set protocol and metadata as already seen + let mut visitor = CheckpointNonFileActionsVisitor { + seen_protocol: true, // Already seen + seen_metadata: true, // Already seen + seen_txns: &mut HashSet::new(), + selection_vector: vec![false; 2], + total_actions: 0, + }; + + visitor.visit_rows_of(batch.as_ref())?; + + let expected = vec![false, false]; // Both should be skipped + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.total_actions, 0); + Ok(()) + } } diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 0e26b610f7..b0d3ea8f0f 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -19,12 +19,12 @@ use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; /// The subset of file action fields that uniquely identifies it in the log, used for deduplication /// of adds and removes during log replay. #[derive(Debug, Hash, Eq, PartialEq)] -struct FileActionKey { - path: String, - dv_unique_id: Option, +pub(crate) struct FileActionKey { + pub(crate) path: String, + pub(crate) dv_unique_id: Option, } impl FileActionKey { - fn new(path: impl Into, dv_unique_id: Option) -> Self { + pub(crate) fn new(path: impl Into, dv_unique_id: Option) -> Self { let path = path.into(); Self { path, dv_unique_id } } @@ -59,7 +59,7 @@ impl AddRemoveDedupVisitor<'_> { /// should be ignored). If not already seen, register it so we can recognize future duplicates. /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it /// and should process it. - fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { + pub fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { // Note: each (add.path + add.dv_unique_id()) pair has a // unique Add + Remove pair in the log. For example: // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json From e500a107abe4c818bd1c451a70bf965124857f05 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 11 Mar 2025 22:04:35 -0700 Subject: [PATCH 002/176] remove pub --- kernel/src/scan/log_replay.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index b0d3ea8f0f..dbcd056dfa 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -59,7 +59,7 @@ impl AddRemoveDedupVisitor<'_> { /// should be ignored). If not already seen, register it so we can recognize future duplicates. /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it /// and should process it. - pub fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { + fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { // Note: each (add.path + add.dv_unique_id()) pair has a // unique Add + Remove pair in the log. For example: // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json From 19733cd003eb7a72d962f9cf1d1556e26d2f7f77 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 11 Mar 2025 22:28:59 -0700 Subject: [PATCH 003/176] assert! instead of assert_eq with bool --- kernel/src/actions/visitors.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 9eef22ed5d..3ade3d9143 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -824,10 +824,7 @@ mod tests { let json_handler = engine.get_json_handler(); let output_schema = get_log_schema().clone(); json_handler - .parse_json( - string_array_to_engine_data(json_strings.into()), - output_schema, - ) + .parse_json(string_array_to_engine_data(json_strings), output_schema) .unwrap() } @@ -1197,8 +1194,8 @@ mod tests { let expected = vec![false, false, false, true, true, false, false, true]; assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_metadata, true); - assert_eq!(visitor.seen_protocol, true); + assert!(visitor.seen_metadata); + assert!(visitor.seen_protocol); assert_eq!(visitor.seen_txns.len(), 1); assert_eq!(visitor.total_actions, 3); Ok(()) From 87c9f31f97a0d7a22e07c337b6c92ee9945c19df Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 13 Mar 2025 13:22:09 -0700 Subject: [PATCH 004/176] log replay for checkpoints --- kernel/src/actions/visitors.rs | 116 +++++--------- kernel/src/checkpoints/log_replay.rs | 229 +++++++++++++++++++++++++++ kernel/src/checkpoints/mod.rs | 1 + kernel/src/lib.rs | 1 + kernel/src/path.rs | 17 ++ kernel/src/utils.rs | 25 ++- 6 files changed, 315 insertions(+), 74 deletions(-) create mode 100644 kernel/src/checkpoints/log_replay.rs create mode 100644 kernel/src/checkpoints/mod.rs diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 3ade3d9143..e0e622b05d 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -491,13 +491,13 @@ impl RowVisitor for SidecarVisitor { /// actions for that same (path, dvId) pair. If the first action for a given (path, dvId) is a remove /// action, we should only include it if it is not expired (i.e., its deletion timestamp is greater /// than the minimum file retention timestamp). -struct CheckpointFileActionsVisitor<'seen> { - seen_file_keys: &'seen mut HashSet, - selection_vector: Vec, - is_log_batch: bool, - total_actions: usize, - total_add_actions: usize, - minimum_file_retention_timestamp: i64, +pub(crate) struct CheckpointFileActionsVisitor<'seen> { + pub(crate) seen_file_keys: &'seen mut HashSet, + pub(crate) selection_vector: &'seen mut Vec, + pub(crate) is_log_batch: bool, + pub(crate) total_actions: usize, + pub(crate) total_add_actions: usize, + pub(crate) minimum_file_retention_timestamp: i64, } #[allow(unused)] // TODO: Remove flag once used for checkpoint writing @@ -653,10 +653,10 @@ impl RowVisitor for CheckpointFileActionsVisitor<'_> { #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] pub(crate) struct CheckpointNonFileActionsVisitor<'seen> { // Non-file actions state - pub(crate) seen_protocol: bool, - pub(crate) seen_metadata: bool, + pub(crate) seen_protocol: &'seen mut bool, + pub(crate) seen_metadata: &'seen mut bool, pub(crate) seen_txns: &'seen mut HashSet, - pub(crate) selection_vector: Vec, + pub(crate) selection_vector: &'seen mut Vec, pub(crate) total_actions: usize, } @@ -668,8 +668,8 @@ impl CheckpointNonFileActionsVisitor<'_> { i: usize, getter: &'a dyn GetData<'a>, ) -> DeltaResult { - if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol { - self.seen_protocol = true; + if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !*self.seen_protocol { + *self.seen_protocol = true; Ok(true) } else { Ok(false) @@ -682,8 +682,8 @@ impl CheckpointNonFileActionsVisitor<'_> { i: usize, getter: &'a dyn GetData<'a>, ) -> DeltaResult { - if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata { - self.seen_metadata = true; + if getter.get_str(i, "metaData.id")?.is_some() && !*self.seen_metadata { + *self.seen_metadata = true; Ok(true) } else { Ok(false) @@ -777,30 +777,13 @@ pub(crate) fn visit_deletion_vector_at<'a>( #[cfg(test)] mod tests { - use std::sync::Arc; - - use crate::arrow::array::{RecordBatch, StringArray}; - use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use crate::arrow::array::StringArray; + use crate::utils::test_utils::parse_json_batch; + use crate::EngineData; use super::*; - use crate::{ - actions::get_log_schema, - engine::arrow_data::ArrowEngineData, - engine::sync::{json::SyncJsonHandler, SyncEngine}, - Engine, EngineData, JsonHandler, - }; - - // TODO(nick): Merge all copies of this into one "test utils" thing - fn string_array_to_engine_data(string_array: StringArray) -> Box { - let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); - let schema = Arc::new(ArrowSchema::new(vec![string_field])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) - .expect("Can't convert to record batch"); - Box::new(ArrowEngineData::new(batch)) - } - fn action_batch() -> Box { - let handler = SyncJsonHandler {}; + fn action_batch() -> Box { let json_strings: StringArray = vec![ r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, r#"{"remove":{"path":"part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, @@ -812,20 +795,7 @@ mod tests { r#"{"txn":{"appId":"myApp","version": 3}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let parsed = handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); - ArrowEngineData::try_from_engine_data(parsed).unwrap() - } - - fn parse_json_batch(json_strings: StringArray) -> Box { - let engine = SyncEngine::new(); - let json_handler = engine.get_json_handler(); - let output_schema = get_log_schema().clone(); - json_handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap() + parse_json_batch(json_strings) } #[test] @@ -1039,7 +1009,7 @@ mod tests { let data = action_batch(); let mut visitor = CheckpointFileActionsVisitor { seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 8], // 8 rows in the action batch + selection_vector: &mut vec![false; 8], // 8 rows in the action batch is_log_batch: true, total_actions: 0, total_add_actions: 0, @@ -1049,7 +1019,7 @@ mod tests { visitor.visit_rows_of(data.as_ref())?; let expected = vec![true, true, false, false, false, false, false, false]; - assert_eq!(visitor.selection_vector, expected); + assert_eq!(*visitor.selection_vector, expected); assert_eq!(visitor.seen_file_keys.len(), 2); assert_eq!(visitor.total_actions, 2); assert_eq!(visitor.total_add_actions, 1); @@ -1070,7 +1040,7 @@ mod tests { let mut visitor = CheckpointFileActionsVisitor { seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 4], + selection_vector: &mut vec![false; 4], is_log_batch: true, total_actions: 0, total_add_actions: 0, @@ -1080,7 +1050,7 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![false, false, true, false]; // Only "one_above_threshold" should be kept - assert_eq!(visitor.selection_vector, expected); + assert_eq!(*visitor.selection_vector, expected); assert_eq!(visitor.seen_file_keys.len(), 4); // All are recorded as seen even if expired assert_eq!(visitor.total_actions, 1); assert_eq!(visitor.total_add_actions, 0); @@ -1099,7 +1069,7 @@ mod tests { let mut visitor = CheckpointFileActionsVisitor { seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 2], + selection_vector: &mut vec![false; 2], is_log_batch: true, // Log batch total_actions: 0, total_add_actions: 0, @@ -1110,7 +1080,7 @@ mod tests { // First one should be included, second one skipped as a duplicate let expected = vec![true, false]; - assert_eq!(visitor.selection_vector, expected); + assert_eq!(*visitor.selection_vector, expected); assert_eq!(visitor.seen_file_keys.len(), 1); assert_eq!(visitor.total_actions, 1); assert_eq!(visitor.total_add_actions, 1); @@ -1130,7 +1100,7 @@ mod tests { let mut visitor = CheckpointFileActionsVisitor { seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 2], + selection_vector: &mut vec![false; 2], is_log_batch: false, // Checkpoint batch total_actions: 0, total_add_actions: 0, @@ -1141,7 +1111,7 @@ mod tests { // Both should be included since we don't track duplicates in checkpoint batches let expected = vec![true, true]; - assert_eq!(visitor.selection_vector, expected); + assert_eq!(*visitor.selection_vector, expected); assert_eq!(visitor.seen_file_keys.len(), 0); // No tracking for checkpoint batches assert_eq!(visitor.total_actions, 2); assert_eq!(visitor.total_add_actions, 2); @@ -1162,7 +1132,7 @@ mod tests { let mut visitor = CheckpointFileActionsVisitor { seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 3], + selection_vector: &mut vec![false; 3], is_log_batch: true, total_actions: 0, total_add_actions: 0, @@ -1172,7 +1142,7 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, false]; // Third one is a duplicate - assert_eq!(visitor.selection_vector, expected); + assert_eq!(*visitor.selection_vector, expected); assert_eq!(visitor.seen_file_keys.len(), 2); assert_eq!(visitor.total_actions, 2); assert_eq!(visitor.total_add_actions, 2); @@ -1183,19 +1153,19 @@ mod tests { fn test_parse_checkpoint_non_file_actions_visitor() -> DeltaResult<()> { let data = action_batch(); let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: false, - seen_metadata: false, + seen_protocol: &mut false, + seen_metadata: &mut false, seen_txns: &mut HashSet::new(), - selection_vector: vec![false; 8], + selection_vector: &mut vec![false; 8], total_actions: 0, }; visitor.visit_rows_of(data.as_ref())?; let expected = vec![false, false, false, true, true, false, false, true]; - assert_eq!(visitor.selection_vector, expected); - assert!(visitor.seen_metadata); - assert!(visitor.seen_protocol); + assert_eq!(*visitor.selection_vector, expected); + assert!(*visitor.seen_metadata); + assert!(*visitor.seen_protocol); assert_eq!(visitor.seen_txns.len(), 1); assert_eq!(visitor.total_actions, 3); Ok(()) @@ -1212,17 +1182,17 @@ mod tests { seen_txns.insert("app1".to_string()); let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: false, - seen_metadata: false, + seen_protocol: &mut false, + seen_metadata: &mut false, seen_txns: &mut seen_txns, - selection_vector: vec![false; 1], + selection_vector: &mut vec![false; 1], total_actions: 0, }; visitor.visit_rows_of(batch.as_ref())?; let expected = vec![false]; // Transaction should be skipped as it's already seen - assert_eq!(visitor.selection_vector, expected); + assert_eq!(*visitor.selection_vector, expected); assert_eq!(visitor.seen_txns.len(), 1); // Still only one transaction assert_eq!(visitor.total_actions, 0); Ok(()) @@ -1240,17 +1210,17 @@ mod tests { // Set protocol and metadata as already seen let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: true, // Already seen - seen_metadata: true, // Already seen + seen_protocol: &mut true, // Already seen + seen_metadata: &mut true, // Already seen seen_txns: &mut HashSet::new(), - selection_vector: vec![false; 2], + selection_vector: &mut vec![false; 2], total_actions: 0, }; visitor.visit_rows_of(batch.as_ref())?; let expected = vec![false, false]; // Both should be skipped - assert_eq!(visitor.selection_vector, expected); + assert_eq!(*visitor.selection_vector, expected); assert_eq!(visitor.total_actions, 0); Ok(()) } diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs new file mode 100644 index 0000000000..a632fd3363 --- /dev/null +++ b/kernel/src/checkpoints/log_replay.rs @@ -0,0 +1,229 @@ +use std::collections::HashSet; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +use crate::actions::visitors::{CheckpointFileActionsVisitor, CheckpointNonFileActionsVisitor}; +use crate::engine_data::RowVisitor; +use crate::scan::log_replay::FileActionKey; +use crate::{DeltaResult, EngineData}; + +/// `LogReplayForCheckpoints` is responsible for filtering actions during log +/// replay to include only those that should be included in a V1 checkpoint. +struct LogReplayForCheckpoints { + /// Tracks file actions that have been seen during log replay to avoid duplicates. + /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. + seen_file_keys: HashSet, + + /// Counter for the total number of actions processed during log replay. + total_actions: Arc, + + /// Counter for the total number of add actions processed during log replay. + total_add_actions: Arc, + + /// Indicates whether a protocol action has been seen in the log. + seen_protocol: bool, + + /// Indicates whether a metadata action has been seen in the log. + seen_metadata: bool, + + /// Set of transaction app IDs that have been processed to avoid duplicates. + seen_txns: HashSet, + + /// Minimum timestamp for file retention, used for filtering expired tombstones. + minimum_file_retention_timestamp: i64, +} + +impl LogReplayForCheckpoints { + pub(super) fn new( + total_actions_counter: Arc, + total_add_actions_counter: Arc, + minimum_file_retention_timestamp: i64, + ) -> Self { + Self { + seen_file_keys: Default::default(), + total_actions: total_actions_counter, + total_add_actions: total_add_actions_counter, + seen_protocol: false, + seen_metadata: false, + seen_txns: Default::default(), + minimum_file_retention_timestamp, + } + } + + /// Iterates over actions and filters them for inclusion in a V1 checkpoint. + /// + /// This function processes batches of actions in reverse chronological order + /// (from most recent to least recent) and performs the necessary filtering + /// to ensure the checkpoint contains only the actions needed to reconstruct + /// the complete state of the table. + /// + /// # Filtering Rules + /// + /// The following rules apply when filtering actions: + /// + /// 1. Only the most recent protocol and metadata actions are included + /// 2. For each app ID, only the most recent transaction action is included + /// 3. File actions are deduplicated based on path and unique ID + /// 4. Tombstones older than `minimum_file_retention_timestamp` are excluded + pub(super) fn process_v1_checkpoint_batch( + &mut self, + actions: Box, + is_log_batch: bool, + ) -> DeltaResult<(Box, Vec)> { + // Initialize selection vector with all rows un-selected + let mut selection_vector = vec![false; actions.len()]; + assert_eq!( + selection_vector.len(), + actions.len(), + "Initial selection vector length does not match actions length" + ); + + // Create the non file actions visitor to process non file actions and update selection vector + let mut non_file_actions_visitor = CheckpointNonFileActionsVisitor { + seen_protocol: &mut self.seen_protocol, + seen_metadata: &mut self.seen_metadata, + seen_txns: &mut self.seen_txns, + selection_vector: &mut selection_vector, + total_actions: 0, + }; + + // Process actions and let visitor update selection vector + non_file_actions_visitor.visit_rows_of(actions.as_ref())?; + + // Update shared counters with non-file action counts from this batch + self.total_actions + .fetch_add(non_file_actions_visitor.total_actions, Ordering::Relaxed); + + // Create the file actions visitor to process file actions and update selection vector + let mut file_actions_visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut self.seen_file_keys, + is_log_batch, + selection_vector: &mut selection_vector, + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: self.minimum_file_retention_timestamp, + }; + + // Process actions and let visitor update selection vector + file_actions_visitor.visit_rows_of(actions.as_ref())?; + + // Update shared counters with file action counts from this batch + self.total_actions + .fetch_add(file_actions_visitor.total_actions, Ordering::Relaxed); + self.total_add_actions + .fetch_add(file_actions_visitor.total_add_actions, Ordering::Relaxed); + + Ok((actions, selection_vector)) + } +} + +/// Given an iterator of (engine_data, bool) tuples, returns an iterator of +/// `(engine_data, selection_vec)`. Each row that is selected in the returned `engine_data` _must_ +/// be written to the V1 checkpoint file in order to capture the table version's complete state. +/// Non-selected rows _must_ be ignored. The boolean flag indicates whether the record batch +/// is a log or checkpoint batch. +/// +/// Note: The iterator of (engine_data, bool) tuples must be sorted by the order of the actions in +/// the log from most recent to least recent. +pub(crate) fn v1_checkpoint_actions_iter( + action_iter: impl Iterator, bool)>> + Send + 'static, + total_actions_counter: Arc, + total_add_actions_counter: Arc, + minimum_file_retention_timestamp: i64, +) -> impl Iterator, Vec)>> + Send + 'static { + let mut log_scanner = LogReplayForCheckpoints::new( + total_actions_counter, + total_add_actions_counter, + minimum_file_retention_timestamp, + ); + + action_iter + .map(move |action_res| { + let (batch, is_log_batch) = action_res?; + log_scanner.process_v1_checkpoint_batch(batch, is_log_batch) + }) + // Only yield batches that have at least one selected row + .filter(|res| res.as_ref().map_or(true, |(_, sv)| sv.contains(&true))) +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; + + use crate::arrow::array::StringArray; + use crate::checkpoints::log_replay::v1_checkpoint_actions_iter; + use crate::utils::test_utils::parse_json_batch; + use crate::DeltaResult; + + /// Tests the end-to-end processing of multiple batches with various action types + /// This tests the integration of the visitors with the main iterator function. + /// More granular testing is performed in the individual visitor tests. + #[test] + fn test_v1_checkpoint_actions_iter_multi_batch_integration() -> DeltaResult<()> { + // Setup counters + let total_actions_counter = Arc::new(AtomicUsize::new(0)); + let total_add_actions_counter = Arc::new(AtomicUsize::new(0)); + + // Create first batch with protocol, metadata, and some files + let json_strings1: StringArray = vec![ + r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, + r#"{"metaData":{"id":"test2","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, + r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, + r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, + ].into(); + + // Create second batch with some duplicates and new files + let json_strings2: StringArray = vec![ + // Protocol and metadata should be skipped as duplicates + r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, + r#"{"metaData":{"id":"test1","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, + // New files + r#"{"add":{"path":"file3","partitionValues":{},"size":800,"modificationTime":102,"dataChange":true}}"#, + // Duplicate file should be skipped + r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, // Transaction + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"# + ].into(); + + // Create third batch with all duplicate actions (should be filtered out completely) + let json_strings3: StringArray = vec![ + r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, + r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, + ].into(); + + let input_batches = vec![ + Ok((parse_json_batch(json_strings1), true)), + Ok((parse_json_batch(json_strings2), true)), + Ok((parse_json_batch(json_strings3), true)), + ]; + + // Run the iterator + let results: Vec<_> = v1_checkpoint_actions_iter( + input_batches.into_iter(), + total_actions_counter.clone(), + total_add_actions_counter.clone(), + 0, + ) + .collect::, _>>()?; + + // Expect two batches in results (third batch should be filtered)" + assert_eq!(results.len(), 2); + + // First batch should have all rows selected + let (_, selection_vector1) = &results[0]; + assert_eq!(selection_vector1, &vec![true, true, true, true]); + + // Second batch should have only new file and transaction selected + let (_, selection_vector2) = &results[1]; + assert_eq!(selection_vector2, &vec![false, false, true, false, true]); + + // Verify counters + // 6 total actions (4 from batch1 + 2 from batch2 + 0 from batch3) + assert_eq!(total_actions_counter.load(Ordering::Relaxed), 6); + + // 3 add actions (2 from batch1 + 1 from batch2) + assert_eq!(total_add_actions_counter.load(Ordering::Relaxed), 3); + + Ok(()) + } +} diff --git a/kernel/src/checkpoints/mod.rs b/kernel/src/checkpoints/mod.rs new file mode 100644 index 0000000000..826ff771fb --- /dev/null +++ b/kernel/src/checkpoints/mod.rs @@ -0,0 +1 @@ +pub mod log_replay; diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index 65a0a6ab54..bf24769212 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -74,6 +74,7 @@ use url::Url; use self::schema::{DataType, SchemaRef}; pub mod actions; +pub mod checkpoints; pub mod engine_data; pub mod error; pub mod expressions; diff --git a/kernel/src/path.rs b/kernel/src/path.rs index df372f08ec..f9988cc8a6 100644 --- a/kernel/src/path.rs +++ b/kernel/src/path.rs @@ -196,6 +196,23 @@ impl ParsedLogPath { } Ok(path) } + + /// Create a new ParsedCommitPath for a new parquet v1 checkpoint file at the specified version + pub(crate) fn new_v1_checkpoint( + table_root: &Url, + version: Version, + ) -> DeltaResult> { + let filename = format!("{:020}.checkpoint.parquet", version); + let location = table_root.join("_delta_log/")?.join(&filename)?; + let path = Self::try_from(location)? + .ok_or_else(|| Error::internal_error("attempted to create invalid checkpoint path"))?; + if !path.is_checkpoint() { + return Err(Error::internal_error( + "ParsedLogPath::new_commit created a non-checkpoint path", + )); + } + Ok(path) + } } #[cfg(test)] diff --git a/kernel/src/utils.rs b/kernel/src/utils.rs index fd2db25013..7713e042a5 100644 --- a/kernel/src/utils.rs +++ b/kernel/src/utils.rs @@ -22,11 +22,15 @@ pub(crate) mod test_utils { use tempfile::TempDir; use test_utils::delta_path_for_version; + use crate::actions::get_log_schema; + use crate::arrow::array::StringArray; + use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use crate::engine::sync::SyncEngine; use crate::{ actions::{Add, Cdc, CommitInfo, Metadata, Protocol, Remove}, engine::arrow_data::ArrowEngineData, - EngineData, }; + use crate::{Engine, EngineData}; #[derive(Serialize)] pub(crate) enum Action { @@ -97,4 +101,23 @@ pub(crate) mod test_utils { pub(crate) fn assert_batch_matches(actual: Box, expected: Box) { assert_eq!(into_record_batch(actual), into_record_batch(expected)); } + + /// Converts a `StringArray` to an `EngineData` object + pub(crate) fn string_array_to_engine_data(string_array: StringArray) -> Box { + let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); + let schema = Arc::new(ArrowSchema::new(vec![string_field])); + let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) + .expect("Can't convert to record batch"); + Box::new(ArrowEngineData::new(batch)) + } + + /// Parses a batch of JSON strings into an `EngineData` object + pub(crate) fn parse_json_batch(json_strings: StringArray) -> Box { + let engine = SyncEngine::new(); + let json_handler = engine.get_json_handler(); + let output_schema = get_log_schema().clone(); + json_handler + .parse_json(string_array_to_engine_data(json_strings), output_schema) + .unwrap() + } } From db5ccd05ba8be030fd8941d4b0025fcbe1372d49 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 13 Mar 2025 15:29:15 -0700 Subject: [PATCH 005/176] rename & some clean up --- kernel/src/checkpoints/log_replay.rs | 17 ++++++++++------- kernel/src/engine/arrow_data.rs | 19 +++---------------- kernel/src/engine/default/json.rs | 9 +-------- kernel/src/scan/mod.rs | 15 +++------------ 4 files changed, 17 insertions(+), 43 deletions(-) diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs index a632fd3363..4bd6c3448e 100644 --- a/kernel/src/checkpoints/log_replay.rs +++ b/kernel/src/checkpoints/log_replay.rs @@ -7,9 +7,10 @@ use crate::engine_data::RowVisitor; use crate::scan::log_replay::FileActionKey; use crate::{DeltaResult, EngineData}; -/// `LogReplayForCheckpoints` is responsible for filtering actions during log +/// `V1CheckpointLogReplayScanner` is responsible for filtering actions during log /// replay to include only those that should be included in a V1 checkpoint. -struct LogReplayForCheckpoints { +#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented +struct V1CheckpointLogReplayScanner { /// Tracks file actions that have been seen during log replay to avoid duplicates. /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. seen_file_keys: HashSet, @@ -33,7 +34,8 @@ struct LogReplayForCheckpoints { minimum_file_retention_timestamp: i64, } -impl LogReplayForCheckpoints { +#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented +impl V1CheckpointLogReplayScanner { pub(super) fn new( total_actions_counter: Arc, total_add_actions_counter: Arc, @@ -65,7 +67,7 @@ impl LogReplayForCheckpoints { /// 2. For each app ID, only the most recent transaction action is included /// 3. File actions are deduplicated based on path and unique ID /// 4. Tombstones older than `minimum_file_retention_timestamp` are excluded - pub(super) fn process_v1_checkpoint_batch( + pub(super) fn filter_v1_checkpoint_actions( &mut self, actions: Box, is_log_batch: bool, @@ -125,13 +127,14 @@ impl LogReplayForCheckpoints { /// /// Note: The iterator of (engine_data, bool) tuples must be sorted by the order of the actions in /// the log from most recent to least recent. +#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented pub(crate) fn v1_checkpoint_actions_iter( action_iter: impl Iterator, bool)>> + Send + 'static, total_actions_counter: Arc, total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> impl Iterator, Vec)>> + Send + 'static { - let mut log_scanner = LogReplayForCheckpoints::new( + let mut log_scanner = V1CheckpointLogReplayScanner::new( total_actions_counter, total_add_actions_counter, minimum_file_retention_timestamp, @@ -140,7 +143,7 @@ pub(crate) fn v1_checkpoint_actions_iter( action_iter .map(move |action_res| { let (batch, is_log_batch) = action_res?; - log_scanner.process_v1_checkpoint_batch(batch, is_log_batch) + log_scanner.filter_v1_checkpoint_actions(batch, is_log_batch) }) // Only yield batches that have at least one selected row .filter(|res| res.as_ref().map_or(true, |(_, sv)| sv.contains(&true))) @@ -156,7 +159,7 @@ mod tests { use crate::utils::test_utils::parse_json_batch; use crate::DeltaResult; - /// Tests the end-to-end processing of multiple batches with various action types + /// Tests the end-to-end processing of multiple batches with various action types. /// This tests the integration of the visitors with the main iterator function. /// More granular testing is performed in the individual visitor tests. #[test] diff --git a/kernel/src/engine/arrow_data.rs b/kernel/src/engine/arrow_data.rs index 9883809013..b09b27ff94 100644 --- a/kernel/src/engine/arrow_data.rs +++ b/kernel/src/engine/arrow_data.rs @@ -294,27 +294,14 @@ impl ArrowEngineData { #[cfg(test)] mod tests { - use std::sync::Arc; - - use crate::arrow::array::{RecordBatch, StringArray}; - use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; - + use crate::arrow::array::StringArray; + use crate::utils::test_utils::string_array_to_engine_data; use crate::{ actions::{get_log_schema, Metadata, Protocol}, engine::sync::SyncEngine, - DeltaResult, Engine, EngineData, + DeltaResult, Engine, }; - use super::ArrowEngineData; - - fn string_array_to_engine_data(string_array: StringArray) -> Box { - let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); - let schema = Arc::new(ArrowSchema::new(vec![string_field])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) - .expect("Can't convert to record batch"); - Box::new(ArrowEngineData::new(batch)) - } - #[test] fn test_md_extract() -> DeltaResult<()> { let engine = SyncEngine::new(); diff --git a/kernel/src/engine/default/json.rs b/kernel/src/engine/default/json.rs index 98a9b0dc74..8b401a3d49 100644 --- a/kernel/src/engine/default/json.rs +++ b/kernel/src/engine/default/json.rs @@ -257,6 +257,7 @@ mod tests { use crate::engine::default::executor::tokio::{ TokioBackgroundExecutor, TokioMultiThreadExecutor, }; + use crate::utils::test_utils::string_array_to_engine_data; use futures::future; use itertools::Itertools; use object_store::local::LocalFileSystem; @@ -471,14 +472,6 @@ mod tests { } } - fn string_array_to_engine_data(string_array: StringArray) -> Box { - let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); - let schema = Arc::new(ArrowSchema::new(vec![string_field])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) - .expect("Can't convert to record batch"); - Box::new(ArrowEngineData::new(batch)) - } - #[test] fn test_parse_json() { let store = Arc::new(LocalFileSystem::new()); diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index ccdff3d663..689a6eab38 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -663,8 +663,8 @@ pub fn selection_vector( // some utils that are used in file_stream.rs and state.rs tests #[cfg(test)] pub(crate) mod test_utils { - use crate::arrow::array::{RecordBatch, StringArray}; - use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use crate::arrow::array::StringArray; + use crate::utils::test_utils::string_array_to_engine_data; use itertools::Itertools; use std::sync::Arc; @@ -676,20 +676,11 @@ pub(crate) mod test_utils { }, scan::log_replay::scan_action_iter, schema::SchemaRef, - EngineData, JsonHandler, + JsonHandler, }; use super::{state::ScanCallback, Transform}; - // TODO(nick): Merge all copies of this into one "test utils" thing - fn string_array_to_engine_data(string_array: StringArray) -> Box { - let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); - let schema = Arc::new(ArrowSchema::new(vec![string_field])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) - .expect("Can't convert to record batch"); - Box::new(ArrowEngineData::new(batch)) - } - // Generates a batch of sidecar actions with the given paths. // The schema is provided as null columns affect equality checks. pub(crate) fn sidecar_batch_with_given_paths( From 42c08c1f439a5d20adcba1f56df74b3e65b469ec Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 13 Mar 2025 15:43:25 -0700 Subject: [PATCH 006/176] remove new path for now --- kernel/src/path.rs | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/kernel/src/path.rs b/kernel/src/path.rs index f9988cc8a6..df372f08ec 100644 --- a/kernel/src/path.rs +++ b/kernel/src/path.rs @@ -196,23 +196,6 @@ impl ParsedLogPath { } Ok(path) } - - /// Create a new ParsedCommitPath for a new parquet v1 checkpoint file at the specified version - pub(crate) fn new_v1_checkpoint( - table_root: &Url, - version: Version, - ) -> DeltaResult> { - let filename = format!("{:020}.checkpoint.parquet", version); - let location = table_root.join("_delta_log/")?.join(&filename)?; - let path = Self::try_from(location)? - .ok_or_else(|| Error::internal_error("attempted to create invalid checkpoint path"))?; - if !path.is_checkpoint() { - return Err(Error::internal_error( - "ParsedLogPath::new_commit created a non-checkpoint path", - )); - } - Ok(path) - } } #[cfg(test)] From f91baebe5af22c4c01a7529fdf9967ffa04c510f Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 21 Mar 2025 19:33:35 -0700 Subject: [PATCH 007/176] merge non file action visitor tests --- kernel/src/actions/visitors.rs | 58 +++++++++++++++++----------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 3ade3d9143..150beffe68 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -784,10 +784,8 @@ mod tests { use super::*; use crate::{ - actions::get_log_schema, - engine::arrow_data::ArrowEngineData, - engine::sync::{json::SyncJsonHandler, SyncEngine}, - Engine, EngineData, JsonHandler, + actions::get_log_schema, engine::arrow_data::ArrowEngineData, engine::sync::SyncEngine, + Engine, EngineData, }; // TODO(nick): Merge all copies of this into one "test utils" thing @@ -799,8 +797,7 @@ mod tests { Box::new(ArrowEngineData::new(batch)) } - fn action_batch() -> Box { - let handler = SyncJsonHandler {}; + fn action_batch() -> Box { let json_strings: StringArray = vec![ r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, r#"{"remove":{"path":"part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, @@ -812,11 +809,7 @@ mod tests { r#"{"txn":{"appId":"myApp","version": 3}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let parsed = handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); - ArrowEngineData::try_from_engine_data(parsed).unwrap() + parse_json_batch(json_strings) } fn parse_json_batch(json_strings: StringArray) -> Box { @@ -1202,26 +1195,30 @@ mod tests { } #[test] - fn test_checkpoint_non_file_actions_visitor_txn_already_seen() -> DeltaResult<()> { - let json_strings: StringArray = - vec![r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#].into(); + fn test_checkpoint_non_file_actions_visitor_already_seen_actions() -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + ].into(); let batch = parse_json_batch(json_strings); - // Pre-populate with app1 + // Pre-populate with txn app1 let mut seen_txns = HashSet::new(); seen_txns.insert("app1".to_string()); let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: false, - seen_metadata: false, + seen_protocol: true, // Already seen + seen_metadata: true, // Already seen seen_txns: &mut seen_txns, - selection_vector: vec![false; 1], + selection_vector: vec![false; 3], total_actions: 0, }; visitor.visit_rows_of(batch.as_ref())?; - let expected = vec![false]; // Transaction should be skipped as it's already seen + // All actions should be skipped as they have already been seen + let expected = vec![false; 3]; assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.seen_txns.len(), 1); // Still only one transaction assert_eq!(visitor.total_actions, 0); @@ -1229,29 +1226,32 @@ mod tests { } #[test] - fn test_checkpoint_non_file_actions_visitor_protocol_and_metadata_already_seen( - ) -> DeltaResult<()> { + fn test_checkpoint_non_file_actions_visitor_duplicate_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, ] .into(); let batch = parse_json_batch(json_strings); - // Set protocol and metadata as already seen let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: true, // Already seen - seen_metadata: true, // Already seen - seen_txns: &mut HashSet::new(), - selection_vector: vec![false; 2], + seen_protocol: false, + seen_metadata: false, + seen_txns: &mut HashSet::new(), // Empty set + selection_vector: vec![false; 6], total_actions: 0, }; visitor.visit_rows_of(batch.as_ref())?; - let expected = vec![false, false]; // Both should be skipped + let expected = vec![true, false, true, false, true, false]; assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.total_actions, 0); + assert_eq!(visitor.seen_txns.len(), 1); + assert_eq!(visitor.total_actions, 3); Ok(()) } } From 9fdfba70f63371a72bc624f7228e6f92f7760ab6 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 24 Mar 2025 15:22:56 -0700 Subject: [PATCH 008/176] mvp for refactor --- kernel/src/actions/visitors.rs | 80 ++++-------- kernel/src/checkpoints/log_replay.rs | 78 ++++++----- kernel/src/lib.rs | 1 + kernel/src/log_replay.rs | 154 ++++++++++++++++++++++ kernel/src/scan/log_replay.rs | 188 ++++++++++++--------------- 5 files changed, 311 insertions(+), 190 deletions(-) create mode 100644 kernel/src/log_replay.rs diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index e0e622b05d..c348c92e27 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -3,10 +3,9 @@ use std::collections::{HashMap, HashSet}; use std::sync::LazyLock; -use tracing::debug; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; -use crate::scan::log_replay::FileActionKey; +use crate::log_replay::{FileActionKey, FileActionVisitor}; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, Error}; @@ -500,40 +499,30 @@ pub(crate) struct CheckpointFileActionsVisitor<'seen> { pub(crate) minimum_file_retention_timestamp: i64, } -#[allow(unused)] // TODO: Remove flag once used for checkpoint writing -impl CheckpointFileActionsVisitor<'_> { - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it - /// and should process it. - /// - /// TODO: This method is a duplicate of AddRemoveDedupVisior's method! - fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen_file_keys.contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - true - } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - if self.is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen_file_keys.insert(key); - } - false - } +impl FileActionVisitor for CheckpointFileActionsVisitor<'_> { + fn seen_file_keys(&mut self) -> &mut HashSet { + self.seen_file_keys } + fn add_path_index(&self) -> usize { + 0 + } + + fn remove_path_index(&self) -> Option { + Some(4) + } + + fn add_dv_start_index(&self) -> usize { + 1 + } + + fn remove_dv_start_index(&self) -> Option { + Some(6) + } +} + +#[allow(unused)] // TODO: Remove flag once used for checkpoint writing +impl CheckpointFileActionsVisitor<'_> { /// A remove action includes a timestamp indicating when the deletion occurred. Physical files /// are deleted lazily after a user-defined expiration time, allowing concurrent readers to /// access stale snapshots. A remove action remains as a tombstone in a checkpoint file until @@ -556,29 +545,14 @@ impl CheckpointFileActionsVisitor<'_> { i: usize, getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { - // Add will have a path at index 0 if it is valid; otherwise we may - // have a remove with a path at index 4. In either case, extract the three dv getters at - // indexes that immediately follow a valid path index. - let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? { - (path, &getters[1..4], true) - } else if let Some(path) = getters[4].get_opt(i, "remove.path")? { - (path, &getters[6..9], false) - } else { + // Retrieve the file action key and whether it is an add action + let Some((file_key, is_add)) = self.extract_file_action(i, getters)? else { + // Not a file action return Ok(false); }; - let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?, - dv_getters[2].get_opt(i, "deletionVector.offset")?, - )), - None => None, - }; - // Check both adds and removes (skipping already-seen) - let file_key = FileActionKey::new(path, dv_unique_id); - if self.check_and_record_seen(file_key) { + if self.check_and_record_seen(file_key, self.is_log_batch) { return Ok(false); } diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs index 4bd6c3448e..98600a821a 100644 --- a/kernel/src/checkpoints/log_replay.rs +++ b/kernel/src/checkpoints/log_replay.rs @@ -4,13 +4,13 @@ use std::sync::Arc; use crate::actions::visitors::{CheckpointFileActionsVisitor, CheckpointNonFileActionsVisitor}; use crate::engine_data::RowVisitor; -use crate::scan::log_replay::FileActionKey; +use crate::log_replay::{FileActionKey, LogReplayProcessor}; use crate::{DeltaResult, EngineData}; -/// `V1CheckpointLogReplayScanner` is responsible for filtering actions during log +/// `CheckpointLogReplayProcessor` is responsible for filtering actions during log /// replay to include only those that should be included in a V1 checkpoint. #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented -struct V1CheckpointLogReplayScanner { +struct CheckpointLogReplayProcessor { /// Tracks file actions that have been seen during log replay to avoid duplicates. /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. seen_file_keys: HashSet, @@ -34,26 +34,10 @@ struct V1CheckpointLogReplayScanner { minimum_file_retention_timestamp: i64, } -#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented -impl V1CheckpointLogReplayScanner { - pub(super) fn new( - total_actions_counter: Arc, - total_add_actions_counter: Arc, - minimum_file_retention_timestamp: i64, - ) -> Self { - Self { - seen_file_keys: Default::default(), - total_actions: total_actions_counter, - total_add_actions: total_add_actions_counter, - seen_protocol: false, - seen_metadata: false, - seen_txns: Default::default(), - minimum_file_retention_timestamp, - } - } +impl LogReplayProcessor for CheckpointLogReplayProcessor { + // Define the processing result type as a tuple of the data and selection vector + type ProcessingResult = (Box, Vec); - /// Iterates over actions and filters them for inclusion in a V1 checkpoint. - /// /// This function processes batches of actions in reverse chronological order /// (from most recent to least recent) and performs the necessary filtering /// to ensure the checkpoint contains only the actions needed to reconstruct @@ -67,16 +51,16 @@ impl V1CheckpointLogReplayScanner { /// 2. For each app ID, only the most recent transaction action is included /// 3. File actions are deduplicated based on path and unique ID /// 4. Tombstones older than `minimum_file_retention_timestamp` are excluded - pub(super) fn filter_v1_checkpoint_actions( + fn process_batch( &mut self, - actions: Box, + batch: Box, is_log_batch: bool, - ) -> DeltaResult<(Box, Vec)> { + ) -> DeltaResult { // Initialize selection vector with all rows un-selected - let mut selection_vector = vec![false; actions.len()]; + let mut selection_vector = vec![false; batch.len()]; assert_eq!( selection_vector.len(), - actions.len(), + batch.len(), "Initial selection vector length does not match actions length" ); @@ -90,7 +74,7 @@ impl V1CheckpointLogReplayScanner { }; // Process actions and let visitor update selection vector - non_file_actions_visitor.visit_rows_of(actions.as_ref())?; + non_file_actions_visitor.visit_rows_of(batch.as_ref())?; // Update shared counters with non-file action counts from this batch self.total_actions @@ -107,7 +91,7 @@ impl V1CheckpointLogReplayScanner { }; // Process actions and let visitor update selection vector - file_actions_visitor.visit_rows_of(actions.as_ref())?; + file_actions_visitor.visit_rows_of(batch.as_ref())?; // Update shared counters with file action counts from this batch self.total_actions @@ -115,7 +99,31 @@ impl V1CheckpointLogReplayScanner { self.total_add_actions .fetch_add(file_actions_visitor.total_add_actions, Ordering::Relaxed); - Ok((actions, selection_vector)) + Ok((batch, selection_vector)) + } + + // Get a reference to the set of seen file keys + fn seen_file_keys(&mut self) -> &mut HashSet { + &mut self.seen_file_keys + } +} + +#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented +impl CheckpointLogReplayProcessor { + pub(super) fn new( + total_actions_counter: Arc, + total_add_actions_counter: Arc, + minimum_file_retention_timestamp: i64, + ) -> Self { + Self { + seen_file_keys: Default::default(), + total_actions: total_actions_counter, + total_add_actions: total_add_actions_counter, + seen_protocol: false, + seen_metadata: false, + seen_txns: Default::default(), + minimum_file_retention_timestamp, + } } } @@ -128,13 +136,13 @@ impl V1CheckpointLogReplayScanner { /// Note: The iterator of (engine_data, bool) tuples must be sorted by the order of the actions in /// the log from most recent to least recent. #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented -pub(crate) fn v1_checkpoint_actions_iter( +pub(crate) fn checkpoint_actions_iter( action_iter: impl Iterator, bool)>> + Send + 'static, total_actions_counter: Arc, total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> impl Iterator, Vec)>> + Send + 'static { - let mut log_scanner = V1CheckpointLogReplayScanner::new( + let mut log_scanner = CheckpointLogReplayProcessor::new( total_actions_counter, total_add_actions_counter, minimum_file_retention_timestamp, @@ -143,7 +151,7 @@ pub(crate) fn v1_checkpoint_actions_iter( action_iter .map(move |action_res| { let (batch, is_log_batch) = action_res?; - log_scanner.filter_v1_checkpoint_actions(batch, is_log_batch) + log_scanner.process_batch(batch, is_log_batch) }) // Only yield batches that have at least one selected row .filter(|res| res.as_ref().map_or(true, |(_, sv)| sv.contains(&true))) @@ -155,7 +163,7 @@ mod tests { use std::sync::Arc; use crate::arrow::array::StringArray; - use crate::checkpoints::log_replay::v1_checkpoint_actions_iter; + use crate::checkpoints::log_replay::checkpoint_actions_iter; use crate::utils::test_utils::parse_json_batch; use crate::DeltaResult; @@ -201,7 +209,7 @@ mod tests { ]; // Run the iterator - let results: Vec<_> = v1_checkpoint_actions_iter( + let results: Vec<_> = checkpoint_actions_iter( input_batches.into_iter(), total_actions_counter.clone(), total_add_actions_counter.clone(), diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index bf24769212..787d2a4823 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -78,6 +78,7 @@ pub mod checkpoints; pub mod engine_data; pub mod error; pub mod expressions; +pub mod log_replay; pub mod scan; pub mod schema; pub mod snapshot; diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs new file mode 100644 index 0000000000..e545f24083 --- /dev/null +++ b/kernel/src/log_replay.rs @@ -0,0 +1,154 @@ +use std::collections::{HashMap, HashSet}; +use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, LazyLock, +}; + +use itertools::Itertools; +use tracing::debug; + +use crate::actions::deletion_vector::DeletionVectorDescriptor; +use crate::actions::get_log_add_schema; +use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; +use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; + +#[derive(Debug, Hash, Eq, PartialEq)] +/// The subset of file action fields that uniquely identifies it in the log, used for deduplication +/// of adds and removes during log replay. +pub struct FileActionKey { + pub(crate) path: String, + pub(crate) dv_unique_id: Option, +} + +impl FileActionKey { + pub fn new(path: impl Into, dv_unique_id: Option) -> Self { + let path = path.into(); + Self { path, dv_unique_id } + } +} + +/// Trait defining the interface for log replay processors that process and filter +/// Delta Lake log actions based on different strategies. +pub trait LogReplayProcessor { + /// The type of results produced by this processor + type ProcessingResult; + + /// Process a batch of actions and return the filtered result + fn process_batch( + &mut self, + batch: Box, + is_log_batch: bool, + ) -> DeltaResult; + + // Get a reference to the set of seen file keys + fn seen_file_keys(&mut self) -> &mut HashSet; +} + +/// Base trait for visitors that process file actions during log replay +pub trait FileActionVisitor { + /// Get a reference to the set of seen file keys + fn seen_file_keys(&mut self) -> &mut HashSet; + + /// Checks if log replay already processed this logical file (in which case the current action + /// should be ignored). If not already seen, register it so we can recognize future duplicates. + /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it + /// and should process it. + fn check_and_record_seen(&mut self, key: FileActionKey, is_log_batch: bool) -> bool { + // Note: each (add.path + add.dv_unique_id()) pair has a + // unique Add + Remove pair in the log. For example: + // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json + + if self.seen_file_keys().contains(&key) { + debug!( + "Ignoring duplicate ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, is_log_batch + ); + true + } else { + debug!( + "Including ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, is_log_batch + ); + if is_log_batch { + // Remember file actions from this batch so we can ignore duplicates as we process + // batches from older commit and/or checkpoint files. We don't track checkpoint + // batches because they are already the oldest actions and never replace anything. + self.seen_file_keys().insert(key); + } + false + } + } + + /// Index in getters array for add.path + fn add_path_index(&self) -> usize; + + /// Index in getters array for remove.path + fn remove_path_index(&self) -> Option; + + /// Starting index for add action's deletion vector getters + /// (Assumes 3 consecutive items: storageType, pathOrInlineDv, offset) + fn add_dv_start_index(&self) -> usize; + + /// Starting index for remove action's deletion vector getters + /// (Assumes 3 consecutive items: storageType, pathOrInlineDv, offset) + fn remove_dv_start_index(&self) -> Option; + + /// Extract deletion vector unique ID + fn extract_dv_unique_id<'a>( + &self, + i: usize, + getters: &[&'a dyn GetData<'a>], + is_add: bool, + ) -> DeltaResult> { + // Get the starting index based on action type + let start_idx = if is_add { + self.add_dv_start_index() + } else if let Some(idx) = self.remove_dv_start_index() { + idx + } else { + return Err(Error::GenericError { + source: "DV getters should exist".into(), + }); + }; + + // Extract the DV unique ID + match getters[start_idx].get_opt(i, "deletionVector.storageType")? { + Some(storage_type) => Ok(Some(DeletionVectorDescriptor::unique_id_from_parts( + storage_type, + getters[start_idx + 1].get(i, "deletionVector.pathOrInlineDv")?, + getters[start_idx + 2].get_opt(i, "deletionVector.offset")?, + ))), + None => Ok(None), + } + } + + /// Extract file action key and determine if it's an add operation + fn extract_file_action<'a>( + &self, + i: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult> { + // Try to extract an add action path + if let Some(path) = getters[self.add_path_index()].get_str(i, "add.path")? { + let dv_unique_id = self.extract_dv_unique_id(i, getters, true)?; + let file_key = FileActionKey::new(path, dv_unique_id); + return Ok(Some((file_key, true))); + } + + // The AddRemoveDedupVisitor does not include remove action getters when + // dealing with non-log batches (since they are not needed for deduplication). + let Some(remove_idx) = self.remove_path_index() else { + return Ok(None); + }; + + // Try to extract a remove action path + if let Some(path) = getters[remove_idx].get_str(i, "remove.path")? { + let dv_unique_id = self.extract_dv_unique_id(i, getters, false)?; + let file_key = FileActionKey::new(path, dv_unique_id); + return Ok(Some((file_key, false))); + } + + // No path found, not a file action + Ok(None) + } +} diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index dbcd056dfa..b2c56c026c 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -3,41 +3,29 @@ use std::collections::{HashMap, HashSet}; use std::sync::{Arc, LazyLock}; use itertools::Itertools; -use tracing::debug; use super::data_skipping::DataSkippingFilter; use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; +use crate::log_replay::{FileActionKey, FileActionVisitor, LogReplayProcessor}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; use crate::scan::{DeletionVectorDescriptor, Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; -/// The subset of file action fields that uniquely identifies it in the log, used for deduplication -/// of adds and removes during log replay. -#[derive(Debug, Hash, Eq, PartialEq)] -pub(crate) struct FileActionKey { - pub(crate) path: String, - pub(crate) dv_unique_id: Option, -} -impl FileActionKey { - pub(crate) fn new(path: impl Into, dv_unique_id: Option) -> Self { - let path = path.into(); - Self { path, dv_unique_id } - } -} - -struct LogReplayScanner { +struct ScanLogReplayProcessor { partition_filter: Option, data_skipping_filter: Option, - + add_transform: Arc, + logical_schema: SchemaRef, + transform: Option>, /// A set of (data file path, dv_unique_id) pairs that have been seen thus /// far in the log. This is used to filter out files with Remove actions as /// well as duplicate entries in the log. - seen: HashSet, + seen_file_keys: HashSet, } /// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds. Log @@ -45,7 +33,7 @@ struct LogReplayScanner { /// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the /// first action for a given file is a remove, then that file does not show up in the result at all. struct AddRemoveDedupVisitor<'seen> { - seen: &'seen mut HashSet, + seen_file_keys: &'seen mut HashSet, selection_vector: Vec, logical_schema: SchemaRef, transform: Option>, @@ -54,37 +42,37 @@ struct AddRemoveDedupVisitor<'seen> { is_log_batch: bool, } -impl AddRemoveDedupVisitor<'_> { - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it - /// and should process it. - fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen.contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - true +impl FileActionVisitor for AddRemoveDedupVisitor<'_> { + fn seen_file_keys(&mut self) -> &mut HashSet { + self.seen_file_keys + } + + fn add_path_index(&self) -> usize { + 0 + } + + fn remove_path_index(&self) -> Option { + if self.is_log_batch { + Some(5) } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - if self.is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen.insert(key); - } - false + None // No remove action getters when not a log batch } } + fn add_dv_start_index(&self) -> usize { + 2 + } + + fn remove_dv_start_index(&self) -> Option { + if self.is_log_batch { + Some(6) + } else { + None // No remove action getters when not a log batch + } + } +} + +impl AddRemoveDedupVisitor<'_> { fn parse_partition_value( &self, field_idx: usize, @@ -162,28 +150,12 @@ impl AddRemoveDedupVisitor<'_> { /// True if this row contains an Add action that should survive log replay. Skip it if the row /// is not an Add action, or the file has already been seen previously. fn is_valid_add<'a>(&mut self, i: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult { - // Add will have a path at index 0 if it is valid; otherwise, if it is a log batch, we may - // have a remove with a path at index 4. In either case, extract the three dv getters at - // indexes that immediately follow a valid path index. - let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? { - (path, &getters[2..5], true) - } else if !self.is_log_batch { - return Ok(false); - } else if let Some(path) = getters[5].get_opt(i, "remove.path")? { - (path, &getters[6..9], false) - } else { + // Retrieve the file action key and whether it is an add action + let Some((file_key, is_add)) = self.extract_file_action(i, getters)? else { + // Not a file action return Ok(false); }; - let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?, - dv_getters[2].get_opt(i, "deletionVector.offset")?, - )), - None => None, - }; - // Apply partition pruning (to adds only) before deduplication, so that we don't waste memory // tracking pruned files. Removes don't get pruned and we'll still have to track them. // @@ -203,8 +175,7 @@ impl AddRemoveDedupVisitor<'_> { }; // Check both adds and removes (skipping already-seen), but only transform and return adds - let file_key = FileActionKey::new(path, dv_unique_id); - if self.check_and_record_seen(file_key) || !is_add { + if self.check_and_record_seen(file_key, self.is_log_batch) || !is_add { return Ok(false); } let transform = self @@ -310,48 +281,70 @@ fn get_add_transform_expr() -> Expression { ]) } -impl LogReplayScanner { - /// Create a new [`LogReplayScanner`] instance - fn new(engine: &dyn Engine, physical_predicate: Option<(ExpressionRef, SchemaRef)>) -> Self { - Self { - partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()), - data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate), - seen: Default::default(), - } - } +impl LogReplayProcessor for ScanLogReplayProcessor { + type ProcessingResult = ScanData; - fn process_scan_batch( + fn process_batch( &mut self, - add_transform: &dyn ExpressionEvaluator, - actions: &dyn EngineData, - logical_schema: SchemaRef, - transform: Option>, + batch: Box, is_log_batch: bool, - ) -> DeltaResult { + ) -> DeltaResult { // Apply data skipping to get back a selection vector for actions that passed skipping. We // will update the vector below as log replay identifies duplicates that should be ignored. let selection_vector = match &self.data_skipping_filter { - Some(filter) => filter.apply(actions)?, - None => vec![true; actions.len()], + Some(filter) => filter.apply(batch.as_ref())?, + None => vec![true; batch.len()], }; - assert_eq!(selection_vector.len(), actions.len()); + assert_eq!(selection_vector.len(), batch.len()); + + let logical_schema = self.logical_schema.clone(); + let transform = self.transform.clone(); + let partition_filter = self.partition_filter.clone(); + let result = self.add_transform.evaluate(batch.as_ref())?; let mut visitor = AddRemoveDedupVisitor { - seen: &mut self.seen, + seen_file_keys: &mut self.seen_file_keys(), selection_vector, logical_schema, transform, - partition_filter: self.partition_filter.clone(), + partition_filter, row_transform_exprs: Vec::new(), is_log_batch, }; - visitor.visit_rows_of(actions)?; + + visitor.visit_rows_of(batch.as_ref())?; // TODO: Teach expression eval to respect the selection vector we just computed so carefully! let selection_vector = visitor.selection_vector; - let result = add_transform.evaluate(actions)?; Ok((result, selection_vector, visitor.row_transform_exprs)) } + + fn seen_file_keys(&mut self) -> &mut HashSet { + &mut self.seen_file_keys + } +} + +impl ScanLogReplayProcessor { + /// Create a new [`ScanLogReplayProcessor`] instance + fn new( + engine: &dyn Engine, + physical_predicate: Option<(ExpressionRef, SchemaRef)>, + logical_schema: SchemaRef, + transform: Option>, + ) -> Self { + Self { + partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()), + data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate), + add_transform: engine.get_expression_handler().get_evaluator( + get_log_add_schema().clone(), + get_add_transform_expr(), + SCAN_ROW_DATATYPE.clone(), + ), + seen_file_keys: Default::default(), + logical_schema, + transform, + } + } } /// Given an iterator of (engine_data, bool) tuples and a predicate, returns an iterator of @@ -365,22 +358,13 @@ pub(crate) fn scan_action_iter( transform: Option>, physical_predicate: Option<(ExpressionRef, SchemaRef)>, ) -> impl Iterator> { - let mut log_scanner = LogReplayScanner::new(engine, physical_predicate); - let add_transform = engine.get_expression_handler().get_evaluator( - get_log_add_schema().clone(), - get_add_transform_expr(), - SCAN_ROW_DATATYPE.clone(), - ); + let mut log_scanner = + ScanLogReplayProcessor::new(engine, physical_predicate, logical_schema, transform); + action_iter .map(move |action_res| { let (batch, is_log_batch) = action_res?; - log_scanner.process_scan_batch( - add_transform.as_ref(), - batch.as_ref(), - logical_schema.clone(), - transform.clone(), - is_log_batch, - ) + log_scanner.process_batch(batch, is_log_batch) }) .filter(|res| res.as_ref().map_or(true, |(_, sv, _)| sv.contains(&true))) } From d420fd1fd2ad5e3d172052b99698b4929178d1e8 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 24 Mar 2025 15:31:31 -0700 Subject: [PATCH 009/176] these github action checks clog my screen --- kernel/src/log_replay.rs | 13 +++---------- kernel/src/scan/log_replay.rs | 2 +- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index e545f24083..cfd4a10c08 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -1,16 +1,9 @@ -use std::collections::{HashMap, HashSet}; -use std::sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, LazyLock, -}; - -use itertools::Itertools; +use std::collections::HashSet; use tracing::debug; use crate::actions::deletion_vector::DeletionVectorDescriptor; -use crate::actions::get_log_add_schema; -use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; -use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; +use crate::engine_data::{GetData, TypedGetData as _}; +use crate::{DeltaResult, EngineData, Error}; #[derive(Debug, Hash, Eq, PartialEq)] /// The subset of file action fields that uniquely identifies it in the log, used for deduplication diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index b2c56c026c..8dce0ed6fe 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -11,7 +11,7 @@ use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; use crate::log_replay::{FileActionKey, FileActionVisitor, LogReplayProcessor}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; -use crate::scan::{DeletionVectorDescriptor, Scalar, TransformExpr}; +use crate::scan::{Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; From 9e0e0483a88f995fd55ac6755caf4bf473325a82 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 25 Mar 2025 14:34:44 -0700 Subject: [PATCH 010/176] base file actions struct --- kernel/src/actions/visitors.rs | 272 ++++++++++++++++++++++++--------- kernel/src/scan/log_replay.rs | 104 +++++-------- 2 files changed, 240 insertions(+), 136 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 150beffe68..9a04411e11 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -492,9 +492,7 @@ impl RowVisitor for SidecarVisitor { /// action, we should only include it if it is not expired (i.e., its deletion timestamp is greater /// than the minimum file retention timestamp). struct CheckpointFileActionsVisitor<'seen> { - seen_file_keys: &'seen mut HashSet, - selection_vector: Vec, - is_log_batch: bool, + deduplicator: FileActionDeduplicator<'seen>, total_actions: usize, total_add_actions: usize, minimum_file_retention_timestamp: i64, @@ -502,35 +500,22 @@ struct CheckpointFileActionsVisitor<'seen> { #[allow(unused)] // TODO: Remove flag once used for checkpoint writing impl CheckpointFileActionsVisitor<'_> { - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it - /// and should process it. - /// - /// TODO: This method is a duplicate of AddRemoveDedupVisior's method! - fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen_file_keys.contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - true - } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - if self.is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen_file_keys.insert(key); - } - false + /// Create a new CheckpointFileActionsVisitor + fn new( + seen_file_keys: &mut HashSet, + selection_vector: Vec, + is_log_batch: bool, + minimum_file_retention_timestamp: i64, + ) -> CheckpointFileActionsVisitor<'_> { + CheckpointFileActionsVisitor { + deduplicator: FileActionDeduplicator::new( + seen_file_keys, + selection_vector, + is_log_batch, + ), + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp, } } @@ -556,29 +541,17 @@ impl CheckpointFileActionsVisitor<'_> { i: usize, getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { - // Add will have a path at index 0 if it is valid; otherwise we may - // have a remove with a path at index 4. In either case, extract the three dv getters at - // indexes that immediately follow a valid path index. - let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? { - (path, &getters[1..4], true) - } else if let Some(path) = getters[4].get_opt(i, "remove.path")? { - (path, &getters[6..9], false) - } else { + let Some((file_key, is_add)) = self.deduplicator.extract_file_action( + i, getters, 0, // add_path_index + 4, // remove_path_index + 1, // add_dv_start_index + 6, // remove_dv_start_index + false, // Never skip remove actions (even if we're processing a log batch) + )? + else { return Ok(false); }; - - let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?, - dv_getters[2].get_opt(i, "deletionVector.offset")?, - )), - None => None, - }; - - // Check both adds and removes (skipping already-seen) - let file_key = FileActionKey::new(path, dv_unique_id); - if self.check_and_record_seen(file_key) { + if self.deduplicator.check_and_record_seen(file_key) { return Ok(false); } @@ -634,7 +607,7 @@ impl RowVisitor for CheckpointFileActionsVisitor<'_> { let should_select = self.is_valid_file_action(i, getters)?; if should_select { - self.selection_vector[i] = true; + self.deduplicator.selection_vector[i] = true; self.total_actions += 1; } } @@ -642,6 +615,145 @@ impl RowVisitor for CheckpointFileActionsVisitor<'_> { } } +/// Core implementation for deduplicating file actions in Delta log replay +/// This struct extracts the common functionality from the CheckpointVisitor +/// and the ScanDataVisitor. +pub(crate) struct FileActionDeduplicator<'seen> { + /// A set of (data file path, dv_unique_id) pairs that have been seen thus + /// far in the log for deduplication + seen_file_keys: &'seen mut HashSet, + /// Selection vector to track which rows should be included + selection_vector: Vec, + /// Whether we're processing a log batch (as opposed to a checkpoint) + is_log_batch: bool, +} + +impl<'seen> FileActionDeduplicator<'seen> { + pub(crate) fn new( + seen_file_keys: &'seen mut HashSet, + selection_vector: Vec, + is_log_batch: bool, + ) -> Self { + Self { + seen_file_keys, + selection_vector, + is_log_batch, + } + } + + /// Checks if log replay already processed this logical file (in which case the current action + /// should be ignored). If not already seen, register it so we can recognize future duplicates. + /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it + /// and should process it. + pub(crate) fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { + // Note: each (add.path + add.dv_unique_id()) pair has a + // unique Add + Remove pair in the log. For example: + // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json + + if self.seen_file_keys.contains(&key) { + debug!( + "Ignoring duplicate ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + true + } else { + debug!( + "Including ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + if self.is_log_batch { + // Remember file actions from this batch so we can ignore duplicates as we process + // batches from older commit and/or checkpoint files. We don't track checkpoint + // batches because they are already the oldest actions and never replace anything. + self.seen_file_keys.insert(key); + } + false + } + } + + /// Extract deletion vector unique ID + fn extract_dv_unique_id<'a>( + &self, + i: usize, + getters: &[&'a dyn GetData<'a>], + add_dv_start_index: Option, + remove_dv_start_index: Option, + ) -> DeltaResult> { + // Get the starting index based on action type + let start_idx = add_dv_start_index + .or(remove_dv_start_index) + .ok_or_else(|| Error::GenericError { + source: "starting indices for add/remove DVs should have been passed".into(), + })?; + + // Extract the DV unique ID + match getters[start_idx].get_opt(i, "deletionVector.storageType")? { + Some(storage_type) => Ok(Some(DeletionVectorDescriptor::unique_id_from_parts( + storage_type, + getters[start_idx + 1].get(i, "deletionVector.pathOrInlineDv")?, + getters[start_idx + 2].get_opt(i, "deletionVector.offset")?, + ))), + None => Ok(None), + } + } + + /// Extract file action key and determine if it's an add operation + pub(crate) fn extract_file_action<'a>( + &self, + i: usize, + getters: &[&'a dyn GetData<'a>], + add_path_index: usize, + remove_path_index: usize, + add_dv_start_index: usize, + remove_dv_start_index: usize, + skip_removes: bool, + ) -> DeltaResult> { + // Try to extract an add action path + if let Some(path) = getters[add_path_index].get_str(i, "add.path")? { + let dv_unique_id = + self.extract_dv_unique_id(i, getters, Some(add_dv_start_index), None)?; + return Ok(Some((FileActionKey::new(path, dv_unique_id), true))); + } + + // The AddRemoveDedupVisitor does not include remove action getters when + // dealing with non-log batches (since they are not needed for deduplication). + // In this case, we should skip remove actions. + if skip_removes { + return Ok(None); + } + + // Try to extract a remove action path + if let Some(path) = getters[remove_path_index].get_str(i, "remove.path")? { + let dv_unique_id = + self.extract_dv_unique_id(i, getters, None, Some(remove_dv_start_index))?; + return Ok(Some((FileActionKey::new(path, dv_unique_id), false))); + } + + // If we didn't find an add or remove action, return None + return Ok(None); + } + + /// Get the selection vector + pub(crate) fn selection_vector(self) -> Vec { + self.selection_vector + } + + /// Get reference to the selection vector + pub(crate) fn selection_vector_ref(&self) -> &Vec { + &self.selection_vector + } + + /// Get mutable reference to the selection vector + pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec { + &mut self.selection_vector + } + + /// Get whether we are processing a log batch + pub(crate) fn is_log_batch(&self) -> bool { + self.is_log_batch + } +} + /// A visitor that selects non-file actions for a checkpoint file. Since log replay visits actions /// in newest-first order, we only keep the first occurrence of: /// - a protocol action, @@ -1030,10 +1142,13 @@ mod tests { #[test] fn test_parse_checkpoint_file_action_visitor() -> DeltaResult<()> { let data = action_batch(); - let mut visitor = CheckpointFileActionsVisitor { + let deduplicator = FileActionDeduplicator { seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 8], // 8 rows in the action batch + selection_vector: vec![false; 8], is_log_batch: true, + }; + let mut visitor = CheckpointFileActionsVisitor { + deduplicator, total_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp: 0, // No tombstones are expired @@ -1042,8 +1157,8 @@ mod tests { visitor.visit_rows_of(data.as_ref())?; let expected = vec![true, true, false, false, false, false, false, false]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_file_keys.len(), 2); + assert_eq!(visitor.deduplicator.seen_file_keys.len(), 2); + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_actions, 2); assert_eq!(visitor.total_add_actions, 1); Ok(()) @@ -1061,10 +1176,13 @@ mod tests { .into(); let batch = parse_json_batch(json_strings); - let mut visitor = CheckpointFileActionsVisitor { + let deduplicator = FileActionDeduplicator { seen_file_keys: &mut HashSet::new(), selection_vector: vec![false; 4], is_log_batch: true, + }; + let mut visitor = CheckpointFileActionsVisitor { + deduplicator, total_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp: 100, // Threshold set to 100 @@ -1073,8 +1191,8 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![false, false, true, false]; // Only "one_above_threshold" should be kept - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_file_keys.len(), 4); // All are recorded as seen even if expired + assert_eq!(visitor.deduplicator.seen_file_keys.len(), 4); // All are recorded as seen even if expired + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_actions, 1); assert_eq!(visitor.total_add_actions, 0); Ok(()) @@ -1090,10 +1208,13 @@ mod tests { .into(); let batch = parse_json_batch(json_strings); - let mut visitor = CheckpointFileActionsVisitor { + let deduplicator = FileActionDeduplicator { seen_file_keys: &mut HashSet::new(), selection_vector: vec![false; 2], - is_log_batch: true, // Log batch + is_log_batch: true, + }; + let mut visitor = CheckpointFileActionsVisitor { + deduplicator, total_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp: 0, @@ -1103,8 +1224,8 @@ mod tests { // First one should be included, second one skipped as a duplicate let expected = vec![true, false]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_file_keys.len(), 1); + assert_eq!(visitor.deduplicator.seen_file_keys.len(), 1); + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_actions, 1); assert_eq!(visitor.total_add_actions, 1); Ok(()) @@ -1121,10 +1242,13 @@ mod tests { .into(); let batch = parse_json_batch(json_strings); - let mut visitor = CheckpointFileActionsVisitor { + let deduplicator = FileActionDeduplicator { seen_file_keys: &mut HashSet::new(), selection_vector: vec![false; 2], - is_log_batch: false, // Checkpoint batch + is_log_batch: false, + }; + let mut visitor = CheckpointFileActionsVisitor { + deduplicator, total_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp: 0, @@ -1134,8 +1258,8 @@ mod tests { // Both should be included since we don't track duplicates in checkpoint batches let expected = vec![true, true]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_file_keys.len(), 0); // No tracking for checkpoint batches + assert_eq!(visitor.deduplicator.seen_file_keys.len(), 0); // No tracking for checkpoint batches + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_actions, 2); assert_eq!(visitor.total_add_actions, 2); Ok(()) @@ -1152,11 +1276,13 @@ mod tests { ] .into(); let batch = parse_json_batch(json_strings); - - let mut visitor = CheckpointFileActionsVisitor { + let deduplicator = FileActionDeduplicator { seen_file_keys: &mut HashSet::new(), selection_vector: vec![false; 3], is_log_batch: true, + }; + let mut visitor = CheckpointFileActionsVisitor { + deduplicator, total_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp: 0, @@ -1165,8 +1291,8 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, false]; // Third one is a duplicate - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_file_keys.len(), 2); + assert_eq!(visitor.deduplicator.seen_file_keys.len(), 2); + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_actions, 2); assert_eq!(visitor.total_add_actions, 2); Ok(()) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index dbcd056dfa..59e3e52c15 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -3,15 +3,15 @@ use std::collections::{HashMap, HashSet}; use std::sync::{Arc, LazyLock}; use itertools::Itertools; -use tracing::debug; use super::data_skipping::DataSkippingFilter; use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; +use crate::actions::visitors::FileActionDeduplicator; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; -use crate::scan::{DeletionVectorDescriptor, Scalar, TransformExpr}; +use crate::scan::{Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; @@ -45,43 +45,28 @@ struct LogReplayScanner { /// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the /// first action for a given file is a remove, then that file does not show up in the result at all. struct AddRemoveDedupVisitor<'seen> { - seen: &'seen mut HashSet, - selection_vector: Vec, + deduplicator: FileActionDeduplicator<'seen>, logical_schema: SchemaRef, transform: Option>, partition_filter: Option, row_transform_exprs: Vec>, - is_log_batch: bool, } impl AddRemoveDedupVisitor<'_> { - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it - /// and should process it. - fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen.contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - true - } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - if self.is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen.insert(key); - } - false + fn new( + seen: &mut HashSet, + selection_vector: Vec, + logical_schema: SchemaRef, + transform: Option>, + partition_filter: Option, + is_log_batch: bool, + ) -> AddRemoveDedupVisitor<'_> { + AddRemoveDedupVisitor { + deduplicator: FileActionDeduplicator::new(seen, selection_vector, is_log_batch), + logical_schema, + transform, + partition_filter, + row_transform_exprs: Vec::new(), } } @@ -162,28 +147,19 @@ impl AddRemoveDedupVisitor<'_> { /// True if this row contains an Add action that should survive log replay. Skip it if the row /// is not an Add action, or the file has already been seen previously. fn is_valid_add<'a>(&mut self, i: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult { - // Add will have a path at index 0 if it is valid; otherwise, if it is a log batch, we may - // have a remove with a path at index 4. In either case, extract the three dv getters at - // indexes that immediately follow a valid path index. - let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? { - (path, &getters[2..5], true) - } else if !self.is_log_batch { - return Ok(false); - } else if let Some(path) = getters[5].get_opt(i, "remove.path")? { - (path, &getters[6..9], false) - } else { + let Some((file_key, is_add)) = self.deduplicator.extract_file_action( + i, + getters, + 0, // add_path_index + 5, // remove_path_index + 2, // add_dv_start_index + 6, // remove_dv_start_index + !self.deduplicator.is_log_batch(), // skip_removes if it's a log batch + )? + else { return Ok(false); }; - let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?, - dv_getters[2].get_opt(i, "deletionVector.offset")?, - )), - None => None, - }; - // Apply partition pruning (to adds only) before deduplication, so that we don't waste memory // tracking pruned files. Removes don't get pruned and we'll still have to track them. // @@ -203,8 +179,7 @@ impl AddRemoveDedupVisitor<'_> { }; // Check both adds and removes (skipping already-seen), but only transform and return adds - let file_key = FileActionKey::new(path, dv_unique_id); - if self.check_and_record_seen(file_key) || !is_add { + if self.deduplicator.check_and_record_seen(file_key) || !is_add { return Ok(false); } let transform = self @@ -243,7 +218,7 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { (names, types).into() }); let (names, types) = NAMES_AND_TYPES.as_ref(); - if self.is_log_batch { + if self.deduplicator.is_log_batch() { (names, types) } else { // All checkpoint actions are already reconciled and Remove actions in checkpoint files @@ -253,7 +228,11 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { } fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { - let expected_getters = if self.is_log_batch { 9 } else { 5 }; + let expected_getters = if self.deduplicator.is_log_batch() { + 9 + } else { + 5 + }; require!( getters.len() == expected_getters, Error::InternalError(format!( @@ -263,8 +242,8 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { ); for i in 0..row_count { - if self.selection_vector[i] { - self.selection_vector[i] = self.is_valid_add(i, getters)?; + if self.deduplicator.selection_vector_ref()[i] { + self.deduplicator.selection_vector_mut()[i] = self.is_valid_add(i, getters)?; } } Ok(()) @@ -336,19 +315,18 @@ impl LogReplayScanner { }; assert_eq!(selection_vector.len(), actions.len()); - let mut visitor = AddRemoveDedupVisitor { - seen: &mut self.seen, + let mut visitor = AddRemoveDedupVisitor::new( + &mut self.seen, selection_vector, logical_schema, transform, - partition_filter: self.partition_filter.clone(), - row_transform_exprs: Vec::new(), + self.partition_filter.clone(), is_log_batch, - }; + ); visitor.visit_rows_of(actions)?; // TODO: Teach expression eval to respect the selection vector we just computed so carefully! - let selection_vector = visitor.selection_vector; + let selection_vector = visitor.deduplicator.selection_vector(); let result = add_transform.evaluate(actions)?; Ok((result, selection_vector, visitor.row_transform_exprs)) } From 303444b5df466f697722bc85c4f23dd340d6faff Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 25 Mar 2025 15:34:58 -0700 Subject: [PATCH 011/176] combine visitors --- kernel/src/actions/visitors.rs | 457 ++++++++++++++++++++------------- 1 file changed, 281 insertions(+), 176 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 9a04411e11..73eb25d939 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -484,38 +484,61 @@ impl RowVisitor for SidecarVisitor { Ok(()) } } - -/// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds and -/// removes to be included in a checkpoint file. Log replay visits actions newest-first, so once -/// we've seen a file action for a given (path, dvId) pair, we should ignore all subsequent (older) -/// actions for that same (path, dvId) pair. If the first action for a given (path, dvId) is a remove -/// action, we should only include it if it is not expired (i.e., its deletion timestamp is greater -/// than the minimum file retention timestamp). -struct CheckpointFileActionsVisitor<'seen> { - deduplicator: FileActionDeduplicator<'seen>, - total_actions: usize, +/// A visitor that filters actions for inclusion in a checkpoint file. +/// +/// This visitor processes actions in newest-to-oldest order (as they appear in log +/// replay) and applies deduplication logic for both file and non-file actions. +/// +/// # File Action Filtering +/// - Keeps only the first occurrence of each unique (path, dvId) pair +/// - Excludes expired tombstone remove actions (where deletionTimestamp ≤ minimumFileRetentionTimestamp) +/// +/// # Non-File Action Filtering +/// - Keeps only the first protocol action +/// - Keeps only the first metadata action +/// - Keeps only the first transaction action for each unique app ID +/// +/// This filtered set of actions represents the minimal set needed to reconstruct +/// the latest valid state of the table. +#[cfg_attr(feature = "developer-visibility", visibility::make(pub))] +pub(crate) struct CheckpointVisitor<'seen> { + // File actions deduplication state + file_deduplicator: FileActionDeduplicator<'seen>, + total_file_actions: usize, total_add_actions: usize, minimum_file_retention_timestamp: i64, + + // Non-file actions deduplication state + seen_protocol: bool, + seen_metadata: bool, + seen_txns: &'seen mut HashSet, + total_non_file_actions: usize, } -#[allow(unused)] // TODO: Remove flag once used for checkpoint writing -impl CheckpointFileActionsVisitor<'_> { - /// Create a new CheckpointFileActionsVisitor - fn new( - seen_file_keys: &mut HashSet, +#[allow(unused)] +impl CheckpointVisitor<'_> { + /// Create a new CheckpointVisitor + fn new<'seen>( + seen_file_keys: &'seen mut HashSet, + seen_txns: &'seen mut HashSet, selection_vector: Vec, is_log_batch: bool, minimum_file_retention_timestamp: i64, - ) -> CheckpointFileActionsVisitor<'_> { - CheckpointFileActionsVisitor { - deduplicator: FileActionDeduplicator::new( + ) -> CheckpointVisitor<'seen> { + CheckpointVisitor { + file_deduplicator: FileActionDeduplicator::new( seen_file_keys, selection_vector, is_log_batch, ), - total_actions: 0, + total_file_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp, + + seen_protocol: false, + seen_metadata: false, + seen_txns, + total_non_file_actions: 0, } } @@ -541,8 +564,8 @@ impl CheckpointFileActionsVisitor<'_> { i: usize, getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { - let Some((file_key, is_add)) = self.deduplicator.extract_file_action( - i, getters, 0, // add_path_index + let Some((file_key, is_add)) = self.file_deduplicator.extract_file_action( + i, &getters, 0, // add_path_index 4, // remove_path_index 1, // add_dv_start_index 6, // remove_dv_start_index @@ -551,11 +574,12 @@ impl CheckpointFileActionsVisitor<'_> { else { return Ok(false); }; - if self.deduplicator.check_and_record_seen(file_key) { + + if self.file_deduplicator.check_and_record_seen(file_key) { return Ok(false); } - // Ignore expired tombstones. + // Ignore expired tombstones. The getter at the fifth index is the remove action's deletionTimestamp. if !is_add && self.is_expired_tombstone(i, getters[5])? { return Ok(false); } @@ -564,39 +588,98 @@ impl CheckpointFileActionsVisitor<'_> { self.total_add_actions += 1; } + self.total_file_actions += 1; Ok(true) } + + /// Returns true if the row contains a protocol action, and we haven't seen one yet. + fn is_valid_protocol_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol { + self.seen_protocol = true; + self.total_non_file_actions += 1; + Ok(true) + } else { + Ok(false) + } + } + + /// Returns true if the row contains a metadata action, and we haven't seen one yet. + fn is_valid_metadata_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata { + self.seen_metadata = true; + self.total_non_file_actions += 1; + Ok(true) + } else { + Ok(false) + } + } + + /// Returns true if the row contains a txn action with an appId that we haven't seen yet. + fn is_valid_txn_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + let app_id = match getter.get_str(i, "txn.appId")? { + Some(id) => id, + None => return Ok(false), + }; + + // Attempting to insert the app_id into the set. If it's already present, the insert will + // return false, indicating that we've already seen this app_id. + if self.seen_txns.insert(app_id.to_string()) { + self.total_non_file_actions += 1; + Ok(true) + } else { + Ok(false) + } + } } -impl RowVisitor for CheckpointFileActionsVisitor<'_> { +impl RowVisitor for CheckpointVisitor<'_> { fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { // The data columns visited must be in the following order: // 1. ADD // 2. REMOVE - static CHECKPOINT_FILE_ACTION_COLUMNS: LazyLock = - LazyLock::new(|| { - const STRING: DataType = DataType::STRING; - const INTEGER: DataType = DataType::INTEGER; - let types_and_names = vec![ - (STRING, column_name!("add.path")), - (STRING, column_name!("add.deletionVector.storageType")), - (STRING, column_name!("add.deletionVector.pathOrInlineDv")), - (INTEGER, column_name!("add.deletionVector.offset")), - (STRING, column_name!("remove.path")), - (DataType::LONG, column_name!("remove.deletionTimestamp")), - (STRING, column_name!("remove.deletionVector.storageType")), - (STRING, column_name!("remove.deletionVector.pathOrInlineDv")), - (INTEGER, column_name!("remove.deletionVector.offset")), - ]; - let (types, names) = types_and_names.into_iter().unzip(); - (names, types).into() - }); - CHECKPOINT_FILE_ACTION_COLUMNS.as_ref() + // 3. METADATA + // 4. PROTOCOL + // 5. TXN + static NAMES_AND_TYPES: LazyLock = LazyLock::new(|| { + const STRING: DataType = DataType::STRING; + const INTEGER: DataType = DataType::INTEGER; + let types_and_names = vec![ + // File action columns + (STRING, column_name!("add.path")), + (STRING, column_name!("add.deletionVector.storageType")), + (STRING, column_name!("add.deletionVector.pathOrInlineDv")), + (INTEGER, column_name!("add.deletionVector.offset")), + (STRING, column_name!("remove.path")), + (DataType::LONG, column_name!("remove.deletionTimestamp")), + (STRING, column_name!("remove.deletionVector.storageType")), + (STRING, column_name!("remove.deletionVector.pathOrInlineDv")), + (INTEGER, column_name!("remove.deletionVector.offset")), + // Non-file action columns + (STRING, column_name!("metaData.id")), + (INTEGER, column_name!("protocol.minReaderVersion")), + (STRING, column_name!("txn.appId")), + ]; + let (types, names) = types_and_names.into_iter().unzip(); + (names, types).into() + }); + NAMES_AND_TYPES.as_ref() } fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { require!( - getters.len() == 9, + getters.len() == 12, Error::InternalError(format!( "Wrong number of visitor getters: {}", getters.len() @@ -604,11 +687,17 @@ impl RowVisitor for CheckpointFileActionsVisitor<'_> { ); for i in 0..row_count { - let should_select = self.is_valid_file_action(i, getters)?; + // Check for non-file actions (metadata, protocol, txn) + let is_non_file_action = self.is_valid_metadata_action(i, getters[9])? + || self.is_valid_protocol_action(i, getters[10])? + || self.is_valid_txn_action(i, getters[11])?; - if should_select { - self.deduplicator.selection_vector[i] = true; - self.total_actions += 1; + // Check for file actions (add, remove) + let is_file_action = self.is_valid_file_action(i, getters)?; + + // Mark the row for selection if it's either a valid non-file or file action + if is_non_file_action || is_file_action { + self.file_deduplicator.selection_vector_mut()[i] = true; } } Ok(()) @@ -1140,100 +1229,105 @@ mod tests { } #[test] - fn test_parse_checkpoint_file_action_visitor() -> DeltaResult<()> { + fn test_checkpoint_visitor() -> DeltaResult<()> { let data = action_batch(); - let deduplicator = FileActionDeduplicator { - seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 8], - is_log_batch: true, - }; - let mut visitor = CheckpointFileActionsVisitor { - deduplicator, - total_actions: 0, - total_add_actions: 0, - minimum_file_retention_timestamp: 0, // No tombstones are expired - }; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = CheckpointVisitor::new( + &mut seen_file_keys, + &mut seen_txns, + vec![false; 8], + true, + 0, // minimum_file_retention_timestamp (no expired tombstones) + ); visitor.visit_rows_of(data.as_ref())?; - let expected = vec![true, true, false, false, false, false, false, false]; - assert_eq!(visitor.deduplicator.seen_file_keys.len(), 2); - assert_eq!(visitor.deduplicator.selection_vector(), expected); - assert_eq!(visitor.total_actions, 2); + // Combined results from both file and non-file actions + // Row 0 is an add action + // Row 1 is a remove action + // Row 3 is a protocol action + // Row 4 is a metadata action + // Row 7 is a txn action + let expected = vec![true, true, false, true, true, false, false, true]; + + // Verify file action results + assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 1); + + // Verify non-file action results + assert!(visitor.seen_protocol); + assert!(visitor.seen_metadata); + assert_eq!(visitor.seen_txns.len(), 1); + assert_eq!(visitor.total_non_file_actions, 3); + + assert_eq!(visitor.file_deduplicator.selection_vector, expected); Ok(()) } #[test] - fn test_checkpoint_file_action_visitor_boundary_cases_for_tombstone_expiration( - ) -> DeltaResult<()> { + fn test_checkpoint_visitor_boundary_cases_for_tombstone_expiration() -> DeltaResult<()> { let json_strings: StringArray = vec![ - r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, - r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, - r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#, - r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, // Missing timestamp defaults to 0 + r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#, + // Missing timestamp defaults to 0 + r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, ] .into(); let batch = parse_json_batch(json_strings); - let deduplicator = FileActionDeduplicator { - seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 4], - is_log_batch: true, - }; - let mut visitor = CheckpointFileActionsVisitor { - deduplicator, - total_actions: 0, - total_add_actions: 0, - minimum_file_retention_timestamp: 100, // Threshold set to 100 - }; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = CheckpointVisitor::new( + &mut seen_file_keys, + &mut seen_txns, + vec![false; 4], + true, + 100, // minimum_file_retention_timestamp (threshold set to 100) + ); visitor.visit_rows_of(batch.as_ref())?; - let expected = vec![false, false, true, false]; // Only "one_above_threshold" should be kept - assert_eq!(visitor.deduplicator.seen_file_keys.len(), 4); // All are recorded as seen even if expired - assert_eq!(visitor.deduplicator.selection_vector(), expected); - assert_eq!(visitor.total_actions, 1); + // Only "one_above_threshold" should be kept + let expected = vec![false, false, true, false]; + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 0); + assert_eq!(visitor.total_non_file_actions, 0); Ok(()) } #[test] - fn test_checkpoint_file_action_visitor_duplicate_file_actions_in_log_batch() -> DeltaResult<()> - { + fn test_checkpoint_visitor_conflicting_file_actions_in_log_batch() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, - r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, // Duplicate path - ] + // Duplicate path + r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + ] .into(); let batch = parse_json_batch(json_strings); - let deduplicator = FileActionDeduplicator { - seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 2], - is_log_batch: true, - }; - let mut visitor = CheckpointFileActionsVisitor { - deduplicator, - total_actions: 0, - total_add_actions: 0, - minimum_file_retention_timestamp: 0, - }; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = + CheckpointVisitor::new(&mut seen_file_keys, &mut seen_txns, vec![false; 2], true, 0); visitor.visit_rows_of(batch.as_ref())?; // First one should be included, second one skipped as a duplicate let expected = vec![true, false]; - assert_eq!(visitor.deduplicator.seen_file_keys.len(), 1); - assert_eq!(visitor.deduplicator.selection_vector(), expected); - assert_eq!(visitor.total_actions, 1); + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 1); + assert_eq!(visitor.total_non_file_actions, 0); Ok(()) } #[test] - fn test_checkpoint_file_action_visitor_duplicate_file_actions_in_checkpoint_batch( - ) -> DeltaResult<()> { + fn test_checkpoint_visitor_duplicate_file_actions_in_checkpoint_batch() -> DeltaResult<()> { + // Note: this is NOT a valid checkpoint batch since it contains duplicate file actions! + // However, we should still be able to parse it without errors, and the duplicates should be included. let json_strings: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, // Duplicate path @@ -1242,31 +1336,29 @@ mod tests { .into(); let batch = parse_json_batch(json_strings); - let deduplicator = FileActionDeduplicator { - seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 2], - is_log_batch: false, - }; - let mut visitor = CheckpointFileActionsVisitor { - deduplicator, - total_actions: 0, - total_add_actions: 0, - minimum_file_retention_timestamp: 0, - }; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = CheckpointVisitor::new( + &mut seen_file_keys, + &mut seen_txns, + vec![false; 2], + false, // is_log_batch = false (checkpoint batch) + 0, + ); visitor.visit_rows_of(batch.as_ref())?; // Both should be included since we don't track duplicates in checkpoint batches let expected = vec![true, true]; - assert_eq!(visitor.deduplicator.seen_file_keys.len(), 0); // No tracking for checkpoint batches - assert_eq!(visitor.deduplicator.selection_vector(), expected); - assert_eq!(visitor.total_actions, 2); + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); + assert_eq!(visitor.total_non_file_actions, 0); Ok(()) } #[test] - fn test_checkpoint_file_action_visitor_with_deletion_vectors() -> DeltaResult<()> { + fn test_checkpoint_visitor_with_deletion_vectors() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, // Same path but different DV @@ -1276,52 +1368,52 @@ mod tests { ] .into(); let batch = parse_json_batch(json_strings); - let deduplicator = FileActionDeduplicator { - seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 3], - is_log_batch: true, - }; - let mut visitor = CheckpointFileActionsVisitor { - deduplicator, - total_actions: 0, - total_add_actions: 0, - minimum_file_retention_timestamp: 0, - }; + + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = + CheckpointVisitor::new(&mut seen_file_keys, &mut seen_txns, vec![false; 3], true, 0); visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, false]; // Third one is a duplicate - assert_eq!(visitor.deduplicator.seen_file_keys.len(), 2); - assert_eq!(visitor.deduplicator.selection_vector(), expected); - assert_eq!(visitor.total_actions, 2); + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); + assert_eq!(visitor.total_non_file_actions, 0); + Ok(()) } #[test] - fn test_parse_checkpoint_non_file_actions_visitor() -> DeltaResult<()> { - let data = action_batch(); - let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: false, - seen_metadata: false, - seen_txns: &mut HashSet::new(), - selection_vector: vec![false; 8], - total_actions: 0, - }; + fn test_checkpoint_visitor_non_file_actions() -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + ].into(); + let batch = parse_json_batch(json_strings); - visitor.visit_rows_of(data.as_ref())?; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = + CheckpointVisitor::new(&mut seen_file_keys, &mut seen_txns, vec![false; 3], true, 0); - let expected = vec![false, false, false, true, true, false, false, true]; - assert_eq!(visitor.selection_vector, expected); - assert!(visitor.seen_metadata); + visitor.visit_rows_of(batch.as_ref())?; + + let expected = vec![true, true, true]; + assert_eq!(visitor.file_deduplicator.selection_vector, expected); assert!(visitor.seen_protocol); + assert!(visitor.seen_metadata); assert_eq!(visitor.seen_txns.len(), 1); - assert_eq!(visitor.total_actions, 3); + assert_eq!(visitor.total_non_file_actions, 3); + assert_eq!(visitor.total_file_actions, 0); + Ok(()) } #[test] - fn test_checkpoint_non_file_actions_visitor_already_seen_actions() -> DeltaResult<()> { + fn test_checkpoint_visitor_already_seen_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, @@ -1330,54 +1422,67 @@ mod tests { let batch = parse_json_batch(json_strings); // Pre-populate with txn app1 + let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); seen_txns.insert("app1".to_string()); - let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: true, // Already seen - seen_metadata: true, // Already seen - seen_txns: &mut seen_txns, - selection_vector: vec![false; 3], - total_actions: 0, - }; + let mut visitor = CheckpointVisitor::new( + &mut seen_file_keys, + &mut seen_txns, // Pre-populated transaction + vec![false; 3], + true, + 0, + ); + + // Mark these as already seen + visitor.seen_protocol = true; + visitor.seen_metadata = true; visitor.visit_rows_of(batch.as_ref())?; // All actions should be skipped as they have already been seen - let expected = vec![false; 3]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_txns.len(), 1); // Still only one transaction - assert_eq!(visitor.total_actions, 0); + let expected = vec![false, false, false]; + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.total_non_file_actions, 0); + assert_eq!(visitor.total_file_actions, 0); + Ok(()) } #[test] - fn test_checkpoint_non_file_actions_visitor_duplicate_non_file_actions() -> DeltaResult<()> { + fn test_checkpoint_visitor_duplicate_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, // Duplicate txn + r#"{"txn":{"appId":"app2","version":1,"lastUpdated":123456789}}"#, // Different app ID + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, // Duplicate protocol r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + // Duplicate metadata + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, ] .into(); let batch = parse_json_batch(json_strings); - let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: false, - seen_metadata: false, - seen_txns: &mut HashSet::new(), // Empty set - selection_vector: vec![false; 6], - total_actions: 0, - }; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = CheckpointVisitor::new( + &mut seen_file_keys, + &mut seen_txns, + vec![false; 7], + true, // is_log_batch + 0, // minimum_file_retention_timestamp + ); visitor.visit_rows_of(batch.as_ref())?; - let expected = vec![true, false, true, false, true, false]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_txns.len(), 1); - assert_eq!(visitor.total_actions, 3); + // First occurrence of each type should be included + let expected = vec![true, false, true, true, false, true, false]; + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.seen_txns.len(), 2); // Two different app IDs + assert_eq!(visitor.total_non_file_actions, 4); // 2 txns + 1 protocol + 1 metadata + assert_eq!(visitor.total_file_actions, 0); + Ok(()) } } From 5dbc924b65eeb2d3a5b34f03059d1d03a9b80f6d Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 25 Mar 2025 17:11:15 -0700 Subject: [PATCH 012/176] fmt --- kernel/src/actions/visitors.rs | 115 ++++++++++++++++++++++----------- kernel/src/scan/log_replay.rs | 8 +-- 2 files changed, 79 insertions(+), 44 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 73eb25d939..a93aa71ec4 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -503,7 +503,7 @@ impl RowVisitor for SidecarVisitor { #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] pub(crate) struct CheckpointVisitor<'seen> { // File actions deduplication state - file_deduplicator: FileActionDeduplicator<'seen>, + deduplicator: FileActionDeduplicator<'seen>, total_file_actions: usize, total_add_actions: usize, minimum_file_retention_timestamp: i64, @@ -526,7 +526,7 @@ impl CheckpointVisitor<'_> { minimum_file_retention_timestamp: i64, ) -> CheckpointVisitor<'seen> { CheckpointVisitor { - file_deduplicator: FileActionDeduplicator::new( + deduplicator: FileActionDeduplicator::new( seen_file_keys, selection_vector, is_log_batch, @@ -564,18 +564,18 @@ impl CheckpointVisitor<'_> { i: usize, getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { - let Some((file_key, is_add)) = self.file_deduplicator.extract_file_action( - i, &getters, 0, // add_path_index - 4, // remove_path_index - 1, // add_dv_start_index - 6, // remove_dv_start_index - false, // Never skip remove actions (even if we're processing a log batch) + // Extract file action key and determine if it's an add operation + let Some((file_key, is_add)) = self.deduplicator.extract_file_action( + i, + getters, + // Do not skip remove actions (even if we're processing a log batch) + FileActionExtractConfig::new(0, 4, 1, 6, false), )? else { return Ok(false); }; - if self.file_deduplicator.check_and_record_seen(file_key) { + if self.deduplicator.check_and_record_seen(file_key) { return Ok(false); } @@ -697,13 +697,46 @@ impl RowVisitor for CheckpointVisitor<'_> { // Mark the row for selection if it's either a valid non-file or file action if is_non_file_action || is_file_action { - self.file_deduplicator.selection_vector_mut()[i] = true; + self.deduplicator.selection_vector_mut()[i] = true; } } Ok(()) } } +/// This struct contains indices and configuration options needed to +/// extract file actions from action batches in the Delta log. +pub(crate) struct FileActionExtractConfig { + /// Index of the getter containing the add.path column + pub add_path_index: usize, + /// Index of the getter containing the remove.path column + pub remove_path_index: usize, + /// Starting index for add action deletion vector columns + pub add_dv_start_index: usize, + /// Starting index for remove action deletion vector columns + pub remove_dv_start_index: usize, + /// Whether to skip remove actions when extracting file actions + pub skip_removes: bool, +} + +impl FileActionExtractConfig { + pub(crate) fn new( + add_path_index: usize, + remove_path_index: usize, + add_dv_start_index: usize, + remove_dv_start_index: usize, + skip_removes: bool, + ) -> Self { + Self { + add_path_index, + remove_path_index, + add_dv_start_index, + remove_dv_start_index, + skip_removes, + } + } +} + /// Core implementation for deduplicating file actions in Delta log replay /// This struct extracts the common functionality from the CheckpointVisitor /// and the ScanDataVisitor. @@ -786,58 +819,64 @@ impl<'seen> FileActionDeduplicator<'seen> { } } - /// Extract file action key and determine if it's an add operation + /// Extracts a file action key and determines if it's an add operation. + /// + /// This method examines the data at the given index using the provided getters and config + /// to identify whether a file action exists and what type it is. + /// + /// # Arguments + /// + /// * `i` - Index position in the data structure to examine + /// * `getters` - Collection of data getter implementations used to access the data + /// * `config` - Configuration specifying where to find add/remove operations + /// + /// # Returns + /// + /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation + /// * `Ok(None)` - When no file action is found + /// * `Err(...)` - On any error during extraction pub(crate) fn extract_file_action<'a>( &self, i: usize, getters: &[&'a dyn GetData<'a>], - add_path_index: usize, - remove_path_index: usize, - add_dv_start_index: usize, - remove_dv_start_index: usize, - skip_removes: bool, + config: FileActionExtractConfig, ) -> DeltaResult> { // Try to extract an add action path - if let Some(path) = getters[add_path_index].get_str(i, "add.path")? { + if let Some(path) = getters[config.add_path_index].get_str(i, "add.path")? { let dv_unique_id = - self.extract_dv_unique_id(i, getters, Some(add_dv_start_index), None)?; + self.extract_dv_unique_id(i, getters, Some(config.add_dv_start_index), None)?; return Ok(Some((FileActionKey::new(path, dv_unique_id), true))); } - // The AddRemoveDedupVisitor does not include remove action getters when - // dealing with non-log batches (since they are not needed for deduplication). - // In this case, we should skip remove actions. - if skip_removes { + // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint file. + if config.skip_removes { return Ok(None); } // Try to extract a remove action path - if let Some(path) = getters[remove_path_index].get_str(i, "remove.path")? { + if let Some(path) = getters[config.remove_path_index].get_str(i, "remove.path")? { let dv_unique_id = - self.extract_dv_unique_id(i, getters, None, Some(remove_dv_start_index))?; + self.extract_dv_unique_id(i, getters, None, Some(config.remove_dv_start_index))?; return Ok(Some((FileActionKey::new(path, dv_unique_id), false))); } - // If we didn't find an add or remove action, return None - return Ok(None); + // No file action found + Ok(None) } - /// Get the selection vector pub(crate) fn selection_vector(self) -> Vec { self.selection_vector } - /// Get reference to the selection vector pub(crate) fn selection_vector_ref(&self) -> &Vec { &self.selection_vector } - /// Get mutable reference to the selection vector pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec { &mut self.selection_vector } - /// Get whether we are processing a log batch + /// Returns whether we are currently processing a log batch. pub(crate) fn is_log_batch(&self) -> bool { self.is_log_batch } @@ -1261,7 +1300,7 @@ mod tests { assert_eq!(visitor.seen_txns.len(), 1); assert_eq!(visitor.total_non_file_actions, 3); - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); Ok(()) } @@ -1291,7 +1330,7 @@ mod tests { // Only "one_above_threshold" should be kept let expected = vec![false, false, true, false]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 0); assert_eq!(visitor.total_non_file_actions, 0); @@ -1317,7 +1356,7 @@ mod tests { // First one should be included, second one skipped as a duplicate let expected = vec![true, false]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 1); assert_eq!(visitor.total_non_file_actions, 0); @@ -1350,7 +1389,7 @@ mod tests { // Both should be included since we don't track duplicates in checkpoint batches let expected = vec![true, true]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); assert_eq!(visitor.total_non_file_actions, 0); @@ -1377,7 +1416,7 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, false]; // Third one is a duplicate - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); assert_eq!(visitor.total_non_file_actions, 0); @@ -1402,7 +1441,7 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, true]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert!(visitor.seen_protocol); assert!(visitor.seen_metadata); assert_eq!(visitor.seen_txns.len(), 1); @@ -1442,7 +1481,7 @@ mod tests { // All actions should be skipped as they have already been seen let expected = vec![false, false, false]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.total_non_file_actions, 0); assert_eq!(visitor.total_file_actions, 0); @@ -1478,7 +1517,7 @@ mod tests { // First occurrence of each type should be included let expected = vec![true, false, true, true, false, true, false]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.seen_txns.len(), 2); // Two different app IDs assert_eq!(visitor.total_non_file_actions, 4); // 2 txns + 1 protocol + 1 metadata assert_eq!(visitor.total_file_actions, 0); diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 59e3e52c15..392b1511ce 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -7,7 +7,7 @@ use itertools::Itertools; use super::data_skipping::DataSkippingFilter; use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; -use crate::actions::visitors::FileActionDeduplicator; +use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig}; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; @@ -150,11 +150,7 @@ impl AddRemoveDedupVisitor<'_> { let Some((file_key, is_add)) = self.deduplicator.extract_file_action( i, getters, - 0, // add_path_index - 5, // remove_path_index - 2, // add_dv_start_index - 6, // remove_dv_start_index - !self.deduplicator.is_log_batch(), // skip_removes if it's a log batch + FileActionExtractConfig::new(0, 5, 2, 6, !self.deduplicator.is_log_batch()), )? else { return Ok(false); From b7939610ebf92dbbc9825437bac45b99a3b221d1 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 25 Mar 2025 17:39:26 -0700 Subject: [PATCH 013/176] remove old code --- kernel/src/actions/visitors.rs | 109 +-------------------------------- 1 file changed, 1 insertion(+), 108 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index a93aa71ec4..c0feb93eb0 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -739,7 +739,7 @@ impl FileActionExtractConfig { /// Core implementation for deduplicating file actions in Delta log replay /// This struct extracts the common functionality from the CheckpointVisitor -/// and the ScanDataVisitor. +/// and the AddRemoveDedupVisitor. pub(crate) struct FileActionDeduplicator<'seen> { /// A set of (data file path, dv_unique_id) pairs that have been seen thus /// far in the log for deduplication @@ -882,113 +882,6 @@ impl<'seen> FileActionDeduplicator<'seen> { } } -/// A visitor that selects non-file actions for a checkpoint file. Since log replay visits actions -/// in newest-first order, we only keep the first occurrence of: -/// - a protocol action, -/// - a metadata action, -/// - a transaction (txn) action for a given app ID. -/// -/// Any subsequent (older) actions of the same type are ignored. This visitor tracks which actions -/// have been seen and includes only the first occurrence of each in the selection vector. -#[cfg_attr(feature = "developer-visibility", visibility::make(pub))] -pub(crate) struct CheckpointNonFileActionsVisitor<'seen> { - // Non-file actions state - pub(crate) seen_protocol: bool, - pub(crate) seen_metadata: bool, - pub(crate) seen_txns: &'seen mut HashSet, - pub(crate) selection_vector: Vec, - pub(crate) total_actions: usize, -} - -#[allow(unused)] // TODO: Remove flag once used for checkpoint writing -impl CheckpointNonFileActionsVisitor<'_> { - /// Returns true if the row contains a protocol action, and we haven’t seen one yet. - fn is_valid_protocol_action<'a>( - &mut self, - i: usize, - getter: &'a dyn GetData<'a>, - ) -> DeltaResult { - if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol { - self.seen_protocol = true; - Ok(true) - } else { - Ok(false) - } - } - - /// Returns true if the row contains a metadata action, and we haven’t seen one yet. - fn is_valid_metadata_action<'a>( - &mut self, - i: usize, - getter: &'a dyn GetData<'a>, - ) -> DeltaResult { - if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata { - self.seen_metadata = true; - Ok(true) - } else { - Ok(false) - } - } - - /// Returns true if the row contains a txn action with an appId that we haven’t seen yet. - fn is_valid_txn_action<'a>( - &mut self, - i: usize, - getter: &'a dyn GetData<'a>, - ) -> DeltaResult { - let app_id = match getter.get_str(i, "txn.appId")? { - Some(id) => id, - None => return Ok(false), - }; - - Ok(self.seen_txns.insert(app_id.to_string())) - } -} - -impl RowVisitor for CheckpointNonFileActionsVisitor<'_> { - fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { - // The data columns visited must be in the following order: - // 1. METADATA - // 2. PROTOCOL - // 3. TXN - static CHECKPOINT_NON_FILE_ACTION_COLUMNS: LazyLock = - LazyLock::new(|| { - const STRING: DataType = DataType::STRING; - const INTEGER: DataType = DataType::INTEGER; - let types_and_names = vec![ - (STRING, column_name!("metaData.id")), - (INTEGER, column_name!("protocol.minReaderVersion")), - (STRING, column_name!("txn.appId")), - ]; - let (types, names) = types_and_names.into_iter().unzip(); - (names, types).into() - }); - CHECKPOINT_NON_FILE_ACTION_COLUMNS.as_ref() - } - - fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { - require!( - getters.len() == 3, - Error::InternalError(format!( - "Wrong number of visitor getters: {}", - getters.len() - )) - ); - - for i in 0..row_count { - let should_select = self.is_valid_metadata_action(i, getters[0])? - || self.is_valid_protocol_action(i, getters[1])? - || self.is_valid_txn_action(i, getters[2])?; - - if should_select { - self.selection_vector[i] = true; - self.total_actions += 1; - } - } - Ok(()) - } -} - /// Get a DV out of some engine data. The caller is responsible for slicing the `getters` slice such /// that the first element contains the `storageType` element of the deletion vector. pub(crate) fn visit_deletion_vector_at<'a>( From 508976ff35a8e10da28222c4a33030eba468965e Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 25 Mar 2025 18:10:05 -0700 Subject: [PATCH 014/176] move FileActionKey --- kernel/src/actions/visitors.rs | 15 ++++++++++++++- kernel/src/scan/log_replay.rs | 16 +--------------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index c0feb93eb0..037dfdd427 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -6,7 +6,6 @@ use std::sync::LazyLock; use tracing::debug; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; -use crate::scan::log_replay::FileActionKey; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, Error}; @@ -704,6 +703,20 @@ impl RowVisitor for CheckpointVisitor<'_> { } } +/// The subset of file action fields that uniquely identifies it in the log, used for deduplication +/// of adds and removes during log replay. +#[derive(Debug, Hash, Eq, PartialEq)] +pub(crate) struct FileActionKey { + pub(crate) path: String, + pub(crate) dv_unique_id: Option, +} +impl FileActionKey { + pub(crate) fn new(path: impl Into, dv_unique_id: Option) -> Self { + let path = path.into(); + Self { path, dv_unique_id } + } +} + /// This struct contains indices and configuration options needed to /// extract file actions from action batches in the Delta log. pub(crate) struct FileActionExtractConfig { diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 392b1511ce..d3287eb5d6 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -7,7 +7,7 @@ use itertools::Itertools; use super::data_skipping::DataSkippingFilter; use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; -use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig}; +use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig, FileActionKey}; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; @@ -16,20 +16,6 @@ use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructFie use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; -/// The subset of file action fields that uniquely identifies it in the log, used for deduplication -/// of adds and removes during log replay. -#[derive(Debug, Hash, Eq, PartialEq)] -pub(crate) struct FileActionKey { - pub(crate) path: String, - pub(crate) dv_unique_id: Option, -} -impl FileActionKey { - pub(crate) fn new(path: impl Into, dv_unique_id: Option) -> Self { - let path = path.into(); - Self { path, dv_unique_id } - } -} - struct LogReplayScanner { partition_filter: Option, data_skipping_filter: Option, From 0160ef151185de1f2c10c2e0a866ebafe3e2eabb Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 01:23:57 -0700 Subject: [PATCH 015/176] fix whitespace --- kernel/src/actions/visitors.rs | 84 +++++++++++++++++----------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 828fc43878..4d93d6fd3e 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -1182,13 +1182,13 @@ mod tests { #[test] fn test_checkpoint_visitor_boundary_cases_for_tombstone_expiration() -> DeltaResult<()> { let json_strings: StringArray = vec![ - r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, - r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, - r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#, - // Missing timestamp defaults to 0 - r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, - ] - .into(); + r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#, + // Missing timestamp defaults to 0 + r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, + ] + .into(); let batch = parse_json_batch(json_strings); let mut seen_file_keys = HashSet::new(); @@ -1217,11 +1217,11 @@ mod tests { #[test] fn test_checkpoint_visitor_conflicting_file_actions_in_log_batch() -> DeltaResult<()> { let json_strings: StringArray = vec![ - r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, - // Duplicate path - r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, - ] - .into(); + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + // Duplicate path + r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + ] + .into(); let batch = parse_json_batch(json_strings); let mut seen_file_keys = HashSet::new(); @@ -1252,11 +1252,11 @@ mod tests { // Note: this is NOT a valid checkpoint batch since it contains duplicate file actions! // However, we should still be able to parse it without errors, and the duplicates should be included. let json_strings: StringArray = vec![ - r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, - // Duplicate path - r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, - ] - .into(); + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + // Duplicate path + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + ] + .into(); let batch = parse_json_batch(json_strings); let mut seen_file_keys = HashSet::new(); @@ -1285,13 +1285,13 @@ mod tests { #[test] fn test_checkpoint_visitor_with_deletion_vectors() -> DeltaResult<()> { let json_strings: StringArray = vec![ - r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - // Same path but different DV - r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"two","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - // Duplicate of first entry - r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - ] - .into(); + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Same path but different DV + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"two","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Duplicate of first entry + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + ] + .into(); let batch = parse_json_batch(json_strings); let mut seen_file_keys = HashSet::new(); @@ -1320,10 +1320,10 @@ mod tests { #[test] fn test_checkpoint_visitor_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - ].into(); + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + ].into(); let batch = parse_json_batch(json_strings); let mut seen_file_keys = HashSet::new(); @@ -1354,10 +1354,10 @@ mod tests { #[test] fn test_checkpoint_visitor_already_seen_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, - ].into(); + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + ].into(); let batch = parse_json_batch(json_strings); // Pre-populate with txn app1 @@ -1389,16 +1389,16 @@ mod tests { #[test] fn test_checkpoint_visitor_duplicate_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, // Duplicate txn - r#"{"txn":{"appId":"app2","version":1,"lastUpdated":123456789}}"#, // Different app ID - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, // Duplicate protocol - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, - // Duplicate metadata - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, - ] - .into(); + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, // Duplicate txn + r#"{"txn":{"appId":"app2","version":1,"lastUpdated":123456789}}"#, // Different app ID + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, // Duplicate protocol + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + // Duplicate metadata + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + ] + .into(); let batch = parse_json_batch(json_strings); let mut seen_file_keys = HashSet::new(); From aae7046782e1b9c98f26d4dd9d38d05c6be78fb0 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 01:28:27 -0700 Subject: [PATCH 016/176] remove old code --- kernel/src/log_replay.rs | 114 +-------------------------------------- 1 file changed, 1 insertion(+), 113 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index cfd4a10c08..e98dd6f03d 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -1,9 +1,6 @@ use std::collections::HashSet; -use tracing::debug; -use crate::actions::deletion_vector::DeletionVectorDescriptor; -use crate::engine_data::{GetData, TypedGetData as _}; -use crate::{DeltaResult, EngineData, Error}; +use crate::{DeltaResult, EngineData}; #[derive(Debug, Hash, Eq, PartialEq)] /// The subset of file action fields that uniquely identifies it in the log, used for deduplication @@ -36,112 +33,3 @@ pub trait LogReplayProcessor { // Get a reference to the set of seen file keys fn seen_file_keys(&mut self) -> &mut HashSet; } - -/// Base trait for visitors that process file actions during log replay -pub trait FileActionVisitor { - /// Get a reference to the set of seen file keys - fn seen_file_keys(&mut self) -> &mut HashSet; - - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it - /// and should process it. - fn check_and_record_seen(&mut self, key: FileActionKey, is_log_batch: bool) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen_file_keys().contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, is_log_batch - ); - true - } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, is_log_batch - ); - if is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen_file_keys().insert(key); - } - false - } - } - - /// Index in getters array for add.path - fn add_path_index(&self) -> usize; - - /// Index in getters array for remove.path - fn remove_path_index(&self) -> Option; - - /// Starting index for add action's deletion vector getters - /// (Assumes 3 consecutive items: storageType, pathOrInlineDv, offset) - fn add_dv_start_index(&self) -> usize; - - /// Starting index for remove action's deletion vector getters - /// (Assumes 3 consecutive items: storageType, pathOrInlineDv, offset) - fn remove_dv_start_index(&self) -> Option; - - /// Extract deletion vector unique ID - fn extract_dv_unique_id<'a>( - &self, - i: usize, - getters: &[&'a dyn GetData<'a>], - is_add: bool, - ) -> DeltaResult> { - // Get the starting index based on action type - let start_idx = if is_add { - self.add_dv_start_index() - } else if let Some(idx) = self.remove_dv_start_index() { - idx - } else { - return Err(Error::GenericError { - source: "DV getters should exist".into(), - }); - }; - - // Extract the DV unique ID - match getters[start_idx].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Ok(Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - getters[start_idx + 1].get(i, "deletionVector.pathOrInlineDv")?, - getters[start_idx + 2].get_opt(i, "deletionVector.offset")?, - ))), - None => Ok(None), - } - } - - /// Extract file action key and determine if it's an add operation - fn extract_file_action<'a>( - &self, - i: usize, - getters: &[&'a dyn GetData<'a>], - ) -> DeltaResult> { - // Try to extract an add action path - if let Some(path) = getters[self.add_path_index()].get_str(i, "add.path")? { - let dv_unique_id = self.extract_dv_unique_id(i, getters, true)?; - let file_key = FileActionKey::new(path, dv_unique_id); - return Ok(Some((file_key, true))); - } - - // The AddRemoveDedupVisitor does not include remove action getters when - // dealing with non-log batches (since they are not needed for deduplication). - let Some(remove_idx) = self.remove_path_index() else { - return Ok(None); - }; - - // Try to extract a remove action path - if let Some(path) = getters[remove_idx].get_str(i, "remove.path")? { - let dv_unique_id = self.extract_dv_unique_id(i, getters, false)?; - let file_key = FileActionKey::new(path, dv_unique_id); - return Ok(Some((file_key, false))); - } - - // No path found, not a file action - Ok(None) - } -} From f5743709a48c2ebf4e2c1086cbb85d486daac31c Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 11:25:28 -0700 Subject: [PATCH 017/176] refactor more --- kernel/src/checkpoints/log_replay.rs | 47 ++++++++++++++++-------- kernel/src/log_replay.rs | 55 ++++++++++++++++++++++++++++ kernel/src/scan/log_replay.rs | 11 ++---- kernel/src/scan/mod.rs | 9 +++++ 4 files changed, 99 insertions(+), 23 deletions(-) diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs index 23c1f50d74..dc64b766c5 100644 --- a/kernel/src/checkpoints/log_replay.rs +++ b/kernel/src/checkpoints/log_replay.rs @@ -4,9 +4,23 @@ use std::sync::Arc; use crate::actions::visitors::CheckpointVisitor; use crate::engine_data::RowVisitor; -use crate::log_replay::{FileActionKey, LogReplayProcessor}; +use crate::log_replay::{ + apply_processor_to_iterator, FileActionKey, HasSelectionVector, LogReplayProcessor, +}; use crate::{DeltaResult, EngineData}; +pub struct CheckpointData { + #[allow(unused)] + data: Box, + selection_vector: Vec, +} + +impl HasSelectionVector for CheckpointData { + fn has_selected_rows(&self) -> bool { + self.selection_vector.contains(&true) + } +} + /// `CheckpointLogReplayProcessor` is responsible for filtering actions during log /// replay to include only those that should be included in a V1 checkpoint. #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented @@ -36,7 +50,7 @@ struct CheckpointLogReplayProcessor { impl LogReplayProcessor for CheckpointLogReplayProcessor { // Define the processing result type as a tuple of the data and selection vector - type ProcessingResult = (Box, Vec); + type ProcessingResult = CheckpointData; /// This function processes batches of actions in reverse chronological order /// (from most recent to least recent) and performs the necessary filtering @@ -90,7 +104,10 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { self.seen_protocol = visitor.seen_protocol; self.seen_metadata = visitor.seen_metadata; - Ok((batch, visitor.deduplicator.selection_vector())) + Ok(CheckpointData { + data: batch, + selection_vector: visitor.deduplicator.selection_vector(), + }) } // Get a reference to the set of seen file keys @@ -132,20 +149,14 @@ pub(crate) fn checkpoint_actions_iter( total_actions_counter: Arc, total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, -) -> impl Iterator, Vec)>> + Send + 'static { +) -> impl Iterator> + Send + 'static { let mut log_scanner = CheckpointLogReplayProcessor::new( total_actions_counter, total_add_actions_counter, minimum_file_retention_timestamp, ); - action_iter - .map(move |action_res| { - let (batch, is_log_batch) = action_res?; - log_scanner.process_batch(batch, is_log_batch) - }) - // Only yield batches that have at least one selected row - .filter(|res| res.as_ref().map_or(true, |(_, sv)| sv.contains(&true))) + apply_processor_to_iterator(log_scanner, action_iter) } #[cfg(test)] @@ -212,12 +223,18 @@ mod tests { assert_eq!(results.len(), 2); // First batch should have all rows selected - let (_, selection_vector1) = &results[0]; - assert_eq!(selection_vector1, &vec![true, true, true, true]); + let checkpoint_data = &results[0]; + assert_eq!( + checkpoint_data.selection_vector, + vec![true, true, true, true] + ); // Second batch should have only new file and transaction selected - let (_, selection_vector2) = &results[1]; - assert_eq!(selection_vector2, &vec![false, false, true, false, true]); + let checkpoint_data = &results[1]; + assert_eq!( + checkpoint_data.selection_vector, + vec![false, false, true, false, true] + ); // Verify counters // 6 total actions (4 from batch1 + 2 from batch2 + 0 from batch3) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index e98dd6f03d..852c3fe0d0 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -32,4 +32,59 @@ pub trait LogReplayProcessor { // Get a reference to the set of seen file keys fn seen_file_keys(&mut self) -> &mut HashSet; + + // Create a selection vector of appropriate length with all elements set to the given value + fn create_selection_vector( + &self, + batch: &Box, + default_value: bool, + ) -> Vec { + let selection_vector = vec![default_value; batch.len()]; + assert_eq!( + selection_vector.len(), + batch.len(), + "Selection vector length does not match actions length" + ); + selection_vector + } + + // Filter an iterator to only include results with at least one selected item + fn filter_non_empty_results(iter: I) -> impl Iterator> + where + I: Iterator>, + T: HasSelectionVector, + { + iter.filter(|res| { + res.as_ref() + .map_or(true, |result| result.has_selected_rows()) + }) + } +} + +/// Trait for types that contain a selection vector +pub trait HasSelectionVector { + /// Check if the selection vector contains at least one selected row + fn has_selected_rows(&self) -> bool; +} + +/// Applies the given processor to the given iterator of action results, +/// and filters out batches with no selected rows. +/// +/// This function abstracts the common pattern used by both checkpoint and scan iterators. +pub fn apply_processor_to_iterator

( + mut processor: P, + action_iter: impl Iterator, bool)>>, +) -> impl Iterator> +where + P: LogReplayProcessor, +{ + action_iter + .map(move |action_res| { + let (batch, is_log_batch) = action_res?; + processor.process_batch(batch, is_log_batch) + }) + .filter(|res| { + res.as_ref() + .map_or(true, |result| result.has_selected_rows()) + }) } diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 6a0fa929af..42a88af4ff 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -10,7 +10,7 @@ use crate::actions::get_log_add_schema; use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig}; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; -use crate::log_replay::{FileActionKey, LogReplayProcessor}; +use crate::log_replay::{apply_processor_to_iterator, FileActionKey, LogReplayProcessor}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; use crate::scan::{Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; @@ -350,15 +350,10 @@ pub(crate) fn scan_action_iter( transform: Option>, physical_predicate: Option<(ExpressionRef, SchemaRef)>, ) -> impl Iterator> { - let mut log_scanner = + let log_scanner = ScanLogReplayProcessor::new(engine, physical_predicate, logical_schema, transform); - action_iter - .map(move |action_res| { - let (batch, is_log_batch) = action_res?; - log_scanner.process_batch(batch, is_log_batch) - }) - .filter(|res| res.as_ref().map_or(true, |(_, sv, _)| sv.contains(&true))) + apply_processor_to_iterator(log_scanner, action_iter) } #[cfg(test)] diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index a8e5da899f..92fd00cead 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -13,6 +13,7 @@ use crate::actions::deletion_vector::{ }; use crate::actions::{get_log_schema, ADD_NAME, REMOVE_NAME, SIDECAR_NAME}; use crate::expressions::{ColumnName, Expression, ExpressionRef, ExpressionTransform, Scalar}; +use crate::log_replay::HasSelectionVector; use crate::predicates::{DefaultPredicateEvaluator, EmptyColumnResolver}; use crate::scan::state::{DvInfo, Stats}; use crate::schema::{ @@ -324,6 +325,14 @@ pub(crate) enum TransformExpr { // (data, deletion_vec, transforms) pub type ScanData = (Box, Vec, Vec>); +// Implementation for the scan result type +impl HasSelectionVector for ScanData { + fn has_selected_rows(&self) -> bool { + let (_, sv, _) = self; + sv.contains(&true) + } +} + /// The result of building a scan over a table. This can be used to get the actual data from /// scanning the table. pub struct Scan { From a618833af203866fc356cde62131cf3a1572c61a Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 14:10:24 -0700 Subject: [PATCH 018/176] refactor --- kernel/src/lib.rs | 1 + kernel/src/log_replay.rs | 176 ++++++++++++++++++++++++++++++++++ kernel/src/scan/log_replay.rs | 132 +++++++++++-------------- 3 files changed, 231 insertions(+), 78 deletions(-) create mode 100644 kernel/src/log_replay.rs diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index 2e46986582..bb21bb0f96 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -77,6 +77,7 @@ pub mod actions; pub mod engine_data; pub mod error; pub mod expressions; +pub mod log_replay; pub mod scan; pub mod schema; pub mod snapshot; diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs new file mode 100644 index 0000000000..7650183a04 --- /dev/null +++ b/kernel/src/log_replay.rs @@ -0,0 +1,176 @@ +use crate::{ + actions::deletion_vector::DeletionVectorDescriptor, + engine_data::{GetData, TypedGetData}, + DeltaResult, +}; +use std::collections::HashSet; +use tracing::debug; + +/// The subset of file action fields that uniquely identifies it in the log, used for deduplication +/// of adds and removes during log replay. +#[derive(Debug, Hash, Eq, PartialEq)] +pub(crate) struct FileActionKey { + pub(crate) path: String, + pub(crate) dv_unique_id: Option, +} +impl FileActionKey { + pub(crate) fn new(path: impl Into, dv_unique_id: Option) -> Self { + let path = path.into(); + Self { path, dv_unique_id } + } +} + +/// Core implementation for deduplicating file actions in Delta log replay +/// This struct extracts the common functionality from the incoming CheckpointVisitor +/// and the AddRemoveDedupVisitor. +pub(crate) struct FileActionDeduplicator<'seen> { + /// A set of (data file path, dv_unique_id) pairs that have been seen thus + /// far in the log for deduplication + seen_file_keys: &'seen mut HashSet, + /// Selection vector to track which rows should be included + selection_vector: Vec, + /// Whether we're processing a log batch (as opposed to a checkpoint) + is_log_batch: bool, + /// Index of the getter containing the add.path column + add_path_index: usize, + /// Index of the getter containing the remove.path column + remove_path_index: usize, + /// Starting index for add action deletion vector columns + add_dv_start_index: usize, + /// Starting index for remove action deletion vector columns + remove_dv_start_index: usize, +} + +impl<'seen> FileActionDeduplicator<'seen> { + pub(crate) fn new( + seen_file_keys: &'seen mut HashSet, + selection_vector: Vec, + is_log_batch: bool, + add_path_index: usize, + remove_path_index: usize, + add_dv_start_index: usize, + remove_dv_start_index: usize, + ) -> Self { + Self { + seen_file_keys, + selection_vector, + is_log_batch, + add_path_index, + remove_path_index, + add_dv_start_index, + remove_dv_start_index, + } + } + + /// Checks if log replay already processed this logical file (in which case the current action + /// should be ignored). If not already seen, register it so we can recognize future duplicates. + /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it + /// and should process it. + pub(crate) fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { + // Note: each (add.path + add.dv_unique_id()) pair has a + // unique Add + Remove pair in the log. For example: + // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json + + if self.seen_file_keys.contains(&key) { + debug!( + "Ignoring duplicate ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + true + } else { + debug!( + "Including ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + if self.is_log_batch { + // Remember file actions from this batch so we can ignore duplicates as we process + // batches from older commit and/or checkpoint files. We don't track checkpoint + // batches because they are already the oldest actions and never replace anything. + self.seen_file_keys.insert(key); + } + false + } + } + + /// Extract the deletion vector unique ID if it exists. + fn extract_dv_unique_id<'a>( + &self, + i: usize, + getters: &[&'a dyn GetData<'a>], + dv_start_index: usize, + ) -> DeltaResult> { + match getters[dv_start_index].get_opt(i, "deletionVector.storageType")? { + Some(storage_type) => { + let path_or_inline = + getters[dv_start_index + 1].get(i, "deletionVector.pathOrInlineDv")?; + let offset = getters[dv_start_index + 2].get_opt(i, "deletionVector.offset")?; + + Ok(Some(DeletionVectorDescriptor::unique_id_from_parts( + storage_type, + path_or_inline, + offset, + ))) + } + None => Ok(None), + } + } + + /// Extracts a file action key and determines if it's an add operation. + /// This method examines the data at the given index using the provided getters + /// to identify whether a file action exists and what type it is. + /// + /// # Arguments + /// + /// * `i` - Index position in the data structure to examine + /// * `getters` - Collection of data getter implementations used to access the data + /// * `skip_removes` - Whether to skip remove actions when extracting file actions + /// + /// # Returns + /// + /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation + /// * `Ok(None)` - When no file action is found + /// * `Err(...)` - On any error during extraction + pub(crate) fn extract_file_action<'a>( + &self, + i: usize, + getters: &[&'a dyn GetData<'a>], + skip_removes: bool, + ) -> DeltaResult> { + // Try to extract an add action by the required path column + if let Some(path) = getters[self.add_path_index].get_str(i, "add.path")? { + let dv_unique_id = self.extract_dv_unique_id(i, getters, self.add_dv_start_index)?; + return Ok(Some((FileActionKey::new(path, dv_unique_id), true))); + } + + // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint batch. + if skip_removes { + return Ok(None); + } + + // Try to extract a remove action by the required path column + if let Some(path) = getters[self.remove_path_index].get_str(i, "remove.path")? { + let dv_unique_id = self.extract_dv_unique_id(i, getters, self.remove_dv_start_index)?; + return Ok(Some((FileActionKey::new(path, dv_unique_id), false))); + } + + // No file action found + Ok(None) + } + + pub(crate) fn selection_vector(self) -> Vec { + self.selection_vector + } + + pub(crate) fn selection_vector_ref(&self) -> &Vec { + &self.selection_vector + } + + pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec { + &mut self.selection_vector + } + + /// Returns whether we are currently processing a log batch. + pub(crate) fn is_log_batch(&self) -> bool { + self.is_log_batch + } +} diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 0e26b610f7..a2d65f1b01 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -3,33 +3,19 @@ use std::collections::{HashMap, HashSet}; use std::sync::{Arc, LazyLock}; use itertools::Itertools; -use tracing::debug; use super::data_skipping::DataSkippingFilter; use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; +use crate::log_replay::{FileActionDeduplicator, FileActionKey}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; -use crate::scan::{DeletionVectorDescriptor, Scalar, TransformExpr}; +use crate::scan::{Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; -/// The subset of file action fields that uniquely identifies it in the log, used for deduplication -/// of adds and removes during log replay. -#[derive(Debug, Hash, Eq, PartialEq)] -struct FileActionKey { - path: String, - dv_unique_id: Option, -} -impl FileActionKey { - fn new(path: impl Into, dv_unique_id: Option) -> Self { - let path = path.into(); - Self { path, dv_unique_id } - } -} - struct LogReplayScanner { partition_filter: Option, data_skipping_filter: Option, @@ -45,43 +31,43 @@ struct LogReplayScanner { /// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the /// first action for a given file is a remove, then that file does not show up in the result at all. struct AddRemoveDedupVisitor<'seen> { - seen: &'seen mut HashSet, - selection_vector: Vec, + deduplicator: FileActionDeduplicator<'seen>, logical_schema: SchemaRef, transform: Option>, partition_filter: Option, row_transform_exprs: Vec>, - is_log_batch: bool, } impl AddRemoveDedupVisitor<'_> { - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it - /// and should process it. - fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen.contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - true - } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - if self.is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen.insert(key); - } - false + // The index position in the row getters for the following columns + const ADD_PATH_INDEX: usize = 0; + const ADD_PARTITION_VALUES_INDEX: usize = 1; + const ADD_DV_START_INDEX: usize = 2; + const REMOVE_PATH_INDEX: usize = 5; + const REMOVE_DV_START_INDEX: usize = 6; + + fn new( + seen: &mut HashSet, + selection_vector: Vec, + logical_schema: SchemaRef, + transform: Option>, + partition_filter: Option, + is_log_batch: bool, + ) -> AddRemoveDedupVisitor<'_> { + AddRemoveDedupVisitor { + deduplicator: FileActionDeduplicator::new( + seen, + selection_vector, + is_log_batch, + Self::ADD_PATH_INDEX, + Self::REMOVE_PATH_INDEX, + Self::ADD_DV_START_INDEX, + Self::REMOVE_DV_START_INDEX, + ), + logical_schema, + transform, + partition_filter, + row_transform_exprs: Vec::new(), } } @@ -162,26 +148,13 @@ impl AddRemoveDedupVisitor<'_> { /// True if this row contains an Add action that should survive log replay. Skip it if the row /// is not an Add action, or the file has already been seen previously. fn is_valid_add<'a>(&mut self, i: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult { - // Add will have a path at index 0 if it is valid; otherwise, if it is a log batch, we may - // have a remove with a path at index 4. In either case, extract the three dv getters at - // indexes that immediately follow a valid path index. - let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? { - (path, &getters[2..5], true) - } else if !self.is_log_batch { + // Remove getters are not included when visiting a non-log batch (checkpoint batch), so do + // not try to extract remove actions in that case. + let Some((file_key, is_add)) = + self.deduplicator + .extract_file_action(i, getters, self.deduplicator.is_log_batch())? + else { return Ok(false); - } else if let Some(path) = getters[5].get_opt(i, "remove.path")? { - (path, &getters[6..9], false) - } else { - return Ok(false); - }; - - let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?, - dv_getters[2].get_opt(i, "deletionVector.offset")?, - )), - None => None, }; // Apply partition pruning (to adds only) before deduplication, so that we don't waste memory @@ -192,7 +165,8 @@ impl AddRemoveDedupVisitor<'_> { // encounter if the table's schema was replaced after the most recent checkpoint. let partition_values = match &self.transform { Some(transform) if is_add => { - let partition_values = getters[1].get(i, "add.partitionValues")?; + let partition_values = + getters[Self::ADD_PARTITION_VALUES_INDEX].get(i, "add.partitionValues")?; let partition_values = self.parse_partition_values(transform, &partition_values)?; if self.is_file_partition_pruned(&partition_values) { return Ok(false); @@ -203,8 +177,7 @@ impl AddRemoveDedupVisitor<'_> { }; // Check both adds and removes (skipping already-seen), but only transform and return adds - let file_key = FileActionKey::new(path, dv_unique_id); - if self.check_and_record_seen(file_key) || !is_add { + if self.deduplicator.check_and_record_seen(file_key) || !is_add { return Ok(false); } let transform = self @@ -243,7 +216,7 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { (names, types).into() }); let (names, types) = NAMES_AND_TYPES.as_ref(); - if self.is_log_batch { + if self.deduplicator.is_log_batch() { (names, types) } else { // All checkpoint actions are already reconciled and Remove actions in checkpoint files @@ -253,7 +226,11 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { } fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { - let expected_getters = if self.is_log_batch { 9 } else { 5 }; + let expected_getters = if self.deduplicator.is_log_batch() { + 9 + } else { + 5 + }; require!( getters.len() == expected_getters, Error::InternalError(format!( @@ -263,8 +240,8 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { ); for i in 0..row_count { - if self.selection_vector[i] { - self.selection_vector[i] = self.is_valid_add(i, getters)?; + if self.deduplicator.selection_vector_ref()[i] { + self.deduplicator.selection_vector_mut()[i] = self.is_valid_add(i, getters)?; } } Ok(()) @@ -336,19 +313,18 @@ impl LogReplayScanner { }; assert_eq!(selection_vector.len(), actions.len()); - let mut visitor = AddRemoveDedupVisitor { - seen: &mut self.seen, + let mut visitor = AddRemoveDedupVisitor::new( + &mut self.seen, selection_vector, logical_schema, transform, - partition_filter: self.partition_filter.clone(), - row_transform_exprs: Vec::new(), + self.partition_filter.clone(), is_log_batch, - }; + ); visitor.visit_rows_of(actions)?; // TODO: Teach expression eval to respect the selection vector we just computed so carefully! - let selection_vector = visitor.selection_vector; + let selection_vector = visitor.deduplicator.selection_vector(); let result = add_transform.evaluate(actions)?; Ok((result, selection_vector, visitor.row_transform_exprs)) } From 7da74b268f38672c54651749c27777d7293cdbc3 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 14:14:15 -0700 Subject: [PATCH 019/176] more docs --- kernel/src/log_replay.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 7650183a04..ef9004ae1b 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -25,7 +25,8 @@ impl FileActionKey { /// and the AddRemoveDedupVisitor. pub(crate) struct FileActionDeduplicator<'seen> { /// A set of (data file path, dv_unique_id) pairs that have been seen thus - /// far in the log for deduplication + /// far in the log for deduplication. This is a mutable reference to the set + /// of seen file keys that persists across multiple log batches. seen_file_keys: &'seen mut HashSet, /// Selection vector to track which rows should be included selection_vector: Vec, From 220a216a2968531943a0773a87b4e2fc702d08fe Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 15:05:59 -0700 Subject: [PATCH 020/176] invert is_log_batch logic --- kernel/src/scan/log_replay.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index a2d65f1b01..b6bdc1570c 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -152,7 +152,7 @@ impl AddRemoveDedupVisitor<'_> { // not try to extract remove actions in that case. let Some((file_key, is_add)) = self.deduplicator - .extract_file_action(i, getters, self.deduplicator.is_log_batch())? + .extract_file_action(i, getters, !self.deduplicator.is_log_batch())? else { return Ok(false); }; From 9d86911fadb6a6aa6267a82cc4aec9c3949ec0da Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 15:14:46 -0700 Subject: [PATCH 021/176] docs --- kernel/src/log_replay.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index ef9004ae1b..d6b175f282 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -20,9 +20,12 @@ impl FileActionKey { } } -/// Core implementation for deduplicating file actions in Delta log replay -/// This struct extracts the common functionality from the incoming CheckpointVisitor -/// and the AddRemoveDedupVisitor. +/// Maintains state and provides functionality for deduplicating file actions during log replay. +/// +/// This struct is embedded in visitors AddRemoveDedupVisitor and CheckpointVisitor to track +/// which files have been seen across multiple log batches. Since logs are processed +/// newest-to-oldest, this deduplicator ensures that each unique file (identified by path +/// and deletion vector ID) is processed only once. pub(crate) struct FileActionDeduplicator<'seen> { /// A set of (data file path, dv_unique_id) pairs that have been seen thus /// far in the log for deduplication. This is a mutable reference to the set From e5b0e32056b8ea12060fd48cb18b2eb63f3e537f Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 15:16:42 -0700 Subject: [PATCH 022/176] docs --- kernel/src/log_replay.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index d6b175f282..e400c27d13 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -22,10 +22,9 @@ impl FileActionKey { /// Maintains state and provides functionality for deduplicating file actions during log replay. /// -/// This struct is embedded in visitors AddRemoveDedupVisitor and CheckpointVisitor to track -/// which files have been seen across multiple log batches. Since logs are processed -/// newest-to-oldest, this deduplicator ensures that each unique file (identified by path -/// and deletion vector ID) is processed only once. +/// This struct is embedded in visitors to track which files have been seen across multiple +/// log batches. Since logs are processed newest-to-oldest, this deduplicator ensures that each +/// unique file (identified by path and deletion vector ID) is processed only once. pub(crate) struct FileActionDeduplicator<'seen> { /// A set of (data file path, dv_unique_id) pairs that have been seen thus /// far in the log for deduplication. This is a mutable reference to the set From a5393dcc896d05071b2a704b82e6f47b93f07bcc Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 15:48:28 -0700 Subject: [PATCH 023/176] docs and imports --- kernel/src/log_replay.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index e400c27d13..521b6e81eb 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -1,9 +1,10 @@ -use crate::{ - actions::deletion_vector::DeletionVectorDescriptor, - engine_data::{GetData, TypedGetData}, - DeltaResult, -}; +//! This module provides structures and functionality to faciliate the log replay process. use std::collections::HashSet; + +use crate::actions::deletion_vector::DeletionVectorDescriptor; +use crate::engine_data::{GetData, TypedGetData}; +use crate::DeltaResult; + use tracing::debug; /// The subset of file action fields that uniquely identifies it in the log, used for deduplication @@ -24,7 +25,9 @@ impl FileActionKey { /// /// This struct is embedded in visitors to track which files have been seen across multiple /// log batches. Since logs are processed newest-to-oldest, this deduplicator ensures that each -/// unique file (identified by path and deletion vector ID) is processed only once. +/// unique file (identified by path and deletion vector ID) is processed only once. Performing +/// deduplication at the visitor level avoids having to load all actions into memory at once, +/// significantly reducing memory usage for large Delta tables with extensive history. pub(crate) struct FileActionDeduplicator<'seen> { /// A set of (data file path, dv_unique_id) pairs that have been seen thus /// far in the log for deduplication. This is a mutable reference to the set From a23c651ad7aa3a8399f836eac5e6113bec2aafde Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 20:04:05 -0700 Subject: [PATCH 024/176] improve mod doc --- kernel/src/log_replay.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 521b6e81eb..e5854ca314 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -1,4 +1,20 @@ -//! This module provides structures and functionality to faciliate the log replay process. +//! This module provides log replay utilities. +//! +//! Log replay is the process of transforming an iterator of action batches (read from Delta +//! transaction logs) into an iterator of filtered/transformed actions for specific use cases. +//! The logs, which record all table changes as JSON entries, are processed batch by batch, +//! typically from newest to oldest. +//! +//! Log replay can be implemented in various ways: +//! - For table scans: Deduplicate file actions to identify the current set of valid files +//! - For checkpointing: Filter actions to include only those needed to rebuild table state +//! +//! This module provides structures for efficient batch processing, focusing on file action +//! deduplication with `FileActionDeduplicator` which tracks unique files across log batches +//! to minimize memory usage for tables with extensive history. +//! +//! Future extensions will support additional log replay processors beyond the current use cases. + use std::collections::HashSet; use crate::actions::deletion_vector::DeletionVectorDescriptor; From d712d181204d068e9346cc0e3e6ee582c95a80a7 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 09:43:14 -0700 Subject: [PATCH 025/176] improve doc --- kernel/src/log_replay.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index e5854ca314..3b2a84692f 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -5,15 +5,14 @@ //! The logs, which record all table changes as JSON entries, are processed batch by batch, //! typically from newest to oldest. //! -//! Log replay can be implemented in various ways: -//! - For table scans: Deduplicate file actions to identify the current set of valid files -//! - For checkpointing: Filter actions to include only those needed to rebuild table state +//! Log replay is currently implemented for table scans, which filter and apply transofmations +//! to file actions to produce a view of the table state at a specific point in time. +//! Future extensions will support additional log replay processors beyond the current use case. +//! (e.g. checkpointing: filter actions to include only those needed to rebuild table state) //! //! This module provides structures for efficient batch processing, focusing on file action //! deduplication with `FileActionDeduplicator` which tracks unique files across log batches //! to minimize memory usage for tables with extensive history. -//! -//! Future extensions will support additional log replay processors beyond the current use cases. use std::collections::HashSet; From e564ae17ca5f6b17659a6ac05867af7df0681621 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 09:46:10 -0700 Subject: [PATCH 026/176] docs' --- kernel/src/log_replay.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 3b2a84692f..d9a906525a 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -5,8 +5,8 @@ //! The logs, which record all table changes as JSON entries, are processed batch by batch, //! typically from newest to oldest. //! -//! Log replay is currently implemented for table scans, which filter and apply transofmations -//! to file actions to produce a view of the table state at a specific point in time. +//! Log replay is currently implemented for table scans, which filter and apply transformations +//! to produce file actions which builds the view of the table state at a specific point in time. //! Future extensions will support additional log replay processors beyond the current use case. //! (e.g. checkpointing: filter actions to include only those needed to rebuild table state) //! From b14ff195c0655cfecc29d6b666af6f72c4bcd29d Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 13:42:06 -0700 Subject: [PATCH 027/176] docs --- kernel/src/log_replay.rs | 2 +- kernel/src/scan/log_replay.rs | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index d9a906525a..0064a701af 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -178,7 +178,7 @@ impl<'seen> FileActionDeduplicator<'seen> { Ok(None) } - pub(crate) fn selection_vector(self) -> Vec { + pub(crate) fn into_selection_vector(self) -> Vec { self.selection_vector } diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index b6bdc1570c..77a9851257 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -148,6 +148,10 @@ impl AddRemoveDedupVisitor<'_> { /// True if this row contains an Add action that should survive log replay. Skip it if the row /// is not an Add action, or the file has already been seen previously. fn is_valid_add<'a>(&mut self, i: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult { + // When processing file actions, we extract path and deletion vector information based on action type: + // - For Add actions: path is at index 0, followed by DV fields at indexes 2-4 + // - For Remove actions (in log batches only): path is at index 5, followed by DV fields at indexes 6-8 + // The file extraction logic selects the appropriate indexes based on whether we found a valid path. // Remove getters are not included when visiting a non-log batch (checkpoint batch), so do // not try to extract remove actions in that case. let Some((file_key, is_add)) = @@ -324,7 +328,7 @@ impl LogReplayScanner { visitor.visit_rows_of(actions)?; // TODO: Teach expression eval to respect the selection vector we just computed so carefully! - let selection_vector = visitor.deduplicator.selection_vector(); + let selection_vector = visitor.deduplicator.into_selection_vector(); let result = add_transform.evaluate(actions)?; Ok((result, selection_vector, visitor.row_transform_exprs)) } From a52d484be0741e5e9d3e72336a0e65d8b86a3298 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 14:22:33 -0700 Subject: [PATCH 028/176] update --- kernel/src/log_replay.rs | 16 ---------------- kernel/src/scan/log_replay.rs | 14 +++++++++----- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 0064a701af..39aa4ab6e3 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -48,8 +48,6 @@ pub(crate) struct FileActionDeduplicator<'seen> { /// far in the log for deduplication. This is a mutable reference to the set /// of seen file keys that persists across multiple log batches. seen_file_keys: &'seen mut HashSet, - /// Selection vector to track which rows should be included - selection_vector: Vec, /// Whether we're processing a log batch (as opposed to a checkpoint) is_log_batch: bool, /// Index of the getter containing the add.path column @@ -65,7 +63,6 @@ pub(crate) struct FileActionDeduplicator<'seen> { impl<'seen> FileActionDeduplicator<'seen> { pub(crate) fn new( seen_file_keys: &'seen mut HashSet, - selection_vector: Vec, is_log_batch: bool, add_path_index: usize, remove_path_index: usize, @@ -74,7 +71,6 @@ impl<'seen> FileActionDeduplicator<'seen> { ) -> Self { Self { seen_file_keys, - selection_vector, is_log_batch, add_path_index, remove_path_index, @@ -178,18 +174,6 @@ impl<'seen> FileActionDeduplicator<'seen> { Ok(None) } - pub(crate) fn into_selection_vector(self) -> Vec { - self.selection_vector - } - - pub(crate) fn selection_vector_ref(&self) -> &Vec { - &self.selection_vector - } - - pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec { - &mut self.selection_vector - } - /// Returns whether we are currently processing a log batch. pub(crate) fn is_log_batch(&self) -> bool { self.is_log_batch diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 77a9851257..3c6c2e8455 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -36,6 +36,7 @@ struct AddRemoveDedupVisitor<'seen> { transform: Option>, partition_filter: Option, row_transform_exprs: Vec>, + selection_vector: Vec, } impl AddRemoveDedupVisitor<'_> { @@ -57,7 +58,6 @@ impl AddRemoveDedupVisitor<'_> { AddRemoveDedupVisitor { deduplicator: FileActionDeduplicator::new( seen, - selection_vector, is_log_batch, Self::ADD_PATH_INDEX, Self::REMOVE_PATH_INDEX, @@ -68,6 +68,7 @@ impl AddRemoveDedupVisitor<'_> { transform, partition_filter, row_transform_exprs: Vec::new(), + selection_vector, } } @@ -244,8 +245,8 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { ); for i in 0..row_count { - if self.deduplicator.selection_vector_ref()[i] { - self.deduplicator.selection_vector_mut()[i] = self.is_valid_add(i, getters)?; + if self.selection_vector[i] { + self.selection_vector[i] = self.is_valid_add(i, getters)?; } } Ok(()) @@ -328,9 +329,12 @@ impl LogReplayScanner { visitor.visit_rows_of(actions)?; // TODO: Teach expression eval to respect the selection vector we just computed so carefully! - let selection_vector = visitor.deduplicator.into_selection_vector(); let result = add_transform.evaluate(actions)?; - Ok((result, selection_vector, visitor.row_transform_exprs)) + Ok(( + result, + visitor.selection_vector, + visitor.row_transform_exprs, + )) } } From a243a989af2d99bfa3ab07ddc78ec236ec0fbb54 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 16:59:37 -0700 Subject: [PATCH 029/176] nits --- kernel/src/log_replay.rs | 31 ++++++++++++++++++++++++------- kernel/src/scan/log_replay.rs | 23 ++++++++++++----------- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 39aa4ab6e3..12528d2962 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -18,8 +18,8 @@ use std::collections::HashSet; use crate::actions::deletion_vector::DeletionVectorDescriptor; use crate::engine_data::{GetData, TypedGetData}; +use crate::log_replay::FileActionKeyType::{Add, Remove}; use crate::DeltaResult; - use tracing::debug; /// The subset of file action fields that uniquely identifies it in the log, used for deduplication @@ -28,14 +28,31 @@ use tracing::debug; pub(crate) struct FileActionKey { pub(crate) path: String, pub(crate) dv_unique_id: Option, + pub(crate) action_type: FileActionKeyType, } + impl FileActionKey { - pub(crate) fn new(path: impl Into, dv_unique_id: Option) -> Self { + pub(crate) fn new( + path: impl Into, + dv_unique_id: Option, + action_type: FileActionKeyType, + ) -> Self { let path = path.into(); - Self { path, dv_unique_id } + Self { + path, + dv_unique_id, + action_type, + } } } +// File actions are either add or remove actions. +#[derive(Debug, Hash, Eq, PartialEq)] +pub(crate) enum FileActionKeyType { + Add, + Remove, +} + /// Maintains state and provides functionality for deduplicating file actions during log replay. /// /// This struct is embedded in visitors to track which files have been seen across multiple @@ -144,7 +161,7 @@ impl<'seen> FileActionDeduplicator<'seen> { /// /// # Returns /// - /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation + /// * `Ok(Some((key))` - When a file action is found, returns the key /// * `Ok(None)` - When no file action is found /// * `Err(...)` - On any error during extraction pub(crate) fn extract_file_action<'a>( @@ -152,11 +169,11 @@ impl<'seen> FileActionDeduplicator<'seen> { i: usize, getters: &[&'a dyn GetData<'a>], skip_removes: bool, - ) -> DeltaResult> { + ) -> DeltaResult> { // Try to extract an add action by the required path column if let Some(path) = getters[self.add_path_index].get_str(i, "add.path")? { let dv_unique_id = self.extract_dv_unique_id(i, getters, self.add_dv_start_index)?; - return Ok(Some((FileActionKey::new(path, dv_unique_id), true))); + return Ok(Some(FileActionKey::new(path, dv_unique_id, Add))); } // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint batch. @@ -167,7 +184,7 @@ impl<'seen> FileActionDeduplicator<'seen> { // Try to extract a remove action by the required path column if let Some(path) = getters[self.remove_path_index].get_str(i, "remove.path")? { let dv_unique_id = self.extract_dv_unique_id(i, getters, self.remove_dv_start_index)?; - return Ok(Some((FileActionKey::new(path, dv_unique_id), false))); + return Ok(Some(FileActionKey::new(path, dv_unique_id, Remove))); } // No file action found diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 3c6c2e8455..2042ab090f 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -9,7 +9,7 @@ use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; -use crate::log_replay::{FileActionDeduplicator, FileActionKey}; +use crate::log_replay::{FileActionDeduplicator, FileActionKey, FileActionKeyType}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; use crate::scan::{Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; @@ -32,11 +32,11 @@ struct LogReplayScanner { /// first action for a given file is a remove, then that file does not show up in the result at all. struct AddRemoveDedupVisitor<'seen> { deduplicator: FileActionDeduplicator<'seen>, + selection_vector: Vec, logical_schema: SchemaRef, transform: Option>, partition_filter: Option, row_transform_exprs: Vec>, - selection_vector: Vec, } impl AddRemoveDedupVisitor<'_> { @@ -64,11 +64,11 @@ impl AddRemoveDedupVisitor<'_> { Self::ADD_DV_START_INDEX, Self::REMOVE_DV_START_INDEX, ), + selection_vector, logical_schema, transform, partition_filter, row_transform_exprs: Vec::new(), - selection_vector, } } @@ -155,12 +155,15 @@ impl AddRemoveDedupVisitor<'_> { // The file extraction logic selects the appropriate indexes based on whether we found a valid path. // Remove getters are not included when visiting a non-log batch (checkpoint batch), so do // not try to extract remove actions in that case. - let Some((file_key, is_add)) = - self.deduplicator - .extract_file_action(i, getters, !self.deduplicator.is_log_batch())? + let Some(file_key) = self.deduplicator.extract_file_action( + i, + getters, + !self.deduplicator.is_log_batch(), // skip_removes. true if this is a checkpoint batch + )? else { return Ok(false); }; + let is_add = matches!(file_key.action_type, FileActionKeyType::Add); // Apply partition pruning (to adds only) before deduplication, so that we don't waste memory // tracking pruned files. Removes don't get pruned and we'll still have to track them. @@ -231,11 +234,9 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { } fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { - let expected_getters = if self.deduplicator.is_log_batch() { - 9 - } else { - 5 - }; + let is_log_batch = self.deduplicator.is_log_batch(); + let expected_getters = if is_log_batch { 9 } else { 5 }; + require!( getters.len() == expected_getters, Error::InternalError(format!( From 9f06382993af7c30b164cf3b452880141adc4dc1 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 17:05:14 -0700 Subject: [PATCH 030/176] Revert "nits" This reverts commit a243a989af2d99bfa3ab07ddc78ec236ec0fbb54. --- kernel/src/log_replay.rs | 31 +++++++------------------------ kernel/src/scan/log_replay.rs | 23 +++++++++++------------ 2 files changed, 18 insertions(+), 36 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 12528d2962..39aa4ab6e3 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -18,8 +18,8 @@ use std::collections::HashSet; use crate::actions::deletion_vector::DeletionVectorDescriptor; use crate::engine_data::{GetData, TypedGetData}; -use crate::log_replay::FileActionKeyType::{Add, Remove}; use crate::DeltaResult; + use tracing::debug; /// The subset of file action fields that uniquely identifies it in the log, used for deduplication @@ -28,31 +28,14 @@ use tracing::debug; pub(crate) struct FileActionKey { pub(crate) path: String, pub(crate) dv_unique_id: Option, - pub(crate) action_type: FileActionKeyType, } - impl FileActionKey { - pub(crate) fn new( - path: impl Into, - dv_unique_id: Option, - action_type: FileActionKeyType, - ) -> Self { + pub(crate) fn new(path: impl Into, dv_unique_id: Option) -> Self { let path = path.into(); - Self { - path, - dv_unique_id, - action_type, - } + Self { path, dv_unique_id } } } -// File actions are either add or remove actions. -#[derive(Debug, Hash, Eq, PartialEq)] -pub(crate) enum FileActionKeyType { - Add, - Remove, -} - /// Maintains state and provides functionality for deduplicating file actions during log replay. /// /// This struct is embedded in visitors to track which files have been seen across multiple @@ -161,7 +144,7 @@ impl<'seen> FileActionDeduplicator<'seen> { /// /// # Returns /// - /// * `Ok(Some((key))` - When a file action is found, returns the key + /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation /// * `Ok(None)` - When no file action is found /// * `Err(...)` - On any error during extraction pub(crate) fn extract_file_action<'a>( @@ -169,11 +152,11 @@ impl<'seen> FileActionDeduplicator<'seen> { i: usize, getters: &[&'a dyn GetData<'a>], skip_removes: bool, - ) -> DeltaResult> { + ) -> DeltaResult> { // Try to extract an add action by the required path column if let Some(path) = getters[self.add_path_index].get_str(i, "add.path")? { let dv_unique_id = self.extract_dv_unique_id(i, getters, self.add_dv_start_index)?; - return Ok(Some(FileActionKey::new(path, dv_unique_id, Add))); + return Ok(Some((FileActionKey::new(path, dv_unique_id), true))); } // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint batch. @@ -184,7 +167,7 @@ impl<'seen> FileActionDeduplicator<'seen> { // Try to extract a remove action by the required path column if let Some(path) = getters[self.remove_path_index].get_str(i, "remove.path")? { let dv_unique_id = self.extract_dv_unique_id(i, getters, self.remove_dv_start_index)?; - return Ok(Some(FileActionKey::new(path, dv_unique_id, Remove))); + return Ok(Some((FileActionKey::new(path, dv_unique_id), false))); } // No file action found diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 2042ab090f..3c6c2e8455 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -9,7 +9,7 @@ use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; -use crate::log_replay::{FileActionDeduplicator, FileActionKey, FileActionKeyType}; +use crate::log_replay::{FileActionDeduplicator, FileActionKey}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; use crate::scan::{Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; @@ -32,11 +32,11 @@ struct LogReplayScanner { /// first action for a given file is a remove, then that file does not show up in the result at all. struct AddRemoveDedupVisitor<'seen> { deduplicator: FileActionDeduplicator<'seen>, - selection_vector: Vec, logical_schema: SchemaRef, transform: Option>, partition_filter: Option, row_transform_exprs: Vec>, + selection_vector: Vec, } impl AddRemoveDedupVisitor<'_> { @@ -64,11 +64,11 @@ impl AddRemoveDedupVisitor<'_> { Self::ADD_DV_START_INDEX, Self::REMOVE_DV_START_INDEX, ), - selection_vector, logical_schema, transform, partition_filter, row_transform_exprs: Vec::new(), + selection_vector, } } @@ -155,15 +155,12 @@ impl AddRemoveDedupVisitor<'_> { // The file extraction logic selects the appropriate indexes based on whether we found a valid path. // Remove getters are not included when visiting a non-log batch (checkpoint batch), so do // not try to extract remove actions in that case. - let Some(file_key) = self.deduplicator.extract_file_action( - i, - getters, - !self.deduplicator.is_log_batch(), // skip_removes. true if this is a checkpoint batch - )? + let Some((file_key, is_add)) = + self.deduplicator + .extract_file_action(i, getters, !self.deduplicator.is_log_batch())? else { return Ok(false); }; - let is_add = matches!(file_key.action_type, FileActionKeyType::Add); // Apply partition pruning (to adds only) before deduplication, so that we don't waste memory // tracking pruned files. Removes don't get pruned and we'll still have to track them. @@ -234,9 +231,11 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { } fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { - let is_log_batch = self.deduplicator.is_log_batch(); - let expected_getters = if is_log_batch { 9 } else { 5 }; - + let expected_getters = if self.deduplicator.is_log_batch() { + 9 + } else { + 5 + }; require!( getters.len() == expected_getters, Error::InternalError(format!( From 58f38c0345179ad11300fad2197953ac4adc61e0 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 17:07:08 -0700 Subject: [PATCH 031/176] nits --- kernel/src/scan/log_replay.rs | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 3c6c2e8455..37e5044059 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -32,11 +32,11 @@ struct LogReplayScanner { /// first action for a given file is a remove, then that file does not show up in the result at all. struct AddRemoveDedupVisitor<'seen> { deduplicator: FileActionDeduplicator<'seen>, + selection_vector: Vec, logical_schema: SchemaRef, transform: Option>, partition_filter: Option, row_transform_exprs: Vec>, - selection_vector: Vec, } impl AddRemoveDedupVisitor<'_> { @@ -64,11 +64,11 @@ impl AddRemoveDedupVisitor<'_> { Self::ADD_DV_START_INDEX, Self::REMOVE_DV_START_INDEX, ), + selection_vector, logical_schema, transform, partition_filter, row_transform_exprs: Vec::new(), - selection_vector, } } @@ -155,9 +155,11 @@ impl AddRemoveDedupVisitor<'_> { // The file extraction logic selects the appropriate indexes based on whether we found a valid path. // Remove getters are not included when visiting a non-log batch (checkpoint batch), so do // not try to extract remove actions in that case. - let Some((file_key, is_add)) = - self.deduplicator - .extract_file_action(i, getters, !self.deduplicator.is_log_batch())? + let Some((file_key, is_add)) = self.deduplicator.extract_file_action( + i, + getters, + !self.deduplicator.is_log_batch(), // skip_removes. true if this is a checkpoint batch + )? else { return Ok(false); }; @@ -231,11 +233,8 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { } fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { - let expected_getters = if self.deduplicator.is_log_batch() { - 9 - } else { - 5 - }; + let is_log_batch = self.deduplicator.is_log_batch(); + let expected_getters = if is_log_batch { 9 } else { 5 }; require!( getters.len() == expected_getters, Error::InternalError(format!( From 628546c45bd10fc4b16501bc47bd5693f5c1b9f8 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 11:39:05 -0700 Subject: [PATCH 032/176] refactor --- kernel/src/log_replay.rs | 54 ++++++++++++++++++- kernel/src/scan/log_replay.rs | 97 +++++++++++++++++++---------------- kernel/src/scan/mod.rs | 8 +++ 3 files changed, 115 insertions(+), 44 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 39aa4ab6e3..3b7e875242 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -18,7 +18,7 @@ use std::collections::HashSet; use crate::actions::deletion_vector::DeletionVectorDescriptor; use crate::engine_data::{GetData, TypedGetData}; -use crate::DeltaResult; +use crate::{DeltaResult, EngineData}; use tracing::debug; @@ -179,3 +179,55 @@ impl<'seen> FileActionDeduplicator<'seen> { self.is_log_batch } } + +/// Trait defining log replay processors which implement custom filtering and transformation +/// logic for processing action batches from transaction logs. They receive batches in reverse +/// chronological order (newest to oldest) and typically: +/// +/// 1. Create or maintain a selection vector to track which actions to include +/// 2. Track already-seen file actions to deduplicate across batches +/// 3. Apply specialized filtering based on processor type (scan, checkpoint, etc.) +/// +pub(crate) trait LogReplayProcessor { + /// The type of results produced by this processor + type Output; + + /// Process a batch of actions and return the filtered result + fn process_actions_batch( + &mut self, + batch: Box, + is_log_batch: bool, + ) -> DeltaResult; + + // Get a reference to the set of seen file keys + fn seen_file_keys(&mut self) -> &mut HashSet; + + /// Applies a processor to an action iterator and filters out empty results. + /// + /// This is an associated function rather than an instance method because the + /// returned iterator needs to own the processor. + fn apply_to_iterator( + processor: impl LogReplayProcessor, + action_iter: impl Iterator, bool)>>, + ) -> impl Iterator> + where + Self::Output: HasSelectionVector, + { + let mut processor = processor; + action_iter + .map(move |action_res| { + let (batch, is_log_batch) = action_res?; + processor.process_actions_batch(batch, is_log_batch) + }) + .filter(|res| { + res.as_ref() + .map_or(true, |result| result.has_selected_rows()) + }) + } +} + +/// Trait for types that contain a selection vector used in log replay filtering. +pub(crate) trait HasSelectionVector { + /// Check if the selection vector contains at least one selected row + fn has_selected_rows(&self) -> bool; +} diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 37e5044059..223c668ec5 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -9,21 +9,23 @@ use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; -use crate::log_replay::{FileActionDeduplicator, FileActionKey}; +use crate::log_replay::{FileActionDeduplicator, FileActionKey, LogReplayProcessor}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; use crate::scan::{Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; -struct LogReplayScanner { +struct ScanLogReplayProcessor { partition_filter: Option, data_skipping_filter: Option, - + add_transform: Arc, + logical_schema: SchemaRef, + transform: Option>, /// A set of (data file path, dv_unique_id) pairs that have been seen thus /// far in the log. This is used to filter out files with Remove actions as /// well as duplicate entries in the log. - seen: HashSet, + seen_file_keys: HashSet, } /// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds. Log @@ -291,41 +293,37 @@ fn get_add_transform_expr() -> Expression { ]) } -impl LogReplayScanner { - /// Create a new [`LogReplayScanner`] instance - fn new(engine: &dyn Engine, physical_predicate: Option<(ExpressionRef, SchemaRef)>) -> Self { - Self { - partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()), - data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate), - seen: Default::default(), - } - } +impl LogReplayProcessor for ScanLogReplayProcessor { + type Output = ScanData; - fn process_scan_batch( + fn process_actions_batch( &mut self, - add_transform: &dyn ExpressionEvaluator, - actions: &dyn EngineData, - logical_schema: SchemaRef, - transform: Option>, + batch: Box, is_log_batch: bool, - ) -> DeltaResult { + ) -> DeltaResult { // Apply data skipping to get back a selection vector for actions that passed skipping. We // will update the vector below as log replay identifies duplicates that should be ignored. let selection_vector = match &self.data_skipping_filter { - Some(filter) => filter.apply(actions)?, - None => vec![true; actions.len()], + Some(filter) => filter.apply(batch.as_ref())?, + None => vec![true; batch.len()], }; - assert_eq!(selection_vector.len(), actions.len()); + assert_eq!(selection_vector.len(), batch.len()); + + let logical_schema = self.logical_schema.clone(); + let transform = self.transform.clone(); + let partition_filter = self.partition_filter.clone(); + let result = self.add_transform.evaluate(batch.as_ref())?; let mut visitor = AddRemoveDedupVisitor::new( - &mut self.seen, + self.seen_file_keys(), selection_vector, logical_schema, transform, - self.partition_filter.clone(), + partition_filter, is_log_batch, ); - visitor.visit_rows_of(actions)?; + + visitor.visit_rows_of(batch.as_ref())?; // TODO: Teach expression eval to respect the selection vector we just computed so carefully! let result = add_transform.evaluate(actions)?; @@ -335,6 +333,33 @@ impl LogReplayScanner { visitor.row_transform_exprs, )) } + + fn seen_file_keys(&mut self) -> &mut HashSet { + &mut self.seen_file_keys + } +} + +impl ScanLogReplayProcessor { + /// Create a new [`ScanLogReplayProcessor`] instance + fn new( + engine: &dyn Engine, + physical_predicate: Option<(ExpressionRef, SchemaRef)>, + logical_schema: SchemaRef, + transform: Option>, + ) -> Self { + Self { + partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()), + data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate), + add_transform: engine.get_expression_handler().get_evaluator( + get_log_add_schema().clone(), + get_add_transform_expr(), + SCAN_ROW_DATATYPE.clone(), + ), + seen_file_keys: Default::default(), + logical_schema, + transform, + } + } } /// Given an iterator of (engine_data, bool) tuples and a predicate, returns an iterator of @@ -348,24 +373,10 @@ pub(crate) fn scan_action_iter( transform: Option>, physical_predicate: Option<(ExpressionRef, SchemaRef)>, ) -> impl Iterator> { - let mut log_scanner = LogReplayScanner::new(engine, physical_predicate); - let add_transform = engine.get_expression_handler().get_evaluator( - get_log_add_schema().clone(), - get_add_transform_expr(), - SCAN_ROW_DATATYPE.clone(), - ); - action_iter - .map(move |action_res| { - let (batch, is_log_batch) = action_res?; - log_scanner.process_scan_batch( - add_transform.as_ref(), - batch.as_ref(), - logical_schema.clone(), - transform.clone(), - is_log_batch, - ) - }) - .filter(|res| res.as_ref().map_or(true, |(_, sv, _)| sv.contains(&true))) + let log_scanner = + ScanLogReplayProcessor::new(engine, physical_predicate, logical_schema, transform); + + ScanLogReplayProcessor::apply_to_iterator(log_scanner, action_iter) } #[cfg(test)] diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 0372bfd252..0b419f9a31 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -13,6 +13,7 @@ use crate::actions::deletion_vector::{ }; use crate::actions::{get_log_schema, ADD_NAME, REMOVE_NAME, SIDECAR_NAME}; use crate::expressions::{ColumnName, Expression, ExpressionRef, ExpressionTransform, Scalar}; +use crate::log_replay::HasSelectionVector; use crate::predicates::{DefaultPredicateEvaluator, EmptyColumnResolver}; use crate::scan::state::{DvInfo, Stats}; use crate::schema::{ @@ -324,6 +325,13 @@ pub(crate) enum TransformExpr { // (data, deletion_vec, transforms) pub type ScanData = (Box, Vec, Vec>); +// Implementation for the scan result type +impl HasSelectionVector for ScanData { + fn has_selected_rows(&self) -> bool { + self.1.contains(&true) + } +} + /// The result of building a scan over a table. This can be used to get the actual data from /// scanning the table. pub struct Scan { From 88cf9831c9ca486df0363213f00fee45cf47727e Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 11:49:51 -0700 Subject: [PATCH 033/176] move --- kernel/src/scan/log_replay.rs | 46 +++++++++++++++++------------------ kernel/src/scan/mod.rs | 1 - 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 223c668ec5..76459e8927 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -28,6 +28,29 @@ struct ScanLogReplayProcessor { seen_file_keys: HashSet, } +impl ScanLogReplayProcessor { + /// Create a new [`ScanLogReplayProcessor`] instance + fn new( + engine: &dyn Engine, + physical_predicate: Option<(ExpressionRef, SchemaRef)>, + logical_schema: SchemaRef, + transform: Option>, + ) -> Self { + Self { + partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()), + data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate), + add_transform: engine.get_expression_handler().get_evaluator( + get_log_add_schema().clone(), + get_add_transform_expr(), + SCAN_ROW_DATATYPE.clone(), + ), + seen_file_keys: Default::default(), + logical_schema, + transform, + } + } +} + /// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds. Log /// replay visits actions newest-first, so once we've seen a file action for a given (path, dvId) /// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the @@ -339,29 +362,6 @@ impl LogReplayProcessor for ScanLogReplayProcessor { } } -impl ScanLogReplayProcessor { - /// Create a new [`ScanLogReplayProcessor`] instance - fn new( - engine: &dyn Engine, - physical_predicate: Option<(ExpressionRef, SchemaRef)>, - logical_schema: SchemaRef, - transform: Option>, - ) -> Self { - Self { - partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()), - data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate), - add_transform: engine.get_expression_handler().get_evaluator( - get_log_add_schema().clone(), - get_add_transform_expr(), - SCAN_ROW_DATATYPE.clone(), - ), - seen_file_keys: Default::default(), - logical_schema, - transform, - } - } -} - /// Given an iterator of (engine_data, bool) tuples and a predicate, returns an iterator of /// `(engine_data, selection_vec)`. Each row that is selected in the returned `engine_data` _must_ /// be processed to complete the scan. Non-selected rows _must_ be ignored. The boolean flag diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 0b419f9a31..7fd1f9ea92 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -325,7 +325,6 @@ pub(crate) enum TransformExpr { // (data, deletion_vec, transforms) pub type ScanData = (Box, Vec, Vec>); -// Implementation for the scan result type impl HasSelectionVector for ScanData { fn has_selected_rows(&self) -> bool { self.1.contains(&true) From 10bb7b56a65ee7f705ce1dbaa74826fcda0f092a Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 23:05:03 -0700 Subject: [PATCH 034/176] fix rebase --- kernel/src/scan/log_replay.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 76459e8927..896b18b8b7 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -335,6 +335,7 @@ impl LogReplayProcessor for ScanLogReplayProcessor { let logical_schema = self.logical_schema.clone(); let transform = self.transform.clone(); let partition_filter = self.partition_filter.clone(); + // TODO: Teach expression eval to respect the selection vector we just computed so carefully! let result = self.add_transform.evaluate(batch.as_ref())?; let mut visitor = AddRemoveDedupVisitor::new( @@ -347,9 +348,6 @@ impl LogReplayProcessor for ScanLogReplayProcessor { ); visitor.visit_rows_of(batch.as_ref())?; - - // TODO: Teach expression eval to respect the selection vector we just computed so carefully! - let result = add_transform.evaluate(actions)?; Ok(( result, visitor.selection_vector, From 4b5a3a96a819a9305ceb2d1e17539f440b61f540 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 11 Mar 2025 22:03:21 -0700 Subject: [PATCH 035/176] introduce visitors --- kernel/src/actions/visitors.rs | 524 +++++++++++++++++++++++++++++++-- 1 file changed, 505 insertions(+), 19 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 36a2c7faf7..9eef22ed5d 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -1,10 +1,12 @@ //! This module defines visitors that can be used to extract the various delta actions from //! [`crate::engine_data::EngineData`] types. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::LazyLock; +use tracing::debug; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; +use crate::scan::log_replay::FileActionKey; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, Error}; @@ -483,6 +485,270 @@ impl RowVisitor for SidecarVisitor { } } +/// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds and +/// removes to be included in a checkpoint file. Log replay visits actions newest-first, so once +/// we've seen a file action for a given (path, dvId) pair, we should ignore all subsequent (older) +/// actions for that same (path, dvId) pair. If the first action for a given (path, dvId) is a remove +/// action, we should only include it if it is not expired (i.e., its deletion timestamp is greater +/// than the minimum file retention timestamp). +struct CheckpointFileActionsVisitor<'seen> { + seen_file_keys: &'seen mut HashSet, + selection_vector: Vec, + is_log_batch: bool, + total_actions: usize, + total_add_actions: usize, + minimum_file_retention_timestamp: i64, +} + +#[allow(unused)] // TODO: Remove flag once used for checkpoint writing +impl CheckpointFileActionsVisitor<'_> { + /// Checks if log replay already processed this logical file (in which case the current action + /// should be ignored). If not already seen, register it so we can recognize future duplicates. + /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it + /// and should process it. + /// + /// TODO: This method is a duplicate of AddRemoveDedupVisior's method! + fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { + // Note: each (add.path + add.dv_unique_id()) pair has a + // unique Add + Remove pair in the log. For example: + // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json + + if self.seen_file_keys.contains(&key) { + debug!( + "Ignoring duplicate ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + true + } else { + debug!( + "Including ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + if self.is_log_batch { + // Remember file actions from this batch so we can ignore duplicates as we process + // batches from older commit and/or checkpoint files. We don't track checkpoint + // batches because they are already the oldest actions and never replace anything. + self.seen_file_keys.insert(key); + } + false + } + } + + /// A remove action includes a timestamp indicating when the deletion occurred. Physical files + /// are deleted lazily after a user-defined expiration time, allowing concurrent readers to + /// access stale snapshots. A remove action remains as a tombstone in a checkpoint file until + /// it expires, which happens when the current time exceeds the removal timestamp plus the + /// expiration threshold. + fn is_expired_tombstone<'a>(&self, i: usize, getter: &'a dyn GetData<'a>) -> DeltaResult { + // Ideally this should never be zero, but we are following the same behavior as Delta + // Spark and the Java Kernel. + let mut deletion_timestamp: i64 = 0; + if let Some(ts) = getter.get_opt(i, "remove.deletionTimestamp")? { + deletion_timestamp = ts; + } + + Ok(deletion_timestamp <= self.minimum_file_retention_timestamp) + } + + /// Returns true if the row contains a valid file action to be included in the checkpoint. + fn is_valid_file_action<'a>( + &mut self, + i: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult { + // Add will have a path at index 0 if it is valid; otherwise we may + // have a remove with a path at index 4. In either case, extract the three dv getters at + // indexes that immediately follow a valid path index. + let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? { + (path, &getters[1..4], true) + } else if let Some(path) = getters[4].get_opt(i, "remove.path")? { + (path, &getters[6..9], false) + } else { + return Ok(false); + }; + + let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? { + Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts( + storage_type, + dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?, + dv_getters[2].get_opt(i, "deletionVector.offset")?, + )), + None => None, + }; + + // Check both adds and removes (skipping already-seen) + let file_key = FileActionKey::new(path, dv_unique_id); + if self.check_and_record_seen(file_key) { + return Ok(false); + } + + // Ignore expired tombstones. + if !is_add && self.is_expired_tombstone(i, getters[5])? { + return Ok(false); + } + + if is_add { + self.total_add_actions += 1; + } + + Ok(true) + } +} + +impl RowVisitor for CheckpointFileActionsVisitor<'_> { + fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { + // The data columns visited must be in the following order: + // 1. ADD + // 2. REMOVE + static CHECKPOINT_FILE_ACTION_COLUMNS: LazyLock = + LazyLock::new(|| { + const STRING: DataType = DataType::STRING; + const INTEGER: DataType = DataType::INTEGER; + let types_and_names = vec![ + (STRING, column_name!("add.path")), + (STRING, column_name!("add.deletionVector.storageType")), + (STRING, column_name!("add.deletionVector.pathOrInlineDv")), + (INTEGER, column_name!("add.deletionVector.offset")), + (STRING, column_name!("remove.path")), + (DataType::LONG, column_name!("remove.deletionTimestamp")), + (STRING, column_name!("remove.deletionVector.storageType")), + (STRING, column_name!("remove.deletionVector.pathOrInlineDv")), + (INTEGER, column_name!("remove.deletionVector.offset")), + ]; + let (types, names) = types_and_names.into_iter().unzip(); + (names, types).into() + }); + CHECKPOINT_FILE_ACTION_COLUMNS.as_ref() + } + + fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { + require!( + getters.len() == 9, + Error::InternalError(format!( + "Wrong number of visitor getters: {}", + getters.len() + )) + ); + + for i in 0..row_count { + let should_select = self.is_valid_file_action(i, getters)?; + + if should_select { + self.selection_vector[i] = true; + self.total_actions += 1; + } + } + Ok(()) + } +} + +/// A visitor that selects non-file actions for a checkpoint file. Since log replay visits actions +/// in newest-first order, we only keep the first occurrence of: +/// - a protocol action, +/// - a metadata action, +/// - a transaction (txn) action for a given app ID. +/// +/// Any subsequent (older) actions of the same type are ignored. This visitor tracks which actions +/// have been seen and includes only the first occurrence of each in the selection vector. +#[cfg_attr(feature = "developer-visibility", visibility::make(pub))] +pub(crate) struct CheckpointNonFileActionsVisitor<'seen> { + // Non-file actions state + pub(crate) seen_protocol: bool, + pub(crate) seen_metadata: bool, + pub(crate) seen_txns: &'seen mut HashSet, + pub(crate) selection_vector: Vec, + pub(crate) total_actions: usize, +} + +#[allow(unused)] // TODO: Remove flag once used for checkpoint writing +impl CheckpointNonFileActionsVisitor<'_> { + /// Returns true if the row contains a protocol action, and we haven’t seen one yet. + fn is_valid_protocol_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol { + self.seen_protocol = true; + Ok(true) + } else { + Ok(false) + } + } + + /// Returns true if the row contains a metadata action, and we haven’t seen one yet. + fn is_valid_metadata_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata { + self.seen_metadata = true; + Ok(true) + } else { + Ok(false) + } + } + + /// Returns true if the row contains a txn action with an appId that we haven’t seen yet. + fn is_valid_txn_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + let app_id = match getter.get_str(i, "txn.appId")? { + Some(id) => id, + None => return Ok(false), + }; + + Ok(self.seen_txns.insert(app_id.to_string())) + } +} + +impl RowVisitor for CheckpointNonFileActionsVisitor<'_> { + fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { + // The data columns visited must be in the following order: + // 1. METADATA + // 2. PROTOCOL + // 3. TXN + static CHECKPOINT_NON_FILE_ACTION_COLUMNS: LazyLock = + LazyLock::new(|| { + const STRING: DataType = DataType::STRING; + const INTEGER: DataType = DataType::INTEGER; + let types_and_names = vec![ + (STRING, column_name!("metaData.id")), + (INTEGER, column_name!("protocol.minReaderVersion")), + (STRING, column_name!("txn.appId")), + ]; + let (types, names) = types_and_names.into_iter().unzip(); + (names, types).into() + }); + CHECKPOINT_NON_FILE_ACTION_COLUMNS.as_ref() + } + + fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { + require!( + getters.len() == 3, + Error::InternalError(format!( + "Wrong number of visitor getters: {}", + getters.len() + )) + ); + + for i in 0..row_count { + let should_select = self.is_valid_metadata_action(i, getters[0])? + || self.is_valid_protocol_action(i, getters[1])? + || self.is_valid_txn_action(i, getters[2])?; + + if should_select { + self.selection_vector[i] = true; + self.total_actions += 1; + } + } + Ok(()) + } +} + /// Get a DV out of some engine data. The caller is responsible for slicing the `getters` slice such /// that the first element contains the `storageType` element of the deletion vector. pub(crate) fn visit_deletion_vector_at<'a>( @@ -537,11 +803,13 @@ mod tests { let handler = SyncJsonHandler {}; let json_strings: StringArray = vec![ r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, + r#"{"remove":{"path":"part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, r#"{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Databricks-Runtime/","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}}"#, r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#, r#"{"cdc":{"path":"_change_data/age=21/cdc-00000-93f7fceb-281a-446a-b221-07b88132d203.c000.snappy.parquet","partitionValues":{"age":"21"},"size":1033,"dataChange":false}}"#, r#"{"sidecar":{"path":"016ae953-37a9-438e-8683-9a9a4a79a395.parquet","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{"tag_foo":"tag_bar"}}}"#, + r#"{"txn":{"appId":"myApp","version": 3}}"#, ] .into(); let output_schema = get_log_schema().clone(); @@ -551,6 +819,18 @@ mod tests { ArrowEngineData::try_from_engine_data(parsed).unwrap() } + fn parse_json_batch(json_strings: StringArray) -> Box { + let engine = SyncEngine::new(); + let json_handler = engine.get_json_handler(); + let output_schema = get_log_schema().clone(); + json_handler + .parse_json( + string_array_to_engine_data(json_strings.into()), + output_schema, + ) + .unwrap() + } + #[test] fn test_parse_protocol() -> DeltaResult<()> { let data = action_batch(); @@ -639,8 +919,6 @@ mod tests { #[test] fn test_parse_add_partitioned() { - let engine = SyncEngine::new(); - let json_handler = engine.get_json_handler(); let json_strings: StringArray = vec![ r#"{"commitInfo":{"timestamp":1670892998177,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[\"c1\",\"c2\"]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"3","numOutputRows":"3","numOutputBytes":"1356"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.2.0","txnId":"046a258f-45e3-4657-b0bf-abfb0f76681c"}}"#, r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, @@ -650,10 +928,7 @@ mod tests { r#"{"add":{"path":"c1=6/c2=a/part-00011-10619b10-b691-4fd0-acc4-2a9608499d7c.c000.snappy.parquet","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":4},\"maxValues\":{\"c3\":4},\"nullCount\":{\"c3\":0}}"}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let batch = json_handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); + let batch = parse_json_batch(json_strings); let mut add_visitor = AddVisitor::default(); add_visitor.visit_rows_of(batch.as_ref()).unwrap(); let add1 = Add { @@ -697,18 +972,13 @@ mod tests { #[test] fn test_parse_remove_partitioned() { - let engine = SyncEngine::new(); - let json_handler = engine.get_json_handler(); let json_strings: StringArray = vec![ r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, r#"{"metaData":{"id":"aff5cb91-8cd9-4195-aef9-446908507302","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, r#"{"remove":{"path":"c1=4/c2=c/part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let batch = json_handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); + let batch = parse_json_batch(json_strings); let mut remove_visitor = RemoveVisitor::default(); remove_visitor.visit_rows_of(batch.as_ref()).unwrap(); let expected_remove = Remove { @@ -736,8 +1006,6 @@ mod tests { #[test] fn test_parse_txn() { - let engine = SyncEngine::new(); - let json_handler = engine.get_json_handler(); let json_strings: StringArray = vec![ r#"{"commitInfo":{"timestamp":1670892998177,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[\"c1\",\"c2\"]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"3","numOutputRows":"3","numOutputBytes":"1356"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.2.0","txnId":"046a258f-45e3-4657-b0bf-abfb0f76681c"}}"#, r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, @@ -747,10 +1015,7 @@ mod tests { r#"{"txn":{"appId":"myApp2","version": 4, "lastUpdated": 1670892998177}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let batch = json_handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); + let batch = parse_json_batch(json_strings); let mut txn_visitor = SetTransactionVisitor::default(); txn_visitor.visit_rows_of(batch.as_ref()).unwrap(); let mut actual = txn_visitor.set_transactions; @@ -771,4 +1036,225 @@ mod tests { }) ); } + + #[test] + fn test_parse_checkpoint_file_action_visitor() -> DeltaResult<()> { + let data = action_batch(); + let mut visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut HashSet::new(), + selection_vector: vec![false; 8], // 8 rows in the action batch + is_log_batch: true, + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: 0, // No tombstones are expired + }; + + visitor.visit_rows_of(data.as_ref())?; + + let expected = vec![true, true, false, false, false, false, false, false]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_file_keys.len(), 2); + assert_eq!(visitor.total_actions, 2); + assert_eq!(visitor.total_add_actions, 1); + Ok(()) + } + + #[test] + fn test_checkpoint_file_action_visitor_boundary_cases_for_tombstone_expiration( + ) -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, // Missing timestamp defaults to 0 + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut HashSet::new(), + selection_vector: vec![false; 4], + is_log_batch: true, + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: 100, // Threshold set to 100 + }; + + visitor.visit_rows_of(batch.as_ref())?; + + let expected = vec![false, false, true, false]; // Only "one_above_threshold" should be kept + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_file_keys.len(), 4); // All are recorded as seen even if expired + assert_eq!(visitor.total_actions, 1); + assert_eq!(visitor.total_add_actions, 0); + Ok(()) + } + + #[test] + fn test_checkpoint_file_action_visitor_duplicate_file_actions_in_log_batch() -> DeltaResult<()> + { + let json_strings: StringArray = vec![ + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, // Duplicate path + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut HashSet::new(), + selection_vector: vec![false; 2], + is_log_batch: true, // Log batch + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: 0, + }; + + visitor.visit_rows_of(batch.as_ref())?; + + // First one should be included, second one skipped as a duplicate + let expected = vec![true, false]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_file_keys.len(), 1); + assert_eq!(visitor.total_actions, 1); + assert_eq!(visitor.total_add_actions, 1); + Ok(()) + } + + #[test] + fn test_checkpoint_file_action_visitor_duplicate_file_actions_in_checkpoint_batch( + ) -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + // Duplicate path + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut HashSet::new(), + selection_vector: vec![false; 2], + is_log_batch: false, // Checkpoint batch + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: 0, + }; + + visitor.visit_rows_of(batch.as_ref())?; + + // Both should be included since we don't track duplicates in checkpoint batches + let expected = vec![true, true]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_file_keys.len(), 0); // No tracking for checkpoint batches + assert_eq!(visitor.total_actions, 2); + assert_eq!(visitor.total_add_actions, 2); + Ok(()) + } + + #[test] + fn test_checkpoint_file_action_visitor_with_deletion_vectors() -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Same path but different DV + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"two","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Duplicate of first entry + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut HashSet::new(), + selection_vector: vec![false; 3], + is_log_batch: true, + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: 0, + }; + + visitor.visit_rows_of(batch.as_ref())?; + + let expected = vec![true, true, false]; // Third one is a duplicate + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_file_keys.len(), 2); + assert_eq!(visitor.total_actions, 2); + assert_eq!(visitor.total_add_actions, 2); + Ok(()) + } + + #[test] + fn test_parse_checkpoint_non_file_actions_visitor() -> DeltaResult<()> { + let data = action_batch(); + let mut visitor = CheckpointNonFileActionsVisitor { + seen_protocol: false, + seen_metadata: false, + seen_txns: &mut HashSet::new(), + selection_vector: vec![false; 8], + total_actions: 0, + }; + + visitor.visit_rows_of(data.as_ref())?; + + let expected = vec![false, false, false, true, true, false, false, true]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_metadata, true); + assert_eq!(visitor.seen_protocol, true); + assert_eq!(visitor.seen_txns.len(), 1); + assert_eq!(visitor.total_actions, 3); + Ok(()) + } + + #[test] + fn test_checkpoint_non_file_actions_visitor_txn_already_seen() -> DeltaResult<()> { + let json_strings: StringArray = + vec![r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#].into(); + let batch = parse_json_batch(json_strings); + + // Pre-populate with app1 + let mut seen_txns = HashSet::new(); + seen_txns.insert("app1".to_string()); + + let mut visitor = CheckpointNonFileActionsVisitor { + seen_protocol: false, + seen_metadata: false, + seen_txns: &mut seen_txns, + selection_vector: vec![false; 1], + total_actions: 0, + }; + + visitor.visit_rows_of(batch.as_ref())?; + + let expected = vec![false]; // Transaction should be skipped as it's already seen + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_txns.len(), 1); // Still only one transaction + assert_eq!(visitor.total_actions, 0); + Ok(()) + } + + #[test] + fn test_checkpoint_non_file_actions_visitor_protocol_and_metadata_already_seen( + ) -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#, + ] + .into(); + let batch = parse_json_batch(json_strings); + + // Set protocol and metadata as already seen + let mut visitor = CheckpointNonFileActionsVisitor { + seen_protocol: true, // Already seen + seen_metadata: true, // Already seen + seen_txns: &mut HashSet::new(), + selection_vector: vec![false; 2], + total_actions: 0, + }; + + visitor.visit_rows_of(batch.as_ref())?; + + let expected = vec![false, false]; // Both should be skipped + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.total_actions, 0); + Ok(()) + } } From 1cb9364700ff922aa841c92f46b5ac9b88008fef Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 11 Mar 2025 22:28:59 -0700 Subject: [PATCH 036/176] assert! instead of assert_eq with bool --- kernel/src/actions/visitors.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 9eef22ed5d..3ade3d9143 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -824,10 +824,7 @@ mod tests { let json_handler = engine.get_json_handler(); let output_schema = get_log_schema().clone(); json_handler - .parse_json( - string_array_to_engine_data(json_strings.into()), - output_schema, - ) + .parse_json(string_array_to_engine_data(json_strings), output_schema) .unwrap() } @@ -1197,8 +1194,8 @@ mod tests { let expected = vec![false, false, false, true, true, false, false, true]; assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_metadata, true); - assert_eq!(visitor.seen_protocol, true); + assert!(visitor.seen_metadata); + assert!(visitor.seen_protocol); assert_eq!(visitor.seen_txns.len(), 1); assert_eq!(visitor.total_actions, 3); Ok(()) From 797a05ce01b301e349bf87de142901c89ff4a15e Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 21 Mar 2025 19:33:35 -0700 Subject: [PATCH 037/176] merge non file action visitor tests --- kernel/src/actions/visitors.rs | 58 +++++++++++++++++----------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 3ade3d9143..150beffe68 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -784,10 +784,8 @@ mod tests { use super::*; use crate::{ - actions::get_log_schema, - engine::arrow_data::ArrowEngineData, - engine::sync::{json::SyncJsonHandler, SyncEngine}, - Engine, EngineData, JsonHandler, + actions::get_log_schema, engine::arrow_data::ArrowEngineData, engine::sync::SyncEngine, + Engine, EngineData, }; // TODO(nick): Merge all copies of this into one "test utils" thing @@ -799,8 +797,7 @@ mod tests { Box::new(ArrowEngineData::new(batch)) } - fn action_batch() -> Box { - let handler = SyncJsonHandler {}; + fn action_batch() -> Box { let json_strings: StringArray = vec![ r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, r#"{"remove":{"path":"part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, @@ -812,11 +809,7 @@ mod tests { r#"{"txn":{"appId":"myApp","version": 3}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let parsed = handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); - ArrowEngineData::try_from_engine_data(parsed).unwrap() + parse_json_batch(json_strings) } fn parse_json_batch(json_strings: StringArray) -> Box { @@ -1202,26 +1195,30 @@ mod tests { } #[test] - fn test_checkpoint_non_file_actions_visitor_txn_already_seen() -> DeltaResult<()> { - let json_strings: StringArray = - vec![r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#].into(); + fn test_checkpoint_non_file_actions_visitor_already_seen_actions() -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + ].into(); let batch = parse_json_batch(json_strings); - // Pre-populate with app1 + // Pre-populate with txn app1 let mut seen_txns = HashSet::new(); seen_txns.insert("app1".to_string()); let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: false, - seen_metadata: false, + seen_protocol: true, // Already seen + seen_metadata: true, // Already seen seen_txns: &mut seen_txns, - selection_vector: vec![false; 1], + selection_vector: vec![false; 3], total_actions: 0, }; visitor.visit_rows_of(batch.as_ref())?; - let expected = vec![false]; // Transaction should be skipped as it's already seen + // All actions should be skipped as they have already been seen + let expected = vec![false; 3]; assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.seen_txns.len(), 1); // Still only one transaction assert_eq!(visitor.total_actions, 0); @@ -1229,29 +1226,32 @@ mod tests { } #[test] - fn test_checkpoint_non_file_actions_visitor_protocol_and_metadata_already_seen( - ) -> DeltaResult<()> { + fn test_checkpoint_non_file_actions_visitor_duplicate_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, ] .into(); let batch = parse_json_batch(json_strings); - // Set protocol and metadata as already seen let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: true, // Already seen - seen_metadata: true, // Already seen - seen_txns: &mut HashSet::new(), - selection_vector: vec![false; 2], + seen_protocol: false, + seen_metadata: false, + seen_txns: &mut HashSet::new(), // Empty set + selection_vector: vec![false; 6], total_actions: 0, }; visitor.visit_rows_of(batch.as_ref())?; - let expected = vec![false, false]; // Both should be skipped + let expected = vec![true, false, true, false, true, false]; assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.total_actions, 0); + assert_eq!(visitor.seen_txns.len(), 1); + assert_eq!(visitor.total_actions, 3); Ok(()) } } From 45c698dbdda99558e130555d0da6fd5f134342ae Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 25 Mar 2025 14:34:44 -0700 Subject: [PATCH 038/176] base file actions struct --- kernel/src/actions/visitors.rs | 272 ++++++++++++++++++++++++--------- kernel/src/scan/log_replay.rs | 1 + 2 files changed, 200 insertions(+), 73 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 150beffe68..9a04411e11 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -492,9 +492,7 @@ impl RowVisitor for SidecarVisitor { /// action, we should only include it if it is not expired (i.e., its deletion timestamp is greater /// than the minimum file retention timestamp). struct CheckpointFileActionsVisitor<'seen> { - seen_file_keys: &'seen mut HashSet, - selection_vector: Vec, - is_log_batch: bool, + deduplicator: FileActionDeduplicator<'seen>, total_actions: usize, total_add_actions: usize, minimum_file_retention_timestamp: i64, @@ -502,35 +500,22 @@ struct CheckpointFileActionsVisitor<'seen> { #[allow(unused)] // TODO: Remove flag once used for checkpoint writing impl CheckpointFileActionsVisitor<'_> { - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it - /// and should process it. - /// - /// TODO: This method is a duplicate of AddRemoveDedupVisior's method! - fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen_file_keys.contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - true - } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - if self.is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen_file_keys.insert(key); - } - false + /// Create a new CheckpointFileActionsVisitor + fn new( + seen_file_keys: &mut HashSet, + selection_vector: Vec, + is_log_batch: bool, + minimum_file_retention_timestamp: i64, + ) -> CheckpointFileActionsVisitor<'_> { + CheckpointFileActionsVisitor { + deduplicator: FileActionDeduplicator::new( + seen_file_keys, + selection_vector, + is_log_batch, + ), + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp, } } @@ -556,29 +541,17 @@ impl CheckpointFileActionsVisitor<'_> { i: usize, getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { - // Add will have a path at index 0 if it is valid; otherwise we may - // have a remove with a path at index 4. In either case, extract the three dv getters at - // indexes that immediately follow a valid path index. - let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? { - (path, &getters[1..4], true) - } else if let Some(path) = getters[4].get_opt(i, "remove.path")? { - (path, &getters[6..9], false) - } else { + let Some((file_key, is_add)) = self.deduplicator.extract_file_action( + i, getters, 0, // add_path_index + 4, // remove_path_index + 1, // add_dv_start_index + 6, // remove_dv_start_index + false, // Never skip remove actions (even if we're processing a log batch) + )? + else { return Ok(false); }; - - let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?, - dv_getters[2].get_opt(i, "deletionVector.offset")?, - )), - None => None, - }; - - // Check both adds and removes (skipping already-seen) - let file_key = FileActionKey::new(path, dv_unique_id); - if self.check_and_record_seen(file_key) { + if self.deduplicator.check_and_record_seen(file_key) { return Ok(false); } @@ -634,7 +607,7 @@ impl RowVisitor for CheckpointFileActionsVisitor<'_> { let should_select = self.is_valid_file_action(i, getters)?; if should_select { - self.selection_vector[i] = true; + self.deduplicator.selection_vector[i] = true; self.total_actions += 1; } } @@ -642,6 +615,145 @@ impl RowVisitor for CheckpointFileActionsVisitor<'_> { } } +/// Core implementation for deduplicating file actions in Delta log replay +/// This struct extracts the common functionality from the CheckpointVisitor +/// and the ScanDataVisitor. +pub(crate) struct FileActionDeduplicator<'seen> { + /// A set of (data file path, dv_unique_id) pairs that have been seen thus + /// far in the log for deduplication + seen_file_keys: &'seen mut HashSet, + /// Selection vector to track which rows should be included + selection_vector: Vec, + /// Whether we're processing a log batch (as opposed to a checkpoint) + is_log_batch: bool, +} + +impl<'seen> FileActionDeduplicator<'seen> { + pub(crate) fn new( + seen_file_keys: &'seen mut HashSet, + selection_vector: Vec, + is_log_batch: bool, + ) -> Self { + Self { + seen_file_keys, + selection_vector, + is_log_batch, + } + } + + /// Checks if log replay already processed this logical file (in which case the current action + /// should be ignored). If not already seen, register it so we can recognize future duplicates. + /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it + /// and should process it. + pub(crate) fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { + // Note: each (add.path + add.dv_unique_id()) pair has a + // unique Add + Remove pair in the log. For example: + // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json + + if self.seen_file_keys.contains(&key) { + debug!( + "Ignoring duplicate ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + true + } else { + debug!( + "Including ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + if self.is_log_batch { + // Remember file actions from this batch so we can ignore duplicates as we process + // batches from older commit and/or checkpoint files. We don't track checkpoint + // batches because they are already the oldest actions and never replace anything. + self.seen_file_keys.insert(key); + } + false + } + } + + /// Extract deletion vector unique ID + fn extract_dv_unique_id<'a>( + &self, + i: usize, + getters: &[&'a dyn GetData<'a>], + add_dv_start_index: Option, + remove_dv_start_index: Option, + ) -> DeltaResult> { + // Get the starting index based on action type + let start_idx = add_dv_start_index + .or(remove_dv_start_index) + .ok_or_else(|| Error::GenericError { + source: "starting indices for add/remove DVs should have been passed".into(), + })?; + + // Extract the DV unique ID + match getters[start_idx].get_opt(i, "deletionVector.storageType")? { + Some(storage_type) => Ok(Some(DeletionVectorDescriptor::unique_id_from_parts( + storage_type, + getters[start_idx + 1].get(i, "deletionVector.pathOrInlineDv")?, + getters[start_idx + 2].get_opt(i, "deletionVector.offset")?, + ))), + None => Ok(None), + } + } + + /// Extract file action key and determine if it's an add operation + pub(crate) fn extract_file_action<'a>( + &self, + i: usize, + getters: &[&'a dyn GetData<'a>], + add_path_index: usize, + remove_path_index: usize, + add_dv_start_index: usize, + remove_dv_start_index: usize, + skip_removes: bool, + ) -> DeltaResult> { + // Try to extract an add action path + if let Some(path) = getters[add_path_index].get_str(i, "add.path")? { + let dv_unique_id = + self.extract_dv_unique_id(i, getters, Some(add_dv_start_index), None)?; + return Ok(Some((FileActionKey::new(path, dv_unique_id), true))); + } + + // The AddRemoveDedupVisitor does not include remove action getters when + // dealing with non-log batches (since they are not needed for deduplication). + // In this case, we should skip remove actions. + if skip_removes { + return Ok(None); + } + + // Try to extract a remove action path + if let Some(path) = getters[remove_path_index].get_str(i, "remove.path")? { + let dv_unique_id = + self.extract_dv_unique_id(i, getters, None, Some(remove_dv_start_index))?; + return Ok(Some((FileActionKey::new(path, dv_unique_id), false))); + } + + // If we didn't find an add or remove action, return None + return Ok(None); + } + + /// Get the selection vector + pub(crate) fn selection_vector(self) -> Vec { + self.selection_vector + } + + /// Get reference to the selection vector + pub(crate) fn selection_vector_ref(&self) -> &Vec { + &self.selection_vector + } + + /// Get mutable reference to the selection vector + pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec { + &mut self.selection_vector + } + + /// Get whether we are processing a log batch + pub(crate) fn is_log_batch(&self) -> bool { + self.is_log_batch + } +} + /// A visitor that selects non-file actions for a checkpoint file. Since log replay visits actions /// in newest-first order, we only keep the first occurrence of: /// - a protocol action, @@ -1030,10 +1142,13 @@ mod tests { #[test] fn test_parse_checkpoint_file_action_visitor() -> DeltaResult<()> { let data = action_batch(); - let mut visitor = CheckpointFileActionsVisitor { + let deduplicator = FileActionDeduplicator { seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 8], // 8 rows in the action batch + selection_vector: vec![false; 8], is_log_batch: true, + }; + let mut visitor = CheckpointFileActionsVisitor { + deduplicator, total_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp: 0, // No tombstones are expired @@ -1042,8 +1157,8 @@ mod tests { visitor.visit_rows_of(data.as_ref())?; let expected = vec![true, true, false, false, false, false, false, false]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_file_keys.len(), 2); + assert_eq!(visitor.deduplicator.seen_file_keys.len(), 2); + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_actions, 2); assert_eq!(visitor.total_add_actions, 1); Ok(()) @@ -1061,10 +1176,13 @@ mod tests { .into(); let batch = parse_json_batch(json_strings); - let mut visitor = CheckpointFileActionsVisitor { + let deduplicator = FileActionDeduplicator { seen_file_keys: &mut HashSet::new(), selection_vector: vec![false; 4], is_log_batch: true, + }; + let mut visitor = CheckpointFileActionsVisitor { + deduplicator, total_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp: 100, // Threshold set to 100 @@ -1073,8 +1191,8 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![false, false, true, false]; // Only "one_above_threshold" should be kept - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_file_keys.len(), 4); // All are recorded as seen even if expired + assert_eq!(visitor.deduplicator.seen_file_keys.len(), 4); // All are recorded as seen even if expired + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_actions, 1); assert_eq!(visitor.total_add_actions, 0); Ok(()) @@ -1090,10 +1208,13 @@ mod tests { .into(); let batch = parse_json_batch(json_strings); - let mut visitor = CheckpointFileActionsVisitor { + let deduplicator = FileActionDeduplicator { seen_file_keys: &mut HashSet::new(), selection_vector: vec![false; 2], - is_log_batch: true, // Log batch + is_log_batch: true, + }; + let mut visitor = CheckpointFileActionsVisitor { + deduplicator, total_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp: 0, @@ -1103,8 +1224,8 @@ mod tests { // First one should be included, second one skipped as a duplicate let expected = vec![true, false]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_file_keys.len(), 1); + assert_eq!(visitor.deduplicator.seen_file_keys.len(), 1); + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_actions, 1); assert_eq!(visitor.total_add_actions, 1); Ok(()) @@ -1121,10 +1242,13 @@ mod tests { .into(); let batch = parse_json_batch(json_strings); - let mut visitor = CheckpointFileActionsVisitor { + let deduplicator = FileActionDeduplicator { seen_file_keys: &mut HashSet::new(), selection_vector: vec![false; 2], - is_log_batch: false, // Checkpoint batch + is_log_batch: false, + }; + let mut visitor = CheckpointFileActionsVisitor { + deduplicator, total_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp: 0, @@ -1134,8 +1258,8 @@ mod tests { // Both should be included since we don't track duplicates in checkpoint batches let expected = vec![true, true]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_file_keys.len(), 0); // No tracking for checkpoint batches + assert_eq!(visitor.deduplicator.seen_file_keys.len(), 0); // No tracking for checkpoint batches + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_actions, 2); assert_eq!(visitor.total_add_actions, 2); Ok(()) @@ -1152,11 +1276,13 @@ mod tests { ] .into(); let batch = parse_json_batch(json_strings); - - let mut visitor = CheckpointFileActionsVisitor { + let deduplicator = FileActionDeduplicator { seen_file_keys: &mut HashSet::new(), selection_vector: vec![false; 3], is_log_batch: true, + }; + let mut visitor = CheckpointFileActionsVisitor { + deduplicator, total_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp: 0, @@ -1165,8 +1291,8 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, false]; // Third one is a duplicate - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_file_keys.len(), 2); + assert_eq!(visitor.deduplicator.seen_file_keys.len(), 2); + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_actions, 2); assert_eq!(visitor.total_add_actions, 2); Ok(()) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 37e5044059..07f1b59b46 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -7,6 +7,7 @@ use itertools::Itertools; use super::data_skipping::DataSkippingFilter; use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; +use crate::actions::visitors::FileActionDeduplicator; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; use crate::log_replay::{FileActionDeduplicator, FileActionKey}; From b06212523f70f3f8d947745a19ac23101abd7fc7 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 25 Mar 2025 15:34:58 -0700 Subject: [PATCH 039/176] combine visitors --- kernel/src/actions/visitors.rs | 457 ++++++++++++++++++++------------- 1 file changed, 281 insertions(+), 176 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 9a04411e11..73eb25d939 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -484,38 +484,61 @@ impl RowVisitor for SidecarVisitor { Ok(()) } } - -/// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds and -/// removes to be included in a checkpoint file. Log replay visits actions newest-first, so once -/// we've seen a file action for a given (path, dvId) pair, we should ignore all subsequent (older) -/// actions for that same (path, dvId) pair. If the first action for a given (path, dvId) is a remove -/// action, we should only include it if it is not expired (i.e., its deletion timestamp is greater -/// than the minimum file retention timestamp). -struct CheckpointFileActionsVisitor<'seen> { - deduplicator: FileActionDeduplicator<'seen>, - total_actions: usize, +/// A visitor that filters actions for inclusion in a checkpoint file. +/// +/// This visitor processes actions in newest-to-oldest order (as they appear in log +/// replay) and applies deduplication logic for both file and non-file actions. +/// +/// # File Action Filtering +/// - Keeps only the first occurrence of each unique (path, dvId) pair +/// - Excludes expired tombstone remove actions (where deletionTimestamp ≤ minimumFileRetentionTimestamp) +/// +/// # Non-File Action Filtering +/// - Keeps only the first protocol action +/// - Keeps only the first metadata action +/// - Keeps only the first transaction action for each unique app ID +/// +/// This filtered set of actions represents the minimal set needed to reconstruct +/// the latest valid state of the table. +#[cfg_attr(feature = "developer-visibility", visibility::make(pub))] +pub(crate) struct CheckpointVisitor<'seen> { + // File actions deduplication state + file_deduplicator: FileActionDeduplicator<'seen>, + total_file_actions: usize, total_add_actions: usize, minimum_file_retention_timestamp: i64, + + // Non-file actions deduplication state + seen_protocol: bool, + seen_metadata: bool, + seen_txns: &'seen mut HashSet, + total_non_file_actions: usize, } -#[allow(unused)] // TODO: Remove flag once used for checkpoint writing -impl CheckpointFileActionsVisitor<'_> { - /// Create a new CheckpointFileActionsVisitor - fn new( - seen_file_keys: &mut HashSet, +#[allow(unused)] +impl CheckpointVisitor<'_> { + /// Create a new CheckpointVisitor + fn new<'seen>( + seen_file_keys: &'seen mut HashSet, + seen_txns: &'seen mut HashSet, selection_vector: Vec, is_log_batch: bool, minimum_file_retention_timestamp: i64, - ) -> CheckpointFileActionsVisitor<'_> { - CheckpointFileActionsVisitor { - deduplicator: FileActionDeduplicator::new( + ) -> CheckpointVisitor<'seen> { + CheckpointVisitor { + file_deduplicator: FileActionDeduplicator::new( seen_file_keys, selection_vector, is_log_batch, ), - total_actions: 0, + total_file_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp, + + seen_protocol: false, + seen_metadata: false, + seen_txns, + total_non_file_actions: 0, } } @@ -541,8 +564,8 @@ impl CheckpointFileActionsVisitor<'_> { i: usize, getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { - let Some((file_key, is_add)) = self.deduplicator.extract_file_action( - i, getters, 0, // add_path_index + let Some((file_key, is_add)) = self.file_deduplicator.extract_file_action( + i, &getters, 0, // add_path_index 4, // remove_path_index 1, // add_dv_start_index 6, // remove_dv_start_index @@ -551,11 +574,12 @@ impl CheckpointFileActionsVisitor<'_> { else { return Ok(false); }; - if self.deduplicator.check_and_record_seen(file_key) { + + if self.file_deduplicator.check_and_record_seen(file_key) { return Ok(false); } - // Ignore expired tombstones. + // Ignore expired tombstones. The getter at the fifth index is the remove action's deletionTimestamp. if !is_add && self.is_expired_tombstone(i, getters[5])? { return Ok(false); } @@ -564,39 +588,98 @@ impl CheckpointFileActionsVisitor<'_> { self.total_add_actions += 1; } + self.total_file_actions += 1; Ok(true) } + + /// Returns true if the row contains a protocol action, and we haven't seen one yet. + fn is_valid_protocol_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol { + self.seen_protocol = true; + self.total_non_file_actions += 1; + Ok(true) + } else { + Ok(false) + } + } + + /// Returns true if the row contains a metadata action, and we haven't seen one yet. + fn is_valid_metadata_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata { + self.seen_metadata = true; + self.total_non_file_actions += 1; + Ok(true) + } else { + Ok(false) + } + } + + /// Returns true if the row contains a txn action with an appId that we haven't seen yet. + fn is_valid_txn_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + let app_id = match getter.get_str(i, "txn.appId")? { + Some(id) => id, + None => return Ok(false), + }; + + // Attempting to insert the app_id into the set. If it's already present, the insert will + // return false, indicating that we've already seen this app_id. + if self.seen_txns.insert(app_id.to_string()) { + self.total_non_file_actions += 1; + Ok(true) + } else { + Ok(false) + } + } } -impl RowVisitor for CheckpointFileActionsVisitor<'_> { +impl RowVisitor for CheckpointVisitor<'_> { fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { // The data columns visited must be in the following order: // 1. ADD // 2. REMOVE - static CHECKPOINT_FILE_ACTION_COLUMNS: LazyLock = - LazyLock::new(|| { - const STRING: DataType = DataType::STRING; - const INTEGER: DataType = DataType::INTEGER; - let types_and_names = vec![ - (STRING, column_name!("add.path")), - (STRING, column_name!("add.deletionVector.storageType")), - (STRING, column_name!("add.deletionVector.pathOrInlineDv")), - (INTEGER, column_name!("add.deletionVector.offset")), - (STRING, column_name!("remove.path")), - (DataType::LONG, column_name!("remove.deletionTimestamp")), - (STRING, column_name!("remove.deletionVector.storageType")), - (STRING, column_name!("remove.deletionVector.pathOrInlineDv")), - (INTEGER, column_name!("remove.deletionVector.offset")), - ]; - let (types, names) = types_and_names.into_iter().unzip(); - (names, types).into() - }); - CHECKPOINT_FILE_ACTION_COLUMNS.as_ref() + // 3. METADATA + // 4. PROTOCOL + // 5. TXN + static NAMES_AND_TYPES: LazyLock = LazyLock::new(|| { + const STRING: DataType = DataType::STRING; + const INTEGER: DataType = DataType::INTEGER; + let types_and_names = vec![ + // File action columns + (STRING, column_name!("add.path")), + (STRING, column_name!("add.deletionVector.storageType")), + (STRING, column_name!("add.deletionVector.pathOrInlineDv")), + (INTEGER, column_name!("add.deletionVector.offset")), + (STRING, column_name!("remove.path")), + (DataType::LONG, column_name!("remove.deletionTimestamp")), + (STRING, column_name!("remove.deletionVector.storageType")), + (STRING, column_name!("remove.deletionVector.pathOrInlineDv")), + (INTEGER, column_name!("remove.deletionVector.offset")), + // Non-file action columns + (STRING, column_name!("metaData.id")), + (INTEGER, column_name!("protocol.minReaderVersion")), + (STRING, column_name!("txn.appId")), + ]; + let (types, names) = types_and_names.into_iter().unzip(); + (names, types).into() + }); + NAMES_AND_TYPES.as_ref() } fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { require!( - getters.len() == 9, + getters.len() == 12, Error::InternalError(format!( "Wrong number of visitor getters: {}", getters.len() @@ -604,11 +687,17 @@ impl RowVisitor for CheckpointFileActionsVisitor<'_> { ); for i in 0..row_count { - let should_select = self.is_valid_file_action(i, getters)?; + // Check for non-file actions (metadata, protocol, txn) + let is_non_file_action = self.is_valid_metadata_action(i, getters[9])? + || self.is_valid_protocol_action(i, getters[10])? + || self.is_valid_txn_action(i, getters[11])?; - if should_select { - self.deduplicator.selection_vector[i] = true; - self.total_actions += 1; + // Check for file actions (add, remove) + let is_file_action = self.is_valid_file_action(i, getters)?; + + // Mark the row for selection if it's either a valid non-file or file action + if is_non_file_action || is_file_action { + self.file_deduplicator.selection_vector_mut()[i] = true; } } Ok(()) @@ -1140,100 +1229,105 @@ mod tests { } #[test] - fn test_parse_checkpoint_file_action_visitor() -> DeltaResult<()> { + fn test_checkpoint_visitor() -> DeltaResult<()> { let data = action_batch(); - let deduplicator = FileActionDeduplicator { - seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 8], - is_log_batch: true, - }; - let mut visitor = CheckpointFileActionsVisitor { - deduplicator, - total_actions: 0, - total_add_actions: 0, - minimum_file_retention_timestamp: 0, // No tombstones are expired - }; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = CheckpointVisitor::new( + &mut seen_file_keys, + &mut seen_txns, + vec![false; 8], + true, + 0, // minimum_file_retention_timestamp (no expired tombstones) + ); visitor.visit_rows_of(data.as_ref())?; - let expected = vec![true, true, false, false, false, false, false, false]; - assert_eq!(visitor.deduplicator.seen_file_keys.len(), 2); - assert_eq!(visitor.deduplicator.selection_vector(), expected); - assert_eq!(visitor.total_actions, 2); + // Combined results from both file and non-file actions + // Row 0 is an add action + // Row 1 is a remove action + // Row 3 is a protocol action + // Row 4 is a metadata action + // Row 7 is a txn action + let expected = vec![true, true, false, true, true, false, false, true]; + + // Verify file action results + assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 1); + + // Verify non-file action results + assert!(visitor.seen_protocol); + assert!(visitor.seen_metadata); + assert_eq!(visitor.seen_txns.len(), 1); + assert_eq!(visitor.total_non_file_actions, 3); + + assert_eq!(visitor.file_deduplicator.selection_vector, expected); Ok(()) } #[test] - fn test_checkpoint_file_action_visitor_boundary_cases_for_tombstone_expiration( - ) -> DeltaResult<()> { + fn test_checkpoint_visitor_boundary_cases_for_tombstone_expiration() -> DeltaResult<()> { let json_strings: StringArray = vec![ - r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, - r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, - r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#, - r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, // Missing timestamp defaults to 0 + r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#, + // Missing timestamp defaults to 0 + r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, ] .into(); let batch = parse_json_batch(json_strings); - let deduplicator = FileActionDeduplicator { - seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 4], - is_log_batch: true, - }; - let mut visitor = CheckpointFileActionsVisitor { - deduplicator, - total_actions: 0, - total_add_actions: 0, - minimum_file_retention_timestamp: 100, // Threshold set to 100 - }; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = CheckpointVisitor::new( + &mut seen_file_keys, + &mut seen_txns, + vec![false; 4], + true, + 100, // minimum_file_retention_timestamp (threshold set to 100) + ); visitor.visit_rows_of(batch.as_ref())?; - let expected = vec![false, false, true, false]; // Only "one_above_threshold" should be kept - assert_eq!(visitor.deduplicator.seen_file_keys.len(), 4); // All are recorded as seen even if expired - assert_eq!(visitor.deduplicator.selection_vector(), expected); - assert_eq!(visitor.total_actions, 1); + // Only "one_above_threshold" should be kept + let expected = vec![false, false, true, false]; + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 0); + assert_eq!(visitor.total_non_file_actions, 0); Ok(()) } #[test] - fn test_checkpoint_file_action_visitor_duplicate_file_actions_in_log_batch() -> DeltaResult<()> - { + fn test_checkpoint_visitor_conflicting_file_actions_in_log_batch() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, - r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, // Duplicate path - ] + // Duplicate path + r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + ] .into(); let batch = parse_json_batch(json_strings); - let deduplicator = FileActionDeduplicator { - seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 2], - is_log_batch: true, - }; - let mut visitor = CheckpointFileActionsVisitor { - deduplicator, - total_actions: 0, - total_add_actions: 0, - minimum_file_retention_timestamp: 0, - }; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = + CheckpointVisitor::new(&mut seen_file_keys, &mut seen_txns, vec![false; 2], true, 0); visitor.visit_rows_of(batch.as_ref())?; // First one should be included, second one skipped as a duplicate let expected = vec![true, false]; - assert_eq!(visitor.deduplicator.seen_file_keys.len(), 1); - assert_eq!(visitor.deduplicator.selection_vector(), expected); - assert_eq!(visitor.total_actions, 1); + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 1); + assert_eq!(visitor.total_non_file_actions, 0); Ok(()) } #[test] - fn test_checkpoint_file_action_visitor_duplicate_file_actions_in_checkpoint_batch( - ) -> DeltaResult<()> { + fn test_checkpoint_visitor_duplicate_file_actions_in_checkpoint_batch() -> DeltaResult<()> { + // Note: this is NOT a valid checkpoint batch since it contains duplicate file actions! + // However, we should still be able to parse it without errors, and the duplicates should be included. let json_strings: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, // Duplicate path @@ -1242,31 +1336,29 @@ mod tests { .into(); let batch = parse_json_batch(json_strings); - let deduplicator = FileActionDeduplicator { - seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 2], - is_log_batch: false, - }; - let mut visitor = CheckpointFileActionsVisitor { - deduplicator, - total_actions: 0, - total_add_actions: 0, - minimum_file_retention_timestamp: 0, - }; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = CheckpointVisitor::new( + &mut seen_file_keys, + &mut seen_txns, + vec![false; 2], + false, // is_log_batch = false (checkpoint batch) + 0, + ); visitor.visit_rows_of(batch.as_ref())?; // Both should be included since we don't track duplicates in checkpoint batches let expected = vec![true, true]; - assert_eq!(visitor.deduplicator.seen_file_keys.len(), 0); // No tracking for checkpoint batches - assert_eq!(visitor.deduplicator.selection_vector(), expected); - assert_eq!(visitor.total_actions, 2); + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); + assert_eq!(visitor.total_non_file_actions, 0); Ok(()) } #[test] - fn test_checkpoint_file_action_visitor_with_deletion_vectors() -> DeltaResult<()> { + fn test_checkpoint_visitor_with_deletion_vectors() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, // Same path but different DV @@ -1276,52 +1368,52 @@ mod tests { ] .into(); let batch = parse_json_batch(json_strings); - let deduplicator = FileActionDeduplicator { - seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 3], - is_log_batch: true, - }; - let mut visitor = CheckpointFileActionsVisitor { - deduplicator, - total_actions: 0, - total_add_actions: 0, - minimum_file_retention_timestamp: 0, - }; + + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = + CheckpointVisitor::new(&mut seen_file_keys, &mut seen_txns, vec![false; 3], true, 0); visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, false]; // Third one is a duplicate - assert_eq!(visitor.deduplicator.seen_file_keys.len(), 2); - assert_eq!(visitor.deduplicator.selection_vector(), expected); - assert_eq!(visitor.total_actions, 2); + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); + assert_eq!(visitor.total_non_file_actions, 0); + Ok(()) } #[test] - fn test_parse_checkpoint_non_file_actions_visitor() -> DeltaResult<()> { - let data = action_batch(); - let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: false, - seen_metadata: false, - seen_txns: &mut HashSet::new(), - selection_vector: vec![false; 8], - total_actions: 0, - }; + fn test_checkpoint_visitor_non_file_actions() -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + ].into(); + let batch = parse_json_batch(json_strings); - visitor.visit_rows_of(data.as_ref())?; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = + CheckpointVisitor::new(&mut seen_file_keys, &mut seen_txns, vec![false; 3], true, 0); - let expected = vec![false, false, false, true, true, false, false, true]; - assert_eq!(visitor.selection_vector, expected); - assert!(visitor.seen_metadata); + visitor.visit_rows_of(batch.as_ref())?; + + let expected = vec![true, true, true]; + assert_eq!(visitor.file_deduplicator.selection_vector, expected); assert!(visitor.seen_protocol); + assert!(visitor.seen_metadata); assert_eq!(visitor.seen_txns.len(), 1); - assert_eq!(visitor.total_actions, 3); + assert_eq!(visitor.total_non_file_actions, 3); + assert_eq!(visitor.total_file_actions, 0); + Ok(()) } #[test] - fn test_checkpoint_non_file_actions_visitor_already_seen_actions() -> DeltaResult<()> { + fn test_checkpoint_visitor_already_seen_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, @@ -1330,54 +1422,67 @@ mod tests { let batch = parse_json_batch(json_strings); // Pre-populate with txn app1 + let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); seen_txns.insert("app1".to_string()); - let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: true, // Already seen - seen_metadata: true, // Already seen - seen_txns: &mut seen_txns, - selection_vector: vec![false; 3], - total_actions: 0, - }; + let mut visitor = CheckpointVisitor::new( + &mut seen_file_keys, + &mut seen_txns, // Pre-populated transaction + vec![false; 3], + true, + 0, + ); + + // Mark these as already seen + visitor.seen_protocol = true; + visitor.seen_metadata = true; visitor.visit_rows_of(batch.as_ref())?; // All actions should be skipped as they have already been seen - let expected = vec![false; 3]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_txns.len(), 1); // Still only one transaction - assert_eq!(visitor.total_actions, 0); + let expected = vec![false, false, false]; + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.total_non_file_actions, 0); + assert_eq!(visitor.total_file_actions, 0); + Ok(()) } #[test] - fn test_checkpoint_non_file_actions_visitor_duplicate_non_file_actions() -> DeltaResult<()> { + fn test_checkpoint_visitor_duplicate_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, // Duplicate txn + r#"{"txn":{"appId":"app2","version":1,"lastUpdated":123456789}}"#, // Different app ID + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, // Duplicate protocol r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + // Duplicate metadata + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, ] .into(); let batch = parse_json_batch(json_strings); - let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: false, - seen_metadata: false, - seen_txns: &mut HashSet::new(), // Empty set - selection_vector: vec![false; 6], - total_actions: 0, - }; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = CheckpointVisitor::new( + &mut seen_file_keys, + &mut seen_txns, + vec![false; 7], + true, // is_log_batch + 0, // minimum_file_retention_timestamp + ); visitor.visit_rows_of(batch.as_ref())?; - let expected = vec![true, false, true, false, true, false]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_txns.len(), 1); - assert_eq!(visitor.total_actions, 3); + // First occurrence of each type should be included + let expected = vec![true, false, true, true, false, true, false]; + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.seen_txns.len(), 2); // Two different app IDs + assert_eq!(visitor.total_non_file_actions, 4); // 2 txns + 1 protocol + 1 metadata + assert_eq!(visitor.total_file_actions, 0); + Ok(()) } } From 90e46cd5d0108216ce3fb17e340da0ca85e0719e Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 25 Mar 2025 17:11:15 -0700 Subject: [PATCH 040/176] fmt --- kernel/src/actions/visitors.rs | 115 ++++++++++++++++++++++----------- kernel/src/scan/log_replay.rs | 2 +- 2 files changed, 78 insertions(+), 39 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 73eb25d939..a93aa71ec4 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -503,7 +503,7 @@ impl RowVisitor for SidecarVisitor { #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] pub(crate) struct CheckpointVisitor<'seen> { // File actions deduplication state - file_deduplicator: FileActionDeduplicator<'seen>, + deduplicator: FileActionDeduplicator<'seen>, total_file_actions: usize, total_add_actions: usize, minimum_file_retention_timestamp: i64, @@ -526,7 +526,7 @@ impl CheckpointVisitor<'_> { minimum_file_retention_timestamp: i64, ) -> CheckpointVisitor<'seen> { CheckpointVisitor { - file_deduplicator: FileActionDeduplicator::new( + deduplicator: FileActionDeduplicator::new( seen_file_keys, selection_vector, is_log_batch, @@ -564,18 +564,18 @@ impl CheckpointVisitor<'_> { i: usize, getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { - let Some((file_key, is_add)) = self.file_deduplicator.extract_file_action( - i, &getters, 0, // add_path_index - 4, // remove_path_index - 1, // add_dv_start_index - 6, // remove_dv_start_index - false, // Never skip remove actions (even if we're processing a log batch) + // Extract file action key and determine if it's an add operation + let Some((file_key, is_add)) = self.deduplicator.extract_file_action( + i, + getters, + // Do not skip remove actions (even if we're processing a log batch) + FileActionExtractConfig::new(0, 4, 1, 6, false), )? else { return Ok(false); }; - if self.file_deduplicator.check_and_record_seen(file_key) { + if self.deduplicator.check_and_record_seen(file_key) { return Ok(false); } @@ -697,13 +697,46 @@ impl RowVisitor for CheckpointVisitor<'_> { // Mark the row for selection if it's either a valid non-file or file action if is_non_file_action || is_file_action { - self.file_deduplicator.selection_vector_mut()[i] = true; + self.deduplicator.selection_vector_mut()[i] = true; } } Ok(()) } } +/// This struct contains indices and configuration options needed to +/// extract file actions from action batches in the Delta log. +pub(crate) struct FileActionExtractConfig { + /// Index of the getter containing the add.path column + pub add_path_index: usize, + /// Index of the getter containing the remove.path column + pub remove_path_index: usize, + /// Starting index for add action deletion vector columns + pub add_dv_start_index: usize, + /// Starting index for remove action deletion vector columns + pub remove_dv_start_index: usize, + /// Whether to skip remove actions when extracting file actions + pub skip_removes: bool, +} + +impl FileActionExtractConfig { + pub(crate) fn new( + add_path_index: usize, + remove_path_index: usize, + add_dv_start_index: usize, + remove_dv_start_index: usize, + skip_removes: bool, + ) -> Self { + Self { + add_path_index, + remove_path_index, + add_dv_start_index, + remove_dv_start_index, + skip_removes, + } + } +} + /// Core implementation for deduplicating file actions in Delta log replay /// This struct extracts the common functionality from the CheckpointVisitor /// and the ScanDataVisitor. @@ -786,58 +819,64 @@ impl<'seen> FileActionDeduplicator<'seen> { } } - /// Extract file action key and determine if it's an add operation + /// Extracts a file action key and determines if it's an add operation. + /// + /// This method examines the data at the given index using the provided getters and config + /// to identify whether a file action exists and what type it is. + /// + /// # Arguments + /// + /// * `i` - Index position in the data structure to examine + /// * `getters` - Collection of data getter implementations used to access the data + /// * `config` - Configuration specifying where to find add/remove operations + /// + /// # Returns + /// + /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation + /// * `Ok(None)` - When no file action is found + /// * `Err(...)` - On any error during extraction pub(crate) fn extract_file_action<'a>( &self, i: usize, getters: &[&'a dyn GetData<'a>], - add_path_index: usize, - remove_path_index: usize, - add_dv_start_index: usize, - remove_dv_start_index: usize, - skip_removes: bool, + config: FileActionExtractConfig, ) -> DeltaResult> { // Try to extract an add action path - if let Some(path) = getters[add_path_index].get_str(i, "add.path")? { + if let Some(path) = getters[config.add_path_index].get_str(i, "add.path")? { let dv_unique_id = - self.extract_dv_unique_id(i, getters, Some(add_dv_start_index), None)?; + self.extract_dv_unique_id(i, getters, Some(config.add_dv_start_index), None)?; return Ok(Some((FileActionKey::new(path, dv_unique_id), true))); } - // The AddRemoveDedupVisitor does not include remove action getters when - // dealing with non-log batches (since they are not needed for deduplication). - // In this case, we should skip remove actions. - if skip_removes { + // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint file. + if config.skip_removes { return Ok(None); } // Try to extract a remove action path - if let Some(path) = getters[remove_path_index].get_str(i, "remove.path")? { + if let Some(path) = getters[config.remove_path_index].get_str(i, "remove.path")? { let dv_unique_id = - self.extract_dv_unique_id(i, getters, None, Some(remove_dv_start_index))?; + self.extract_dv_unique_id(i, getters, None, Some(config.remove_dv_start_index))?; return Ok(Some((FileActionKey::new(path, dv_unique_id), false))); } - // If we didn't find an add or remove action, return None - return Ok(None); + // No file action found + Ok(None) } - /// Get the selection vector pub(crate) fn selection_vector(self) -> Vec { self.selection_vector } - /// Get reference to the selection vector pub(crate) fn selection_vector_ref(&self) -> &Vec { &self.selection_vector } - /// Get mutable reference to the selection vector pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec { &mut self.selection_vector } - /// Get whether we are processing a log batch + /// Returns whether we are currently processing a log batch. pub(crate) fn is_log_batch(&self) -> bool { self.is_log_batch } @@ -1261,7 +1300,7 @@ mod tests { assert_eq!(visitor.seen_txns.len(), 1); assert_eq!(visitor.total_non_file_actions, 3); - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); Ok(()) } @@ -1291,7 +1330,7 @@ mod tests { // Only "one_above_threshold" should be kept let expected = vec![false, false, true, false]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 0); assert_eq!(visitor.total_non_file_actions, 0); @@ -1317,7 +1356,7 @@ mod tests { // First one should be included, second one skipped as a duplicate let expected = vec![true, false]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 1); assert_eq!(visitor.total_non_file_actions, 0); @@ -1350,7 +1389,7 @@ mod tests { // Both should be included since we don't track duplicates in checkpoint batches let expected = vec![true, true]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); assert_eq!(visitor.total_non_file_actions, 0); @@ -1377,7 +1416,7 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, false]; // Third one is a duplicate - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); assert_eq!(visitor.total_non_file_actions, 0); @@ -1402,7 +1441,7 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, true]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert!(visitor.seen_protocol); assert!(visitor.seen_metadata); assert_eq!(visitor.seen_txns.len(), 1); @@ -1442,7 +1481,7 @@ mod tests { // All actions should be skipped as they have already been seen let expected = vec![false, false, false]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.total_non_file_actions, 0); assert_eq!(visitor.total_file_actions, 0); @@ -1478,7 +1517,7 @@ mod tests { // First occurrence of each type should be included let expected = vec![true, false, true, true, false, true, false]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.seen_txns.len(), 2); // Two different app IDs assert_eq!(visitor.total_non_file_actions, 4); // 2 txns + 1 protocol + 1 metadata assert_eq!(visitor.total_file_actions, 0); diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 07f1b59b46..ffb53698ff 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -7,7 +7,7 @@ use itertools::Itertools; use super::data_skipping::DataSkippingFilter; use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; -use crate::actions::visitors::FileActionDeduplicator; +use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig}; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; use crate::log_replay::{FileActionDeduplicator, FileActionKey}; From 3c25392999e8e7f7c4104dee80c7e3fc492811b2 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 25 Mar 2025 17:39:26 -0700 Subject: [PATCH 041/176] remove old code --- kernel/src/actions/visitors.rs | 109 +-------------------------------- 1 file changed, 1 insertion(+), 108 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index a93aa71ec4..c0feb93eb0 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -739,7 +739,7 @@ impl FileActionExtractConfig { /// Core implementation for deduplicating file actions in Delta log replay /// This struct extracts the common functionality from the CheckpointVisitor -/// and the ScanDataVisitor. +/// and the AddRemoveDedupVisitor. pub(crate) struct FileActionDeduplicator<'seen> { /// A set of (data file path, dv_unique_id) pairs that have been seen thus /// far in the log for deduplication @@ -882,113 +882,6 @@ impl<'seen> FileActionDeduplicator<'seen> { } } -/// A visitor that selects non-file actions for a checkpoint file. Since log replay visits actions -/// in newest-first order, we only keep the first occurrence of: -/// - a protocol action, -/// - a metadata action, -/// - a transaction (txn) action for a given app ID. -/// -/// Any subsequent (older) actions of the same type are ignored. This visitor tracks which actions -/// have been seen and includes only the first occurrence of each in the selection vector. -#[cfg_attr(feature = "developer-visibility", visibility::make(pub))] -pub(crate) struct CheckpointNonFileActionsVisitor<'seen> { - // Non-file actions state - pub(crate) seen_protocol: bool, - pub(crate) seen_metadata: bool, - pub(crate) seen_txns: &'seen mut HashSet, - pub(crate) selection_vector: Vec, - pub(crate) total_actions: usize, -} - -#[allow(unused)] // TODO: Remove flag once used for checkpoint writing -impl CheckpointNonFileActionsVisitor<'_> { - /// Returns true if the row contains a protocol action, and we haven’t seen one yet. - fn is_valid_protocol_action<'a>( - &mut self, - i: usize, - getter: &'a dyn GetData<'a>, - ) -> DeltaResult { - if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol { - self.seen_protocol = true; - Ok(true) - } else { - Ok(false) - } - } - - /// Returns true if the row contains a metadata action, and we haven’t seen one yet. - fn is_valid_metadata_action<'a>( - &mut self, - i: usize, - getter: &'a dyn GetData<'a>, - ) -> DeltaResult { - if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata { - self.seen_metadata = true; - Ok(true) - } else { - Ok(false) - } - } - - /// Returns true if the row contains a txn action with an appId that we haven’t seen yet. - fn is_valid_txn_action<'a>( - &mut self, - i: usize, - getter: &'a dyn GetData<'a>, - ) -> DeltaResult { - let app_id = match getter.get_str(i, "txn.appId")? { - Some(id) => id, - None => return Ok(false), - }; - - Ok(self.seen_txns.insert(app_id.to_string())) - } -} - -impl RowVisitor for CheckpointNonFileActionsVisitor<'_> { - fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { - // The data columns visited must be in the following order: - // 1. METADATA - // 2. PROTOCOL - // 3. TXN - static CHECKPOINT_NON_FILE_ACTION_COLUMNS: LazyLock = - LazyLock::new(|| { - const STRING: DataType = DataType::STRING; - const INTEGER: DataType = DataType::INTEGER; - let types_and_names = vec![ - (STRING, column_name!("metaData.id")), - (INTEGER, column_name!("protocol.minReaderVersion")), - (STRING, column_name!("txn.appId")), - ]; - let (types, names) = types_and_names.into_iter().unzip(); - (names, types).into() - }); - CHECKPOINT_NON_FILE_ACTION_COLUMNS.as_ref() - } - - fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { - require!( - getters.len() == 3, - Error::InternalError(format!( - "Wrong number of visitor getters: {}", - getters.len() - )) - ); - - for i in 0..row_count { - let should_select = self.is_valid_metadata_action(i, getters[0])? - || self.is_valid_protocol_action(i, getters[1])? - || self.is_valid_txn_action(i, getters[2])?; - - if should_select { - self.selection_vector[i] = true; - self.total_actions += 1; - } - } - Ok(()) - } -} - /// Get a DV out of some engine data. The caller is responsible for slicing the `getters` slice such /// that the first element contains the `storageType` element of the deletion vector. pub(crate) fn visit_deletion_vector_at<'a>( From cba8ed650be016c8326bad42bf87a8e119255946 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 25 Mar 2025 18:10:05 -0700 Subject: [PATCH 042/176] move FileActionKey --- kernel/src/actions/visitors.rs | 15 ++++++++++++++- kernel/src/scan/log_replay.rs | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index c0feb93eb0..037dfdd427 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -6,7 +6,6 @@ use std::sync::LazyLock; use tracing::debug; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; -use crate::scan::log_replay::FileActionKey; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, Error}; @@ -704,6 +703,20 @@ impl RowVisitor for CheckpointVisitor<'_> { } } +/// The subset of file action fields that uniquely identifies it in the log, used for deduplication +/// of adds and removes during log replay. +#[derive(Debug, Hash, Eq, PartialEq)] +pub(crate) struct FileActionKey { + pub(crate) path: String, + pub(crate) dv_unique_id: Option, +} +impl FileActionKey { + pub(crate) fn new(path: impl Into, dv_unique_id: Option) -> Self { + let path = path.into(); + Self { path, dv_unique_id } + } +} + /// This struct contains indices and configuration options needed to /// extract file actions from action batches in the Delta log. pub(crate) struct FileActionExtractConfig { diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index ffb53698ff..38b5d0c182 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -7,7 +7,7 @@ use itertools::Itertools; use super::data_skipping::DataSkippingFilter; use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; -use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig}; +use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig, FileActionKey}; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; use crate::log_replay::{FileActionDeduplicator, FileActionKey}; From 28f1fb4f620977db39fcb84280740686674b695d Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 20:14:57 -0700 Subject: [PATCH 043/176] fix merge --- kernel/src/actions/visitors.rs | 230 ++++----------------------------- kernel/src/scan/log_replay.rs | 1 - 2 files changed, 22 insertions(+), 209 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 037dfdd427..cc4e9b44b3 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -3,9 +3,9 @@ use std::collections::{HashMap, HashSet}; use std::sync::LazyLock; -use tracing::debug; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; +use crate::log_replay::{FileActionDeduplicator, FileActionKey}; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, Error}; @@ -516,6 +516,12 @@ pub(crate) struct CheckpointVisitor<'seen> { #[allow(unused)] impl CheckpointVisitor<'_> { + // The index position in the row getters for the following columns + const ADD_PATH_INDEX: usize = 0; + const ADD_DV_START_INDEX: usize = 1; + const REMOVE_PATH_INDEX: usize = 4; + const REMOVE_DV_START_INDEX: usize = 6; + /// Create a new CheckpointVisitor fn new<'seen>( seen_file_keys: &'seen mut HashSet, @@ -529,6 +535,10 @@ impl CheckpointVisitor<'_> { seen_file_keys, selection_vector, is_log_batch, + Self::ADD_PATH_INDEX, + Self::REMOVE_PATH_INDEX, + Self::ADD_DV_START_INDEX, + Self::REMOVE_DV_START_INDEX, ), total_file_actions: 0, total_add_actions: 0, @@ -563,17 +573,13 @@ impl CheckpointVisitor<'_> { i: usize, getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { - // Extract file action key and determine if it's an add operation - let Some((file_key, is_add)) = self.deduplicator.extract_file_action( - i, - getters, - // Do not skip remove actions (even if we're processing a log batch) - FileActionExtractConfig::new(0, 4, 1, 6, false), - )? + // Never skip remove actions, as they may be unexpired tombstones. + let Some((file_key, is_add)) = self.deduplicator.extract_file_action(i, getters, false)? else { return Ok(false); }; + // Check if we've already seen this file action if self.deduplicator.check_and_record_seen(file_key) { return Ok(false); } @@ -703,198 +709,6 @@ impl RowVisitor for CheckpointVisitor<'_> { } } -/// The subset of file action fields that uniquely identifies it in the log, used for deduplication -/// of adds and removes during log replay. -#[derive(Debug, Hash, Eq, PartialEq)] -pub(crate) struct FileActionKey { - pub(crate) path: String, - pub(crate) dv_unique_id: Option, -} -impl FileActionKey { - pub(crate) fn new(path: impl Into, dv_unique_id: Option) -> Self { - let path = path.into(); - Self { path, dv_unique_id } - } -} - -/// This struct contains indices and configuration options needed to -/// extract file actions from action batches in the Delta log. -pub(crate) struct FileActionExtractConfig { - /// Index of the getter containing the add.path column - pub add_path_index: usize, - /// Index of the getter containing the remove.path column - pub remove_path_index: usize, - /// Starting index for add action deletion vector columns - pub add_dv_start_index: usize, - /// Starting index for remove action deletion vector columns - pub remove_dv_start_index: usize, - /// Whether to skip remove actions when extracting file actions - pub skip_removes: bool, -} - -impl FileActionExtractConfig { - pub(crate) fn new( - add_path_index: usize, - remove_path_index: usize, - add_dv_start_index: usize, - remove_dv_start_index: usize, - skip_removes: bool, - ) -> Self { - Self { - add_path_index, - remove_path_index, - add_dv_start_index, - remove_dv_start_index, - skip_removes, - } - } -} - -/// Core implementation for deduplicating file actions in Delta log replay -/// This struct extracts the common functionality from the CheckpointVisitor -/// and the AddRemoveDedupVisitor. -pub(crate) struct FileActionDeduplicator<'seen> { - /// A set of (data file path, dv_unique_id) pairs that have been seen thus - /// far in the log for deduplication - seen_file_keys: &'seen mut HashSet, - /// Selection vector to track which rows should be included - selection_vector: Vec, - /// Whether we're processing a log batch (as opposed to a checkpoint) - is_log_batch: bool, -} - -impl<'seen> FileActionDeduplicator<'seen> { - pub(crate) fn new( - seen_file_keys: &'seen mut HashSet, - selection_vector: Vec, - is_log_batch: bool, - ) -> Self { - Self { - seen_file_keys, - selection_vector, - is_log_batch, - } - } - - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it - /// and should process it. - pub(crate) fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen_file_keys.contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - true - } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - if self.is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen_file_keys.insert(key); - } - false - } - } - - /// Extract deletion vector unique ID - fn extract_dv_unique_id<'a>( - &self, - i: usize, - getters: &[&'a dyn GetData<'a>], - add_dv_start_index: Option, - remove_dv_start_index: Option, - ) -> DeltaResult> { - // Get the starting index based on action type - let start_idx = add_dv_start_index - .or(remove_dv_start_index) - .ok_or_else(|| Error::GenericError { - source: "starting indices for add/remove DVs should have been passed".into(), - })?; - - // Extract the DV unique ID - match getters[start_idx].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Ok(Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - getters[start_idx + 1].get(i, "deletionVector.pathOrInlineDv")?, - getters[start_idx + 2].get_opt(i, "deletionVector.offset")?, - ))), - None => Ok(None), - } - } - - /// Extracts a file action key and determines if it's an add operation. - /// - /// This method examines the data at the given index using the provided getters and config - /// to identify whether a file action exists and what type it is. - /// - /// # Arguments - /// - /// * `i` - Index position in the data structure to examine - /// * `getters` - Collection of data getter implementations used to access the data - /// * `config` - Configuration specifying where to find add/remove operations - /// - /// # Returns - /// - /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation - /// * `Ok(None)` - When no file action is found - /// * `Err(...)` - On any error during extraction - pub(crate) fn extract_file_action<'a>( - &self, - i: usize, - getters: &[&'a dyn GetData<'a>], - config: FileActionExtractConfig, - ) -> DeltaResult> { - // Try to extract an add action path - if let Some(path) = getters[config.add_path_index].get_str(i, "add.path")? { - let dv_unique_id = - self.extract_dv_unique_id(i, getters, Some(config.add_dv_start_index), None)?; - return Ok(Some((FileActionKey::new(path, dv_unique_id), true))); - } - - // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint file. - if config.skip_removes { - return Ok(None); - } - - // Try to extract a remove action path - if let Some(path) = getters[config.remove_path_index].get_str(i, "remove.path")? { - let dv_unique_id = - self.extract_dv_unique_id(i, getters, None, Some(config.remove_dv_start_index))?; - return Ok(Some((FileActionKey::new(path, dv_unique_id), false))); - } - - // No file action found - Ok(None) - } - - pub(crate) fn selection_vector(self) -> Vec { - self.selection_vector - } - - pub(crate) fn selection_vector_ref(&self) -> &Vec { - &self.selection_vector - } - - pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec { - &mut self.selection_vector - } - - /// Returns whether we are currently processing a log batch. - pub(crate) fn is_log_batch(&self) -> bool { - self.is_log_batch - } -} - /// Get a DV out of some engine data. The caller is responsible for slicing the `getters` slice such /// that the first element contains the `storageType` element of the deletion vector. pub(crate) fn visit_deletion_vector_at<'a>( @@ -1206,7 +1020,7 @@ mod tests { assert_eq!(visitor.seen_txns.len(), 1); assert_eq!(visitor.total_non_file_actions, 3); - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector(), expected); Ok(()) } @@ -1236,7 +1050,7 @@ mod tests { // Only "one_above_threshold" should be kept let expected = vec![false, false, true, false]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 0); assert_eq!(visitor.total_non_file_actions, 0); @@ -1262,7 +1076,7 @@ mod tests { // First one should be included, second one skipped as a duplicate let expected = vec![true, false]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 1); assert_eq!(visitor.total_non_file_actions, 0); @@ -1295,7 +1109,7 @@ mod tests { // Both should be included since we don't track duplicates in checkpoint batches let expected = vec![true, true]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); assert_eq!(visitor.total_non_file_actions, 0); @@ -1322,7 +1136,7 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, false]; // Third one is a duplicate - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); assert_eq!(visitor.total_non_file_actions, 0); @@ -1347,7 +1161,7 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, true]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert!(visitor.seen_protocol); assert!(visitor.seen_metadata); assert_eq!(visitor.seen_txns.len(), 1); @@ -1387,7 +1201,7 @@ mod tests { // All actions should be skipped as they have already been seen let expected = vec![false, false, false]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_non_file_actions, 0); assert_eq!(visitor.total_file_actions, 0); @@ -1423,7 +1237,7 @@ mod tests { // First occurrence of each type should be included let expected = vec![true, false, true, true, false, true, false]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.seen_txns.len(), 2); // Two different app IDs assert_eq!(visitor.total_non_file_actions, 4); // 2 txns + 1 protocol + 1 metadata assert_eq!(visitor.total_file_actions, 0); diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 38b5d0c182..37e5044059 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -7,7 +7,6 @@ use itertools::Itertools; use super::data_skipping::DataSkippingFilter; use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; -use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig, FileActionKey}; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; use crate::log_replay::{FileActionDeduplicator, FileActionKey}; From 48f831a4baa1df214a3479cd074431faaf8c30ec Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 10:57:34 -0700 Subject: [PATCH 044/176] doc --- kernel/src/actions/visitors.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index cc4e9b44b3..86dfded4e7 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -559,6 +559,8 @@ impl CheckpointVisitor<'_> { fn is_expired_tombstone<'a>(&self, i: usize, getter: &'a dyn GetData<'a>) -> DeltaResult { // Ideally this should never be zero, but we are following the same behavior as Delta // Spark and the Java Kernel. + // Note: When remove.deletion_timestamp is not present (defaulting to 0), the remove action + // will be excluded from the checkpoint file as it will be treated as expired. let mut deletion_timestamp: i64 = 0; if let Some(ts) = getter.get_opt(i, "remove.deletionTimestamp")? { deletion_timestamp = ts; From 7c3d976dce372aeb71339454f6140ab5dc3e7ed5 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 23:22:49 -0700 Subject: [PATCH 045/176] docs --- kernel/src/actions/visitors.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 86dfded4e7..c2ac41b0bd 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -497,6 +497,8 @@ impl RowVisitor for SidecarVisitor { /// - Keeps only the first metadata action /// - Keeps only the first transaction action for each unique app ID /// +/// CommitInfo, CDC, and sidecar actions are NOT part of the V1 spec checkpoint schema. +/// /// This filtered set of actions represents the minimal set needed to reconstruct /// the latest valid state of the table. #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] From b2bb0ce840b21f5a6895d6914a29bc296bb61827 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 23:43:58 -0700 Subject: [PATCH 046/176] fix rebase --- kernel/src/actions/visitors.rs | 57 +++++++++++++++++----------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index c2ac41b0bd..17fa6c93e5 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -503,13 +503,14 @@ impl RowVisitor for SidecarVisitor { /// the latest valid state of the table. #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] pub(crate) struct CheckpointVisitor<'seen> { - // File actions deduplication state + // File actions state deduplicator: FileActionDeduplicator<'seen>, + selection_vector: Vec, total_file_actions: usize, total_add_actions: usize, minimum_file_retention_timestamp: i64, - // Non-file actions deduplication state + // Non-file actions state seen_protocol: bool, seen_metadata: bool, seen_txns: &'seen mut HashSet, @@ -527,21 +528,21 @@ impl CheckpointVisitor<'_> { /// Create a new CheckpointVisitor fn new<'seen>( seen_file_keys: &'seen mut HashSet, - seen_txns: &'seen mut HashSet, - selection_vector: Vec, is_log_batch: bool, + selection_vector: Vec, + seen_txns: &'seen mut HashSet, minimum_file_retention_timestamp: i64, ) -> CheckpointVisitor<'seen> { CheckpointVisitor { deduplicator: FileActionDeduplicator::new( seen_file_keys, - selection_vector, is_log_batch, Self::ADD_PATH_INDEX, Self::REMOVE_PATH_INDEX, Self::ADD_DV_START_INDEX, Self::REMOVE_DV_START_INDEX, ), + selection_vector, total_file_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp, @@ -706,7 +707,7 @@ impl RowVisitor for CheckpointVisitor<'_> { // Mark the row for selection if it's either a valid non-file or file action if is_non_file_action || is_file_action { - self.deduplicator.selection_vector_mut()[i] = true; + self.selection_vector[i] = true; } } Ok(()) @@ -998,9 +999,9 @@ mod tests { let mut seen_txns = HashSet::new(); let mut visitor = CheckpointVisitor::new( &mut seen_file_keys, - &mut seen_txns, - vec![false; 8], true, + vec![false; 8], + &mut seen_txns, 0, // minimum_file_retention_timestamp (no expired tombstones) ); @@ -1024,7 +1025,7 @@ mod tests { assert_eq!(visitor.seen_txns.len(), 1); assert_eq!(visitor.total_non_file_actions, 3); - assert_eq!(visitor.deduplicator.selection_vector(), expected); + assert_eq!(visitor.selection_vector, expected); Ok(()) } @@ -1044,9 +1045,9 @@ mod tests { let mut seen_txns = HashSet::new(); let mut visitor = CheckpointVisitor::new( &mut seen_file_keys, - &mut seen_txns, - vec![false; 4], true, + vec![false; 4], + &mut seen_txns, 100, // minimum_file_retention_timestamp (threshold set to 100) ); @@ -1054,7 +1055,7 @@ mod tests { // Only "one_above_threshold" should be kept let expected = vec![false, false, true, false]; - assert_eq!(visitor.deduplicator.selection_vector(), expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 0); assert_eq!(visitor.total_non_file_actions, 0); @@ -1074,13 +1075,13 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); let mut visitor = - CheckpointVisitor::new(&mut seen_file_keys, &mut seen_txns, vec![false; 2], true, 0); + CheckpointVisitor::new(&mut seen_file_keys, true, vec![false; 2], &mut seen_txns, 0); visitor.visit_rows_of(batch.as_ref())?; // First one should be included, second one skipped as a duplicate let expected = vec![true, false]; - assert_eq!(visitor.deduplicator.selection_vector(), expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 1); assert_eq!(visitor.total_non_file_actions, 0); @@ -1103,9 +1104,9 @@ mod tests { let mut seen_txns = HashSet::new(); let mut visitor = CheckpointVisitor::new( &mut seen_file_keys, - &mut seen_txns, - vec![false; 2], false, // is_log_batch = false (checkpoint batch) + vec![false; 2], + &mut seen_txns, 0, ); @@ -1113,7 +1114,7 @@ mod tests { // Both should be included since we don't track duplicates in checkpoint batches let expected = vec![true, true]; - assert_eq!(visitor.deduplicator.selection_vector(), expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); assert_eq!(visitor.total_non_file_actions, 0); @@ -1135,12 +1136,12 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); let mut visitor = - CheckpointVisitor::new(&mut seen_file_keys, &mut seen_txns, vec![false; 3], true, 0); + CheckpointVisitor::new(&mut seen_file_keys, true, vec![false; 3], &mut seen_txns, 0); visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, false]; // Third one is a duplicate - assert_eq!(visitor.deduplicator.selection_vector(), expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); assert_eq!(visitor.total_non_file_actions, 0); @@ -1160,12 +1161,12 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); let mut visitor = - CheckpointVisitor::new(&mut seen_file_keys, &mut seen_txns, vec![false; 3], true, 0); + CheckpointVisitor::new(&mut seen_file_keys, true, vec![false; 3], &mut seen_txns, 0); visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, true]; - assert_eq!(visitor.deduplicator.selection_vector(), expected); + assert_eq!(visitor.selection_vector, expected); assert!(visitor.seen_protocol); assert!(visitor.seen_metadata); assert_eq!(visitor.seen_txns.len(), 1); @@ -1191,9 +1192,9 @@ mod tests { let mut visitor = CheckpointVisitor::new( &mut seen_file_keys, - &mut seen_txns, // Pre-populated transaction - vec![false; 3], true, + vec![false; 3], + &mut seen_txns, // Pre-populated transaction 0, ); @@ -1205,7 +1206,7 @@ mod tests { // All actions should be skipped as they have already been seen let expected = vec![false, false, false]; - assert_eq!(visitor.deduplicator.selection_vector(), expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_non_file_actions, 0); assert_eq!(visitor.total_file_actions, 0); @@ -1231,17 +1232,17 @@ mod tests { let mut seen_txns = HashSet::new(); let mut visitor = CheckpointVisitor::new( &mut seen_file_keys, - &mut seen_txns, - vec![false; 7], true, // is_log_batch - 0, // minimum_file_retention_timestamp + vec![false; 7], + &mut seen_txns, + 0, // minimum_file_retention_timestamp ); visitor.visit_rows_of(batch.as_ref())?; // First occurrence of each type should be included let expected = vec![true, false, true, true, false, true, false]; - assert_eq!(visitor.deduplicator.selection_vector(), expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.seen_txns.len(), 2); // Two different app IDs assert_eq!(visitor.total_non_file_actions, 4); // 2 txns + 1 protocol + 1 metadata assert_eq!(visitor.total_file_actions, 0); From abc7e1fe4573d924372502ad26f1b5dcef4e5007 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 28 Mar 2025 16:14:53 -0700 Subject: [PATCH 047/176] merge fixes --- kernel/src/actions/visitors.rs | 228 ++++----------------------- kernel/src/checkpoints/log_replay.rs | 34 ++-- kernel/src/scan/log_replay.rs | 32 +--- 3 files changed, 45 insertions(+), 249 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 4d93d6fd3e..aa7f146142 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -4,10 +4,8 @@ use std::collections::{HashMap, HashSet}; use std::sync::LazyLock; -use tracing::debug; - use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; -use crate::log_replay::FileActionKey; +use crate::log_replay::{FileActionDeduplicator, FileActionKey}; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, Error}; @@ -505,19 +503,25 @@ impl RowVisitor for SidecarVisitor { pub(crate) struct CheckpointVisitor<'seen> { // File actions deduplication state pub(crate) deduplicator: FileActionDeduplicator<'seen>, - pub(crate) total_file_actions: usize, - pub(crate) total_add_actions: usize, + pub(crate) selection_vector: Vec, + pub(crate) total_file_actions: u64, + pub(crate) total_add_actions: u64, pub(crate) minimum_file_retention_timestamp: i64, // Non-file actions deduplication state pub(crate) seen_protocol: bool, pub(crate) seen_metadata: bool, pub(crate) seen_txns: &'seen mut HashSet, - pub(crate) total_non_file_actions: usize, + pub(crate) total_non_file_actions: u64, } #[allow(unused)] impl CheckpointVisitor<'_> { + // The index position in the row getters for the following columns + const ADD_PATH_INDEX: usize = 0; + const ADD_DV_START_INDEX: usize = 1; + const REMOVE_PATH_INDEX: usize = 4; + const REMOVE_DV_START_INDEX: usize = 6; /// Create a new CheckpointVisitor pub(crate) fn new<'seen>( seen_file_keys: &'seen mut HashSet, @@ -531,9 +535,13 @@ impl CheckpointVisitor<'_> { CheckpointVisitor { deduplicator: FileActionDeduplicator::new( seen_file_keys, - selection_vector, is_log_batch, + Self::ADD_PATH_INDEX, + Self::REMOVE_PATH_INDEX, + Self::ADD_DV_START_INDEX, + Self::REMOVE_DV_START_INDEX, ), + selection_vector, total_file_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp, @@ -567,17 +575,13 @@ impl CheckpointVisitor<'_> { i: usize, getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { - // Extract file action key and determine if it's an add operation - let Some((file_key, is_add)) = self.deduplicator.extract_file_action( - i, - getters, - // Do not skip remove actions (even if we're processing a log batch) - FileActionExtractConfig::new(0, 4, 1, 6, false), - )? + // Never skip remove actions, as they may be unexpired tombstones. + let Some((file_key, is_add)) = self.deduplicator.extract_file_action(i, getters, false)? else { return Ok(false); }; + // Check if we've already seen this file action if self.deduplicator.check_and_record_seen(file_key) { return Ok(false); } @@ -700,191 +704,13 @@ impl RowVisitor for CheckpointVisitor<'_> { // Mark the row for selection if it's either a valid non-file or file action if is_non_file_action || is_file_action { - self.deduplicator.selection_vector_mut()[i] = true; + self.selection_vector[i] = true; } } Ok(()) } } -/// This struct contains indices and configuration options needed to -/// extract file actions from action batches in the Delta log. -pub(crate) struct FileActionExtractConfig { - /// Index of the getter containing the add.path column - pub add_path_index: usize, - /// Index of the getter containing the remove.path column - pub remove_path_index: usize, - /// Starting index for add action deletion vector columns - pub add_dv_start_index: usize, - /// Starting index for remove action deletion vector columns - pub remove_dv_start_index: usize, - /// Whether to skip remove actions when extracting file actions - pub skip_removes: bool, -} - -impl FileActionExtractConfig { - pub(crate) fn new( - add_path_index: usize, - remove_path_index: usize, - add_dv_start_index: usize, - remove_dv_start_index: usize, - skip_removes: bool, - ) -> Self { - Self { - add_path_index, - remove_path_index, - add_dv_start_index, - remove_dv_start_index, - skip_removes, - } - } -} - -/// Core implementation for deduplicating file actions in Delta log replay -/// This struct extracts the common functionality from the CheckpointVisitor -/// and the AddRemoveDedupVisitor. -pub(crate) struct FileActionDeduplicator<'seen> { - /// A set of (data file path, dv_unique_id) pairs that have been seen thus - /// far in the log for deduplication - seen_file_keys: &'seen mut HashSet, - /// Selection vector to track which rows should be included - selection_vector: Vec, - /// Whether we're processing a log batch (as opposed to a checkpoint) - is_log_batch: bool, -} - -impl<'seen> FileActionDeduplicator<'seen> { - pub(crate) fn new( - seen_file_keys: &'seen mut HashSet, - selection_vector: Vec, - is_log_batch: bool, - ) -> Self { - Self { - seen_file_keys, - selection_vector, - is_log_batch, - } - } - - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it - /// and should process it. - pub(crate) fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen_file_keys.contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - true - } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - if self.is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen_file_keys.insert(key); - } - false - } - } - - /// Extract deletion vector unique ID - fn extract_dv_unique_id<'a>( - &self, - i: usize, - getters: &[&'a dyn GetData<'a>], - add_dv_start_index: Option, - remove_dv_start_index: Option, - ) -> DeltaResult> { - // Get the starting index based on action type - let start_idx = add_dv_start_index - .or(remove_dv_start_index) - .ok_or_else(|| Error::GenericError { - source: "starting indices for add/remove DVs should have been passed".into(), - })?; - - // Extract the DV unique ID - match getters[start_idx].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Ok(Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - getters[start_idx + 1].get(i, "deletionVector.pathOrInlineDv")?, - getters[start_idx + 2].get_opt(i, "deletionVector.offset")?, - ))), - None => Ok(None), - } - } - - /// Extracts a file action key and determines if it's an add operation. - /// - /// This method examines the data at the given index using the provided getters and config - /// to identify whether a file action exists and what type it is. - /// - /// # Arguments - /// - /// * `i` - Index position in the data structure to examine - /// * `getters` - Collection of data getter implementations used to access the data - /// * `config` - Configuration specifying where to find add/remove operations - /// - /// # Returns - /// - /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation - /// * `Ok(None)` - When no file action is found - /// * `Err(...)` - On any error during extraction - pub(crate) fn extract_file_action<'a>( - &self, - i: usize, - getters: &[&'a dyn GetData<'a>], - config: FileActionExtractConfig, - ) -> DeltaResult> { - // Try to extract an add action path - if let Some(path) = getters[config.add_path_index].get_str(i, "add.path")? { - let dv_unique_id = - self.extract_dv_unique_id(i, getters, Some(config.add_dv_start_index), None)?; - return Ok(Some((FileActionKey::new(path, dv_unique_id), true))); - } - - // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint file. - if config.skip_removes { - return Ok(None); - } - - // Try to extract a remove action path - if let Some(path) = getters[config.remove_path_index].get_str(i, "remove.path")? { - let dv_unique_id = - self.extract_dv_unique_id(i, getters, None, Some(config.remove_dv_start_index))?; - return Ok(Some((FileActionKey::new(path, dv_unique_id), false))); - } - - // No file action found - Ok(None) - } - - pub(crate) fn selection_vector(self) -> Vec { - self.selection_vector - } - - pub(crate) fn selection_vector_ref(&self) -> &Vec { - &self.selection_vector - } - - pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec { - &mut self.selection_vector - } - - /// Returns whether we are currently processing a log batch. - pub(crate) fn is_log_batch(&self) -> bool { - self.is_log_batch - } -} - /// Get a DV out of some engine data. The caller is responsible for slicing the `getters` slice such /// that the first element contains the `storageType` element of the deletion vector. pub(crate) fn visit_deletion_vector_at<'a>( @@ -1175,7 +1001,7 @@ mod tests { assert_eq!(visitor.seen_txns.len(), 1); assert_eq!(visitor.total_non_file_actions, 3); - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.selection_vector, expected); Ok(()) } @@ -1207,7 +1033,7 @@ mod tests { // Only "one_above_threshold" should be kept let expected = vec![false, false, true, false]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 0); assert_eq!(visitor.total_non_file_actions, 0); @@ -1240,7 +1066,7 @@ mod tests { // First one should be included, second one skipped as a duplicate let expected = vec![true, false]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 1); assert_eq!(visitor.total_non_file_actions, 0); @@ -1275,7 +1101,7 @@ mod tests { // Both should be included since we don't track duplicates in checkpoint batches let expected = vec![true, true]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); assert_eq!(visitor.total_non_file_actions, 0); @@ -1309,7 +1135,7 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, false]; // Third one is a duplicate - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); assert_eq!(visitor.total_non_file_actions, 0); @@ -1341,7 +1167,7 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, true]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.selection_vector, expected); assert!(visitor.seen_protocol); assert!(visitor.seen_metadata); assert_eq!(visitor.seen_txns.len(), 1); @@ -1379,7 +1205,7 @@ mod tests { // All actions should be skipped as they have already been seen let expected = vec![false, false, false]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_non_file_actions, 0); assert_eq!(visitor.total_file_actions, 0); @@ -1417,7 +1243,7 @@ mod tests { // First occurrence of each type should be included let expected = vec![true, false, true, true, false, true, false]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.seen_txns.len(), 2); // Two different app IDs assert_eq!(visitor.total_non_file_actions, 4); assert_eq!(visitor.total_file_actions, 0); diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs index dc64b766c5..0a31bffc1b 100644 --- a/kernel/src/checkpoints/log_replay.rs +++ b/kernel/src/checkpoints/log_replay.rs @@ -1,12 +1,10 @@ use std::collections::HashSet; -use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; use crate::actions::visitors::CheckpointVisitor; use crate::engine_data::RowVisitor; -use crate::log_replay::{ - apply_processor_to_iterator, FileActionKey, HasSelectionVector, LogReplayProcessor, -}; +use crate::log_replay::{FileActionKey, HasSelectionVector, LogReplayProcessor}; use crate::{DeltaResult, EngineData}; pub struct CheckpointData { @@ -30,10 +28,10 @@ struct CheckpointLogReplayProcessor { seen_file_keys: HashSet, /// Counter for the total number of actions processed during log replay. - total_actions: Arc, + total_actions: Arc, /// Counter for the total number of add actions processed during log replay. - total_add_actions: Arc, + total_add_actions: Arc, /// Indicates whether a protocol action has been seen in the log. seen_protocol: bool, @@ -50,7 +48,7 @@ struct CheckpointLogReplayProcessor { impl LogReplayProcessor for CheckpointLogReplayProcessor { // Define the processing result type as a tuple of the data and selection vector - type ProcessingResult = CheckpointData; + type Output = CheckpointData; /// This function processes batches of actions in reverse chronological order /// (from most recent to least recent) and performs the necessary filtering @@ -65,11 +63,11 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { /// 2. For each app ID, only the most recent transaction action is included /// 3. File actions are deduplicated based on path and unique ID /// 4. Tombstones older than `minimum_file_retention_timestamp` are excluded - fn process_batch( + fn process_actions_batch( &mut self, batch: Box, is_log_batch: bool, - ) -> DeltaResult { + ) -> DeltaResult { // Initialize selection vector with all rows un-selected let selection_vector = vec![false; batch.len()]; assert_eq!( @@ -106,7 +104,7 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { Ok(CheckpointData { data: batch, - selection_vector: visitor.deduplicator.selection_vector(), + selection_vector: visitor.selection_vector, }) } @@ -119,8 +117,8 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented impl CheckpointLogReplayProcessor { pub(super) fn new( - total_actions_counter: Arc, - total_add_actions_counter: Arc, + total_actions_counter: Arc, + total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> Self { Self { @@ -146,8 +144,8 @@ impl CheckpointLogReplayProcessor { #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented pub(crate) fn checkpoint_actions_iter( action_iter: impl Iterator, bool)>> + Send + 'static, - total_actions_counter: Arc, - total_add_actions_counter: Arc, + total_actions_counter: Arc, + total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> impl Iterator> + Send + 'static { let mut log_scanner = CheckpointLogReplayProcessor::new( @@ -156,12 +154,12 @@ pub(crate) fn checkpoint_actions_iter( minimum_file_retention_timestamp, ); - apply_processor_to_iterator(log_scanner, action_iter) + CheckpointLogReplayProcessor::apply_to_iterator(log_scanner, action_iter) } #[cfg(test)] mod tests { - use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; use crate::arrow::array::StringArray; @@ -175,8 +173,8 @@ mod tests { #[test] fn test_v1_checkpoint_actions_iter_multi_batch_integration() -> DeltaResult<()> { // Setup counters - let total_actions_counter = Arc::new(AtomicUsize::new(0)); - let total_add_actions_counter = Arc::new(AtomicUsize::new(0)); + let total_actions_counter = Arc::new(AtomicU64::new(0)); + let total_add_actions_counter = Arc::new(AtomicU64::new(0)); // Create first batch with protocol, metadata, and some files let json_strings1: StringArray = vec![ diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 7782b18578..0959d9cf88 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -7,7 +7,6 @@ use itertools::Itertools; use super::data_skipping::DataSkippingFilter; use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; -use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig}; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; use crate::log_replay::{FileActionDeduplicator, FileActionKey, LogReplayProcessor}; @@ -29,29 +28,6 @@ struct ScanLogReplayProcessor { seen_file_keys: HashSet, } -impl ScanLogReplayProcessor { - /// Create a new [`ScanLogReplayProcessor`] instance - fn new( - engine: &dyn Engine, - physical_predicate: Option<(ExpressionRef, SchemaRef)>, - logical_schema: SchemaRef, - transform: Option>, - ) -> Self { - Self { - partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()), - data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate), - add_transform: engine.get_expression_handler().get_evaluator( - get_log_add_schema().clone(), - get_add_transform_expr(), - SCAN_ROW_DATATYPE.clone(), - ), - seen_file_keys: Default::default(), - logical_schema, - transform, - } - } -} - /// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds. Log /// replay visits actions newest-first, so once we've seen a file action for a given (path, dvId) /// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the @@ -270,8 +246,8 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { ); for i in 0..row_count { - if self.deduplicator.selection_vector_ref()[i] { - self.deduplicator.selection_vector_mut()[i] = self.is_valid_add(i, getters)?; + if self.selection_vector[i] { + self.selection_vector[i] = self.is_valid_add(i, getters)?; } } Ok(()) @@ -359,10 +335,6 @@ impl LogReplayProcessor for ScanLogReplayProcessor { fn seen_file_keys(&mut self) -> &mut HashSet { &mut self.seen_file_keys } - - fn seen_file_keys(&mut self) -> &mut HashSet { - &mut self.seen_file_keys - } } impl ScanLogReplayProcessor { From 964f294c328c8bc77f122fea9586cde33b18a115 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 30 Mar 2025 14:27:21 -0700 Subject: [PATCH 048/176] docs --- kernel/src/log_replay.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 39aa4ab6e3..9eddae2931 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -48,6 +48,8 @@ pub(crate) struct FileActionDeduplicator<'seen> { /// far in the log for deduplication. This is a mutable reference to the set /// of seen file keys that persists across multiple log batches. seen_file_keys: &'seen mut HashSet, + // TODO: Consider renaming to `is_commit_batch`, `deduplicate_batch`, or `save_batch` + // to better reflect its role in deduplication logic. /// Whether we're processing a log batch (as opposed to a checkpoint) is_log_batch: bool, /// Index of the getter containing the add.path column @@ -109,7 +111,13 @@ impl<'seen> FileActionDeduplicator<'seen> { } } - /// Extract the deletion vector unique ID if it exists. + /// Extracts the deletion vector unique ID if it exists. + /// + /// This function retrieves the necessary fields for constructing a deletion vector unique ID + /// by accessing `getters` at `dv_start_index` and the following two indices. Specifically: + /// - `dv_start_index` retrieves the storage type (`deletionVector.storageType`). + /// - `dv_start_index + 1` retrieves the path or inline deletion vector (`deletionVector.pathOrInlineDv`). + /// - `dv_start_index + 2` retrieves the optional offset (`deletionVector.offset`). fn extract_dv_unique_id<'a>( &self, i: usize, @@ -175,6 +183,9 @@ impl<'seen> FileActionDeduplicator<'seen> { } /// Returns whether we are currently processing a log batch. + /// + /// `true` indicates we are processing a batch from a commit file. + /// `false` indicates we are processing a batch from a checkpoint. pub(crate) fn is_log_batch(&self) -> bool { self.is_log_batch } From c026258bcff15b16c8448a4a62acdf92011b14cf Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 30 Mar 2025 15:37:29 -0700 Subject: [PATCH 049/176] clean up and docs --- kernel/src/actions/visitors.rs | 178 +++++++++++++++++---------------- 1 file changed, 91 insertions(+), 87 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 17fa6c93e5..e4ecf0d5df 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -483,57 +483,71 @@ impl RowVisitor for SidecarVisitor { Ok(()) } } -/// A visitor that filters actions for inclusion in a checkpoint file. +/// A visitor that filters actions for inclusion in a V1 spec checkpoint file. /// /// This visitor processes actions in newest-to-oldest order (as they appear in log -/// replay) and applies deduplication logic for both file and non-file actions. +/// replay) and applies deduplication logic for both file and non-file actions to +/// produce the minimal state representation for the table. /// /// # File Action Filtering /// - Keeps only the first occurrence of each unique (path, dvId) pair /// - Excludes expired tombstone remove actions (where deletionTimestamp ≤ minimumFileRetentionTimestamp) +/// - Add actions represent files present in the table +/// - Unexpired remove actions represent tombstones still needed for consistency /// /// # Non-File Action Filtering -/// - Keeps only the first protocol action -/// - Keeps only the first metadata action +/// - Keeps only the first protocol action (newest version) +/// - Keeps only the first metadata action (most recent table metadata) /// - Keeps only the first transaction action for each unique app ID /// -/// CommitInfo, CDC, and sidecar actions are NOT part of the V1 spec checkpoint schema. +/// # Excluded Actions +/// CommitInfo, CDC, Sidecar, and CheckpointMetadata actions are NOT part of the V1 checkpoint schema +/// and are filtered out. /// -/// This filtered set of actions represents the minimal set needed to reconstruct -/// the latest valid state of the table. +/// The resulting filtered set of actions represents the minimal set needed to reconstruct +/// the latest valid state of the table at the checkpointed version. #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] -pub(crate) struct CheckpointVisitor<'seen> { +pub(crate) struct V1CheckpointVisitor<'seen> { // File actions state - deduplicator: FileActionDeduplicator<'seen>, - selection_vector: Vec, - total_file_actions: usize, - total_add_actions: usize, - minimum_file_retention_timestamp: i64, + deduplicator: FileActionDeduplicator<'seen>, // Used to deduplicate file actions + selection_vector: Vec, // Used to mark rows for selection + total_file_actions: i64, // i64 to match the `_last_checkpoint` file schema + total_add_actions: i64, // i64 to match the `_last_checkpoint` file schema + minimum_file_retention_timestamp: i64, // i64 for comparison with remove.deletionTimestamp // Non-file actions state - seen_protocol: bool, - seen_metadata: bool, - seen_txns: &'seen mut HashSet, - total_non_file_actions: usize, + seen_protocol: bool, // Used to keep only the first protocol action + seen_metadata: bool, // Used to keep only the first metadata action + seen_txns: &'seen mut HashSet, // Used to keep only the first txn action for each app ID + total_non_file_actions: i64, // i64 to match the `_last_checkpoint` file schema } #[allow(unused)] -impl CheckpointVisitor<'_> { - // The index position in the row getters for the following columns - const ADD_PATH_INDEX: usize = 0; - const ADD_DV_START_INDEX: usize = 1; - const REMOVE_PATH_INDEX: usize = 4; - const REMOVE_DV_START_INDEX: usize = 6; - - /// Create a new CheckpointVisitor +impl V1CheckpointVisitor<'_> { + // These index positions correspond to the order of columns defined in + // `selected_column_names_and_types()`, and are used to extract file key information + // for deduplication purposes + const ADD_PATH_INDEX: usize = 0; // Position of "add.path" in getters + const ADD_DV_START_INDEX: usize = 1; // Start position of add deletion vector columns + const REMOVE_PATH_INDEX: usize = 4; // Position of "remove.path" in getters + const REMOVE_DV_START_INDEX: usize = 6; // Start position of remove deletion vector columns + + /// Creates a new V1CheckpointVisitor for filtering checkpoint actions. + /// + /// # Arguments + /// * `seen_file_keys` - Set to track already seen file keys for deduplication + /// * `is_log_batch` - True if processing a batch from a commit file, false if from a checkpoint file + /// * `selection_vector` - Vector to mark rows for selection in the output + /// * `seen_txns` - Set to track already seen transaction app IDs + /// * `minimum_file_retention_timestamp` - Timestamp threshold for tombstone expiration fn new<'seen>( seen_file_keys: &'seen mut HashSet, is_log_batch: bool, selection_vector: Vec, seen_txns: &'seen mut HashSet, minimum_file_retention_timestamp: i64, - ) -> CheckpointVisitor<'seen> { - CheckpointVisitor { + ) -> V1CheckpointVisitor<'seen> { + V1CheckpointVisitor { deduplicator: FileActionDeduplicator::new( seen_file_keys, is_log_batch, @@ -554,11 +568,16 @@ impl CheckpointVisitor<'_> { } } + /// Determines if a remove action tombstone has expired and should be excluded from the checkpoint. + /// /// A remove action includes a timestamp indicating when the deletion occurred. Physical files /// are deleted lazily after a user-defined expiration time, allowing concurrent readers to /// access stale snapshots. A remove action remains as a tombstone in a checkpoint file until - /// it expires, which happens when the current time exceeds the removal timestamp plus the - /// expiration threshold. + /// it expires, which happens when the deletion timestamp is less than or equal to the + /// minimum file retention timestamp. + /// + /// Note: When remove.deletion_timestamp is not present (defaulting to 0), the remove action + /// will be excluded from the checkpoint file as it will be treated as expired. fn is_expired_tombstone<'a>(&self, i: usize, getter: &'a dyn GetData<'a>) -> DeltaResult { // Ideally this should never be zero, but we are following the same behavior as Delta // Spark and the Java Kernel. @@ -573,6 +592,8 @@ impl CheckpointVisitor<'_> { } /// Returns true if the row contains a valid file action to be included in the checkpoint. + /// This function handles both add and remove actions, applying deduplication logic and + /// tombstone expiration rules as needed. fn is_valid_file_action<'a>( &mut self, i: usize, @@ -654,7 +675,7 @@ impl CheckpointVisitor<'_> { } } -impl RowVisitor for CheckpointVisitor<'_> { +impl RowVisitor for V1CheckpointVisitor<'_> { fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { // The data columns visited must be in the following order: // 1. ADD @@ -993,11 +1014,11 @@ mod tests { } #[test] - fn test_checkpoint_visitor() -> DeltaResult<()> { + fn test_v1_checkpoint_visitor() -> DeltaResult<()> { let data = action_batch(); let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); - let mut visitor = CheckpointVisitor::new( + let mut visitor = V1CheckpointVisitor::new( &mut seen_file_keys, true, vec![false; 8], @@ -1008,11 +1029,14 @@ mod tests { visitor.visit_rows_of(data.as_ref())?; // Combined results from both file and non-file actions - // Row 0 is an add action - // Row 1 is a remove action - // Row 3 is a protocol action - // Row 4 is a metadata action - // Row 7 is a txn action + // Row 0 is an add action (included) + // Row 1 is a remove action (included) + // Row 2 is a commit info action (excluded) + // Row 3 is a protocol action (included) + // Row 4 is a metadata action (included) + // Row 5 is a cdc action (excluded) + // Row 6 is a sidecar action (excluded) + // Row 7 is a txn action (included) let expected = vec![true, true, false, true, true, false, false, true]; // Verify file action results @@ -1029,8 +1053,14 @@ mod tests { Ok(()) } + /// Tests the boundary conditions for tombstone expiration logic. + /// Specifically checks: + /// - Remove actions with deletionTimestamp == minimumFileRetentionTimestamp (should be excluded) + /// - Remove actions with deletionTimestamp < minimumFileRetentionTimestamp (should be excluded) + /// - Remove actions with deletionTimestamp > minimumFileRetentionTimestamp (should be included) + /// - Remove actions with missing deletionTimestamp (defaults to 0, should be excluded) #[test] - fn test_checkpoint_visitor_boundary_cases_for_tombstone_expiration() -> DeltaResult<()> { + fn test_v1_checkpoint_visitor_boundary_cases_for_tombstone_expiration() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, @@ -1043,7 +1073,7 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); - let mut visitor = CheckpointVisitor::new( + let mut visitor = V1CheckpointVisitor::new( &mut seen_file_keys, true, vec![false; 4], @@ -1063,7 +1093,7 @@ mod tests { } #[test] - fn test_checkpoint_visitor_conflicting_file_actions_in_log_batch() -> DeltaResult<()> { + fn test_v1_checkpoint_visitor_conflicting_file_actions_in_log_batch() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, // Duplicate path @@ -1075,11 +1105,11 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); let mut visitor = - CheckpointVisitor::new(&mut seen_file_keys, true, vec![false; 2], &mut seen_txns, 0); + V1CheckpointVisitor::new(&mut seen_file_keys, true, vec![false; 2], &mut seen_txns, 0); visitor.visit_rows_of(batch.as_ref())?; - // First one should be included, second one skipped as a duplicate + // First file action should be included. The second one should be excluded due to the conflict. let expected = vec![true, false]; assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_file_actions, 1); @@ -1089,46 +1119,46 @@ mod tests { } #[test] - fn test_checkpoint_visitor_duplicate_file_actions_in_checkpoint_batch() -> DeltaResult<()> { - // Note: this is NOT a valid checkpoint batch since it contains duplicate file actions! - // However, we should still be able to parse it without errors, and the duplicates should be included. + fn test_v1_checkpoint_visitor_file_actions_in_checkpoint_batch() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, - // Duplicate path - r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, ] .into(); let batch = parse_json_batch(json_strings); let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); - let mut visitor = CheckpointVisitor::new( + let mut visitor = V1CheckpointVisitor::new( &mut seen_file_keys, false, // is_log_batch = false (checkpoint batch) - vec![false; 2], + vec![false; 1], &mut seen_txns, 0, ); visitor.visit_rows_of(batch.as_ref())?; - // Both should be included since we don't track duplicates in checkpoint batches - let expected = vec![true, true]; + let expected = vec![true]; assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.total_file_actions, 2); - assert_eq!(visitor.total_add_actions, 2); + assert_eq!(visitor.total_file_actions, 1); + assert_eq!(visitor.total_add_actions, 1); assert_eq!(visitor.total_non_file_actions, 0); + // The action should NOT be added to the seen_file_keys set as it's a checkpoint batch + // and actions in checkpoint batches do not conflict with + assert!(seen_file_keys.is_empty()); Ok(()) } #[test] - fn test_checkpoint_visitor_with_deletion_vectors() -> DeltaResult<()> { + fn test_v1_checkpoint_visitor_conflicts_with_deletion_vectors() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, // Same path but different DV r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"two","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, // Duplicate of first entry r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Conflicting remove action with DV + r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, ] .into(); let batch = parse_json_batch(json_strings); @@ -1136,11 +1166,12 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); let mut visitor = - CheckpointVisitor::new(&mut seen_file_keys, true, vec![false; 3], &mut seen_txns, 0); + V1CheckpointVisitor::new(&mut seen_file_keys, true, vec![false; 4], &mut seen_txns, 0); visitor.visit_rows_of(batch.as_ref())?; - let expected = vec![true, true, false]; // Third one is a duplicate + // Only the first two should be included since they have different (path, DvID) keys + let expected = vec![true, true, false, false]; assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); @@ -1150,34 +1181,7 @@ mod tests { } #[test] - fn test_checkpoint_visitor_non_file_actions() -> DeltaResult<()> { - let json_strings: StringArray = vec![ - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - ].into(); - let batch = parse_json_batch(json_strings); - - let mut seen_file_keys = HashSet::new(); - let mut seen_txns = HashSet::new(); - let mut visitor = - CheckpointVisitor::new(&mut seen_file_keys, true, vec![false; 3], &mut seen_txns, 0); - - visitor.visit_rows_of(batch.as_ref())?; - - let expected = vec![true, true, true]; - assert_eq!(visitor.selection_vector, expected); - assert!(visitor.seen_protocol); - assert!(visitor.seen_metadata); - assert_eq!(visitor.seen_txns.len(), 1); - assert_eq!(visitor.total_non_file_actions, 3); - assert_eq!(visitor.total_file_actions, 0); - - Ok(()) - } - - #[test] - fn test_checkpoint_visitor_already_seen_non_file_actions() -> DeltaResult<()> { + fn test_v1_checkpoint_visitor_already_seen_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, @@ -1190,7 +1194,7 @@ mod tests { let mut seen_txns = HashSet::new(); seen_txns.insert("app1".to_string()); - let mut visitor = CheckpointVisitor::new( + let mut visitor = V1CheckpointVisitor::new( &mut seen_file_keys, true, vec![false; 3], @@ -1214,7 +1218,7 @@ mod tests { } #[test] - fn test_checkpoint_visitor_duplicate_non_file_actions() -> DeltaResult<()> { + fn test_v1_checkpoint_visitor_duplicate_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, // Duplicate txn @@ -1230,7 +1234,7 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); - let mut visitor = CheckpointVisitor::new( + let mut visitor = V1CheckpointVisitor::new( &mut seen_file_keys, true, // is_log_batch vec![false; 7], From 88ba96c612810cfc46ee9ed86e1ed501dff5c6be Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 30 Mar 2025 15:57:24 -0700 Subject: [PATCH 050/176] docs --- kernel/src/log_replay.rs | 21 ++++++++++++++++----- kernel/src/scan/log_replay.rs | 6 +----- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 3b7e875242..55c78a6665 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -190,21 +190,32 @@ impl<'seen> FileActionDeduplicator<'seen> { /// pub(crate) trait LogReplayProcessor { /// The type of results produced by this processor - type Output; + type Output: HasSelectionVector; /// Process a batch of actions and return the filtered result + /// + /// # Arguments + /// * `batch` - Box containing the `EngineData` batch of actions to process + /// * `is_log_batch` - Flag indicating whether this batch comes from a commit file (`true`) + /// or a checkpoint file (`false`) + /// + /// Returns a `DeltaResult` containing the processor's output type with filtered actions fn process_actions_batch( &mut self, batch: Box, is_log_batch: bool, ) -> DeltaResult; - // Get a reference to the set of seen file keys - fn seen_file_keys(&mut self) -> &mut HashSet; - /// Applies a processor to an action iterator and filters out empty results. /// - /// This is an associated function rather than an instance method because the + /// # Arguments + /// * `processor` - The processor implementation to apply + /// * `action_iter` - Iterator of action batches and their source flags + /// + /// Returns an iterator that yields processed results, filtering out batches + /// where no rows were selected + /// + /// Note: This is an associated function rather than an instance method because the /// returned iterator needs to own the processor. fn apply_to_iterator( processor: impl LogReplayProcessor, diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 896b18b8b7..8d301e64b1 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -339,7 +339,7 @@ impl LogReplayProcessor for ScanLogReplayProcessor { let result = self.add_transform.evaluate(batch.as_ref())?; let mut visitor = AddRemoveDedupVisitor::new( - self.seen_file_keys(), + &mut self.seen_file_keys, selection_vector, logical_schema, transform, @@ -354,10 +354,6 @@ impl LogReplayProcessor for ScanLogReplayProcessor { visitor.row_transform_exprs, )) } - - fn seen_file_keys(&mut self) -> &mut HashSet { - &mut self.seen_file_keys - } } /// Given an iterator of (engine_data, bool) tuples and a predicate, returns an iterator of From 4c98c84772ec55337bf749ffb3a4fb565dd7bc83 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 30 Mar 2025 16:44:46 -0700 Subject: [PATCH 051/176] docs --- kernel/src/log_replay.rs | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 55c78a6665..8f9c04a318 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -180,16 +180,20 @@ impl<'seen> FileActionDeduplicator<'seen> { } } -/// Trait defining log replay processors which implement custom filtering and transformation -/// logic for processing action batches from transaction logs. They receive batches in reverse -/// chronological order (newest to oldest) and typically: +/// Trait defining log replay processors. /// -/// 1. Create or maintain a selection vector to track which actions to include -/// 2. Track already-seen file actions to deduplicate across batches -/// 3. Apply specialized filtering based on processor type (scan, checkpoint, etc.) +/// Log replay processors filter and transform action batches from Delta transaction logs +/// into specialized output types. Each processor maintains state as it processes batches +/// in reverse chronological order (newest to oldest). /// +/// Typical responsibilities include: +/// +/// 1. Maintaining selection vectors to identify relevant actions in each batch +/// 2. Tracking file actions that have already been processed to eliminate duplicates +/// 3. Applying domain-specific filtering based on the processor's purpose (scan, checkpoint, etc.) pub(crate) trait LogReplayProcessor { - /// The type of results produced by this processor + /// The type of results produced by this processor must implement the + /// `HasSelectionVector` trait to allow filtering out batches with no selected rows. type Output: HasSelectionVector; /// Process a batch of actions and return the filtered result @@ -237,7 +241,8 @@ pub(crate) trait LogReplayProcessor { } } -/// Trait for types that contain a selection vector used in log replay filtering. +/// This trait is used to determine if a processor's output contains any selected rows. +/// This is used to filter out batches with no selected rows from the log replay results. pub(crate) trait HasSelectionVector { /// Check if the selection vector contains at least one selected row fn has_selected_rows(&self) -> bool; From 655ed1d1bf3c3ab6321875902ad3ad9c01bd6e2c Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 1 Apr 2025 11:03:56 -0700 Subject: [PATCH 052/176] fix merge --- kernel/src/scan/log_replay.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 4917533827..8d301e64b1 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -12,7 +12,6 @@ use crate::expressions::{column_expr, column_name, ColumnName, Expression, Expre use crate::log_replay::{FileActionDeduplicator, FileActionKey, LogReplayProcessor}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; use crate::scan::{Scalar, TransformExpr}; -use crate::scan::{Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; From 6c222a394bd385658b5849673bb004f6bc01c218 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 1 Apr 2025 11:05:04 -0700 Subject: [PATCH 053/176] crate mod --- kernel/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index d40a515e6f..f0c1b0f343 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -77,7 +77,7 @@ pub mod actions; pub mod engine_data; pub mod error; pub mod expressions; -pub mod log_replay; +pub(crate) mod log_replay; pub mod scan; pub mod schema; pub mod snapshot; From 30bd7d63236981e27bd64da927daab1db1fc563e Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 1 Apr 2025 11:07:18 -0700 Subject: [PATCH 054/176] dev vis --- kernel/src/lib.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index f0c1b0f343..29c64077c5 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -77,7 +77,6 @@ pub mod actions; pub mod engine_data; pub mod error; pub mod expressions; -pub(crate) mod log_replay; pub mod scan; pub mod schema; pub mod snapshot; @@ -98,6 +97,11 @@ pub mod path; #[cfg(not(feature = "developer-visibility"))] pub(crate) mod path; +#[cfg(feature = "developer-visibility")] +pub mod log_replay; +#[cfg(not(feature = "developer-visibility"))] +pub(crate) mod log_replay; + #[cfg(feature = "developer-visibility")] pub mod log_segment; #[cfg(not(feature = "developer-visibility"))] From 5777e5a9564c72a5b86271cbb13772b54d4ec786 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 1 Apr 2025 11:31:54 -0700 Subject: [PATCH 055/176] improve docs --- kernel/src/log_replay.rs | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index d4755ebbd1..dadcf7f4de 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -191,30 +191,31 @@ impl<'seen> FileActionDeduplicator<'seen> { } } -/// Trait defining log replay processors. +/// A trait for processing batches of actions from Delta transaction logs during log replay. /// -/// Log replay processors filter and transform action batches from Delta transaction logs -/// into specialized output types. Each processor maintains state as it processes batches -/// in reverse chronological order (newest to oldest). +/// Log replay processors scan transaction logs in **reverse chronological order** (newest to oldest), +/// filtering and transforming action batches into specialized output types. These processors: /// -/// Typical responsibilities include: +/// - **Track and deduplicate file actions** to ensure only the latest relevant changes are kept. +/// - **Maintain selection vectors** to indicate which actions in each batch should be included. +/// - **Apply custom filtering logic** based on the processor’s purpose (e.g., checkpointing, scanning). /// -/// 1. Maintaining selection vectors to identify relevant actions in each batch -/// 2. Tracking file actions that have already been processed to eliminate duplicates -/// 3. Applying domain-specific filtering based on the processor's purpose (scan, checkpoint, etc.) +/// The `Output` type must implement [`HasSelectionVector`] to enable filtering of batches +/// with no selected rows. pub(crate) trait LogReplayProcessor { /// The type of results produced by this processor must implement the /// `HasSelectionVector` trait to allow filtering out batches with no selected rows. type Output: HasSelectionVector; - /// Process a batch of actions and return the filtered result + /// Processes a batch of actions and returns the filtered results. /// /// # Arguments - /// * `batch` - Box containing the `EngineData` batch of actions to process - /// * `is_log_batch` - Flag indicating whether this batch comes from a commit file (`true`) - /// or a checkpoint file (`false`) + /// - `batch` - A boxed [`EngineData`] instance representing a batch of actions. + /// - `is_log_batch` - `true` if the batch originates from a commit log, `false` if from a checkpoint. /// - /// Returns a `DeltaResult` containing the processor's output type with filtered actions + /// Returns a [`DeltaResult`] containing the processor’s output, which includes only selected actions. + /// + /// Note: Since log replay is stateful, processing may update internal processor state (e.g., deduplication sets). fn process_actions_batch( &mut self, batch: Box, From bdbc3fb67fa4f35ca20ab7e03577675bdfad5155 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 1 Apr 2025 11:37:06 -0700 Subject: [PATCH 056/176] docs --- kernel/src/checkpoints/log_replay.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs index 0a31bffc1b..f563e34f7b 100644 --- a/kernel/src/checkpoints/log_replay.rs +++ b/kernel/src/checkpoints/log_replay.rs @@ -61,8 +61,9 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { /// /// 1. Only the most recent protocol and metadata actions are included /// 2. For each app ID, only the most recent transaction action is included - /// 3. File actions are deduplicated based on path and unique ID + /// 3. Add and remove actions are deduplicated based on path and unique ID /// 4. Tombstones older than `minimum_file_retention_timestamp` are excluded + /// 5. Sidecar, commitInfo, and CDC actions are excluded fn process_actions_batch( &mut self, batch: Box, @@ -107,11 +108,6 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { selection_vector: visitor.selection_vector, }) } - - // Get a reference to the set of seen file keys - fn seen_file_keys(&mut self) -> &mut HashSet { - &mut self.seen_file_keys - } } #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented From 95d01640840fb3ff7e5954c381cedb5084945640 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 1 Apr 2025 11:44:08 -0700 Subject: [PATCH 057/176] accept metadata & protocol param --- kernel/src/actions/visitors.rs | 54 ++++++++++++++++++++++++---------- kernel/src/scan/log_replay.rs | 4 ++- 2 files changed, 41 insertions(+), 17 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index cf0b1d6598..55b9df210f 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -544,8 +544,10 @@ impl V1CheckpointVisitor<'_> { seen_file_keys: &'seen mut HashSet, is_log_batch: bool, selection_vector: Vec, - seen_txns: &'seen mut HashSet, minimum_file_retention_timestamp: i64, + seen_protocol: bool, + seen_metadata: bool, + seen_txns: &'seen mut HashSet, ) -> V1CheckpointVisitor<'seen> { V1CheckpointVisitor { deduplicator: FileActionDeduplicator::new( @@ -561,8 +563,8 @@ impl V1CheckpointVisitor<'_> { total_add_actions: 0, minimum_file_retention_timestamp, - seen_protocol: false, - seen_metadata: false, + seen_protocol, + seen_metadata, seen_txns, total_non_file_actions: 0, } @@ -1022,8 +1024,10 @@ mod tests { &mut seen_file_keys, true, vec![false; 8], - &mut seen_txns, 0, // minimum_file_retention_timestamp (no expired tombstones) + false, + false, + &mut seen_txns, ); visitor.visit_rows_of(data.as_ref())?; @@ -1077,8 +1081,10 @@ mod tests { &mut seen_file_keys, true, vec![false; 4], - &mut seen_txns, 100, // minimum_file_retention_timestamp (threshold set to 100) + false, + false, + &mut seen_txns, ); visitor.visit_rows_of(batch.as_ref())?; @@ -1104,8 +1110,15 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); - let mut visitor = - V1CheckpointVisitor::new(&mut seen_file_keys, true, vec![false; 2], &mut seen_txns, 0); + let mut visitor = V1CheckpointVisitor::new( + &mut seen_file_keys, + true, + vec![false; 2], + 0, + false, + false, + &mut seen_txns, + ); visitor.visit_rows_of(batch.as_ref())?; @@ -1132,8 +1145,10 @@ mod tests { &mut seen_file_keys, false, // is_log_batch = false (checkpoint batch) vec![false; 1], - &mut seen_txns, 0, + false, + false, + &mut seen_txns, ); visitor.visit_rows_of(batch.as_ref())?; @@ -1165,8 +1180,15 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); - let mut visitor = - V1CheckpointVisitor::new(&mut seen_file_keys, true, vec![false; 4], &mut seen_txns, 0); + let mut visitor = V1CheckpointVisitor::new( + &mut seen_file_keys, + true, + vec![false; 4], + 0, + false, + false, + &mut seen_txns, + ); visitor.visit_rows_of(batch.as_ref())?; @@ -1198,14 +1220,12 @@ mod tests { &mut seen_file_keys, true, vec![false; 3], - &mut seen_txns, // Pre-populated transaction 0, + true, // The visior has already seen a protocol action + true, // The visitor has already seen a metadata action + &mut seen_txns, // Pre-populated transaction ); - // Mark these as already seen - visitor.seen_protocol = true; - visitor.seen_metadata = true; - visitor.visit_rows_of(batch.as_ref())?; // All actions should be skipped as they have already been seen @@ -1238,8 +1258,10 @@ mod tests { &mut seen_file_keys, true, // is_log_batch vec![false; 7], - &mut seen_txns, 0, // minimum_file_retention_timestamp + false, + false, + &mut seen_txns, ); visitor.visit_rows_of(batch.as_ref())?; diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 37e5044059..56a9eb6850 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -40,7 +40,9 @@ struct AddRemoveDedupVisitor<'seen> { } impl AddRemoveDedupVisitor<'_> { - // The index position in the row getters for the following columns + // These index positions correspond to the order of columns defined in + // `selected_column_names_and_types()`, and are used to extract file key information + // for deduplication purposes const ADD_PATH_INDEX: usize = 0; const ADD_PARTITION_VALUES_INDEX: usize = 1; const ADD_DV_START_INDEX: usize = 2; From 7a59eabb969a7b2bd33d4132dd68287396211f06 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 1 Apr 2025 12:03:00 -0700 Subject: [PATCH 058/176] improve docs --- kernel/src/actions/visitors.rs | 31 +++++---------- kernel/src/checkpoints/log_replay.rs | 59 +++++++++++++++------------- kernel/src/scan/log_replay.rs | 3 ++ 3 files changed, 43 insertions(+), 50 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index befbeb47b7..208af82cbc 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -510,16 +510,16 @@ impl RowVisitor for SidecarVisitor { pub(crate) struct V1CheckpointVisitor<'seen> { // File actions state deduplicator: FileActionDeduplicator<'seen>, // Used to deduplicate file actions - selection_vector: Vec, // Used to mark rows for selection - total_file_actions: i64, // i64 to match the `_last_checkpoint` file schema - total_add_actions: i64, // i64 to match the `_last_checkpoint` file schema + pub(crate) selection_vector: Vec, // Used to mark rows for selection + pub(crate) total_file_actions: i64, // i64 to match the `_last_checkpoint` file schema + pub(crate) total_add_actions: i64, // i64 to match the `_last_checkpoint` file schema minimum_file_retention_timestamp: i64, // i64 for comparison with remove.deletionTimestamp // Non-file actions state - seen_protocol: bool, // Used to keep only the first protocol action - seen_metadata: bool, // Used to keep only the first metadata action + pub(crate) seen_protocol: bool, // Used to keep only the first protocol action + pub(crate) seen_metadata: bool, // Used to keep only the first metadata action seen_txns: &'seen mut HashSet, // Used to keep only the first txn action for each app ID - total_non_file_actions: i64, // i64 to match the `_last_checkpoint` file schema + pub(crate) total_non_file_actions: i64, // i64 to match the `_last_checkpoint` file schema } #[allow(unused)] @@ -540,7 +540,7 @@ impl V1CheckpointVisitor<'_> { /// * `selection_vector` - Vector to mark rows for selection in the output /// * `seen_txns` - Set to track already seen transaction app IDs /// * `minimum_file_retention_timestamp` - Timestamp threshold for tombstone expiration - fn new<'seen>( + pub(crate) fn new<'seen>( seen_file_keys: &'seen mut HashSet, is_log_batch: bool, selection_vector: Vec, @@ -765,24 +765,11 @@ pub(crate) fn visit_deletion_vector_at<'a>( #[cfg(test)] mod tests { - use crate::arrow::array::StringArray; - use crate::utils::test_utils::parse_json_batch; use crate::EngineData; + use crate::{arrow::array::StringArray, utils::test_utils::string_array_to_engine_data}; use super::*; - use crate::{ - actions::get_log_schema, engine::arrow_data::ArrowEngineData, engine::sync::SyncEngine, - Engine, EngineData, - }; - - // TODO(nick): Merge all copies of this into one "test utils" thing - fn string_array_to_engine_data(string_array: StringArray) -> Box { - let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); - let schema = Arc::new(ArrowSchema::new(vec![string_field])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) - .expect("Can't convert to record batch"); - Box::new(ArrowEngineData::new(batch)) - } + use crate::{actions::get_log_schema, engine::sync::SyncEngine, Engine}; fn action_batch() -> Box { let json_strings: StringArray = vec![ diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs index da6f08f8df..748c4bcf16 100644 --- a/kernel/src/checkpoints/log_replay.rs +++ b/kernel/src/checkpoints/log_replay.rs @@ -1,46 +1,49 @@ use std::collections::HashSet; -use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::atomic::{AtomicI64, Ordering}; use std::sync::Arc; use crate::actions::visitors::V1CheckpointVisitor; use crate::log_replay::{FileActionKey, HasSelectionVector, LogReplayProcessor}; -use crate::{DeltaResult, EngineData}; +use crate::{DeltaResult, EngineData, RowVisitor}; +/// `CheckpointData` is a wrapper struct that contains the engine data and selection vector +/// for a batch of actions that have been processed during log replay. +/// TODO: Use `FilteredEngineData` when implemented pub struct CheckpointData { #[allow(unused)] data: Box, selection_vector: Vec, } +/// Implement the `HasSelectionVector` trait for `CheckpointData` to allow checking +/// whether the data contains selected rows. impl HasSelectionVector for CheckpointData { fn has_selected_rows(&self) -> bool { self.selection_vector.contains(&true) } } -/// `CheckpointLogReplayProcessor` is responsible for filtering actions during log -/// replay to include only those that should be included in a V1 checkpoint. +/// The [`CheckpointLogReplayProcessor`] is an implementation of the [`LogReplayProcessor`] trait +/// that filters log segment actions for inclusion in a V1 checkpoint file. +/// +/// It processes each action batch via the `process_actions_batch` method, using the +/// [`V1CheckpointVisitor`] to convert each batch into a [`CheckpointData`] instance that +/// contains only the actions required for the checkpoint. #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented struct CheckpointLogReplayProcessor { /// Tracks file actions that have been seen during log replay to avoid duplicates. /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. seen_file_keys: HashSet, - /// Counter for the total number of actions processed during log replay. - total_actions: Arc, - + total_actions: Arc, /// Counter for the total number of add actions processed during log replay. - total_add_actions: Arc, - + total_add_actions: Arc, /// Indicates whether a protocol action has been seen in the log. seen_protocol: bool, - /// Indicates whether a metadata action has been seen in the log. seen_metadata: bool, - /// Set of transaction app IDs that have been processed to avoid duplicates. seen_txns: HashSet, - /// Minimum timestamp for file retention, used for filtering expired tombstones. minimum_file_retention_timestamp: i64, } @@ -81,10 +84,10 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { &mut self.seen_file_keys, is_log_batch, selection_vector, - &mut self.seen_txns, self.minimum_file_retention_timestamp, self.seen_protocol, self.seen_metadata, + &mut self.seen_txns, ); // Process actions and let visitor update selection vector @@ -112,8 +115,8 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented impl CheckpointLogReplayProcessor { pub(super) fn new( - total_actions_counter: Arc, - total_add_actions_counter: Arc, + total_actions_counter: Arc, + total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> Self { Self { @@ -131,16 +134,16 @@ impl CheckpointLogReplayProcessor { /// Given an iterator of (engine_data, bool) tuples, returns an iterator of /// `(engine_data, selection_vec)`. Each row that is selected in the returned `engine_data` _must_ /// be written to the V1 checkpoint file in order to capture the table version's complete state. -/// Non-selected rows _must_ be ignored. The boolean flag indicates whether the record batch -/// is a log or checkpoint batch. +/// Non-selected rows _must_ be ignored. The boolean flag indicates whether the record batch +/// is a log or checkpoint batch. /// -/// Note: The iterator of (engine_data, bool) tuples must be sorted by the order of the actions in -/// the log from most recent to least recent. +/// Note: The iterator of (engine_data, bool) tuples 'action_iter' parameter must be sorted by the +/// order of the actions in the log from most recent to least recent. #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented pub(crate) fn checkpoint_actions_iter( action_iter: impl Iterator, bool)>> + Send + 'static, - total_actions_counter: Arc, - total_add_actions_counter: Arc, + total_actions_counter: Arc, + total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> impl Iterator> + Send + 'static { let mut log_scanner = CheckpointLogReplayProcessor::new( @@ -154,7 +157,7 @@ pub(crate) fn checkpoint_actions_iter( #[cfg(test)] mod tests { - use std::sync::atomic::{AtomicU64, Ordering}; + use std::sync::atomic::{AtomicI64, Ordering}; use std::sync::Arc; use crate::arrow::array::StringArray; @@ -166,10 +169,10 @@ mod tests { /// This tests the integration of the visitors with the main iterator function. /// More granular testing is performed in the individual visitor tests. #[test] - fn test_v1_checkpoint_actions_iter_multi_batch_integration() -> DeltaResult<()> { + fn test_v1_checkpoint_actions_iter_multi_batch_test() -> DeltaResult<()> { // Setup counters - let total_actions_counter = Arc::new(AtomicU64::new(0)); - let total_add_actions_counter = Arc::new(AtomicU64::new(0)); + let total_actions_counter = Arc::new(AtomicI64::new(0)); + let total_add_actions_counter = Arc::new(AtomicI64::new(0)); // Create first batch with protocol, metadata, and some files let json_strings1: StringArray = vec![ @@ -191,7 +194,8 @@ mod tests { r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"# ].into(); - // Create third batch with all duplicate actions (should be filtered out completely) + // Create third batch with all duplicate actions. + // The entire batch should be skippped as there are no selected actions to write from this batch. let json_strings3: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, @@ -212,7 +216,7 @@ mod tests { ) .collect::, _>>()?; - // Expect two batches in results (third batch should be filtered)" + // Expect two batches in results (third batch should be filtered out)" assert_eq!(results.len(), 2); // First batch should have all rows selected @@ -229,7 +233,6 @@ mod tests { vec![false, false, true, false, true] ); - // Verify counters // 6 total actions (4 from batch1 + 2 from batch2 + 0 from batch3) assert_eq!(total_actions_counter.load(Ordering::Relaxed), 6); diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index a635561e3d..ae3a3b1333 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -362,6 +362,9 @@ impl ScanLogReplayProcessor { /// `(engine_data, selection_vec)`. Each row that is selected in the returned `engine_data` _must_ /// be processed to complete the scan. Non-selected rows _must_ be ignored. The boolean flag /// indicates whether the record batch is a log or checkpoint batch. +/// +/// Note: The iterator of (engine_data, bool) tuples 'action_iter' parameter must be sorted by the +/// order of the actions in the log from most recent to least recent. pub(crate) fn scan_action_iter( engine: &dyn Engine, action_iter: impl Iterator, bool)>>, From e4bc34e3104d2ad5ef539074ca4ad9f3bc323f1d Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 1 Apr 2025 14:46:18 -0700 Subject: [PATCH 059/176] docs --- kernel/src/log_replay.rs | 15 +++++++++++---- kernel/src/scan/log_replay.rs | 24 +++++++++++++++++++++--- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index dadcf7f4de..79ad70b6fc 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -200,8 +200,16 @@ impl<'seen> FileActionDeduplicator<'seen> { /// - **Maintain selection vectors** to indicate which actions in each batch should be included. /// - **Apply custom filtering logic** based on the processor’s purpose (e.g., checkpointing, scanning). /// +/// Implementations: +/// - `ScanLogReplayProcessor`: Used for table scans, this processor filters and selects relevant +/// file actions to reconstruct the table state at a specific point in time. +/// - `V1CheckpointLogReplayProcessor`(WIP): Will be responsible for processing log batches to construct +/// V1 spec checkpoint files, ensuring only necessary metadata and file actions are retained. +/// /// The `Output` type must implement [`HasSelectionVector`] to enable filtering of batches /// with no selected rows. +/// +/// TODO: Refactor the Change Data Feed (CDF) processor to use this trait. pub(crate) trait LogReplayProcessor { /// The type of results produced by this processor must implement the /// `HasSelectionVector` trait to allow filtering out batches with no selected rows. @@ -210,7 +218,7 @@ pub(crate) trait LogReplayProcessor { /// Processes a batch of actions and returns the filtered results. /// /// # Arguments - /// - `batch` - A boxed [`EngineData`] instance representing a batch of actions. + /// - `actions_batch` - A boxed [`EngineData`] instance representing a batch of actions. /// - `is_log_batch` - `true` if the batch originates from a commit log, `false` if from a checkpoint. /// /// Returns a [`DeltaResult`] containing the processor’s output, which includes only selected actions. @@ -218,7 +226,7 @@ pub(crate) trait LogReplayProcessor { /// Note: Since log replay is stateful, processing may update internal processor state (e.g., deduplication sets). fn process_actions_batch( &mut self, - batch: Box, + actions_batch: Box, is_log_batch: bool, ) -> DeltaResult; @@ -234,13 +242,12 @@ pub(crate) trait LogReplayProcessor { /// Note: This is an associated function rather than an instance method because the /// returned iterator needs to own the processor. fn apply_to_iterator( - processor: impl LogReplayProcessor, + mut processor: impl LogReplayProcessor, action_iter: impl Iterator, bool)>>, ) -> impl Iterator> where Self::Output: HasSelectionVector, { - let mut processor = processor; action_iter .map(move |action_res| { let (batch, is_log_batch) = action_res?; diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 8d301e64b1..845fb09f00 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -16,6 +16,24 @@ use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructFie use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; +/// [`ScanLogReplayProcessor`] processes Delta log replay actions specifically for scanning file data. +/// +/// During a table scan, the processor reads batches of log actions (in reverse chronological order) +/// and performs the following steps: +/// +/// - Data Skipping: Applies a predicate-based filter (via [`DataSkippingFilter`]) to quickly skip +/// rows that are irrelevant for the query. +/// - Partition Pruning: Uses an optional partition filter (extracted from a physical predicate) +/// to exclude actions whose partition values do not meet the required criteria. +/// - Action Deduplication: Leverages the [`FileActionDeduplicator`] to ensure that for each unique file +/// (identified by its path and deletion vector unique ID), only the latest valid Add action is processed. +/// - Transformation: Evaluates and applies any necessary transformations to convert physical log actions +/// into a logical representation, as dictated by the table schema and optional transform logic. +/// +/// As an implementation of [`LogReplayProcessor`], [`ScanLogReplayProcessor`] provides the `process_actions_batch` +/// method, which applies these steps to each batch of log actions and produces a [`ScanData`] result. This result +/// includes the transformed batch, a selection vector indicating which rows should be processed further, and any +/// row-level transformation expressions that need to be applied to the selected rows. struct ScanLogReplayProcessor { partition_filter: Option, data_skipping_filter: Option, @@ -335,8 +353,6 @@ impl LogReplayProcessor for ScanLogReplayProcessor { let logical_schema = self.logical_schema.clone(); let transform = self.transform.clone(); let partition_filter = self.partition_filter.clone(); - // TODO: Teach expression eval to respect the selection vector we just computed so carefully! - let result = self.add_transform.evaluate(batch.as_ref())?; let mut visitor = AddRemoveDedupVisitor::new( &mut self.seen_file_keys, @@ -346,8 +362,10 @@ impl LogReplayProcessor for ScanLogReplayProcessor { partition_filter, is_log_batch, ); - visitor.visit_rows_of(batch.as_ref())?; + + // TODO: Teach expression eval to respect the selection vector we just computed so carefully! + let result = self.add_transform.evaluate(batch.as_ref())?; Ok(( result, visitor.selection_vector, From d24a80c240862a8983a9ebaaec8ecee410ad8b07 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 1 Apr 2025 15:22:27 -0700 Subject: [PATCH 060/176] refactor into checkpoint mod --- kernel/src/actions/visitors.rs | 517 +----------------------- kernel/src/checkpoint/log_replay.rs | 594 ++++++++++++++++++++++++++++ kernel/src/checkpoint/mod.rs | 50 +++ kernel/src/lib.rs | 1 + 4 files changed, 646 insertions(+), 516 deletions(-) create mode 100644 kernel/src/checkpoint/log_replay.rs create mode 100644 kernel/src/checkpoint/mod.rs diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 55b9df210f..7f7c6dff26 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -1,11 +1,10 @@ //! This module defines visitors that can be used to extract the various delta actions from //! [`crate::engine_data::EngineData`] types. -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::sync::LazyLock; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; -use crate::log_replay::{FileActionDeduplicator, FileActionKey}; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, Error}; @@ -483,259 +482,6 @@ impl RowVisitor for SidecarVisitor { Ok(()) } } -/// A visitor that filters actions for inclusion in a V1 spec checkpoint file. -/// -/// This visitor processes actions in newest-to-oldest order (as they appear in log -/// replay) and applies deduplication logic for both file and non-file actions to -/// produce the minimal state representation for the table. -/// -/// # File Action Filtering -/// - Keeps only the first occurrence of each unique (path, dvId) pair -/// - Excludes expired tombstone remove actions (where deletionTimestamp ≤ minimumFileRetentionTimestamp) -/// - Add actions represent files present in the table -/// - Unexpired remove actions represent tombstones still needed for consistency -/// -/// # Non-File Action Filtering -/// - Keeps only the first protocol action (newest version) -/// - Keeps only the first metadata action (most recent table metadata) -/// - Keeps only the first transaction action for each unique app ID -/// -/// # Excluded Actions -/// CommitInfo, CDC, Sidecar, and CheckpointMetadata actions are NOT part of the V1 checkpoint schema -/// and are filtered out. -/// -/// The resulting filtered set of actions represents the minimal set needed to reconstruct -/// the latest valid state of the table at the checkpointed version. -#[cfg_attr(feature = "developer-visibility", visibility::make(pub))] -pub(crate) struct V1CheckpointVisitor<'seen> { - // File actions state - deduplicator: FileActionDeduplicator<'seen>, // Used to deduplicate file actions - selection_vector: Vec, // Used to mark rows for selection - total_file_actions: i64, // i64 to match the `_last_checkpoint` file schema - total_add_actions: i64, // i64 to match the `_last_checkpoint` file schema - minimum_file_retention_timestamp: i64, // i64 for comparison with remove.deletionTimestamp - - // Non-file actions state - seen_protocol: bool, // Used to keep only the first protocol action - seen_metadata: bool, // Used to keep only the first metadata action - seen_txns: &'seen mut HashSet, // Used to keep only the first txn action for each app ID - total_non_file_actions: i64, // i64 to match the `_last_checkpoint` file schema -} - -#[allow(unused)] -impl V1CheckpointVisitor<'_> { - // These index positions correspond to the order of columns defined in - // `selected_column_names_and_types()`, and are used to extract file key information - // for deduplication purposes - const ADD_PATH_INDEX: usize = 0; // Position of "add.path" in getters - const ADD_DV_START_INDEX: usize = 1; // Start position of add deletion vector columns - const REMOVE_PATH_INDEX: usize = 4; // Position of "remove.path" in getters - const REMOVE_DV_START_INDEX: usize = 6; // Start position of remove deletion vector columns - - /// Creates a new V1CheckpointVisitor for filtering checkpoint actions. - /// - /// # Arguments - /// * `seen_file_keys` - Set to track already seen file keys for deduplication - /// * `is_log_batch` - True if processing a batch from a commit file, false if from a checkpoint file - /// * `selection_vector` - Vector to mark rows for selection in the output - /// * `seen_txns` - Set to track already seen transaction app IDs - /// * `minimum_file_retention_timestamp` - Timestamp threshold for tombstone expiration - fn new<'seen>( - seen_file_keys: &'seen mut HashSet, - is_log_batch: bool, - selection_vector: Vec, - minimum_file_retention_timestamp: i64, - seen_protocol: bool, - seen_metadata: bool, - seen_txns: &'seen mut HashSet, - ) -> V1CheckpointVisitor<'seen> { - V1CheckpointVisitor { - deduplicator: FileActionDeduplicator::new( - seen_file_keys, - is_log_batch, - Self::ADD_PATH_INDEX, - Self::REMOVE_PATH_INDEX, - Self::ADD_DV_START_INDEX, - Self::REMOVE_DV_START_INDEX, - ), - selection_vector, - total_file_actions: 0, - total_add_actions: 0, - minimum_file_retention_timestamp, - - seen_protocol, - seen_metadata, - seen_txns, - total_non_file_actions: 0, - } - } - - /// Determines if a remove action tombstone has expired and should be excluded from the checkpoint. - /// - /// A remove action includes a timestamp indicating when the deletion occurred. Physical files - /// are deleted lazily after a user-defined expiration time, allowing concurrent readers to - /// access stale snapshots. A remove action remains as a tombstone in a checkpoint file until - /// it expires, which happens when the deletion timestamp is less than or equal to the - /// minimum file retention timestamp. - /// - /// Note: When remove.deletion_timestamp is not present (defaulting to 0), the remove action - /// will be excluded from the checkpoint file as it will be treated as expired. - fn is_expired_tombstone<'a>(&self, i: usize, getter: &'a dyn GetData<'a>) -> DeltaResult { - // Ideally this should never be zero, but we are following the same behavior as Delta - // Spark and the Java Kernel. - // Note: When remove.deletion_timestamp is not present (defaulting to 0), the remove action - // will be excluded from the checkpoint file as it will be treated as expired. - let mut deletion_timestamp: i64 = 0; - if let Some(ts) = getter.get_opt(i, "remove.deletionTimestamp")? { - deletion_timestamp = ts; - } - - Ok(deletion_timestamp <= self.minimum_file_retention_timestamp) - } - - /// Returns true if the row contains a valid file action to be included in the checkpoint. - /// This function handles both add and remove actions, applying deduplication logic and - /// tombstone expiration rules as needed. - fn is_valid_file_action<'a>( - &mut self, - i: usize, - getters: &[&'a dyn GetData<'a>], - ) -> DeltaResult { - // Never skip remove actions, as they may be unexpired tombstones. - let Some((file_key, is_add)) = self.deduplicator.extract_file_action(i, getters, false)? - else { - return Ok(false); - }; - - // Check if we've already seen this file action - if self.deduplicator.check_and_record_seen(file_key) { - return Ok(false); - } - - // Ignore expired tombstones. The getter at the fifth index is the remove action's deletionTimestamp. - if !is_add && self.is_expired_tombstone(i, getters[5])? { - return Ok(false); - } - - if is_add { - self.total_add_actions += 1; - } - - self.total_file_actions += 1; - Ok(true) - } - - /// Returns true if the row contains a protocol action, and we haven't seen one yet. - fn is_valid_protocol_action<'a>( - &mut self, - i: usize, - getter: &'a dyn GetData<'a>, - ) -> DeltaResult { - if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol { - self.seen_protocol = true; - self.total_non_file_actions += 1; - Ok(true) - } else { - Ok(false) - } - } - - /// Returns true if the row contains a metadata action, and we haven't seen one yet. - fn is_valid_metadata_action<'a>( - &mut self, - i: usize, - getter: &'a dyn GetData<'a>, - ) -> DeltaResult { - if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata { - self.seen_metadata = true; - self.total_non_file_actions += 1; - Ok(true) - } else { - Ok(false) - } - } - - /// Returns true if the row contains a txn action with an appId that we haven't seen yet. - fn is_valid_txn_action<'a>( - &mut self, - i: usize, - getter: &'a dyn GetData<'a>, - ) -> DeltaResult { - let app_id = match getter.get_str(i, "txn.appId")? { - Some(id) => id, - None => return Ok(false), - }; - - // Attempting to insert the app_id into the set. If it's already present, the insert will - // return false, indicating that we've already seen this app_id. - if self.seen_txns.insert(app_id.to_string()) { - self.total_non_file_actions += 1; - Ok(true) - } else { - Ok(false) - } - } -} - -impl RowVisitor for V1CheckpointVisitor<'_> { - fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { - // The data columns visited must be in the following order: - // 1. ADD - // 2. REMOVE - // 3. METADATA - // 4. PROTOCOL - // 5. TXN - static NAMES_AND_TYPES: LazyLock = LazyLock::new(|| { - const STRING: DataType = DataType::STRING; - const INTEGER: DataType = DataType::INTEGER; - let types_and_names = vec![ - // File action columns - (STRING, column_name!("add.path")), - (STRING, column_name!("add.deletionVector.storageType")), - (STRING, column_name!("add.deletionVector.pathOrInlineDv")), - (INTEGER, column_name!("add.deletionVector.offset")), - (STRING, column_name!("remove.path")), - (DataType::LONG, column_name!("remove.deletionTimestamp")), - (STRING, column_name!("remove.deletionVector.storageType")), - (STRING, column_name!("remove.deletionVector.pathOrInlineDv")), - (INTEGER, column_name!("remove.deletionVector.offset")), - // Non-file action columns - (STRING, column_name!("metaData.id")), - (INTEGER, column_name!("protocol.minReaderVersion")), - (STRING, column_name!("txn.appId")), - ]; - let (types, names) = types_and_names.into_iter().unzip(); - (names, types).into() - }); - NAMES_AND_TYPES.as_ref() - } - - fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { - require!( - getters.len() == 12, - Error::InternalError(format!( - "Wrong number of visitor getters: {}", - getters.len() - )) - ); - - for i in 0..row_count { - // Check for non-file actions (metadata, protocol, txn) - let is_non_file_action = self.is_valid_metadata_action(i, getters[9])? - || self.is_valid_protocol_action(i, getters[10])? - || self.is_valid_txn_action(i, getters[11])?; - - // Check for file actions (add, remove) - let is_file_action = self.is_valid_file_action(i, getters)?; - - // Mark the row for selection if it's either a valid non-file or file action - if is_non_file_action || is_file_action { - self.selection_vector[i] = true; - } - } - Ok(()) - } -} /// Get a DV out of some engine data. The caller is responsible for slicing the `getters` slice such /// that the first element contains the `storageType` element of the deletion vector. @@ -1014,265 +760,4 @@ mod tests { }) ); } - - #[test] - fn test_v1_checkpoint_visitor() -> DeltaResult<()> { - let data = action_batch(); - let mut seen_file_keys = HashSet::new(); - let mut seen_txns = HashSet::new(); - let mut visitor = V1CheckpointVisitor::new( - &mut seen_file_keys, - true, - vec![false; 8], - 0, // minimum_file_retention_timestamp (no expired tombstones) - false, - false, - &mut seen_txns, - ); - - visitor.visit_rows_of(data.as_ref())?; - - // Combined results from both file and non-file actions - // Row 0 is an add action (included) - // Row 1 is a remove action (included) - // Row 2 is a commit info action (excluded) - // Row 3 is a protocol action (included) - // Row 4 is a metadata action (included) - // Row 5 is a cdc action (excluded) - // Row 6 is a sidecar action (excluded) - // Row 7 is a txn action (included) - let expected = vec![true, true, false, true, true, false, false, true]; - - // Verify file action results - assert_eq!(visitor.total_file_actions, 2); - assert_eq!(visitor.total_add_actions, 1); - - // Verify non-file action results - assert!(visitor.seen_protocol); - assert!(visitor.seen_metadata); - assert_eq!(visitor.seen_txns.len(), 1); - assert_eq!(visitor.total_non_file_actions, 3); - - assert_eq!(visitor.selection_vector, expected); - Ok(()) - } - - /// Tests the boundary conditions for tombstone expiration logic. - /// Specifically checks: - /// - Remove actions with deletionTimestamp == minimumFileRetentionTimestamp (should be excluded) - /// - Remove actions with deletionTimestamp < minimumFileRetentionTimestamp (should be excluded) - /// - Remove actions with deletionTimestamp > minimumFileRetentionTimestamp (should be included) - /// - Remove actions with missing deletionTimestamp (defaults to 0, should be excluded) - #[test] - fn test_v1_checkpoint_visitor_boundary_cases_for_tombstone_expiration() -> DeltaResult<()> { - let json_strings: StringArray = vec![ - r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, - r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, - r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#, - // Missing timestamp defaults to 0 - r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, - ] - .into(); - let batch = parse_json_batch(json_strings); - - let mut seen_file_keys = HashSet::new(); - let mut seen_txns = HashSet::new(); - let mut visitor = V1CheckpointVisitor::new( - &mut seen_file_keys, - true, - vec![false; 4], - 100, // minimum_file_retention_timestamp (threshold set to 100) - false, - false, - &mut seen_txns, - ); - - visitor.visit_rows_of(batch.as_ref())?; - - // Only "one_above_threshold" should be kept - let expected = vec![false, false, true, false]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.total_file_actions, 1); - assert_eq!(visitor.total_add_actions, 0); - assert_eq!(visitor.total_non_file_actions, 0); - Ok(()) - } - - #[test] - fn test_v1_checkpoint_visitor_conflicting_file_actions_in_log_batch() -> DeltaResult<()> { - let json_strings: StringArray = vec![ - r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, - // Duplicate path - r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, - ] - .into(); - let batch = parse_json_batch(json_strings); - - let mut seen_file_keys = HashSet::new(); - let mut seen_txns = HashSet::new(); - let mut visitor = V1CheckpointVisitor::new( - &mut seen_file_keys, - true, - vec![false; 2], - 0, - false, - false, - &mut seen_txns, - ); - - visitor.visit_rows_of(batch.as_ref())?; - - // First file action should be included. The second one should be excluded due to the conflict. - let expected = vec![true, false]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.total_file_actions, 1); - assert_eq!(visitor.total_add_actions, 1); - assert_eq!(visitor.total_non_file_actions, 0); - Ok(()) - } - - #[test] - fn test_v1_checkpoint_visitor_file_actions_in_checkpoint_batch() -> DeltaResult<()> { - let json_strings: StringArray = vec![ - r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, - ] - .into(); - let batch = parse_json_batch(json_strings); - - let mut seen_file_keys = HashSet::new(); - let mut seen_txns = HashSet::new(); - let mut visitor = V1CheckpointVisitor::new( - &mut seen_file_keys, - false, // is_log_batch = false (checkpoint batch) - vec![false; 1], - 0, - false, - false, - &mut seen_txns, - ); - - visitor.visit_rows_of(batch.as_ref())?; - - let expected = vec![true]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.total_file_actions, 1); - assert_eq!(visitor.total_add_actions, 1); - assert_eq!(visitor.total_non_file_actions, 0); - // The action should NOT be added to the seen_file_keys set as it's a checkpoint batch - // and actions in checkpoint batches do not conflict with - assert!(seen_file_keys.is_empty()); - Ok(()) - } - - #[test] - fn test_v1_checkpoint_visitor_conflicts_with_deletion_vectors() -> DeltaResult<()> { - let json_strings: StringArray = vec![ - r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - // Same path but different DV - r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"two","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - // Duplicate of first entry - r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - // Conflicting remove action with DV - r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - ] - .into(); - let batch = parse_json_batch(json_strings); - - let mut seen_file_keys = HashSet::new(); - let mut seen_txns = HashSet::new(); - let mut visitor = V1CheckpointVisitor::new( - &mut seen_file_keys, - true, - vec![false; 4], - 0, - false, - false, - &mut seen_txns, - ); - - visitor.visit_rows_of(batch.as_ref())?; - - // Only the first two should be included since they have different (path, DvID) keys - let expected = vec![true, true, false, false]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.total_file_actions, 2); - assert_eq!(visitor.total_add_actions, 2); - assert_eq!(visitor.total_non_file_actions, 0); - - Ok(()) - } - - #[test] - fn test_v1_checkpoint_visitor_already_seen_non_file_actions() -> DeltaResult<()> { - let json_strings: StringArray = vec![ - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, - ].into(); - let batch = parse_json_batch(json_strings); - - // Pre-populate with txn app1 - let mut seen_file_keys = HashSet::new(); - let mut seen_txns = HashSet::new(); - seen_txns.insert("app1".to_string()); - - let mut visitor = V1CheckpointVisitor::new( - &mut seen_file_keys, - true, - vec![false; 3], - 0, - true, // The visior has already seen a protocol action - true, // The visitor has already seen a metadata action - &mut seen_txns, // Pre-populated transaction - ); - - visitor.visit_rows_of(batch.as_ref())?; - - // All actions should be skipped as they have already been seen - let expected = vec![false, false, false]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.total_non_file_actions, 0); - assert_eq!(visitor.total_file_actions, 0); - - Ok(()) - } - - #[test] - fn test_v1_checkpoint_visitor_duplicate_non_file_actions() -> DeltaResult<()> { - let json_strings: StringArray = vec![ - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, // Duplicate txn - r#"{"txn":{"appId":"app2","version":1,"lastUpdated":123456789}}"#, // Different app ID - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, // Duplicate protocol - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, - // Duplicate metadata - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, - ] - .into(); - let batch = parse_json_batch(json_strings); - - let mut seen_file_keys = HashSet::new(); - let mut seen_txns = HashSet::new(); - let mut visitor = V1CheckpointVisitor::new( - &mut seen_file_keys, - true, // is_log_batch - vec![false; 7], - 0, // minimum_file_retention_timestamp - false, - false, - &mut seen_txns, - ); - - visitor.visit_rows_of(batch.as_ref())?; - - // First occurrence of each type should be included - let expected = vec![true, false, true, true, false, true, false]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_txns.len(), 2); // Two different app IDs - assert_eq!(visitor.total_non_file_actions, 4); // 2 txns + 1 protocol + 1 metadata - assert_eq!(visitor.total_file_actions, 0); - - Ok(()) - } } diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs new file mode 100644 index 0000000000..e17e9d1607 --- /dev/null +++ b/kernel/src/checkpoint/log_replay.rs @@ -0,0 +1,594 @@ +//! This module implements log replay functionality specifically for checkpoint writes in delta tables. +//! +//! The primary goal is to process Delta log actions in reverse chronological order (from most recent to +//! least recent) to produce the minimal set of actions required to reconstruct the table state in a V1 checkpoint. +//! +//! ## Key Responsibilities +//! - Filtering: Only the most recent protocol and metadata actions are retained, and for each transaction +//! (identified by its app ID), only the latest action is kept. +//! - Deduplication: File actions are deduplicated based on file path and deletion vector unique ID so that +//! duplicate or obsolete actions (including remove actions) are ignored. +//! - Retention Filtering: Tombstones older than the configured `minimum_file_retention_timestamp` are excluded. +//! +//! TODO: V1CheckpointLogReplayProcessor & CheckpointData is a WIP. +//! The module defines the CheckpointLogReplayProcessor which implements the LogReplayProcessor trait, +//! as well as a [`V1CheckpointVisitor`] to traverse and process batches of log actions. +//! +//! The processing result is encapsulated in CheckpointData, which includes the transformed log data and +//! a selection vector indicating which rows should be written to the checkpoint. +//! +//! For log replay functionality used during table scans (i.e. for reading checkpoints and commit logs), refer to +//! the `scan/log_replay.rs` module. +use std::collections::HashSet; +use std::sync::LazyLock; + +use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; +use crate::log_replay::{FileActionDeduplicator, FileActionKey}; +use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; +use crate::utils::require; +use crate::{DeltaResult, Error}; + +/// A visitor that filters actions for inclusion in a V1 spec checkpoint file. +/// +/// This visitor processes actions in newest-to-oldest order (as they appear in log +/// replay) and applies deduplication logic for both file and non-file actions to +/// produce the minimal state representation for the table. +/// +/// # File Action Filtering +/// - Keeps only the first occurrence of each unique (path, dvId) pair +/// - Excludes expired tombstone remove actions (where deletionTimestamp ≤ minimumFileRetentionTimestamp) +/// - Add actions represent files present in the table +/// - Unexpired remove actions represent tombstones still needed for consistency +/// +/// # Non-File Action Filtering +/// - Keeps only the first protocol action (newest version) +/// - Keeps only the first metadata action (most recent table metadata) +/// - Keeps only the first transaction action for each unique app ID +/// +/// # Excluded Actions +/// CommitInfo, CDC, Sidecar, and CheckpointMetadata actions are NOT part of the V1 checkpoint schema +/// and are filtered out. +/// +/// The resulting filtered set of actions represents the minimal set needed to reconstruct +/// the latest valid state of the table at the checkpointed version. +#[cfg_attr(feature = "developer-visibility", visibility::make(pub))] +pub(crate) struct V1CheckpointVisitor<'seen> { + // File actions state + deduplicator: FileActionDeduplicator<'seen>, // Used to deduplicate file actions + selection_vector: Vec, // Used to mark rows for selection + total_file_actions: i64, // i64 to match the `_last_checkpoint` file schema + total_add_actions: i64, // i64 to match the `_last_checkpoint` file schema + minimum_file_retention_timestamp: i64, // i64 for comparison with remove.deletionTimestamp + + // Non-file actions state + seen_protocol: bool, // Used to keep only the first protocol action + seen_metadata: bool, // Used to keep only the first metadata action + seen_txns: &'seen mut HashSet, // Used to keep only the first txn action for each app ID + total_non_file_actions: i64, // i64 to match the `_last_checkpoint` file schema +} + +#[allow(unused)] +impl V1CheckpointVisitor<'_> { + // These index positions correspond to the order of columns defined in + // `selected_column_names_and_types()`, and are used to extract file key information + // for deduplication purposes + const ADD_PATH_INDEX: usize = 0; // Position of "add.path" in getters + const ADD_DV_START_INDEX: usize = 1; // Start position of add deletion vector columns + const REMOVE_PATH_INDEX: usize = 4; // Position of "remove.path" in getters + const REMOVE_DV_START_INDEX: usize = 6; // Start position of remove deletion vector columns + + /// Creates a new V1CheckpointVisitor for filtering checkpoint actions. + /// + /// # Arguments + /// * `seen_file_keys` - Set to track already seen file keys for deduplication + /// * `is_log_batch` - True if processing a batch from a commit file, false if from a checkpoint file + /// * `selection_vector` - Vector to mark rows for selection in the output + /// * `seen_txns` - Set to track already seen transaction app IDs + /// * `minimum_file_retention_timestamp` - Timestamp threshold for tombstone expiration + pub(crate) fn new<'seen>( + seen_file_keys: &'seen mut HashSet, + is_log_batch: bool, + selection_vector: Vec, + minimum_file_retention_timestamp: i64, + seen_protocol: bool, + seen_metadata: bool, + seen_txns: &'seen mut HashSet, + ) -> V1CheckpointVisitor<'seen> { + V1CheckpointVisitor { + deduplicator: FileActionDeduplicator::new( + seen_file_keys, + is_log_batch, + Self::ADD_PATH_INDEX, + Self::REMOVE_PATH_INDEX, + Self::ADD_DV_START_INDEX, + Self::REMOVE_DV_START_INDEX, + ), + selection_vector, + total_file_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp, + + seen_protocol, + seen_metadata, + seen_txns, + total_non_file_actions: 0, + } + } + + /// Determines if a remove action tombstone has expired and should be excluded from the checkpoint. + /// + /// A remove action includes a timestamp indicating when the deletion occurred. Physical files + /// are deleted lazily after a user-defined expiration time, allowing concurrent readers to + /// access stale snapshots. A remove action remains as a tombstone in a checkpoint file until + /// it expires, which happens when the deletion timestamp is less than or equal to the + /// minimum file retention timestamp. + /// + /// Note: When remove.deletion_timestamp is not present (defaulting to 0), the remove action + /// will be excluded from the checkpoint file as it will be treated as expired. + fn is_expired_tombstone<'a>(&self, i: usize, getter: &'a dyn GetData<'a>) -> DeltaResult { + // Ideally this should never be zero, but we are following the same behavior as Delta + // Spark and the Java Kernel. + // Note: When remove.deletion_timestamp is not present (defaulting to 0), the remove action + // will be excluded from the checkpoint file as it will be treated as expired. + let mut deletion_timestamp: i64 = 0; + if let Some(ts) = getter.get_opt(i, "remove.deletionTimestamp")? { + deletion_timestamp = ts; + } + + Ok(deletion_timestamp <= self.minimum_file_retention_timestamp) + } + + /// Returns true if the row contains a valid file action to be included in the checkpoint. + /// This function handles both add and remove actions, applying deduplication logic and + /// tombstone expiration rules as needed. + fn is_valid_file_action<'a>( + &mut self, + i: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult { + // Never skip remove actions, as they may be unexpired tombstones. + let Some((file_key, is_add)) = self.deduplicator.extract_file_action(i, getters, false)? + else { + return Ok(false); + }; + + // Check if we've already seen this file action + if self.deduplicator.check_and_record_seen(file_key) { + return Ok(false); + } + + // Ignore expired tombstones. The getter at the fifth index is the remove action's deletionTimestamp. + if !is_add && self.is_expired_tombstone(i, getters[5])? { + return Ok(false); + } + + if is_add { + self.total_add_actions += 1; + } + + self.total_file_actions += 1; + Ok(true) + } + + /// Returns true if the row contains a protocol action, and we haven't seen one yet. + fn is_valid_protocol_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol { + self.seen_protocol = true; + self.total_non_file_actions += 1; + Ok(true) + } else { + Ok(false) + } + } + + /// Returns true if the row contains a metadata action, and we haven't seen one yet. + fn is_valid_metadata_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata { + self.seen_metadata = true; + self.total_non_file_actions += 1; + Ok(true) + } else { + Ok(false) + } + } + + /// Returns true if the row contains a txn action with an appId that we haven't seen yet. + fn is_valid_txn_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + let app_id = match getter.get_str(i, "txn.appId")? { + Some(id) => id, + None => return Ok(false), + }; + + // Attempting to insert the app_id into the set. If it's already present, the insert will + // return false, indicating that we've already seen this app_id. + if self.seen_txns.insert(app_id.to_string()) { + self.total_non_file_actions += 1; + Ok(true) + } else { + Ok(false) + } + } +} + +impl RowVisitor for V1CheckpointVisitor<'_> { + fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { + // The data columns visited must be in the following order: + // 1. ADD + // 2. REMOVE + // 3. METADATA + // 4. PROTOCOL + // 5. TXN + static NAMES_AND_TYPES: LazyLock = LazyLock::new(|| { + const STRING: DataType = DataType::STRING; + const INTEGER: DataType = DataType::INTEGER; + let types_and_names = vec![ + // File action columns + (STRING, column_name!("add.path")), + (STRING, column_name!("add.deletionVector.storageType")), + (STRING, column_name!("add.deletionVector.pathOrInlineDv")), + (INTEGER, column_name!("add.deletionVector.offset")), + (STRING, column_name!("remove.path")), + (DataType::LONG, column_name!("remove.deletionTimestamp")), + (STRING, column_name!("remove.deletionVector.storageType")), + (STRING, column_name!("remove.deletionVector.pathOrInlineDv")), + (INTEGER, column_name!("remove.deletionVector.offset")), + // Non-file action columns + (STRING, column_name!("metaData.id")), + (INTEGER, column_name!("protocol.minReaderVersion")), + (STRING, column_name!("txn.appId")), + ]; + let (types, names) = types_and_names.into_iter().unzip(); + (names, types).into() + }); + NAMES_AND_TYPES.as_ref() + } + + fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { + require!( + getters.len() == 12, + Error::InternalError(format!( + "Wrong number of visitor getters: {}", + getters.len() + )) + ); + + for i in 0..row_count { + // Check for non-file actions (metadata, protocol, txn) + let is_non_file_action = self.is_valid_metadata_action(i, getters[9])? + || self.is_valid_protocol_action(i, getters[10])? + || self.is_valid_txn_action(i, getters[11])?; + + // Check for file actions (add, remove) + let is_file_action = self.is_valid_file_action(i, getters)?; + + // Mark the row for selection if it's either a valid non-file or file action + if is_non_file_action || is_file_action { + self.selection_vector[i] = true; + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + use std::sync::Arc; + + use crate::arrow::array::{RecordBatch, StringArray}; + use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + + use super::*; + use crate::{ + actions::get_log_schema, engine::arrow_data::ArrowEngineData, engine::sync::SyncEngine, + Engine, EngineData, + }; + + // Helper function to convert a StringArray to EngineData + fn string_array_to_engine_data(string_array: StringArray) -> Box { + let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); + let schema = Arc::new(ArrowSchema::new(vec![string_field])); + let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) + .expect("Can't convert to record batch"); + Box::new(ArrowEngineData::new(batch)) + } + + // Creates a batch of actions for testing + fn action_batch() -> Box { + let json_strings: StringArray = vec![ + r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, + r#"{"remove":{"path":"part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, + r#"{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Databricks-Runtime/","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#, + r#"{"cdc":{"path":"_change_data/age=21/cdc-00000-93f7fceb-281a-446a-b221-07b88132d203.c000.snappy.parquet","partitionValues":{"age":"21"},"size":1033,"dataChange":false}}"#, + r#"{"sidecar":{"path":"016ae953-37a9-438e-8683-9a9a4a79a395.parquet","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{"tag_foo":"tag_bar"}}}"#, + r#"{"txn":{"appId":"myApp","version": 3}}"#, + ] + .into(); + parse_json_batch(json_strings) + } + + // Parses JSON strings into EngineData + fn parse_json_batch(json_strings: StringArray) -> Box { + let engine = SyncEngine::new(); + let json_handler = engine.get_json_handler(); + let output_schema = get_log_schema().clone(); + json_handler + .parse_json(string_array_to_engine_data(json_strings), output_schema) + .unwrap() + } + + #[test] + fn test_v1_checkpoint_visitor() -> DeltaResult<()> { + let data = action_batch(); + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = V1CheckpointVisitor::new( + &mut seen_file_keys, + true, + vec![false; 8], + 0, // minimum_file_retention_timestamp (no expired tombstones) + false, + false, + &mut seen_txns, + ); + + visitor.visit_rows_of(data.as_ref())?; + + // Combined results from both file and non-file actions + // Row 0 is an add action (included) + // Row 1 is a remove action (included) + // Row 2 is a commit info action (excluded) + // Row 3 is a protocol action (included) + // Row 4 is a metadata action (included) + // Row 5 is a cdc action (excluded) + // Row 6 is a sidecar action (excluded) + // Row 7 is a txn action (included) + let expected = vec![true, true, false, true, true, false, false, true]; + + // Verify file action results + assert_eq!(visitor.total_file_actions, 2); + assert_eq!(visitor.total_add_actions, 1); + + // Verify non-file action results + assert!(visitor.seen_protocol); + assert!(visitor.seen_metadata); + assert_eq!(visitor.seen_txns.len(), 1); + assert_eq!(visitor.total_non_file_actions, 3); + + assert_eq!(visitor.selection_vector, expected); + Ok(()) + } + + /// Tests the boundary conditions for tombstone expiration logic. + /// Specifically checks: + /// - Remove actions with deletionTimestamp == minimumFileRetentionTimestamp (should be excluded) + /// - Remove actions with deletionTimestamp < minimumFileRetentionTimestamp (should be excluded) + /// - Remove actions with deletionTimestamp > minimumFileRetentionTimestamp (should be included) + /// - Remove actions with missing deletionTimestamp (defaults to 0, should be excluded) + #[test] + fn test_v1_checkpoint_visitor_boundary_cases_for_tombstone_expiration() -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#, + // Missing timestamp defaults to 0 + r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = V1CheckpointVisitor::new( + &mut seen_file_keys, + true, + vec![false; 4], + 100, // minimum_file_retention_timestamp (threshold set to 100) + false, + false, + &mut seen_txns, + ); + + visitor.visit_rows_of(batch.as_ref())?; + + // Only "one_above_threshold" should be kept + let expected = vec![false, false, true, false]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.total_file_actions, 1); + assert_eq!(visitor.total_add_actions, 0); + assert_eq!(visitor.total_non_file_actions, 0); + Ok(()) + } + + #[test] + fn test_v1_checkpoint_visitor_conflicting_file_actions_in_log_batch() -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + // Duplicate path + r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = V1CheckpointVisitor::new( + &mut seen_file_keys, + true, + vec![false; 2], + 0, + false, + false, + &mut seen_txns, + ); + + visitor.visit_rows_of(batch.as_ref())?; + + // First file action should be included. The second one should be excluded due to the conflict. + let expected = vec![true, false]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.total_file_actions, 1); + assert_eq!(visitor.total_add_actions, 1); + assert_eq!(visitor.total_non_file_actions, 0); + Ok(()) + } + + #[test] + fn test_v1_checkpoint_visitor_file_actions_in_checkpoint_batch() -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = V1CheckpointVisitor::new( + &mut seen_file_keys, + false, // is_log_batch = false (checkpoint batch) + vec![false; 1], + 0, + false, + false, + &mut seen_txns, + ); + + visitor.visit_rows_of(batch.as_ref())?; + + let expected = vec![true]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.total_file_actions, 1); + assert_eq!(visitor.total_add_actions, 1); + assert_eq!(visitor.total_non_file_actions, 0); + // The action should NOT be added to the seen_file_keys set as it's a checkpoint batch + // and actions in checkpoint batches do not conflict with + assert!(seen_file_keys.is_empty()); + Ok(()) + } + + #[test] + fn test_v1_checkpoint_visitor_conflicts_with_deletion_vectors() -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Same path but different DV + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"two","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Duplicate of first entry + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Conflicting remove action with DV + r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = V1CheckpointVisitor::new( + &mut seen_file_keys, + true, + vec![false; 4], + 0, + false, + false, + &mut seen_txns, + ); + + visitor.visit_rows_of(batch.as_ref())?; + + // Only the first two should be included since they have different (path, DvID) keys + let expected = vec![true, true, false, false]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.total_file_actions, 2); + assert_eq!(visitor.total_add_actions, 2); + assert_eq!(visitor.total_non_file_actions, 0); + + Ok(()) + } + + #[test] + fn test_v1_checkpoint_visitor_already_seen_non_file_actions() -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + ].into(); + let batch = parse_json_batch(json_strings); + + // Pre-populate with txn app1 + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + seen_txns.insert("app1".to_string()); + + let mut visitor = V1CheckpointVisitor::new( + &mut seen_file_keys, + true, + vec![false; 3], + 0, + true, // The visior has already seen a protocol action + true, // The visitor has already seen a metadata action + &mut seen_txns, // Pre-populated transaction + ); + + visitor.visit_rows_of(batch.as_ref())?; + + // All actions should be skipped as they have already been seen + let expected = vec![false, false, false]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.total_non_file_actions, 0); + assert_eq!(visitor.total_file_actions, 0); + + Ok(()) + } + + #[test] + fn test_v1_checkpoint_visitor_duplicate_non_file_actions() -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, // Duplicate txn + r#"{"txn":{"appId":"app2","version":1,"lastUpdated":123456789}}"#, // Different app ID + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, // Duplicate protocol + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + // Duplicate metadata + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = V1CheckpointVisitor::new( + &mut seen_file_keys, + true, // is_log_batch + vec![false; 7], + 0, // minimum_file_retention_timestamp + false, + false, + &mut seen_txns, + ); + + visitor.visit_rows_of(batch.as_ref())?; + + // First occurrence of each type should be included + let expected = vec![true, false, true, true, false, true, false]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_txns.len(), 2); // Two different app IDs + assert_eq!(visitor.total_non_file_actions, 4); // 2 txns + 1 protocol + 1 metadata + assert_eq!(visitor.total_file_actions, 0); + + Ok(()) + } +} diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs new file mode 100644 index 0000000000..5978574904 --- /dev/null +++ b/kernel/src/checkpoint/mod.rs @@ -0,0 +1,50 @@ +//! # Delta Kernel Checkpoint API +//! +//! This module implements the API for writing single-file checkpoints in Delta tables. +//! Checkpoints provide a compact summary of the table state, enabling faster recovery by +//! avoiding full log replay. This API supports multiple checkpoint types: +//! +//! 1. **Single-file Classic-named V1 Checkpoint** – for legacy tables that do not support +//! the v2Checkpoints feature. +//! 2. **Single-file Classic-named V2 Checkpoint** – for backwards compatibility when the +//! v2Checkpoints feature is enabled. +//! 3. **Single-file UUID-named V2 Checkpoint** – the recommended option for small to medium +//! tables with v2Checkpoints support. +//! +//! TODO!(seb): API WIP +//! The API is designed using a builder pattern via the `CheckpointBuilder`, which performs +//! table feature detection and configuration validation before constructing a `CheckpointWriter`. +//! +//! The `CheckpointWriter` then orchestrates the process of: +//! - Replaying Delta log actions (via the `checkpoint/log_replay.rs` module) to filter, deduplicate, +//! and select the actions that represent the table's current state. +//! - Writing the consolidated checkpoint data to a single file. +//! - Finalizing the checkpoint by generating a `_last_checkpoint` file with metadata. +//! +//! ## Example +//! +//! ```ignore +//! let path = "./tests/data/app-txn-no-checkpoint"; +//! let engine = Arc::new(SyncEngine::new()); +//! let table = Table::try_from_uri(path)?; +//! +//! // Create a checkpoint builder for the table at a specific version +//! let builder = table.checkpoint(&engine, Some(2))?; +//! +//! // Optionally configure the builder (e.g., force classic naming) +//! let writer = builder.with_classic_naming(true); +//! +//! // Build the checkpoint writer +//! let mut writer = builder.build(&engine)?; +//! +//! // Retrieve checkpoint data (ensuring single consumption) +//! let checkpoint_data = writer.get_checkpoint_info()?; +//! +//! // Write checkpoint data to file and collect metadata before finalizing +//! writer.finalize_checkpoint(&engine, &checkpoint_metadata)?; +//! ``` +//! +//! This module, along with its submodule `checkpoint/log_replay.rs`, provides the full +//! API and implementation for generating checkpoints. See `checkpoint/log_replay.rs` for details +//! on how log replay is used to filter and deduplicate actions for checkpoint creation. +pub mod log_replay; diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index d40a515e6f..fb6e1f069c 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -74,6 +74,7 @@ use url::Url; use self::schema::{DataType, SchemaRef}; pub mod actions; +pub mod checkpoint; pub mod engine_data; pub mod error; pub mod expressions; From 1981ab45afcf2e4d995de973b4d217908e1fbbc5 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 1 Apr 2025 15:53:36 -0700 Subject: [PATCH 061/176] refactor into test_utils --- kernel/src/actions/visitors.rs | 62 +++---------------------------- kernel/src/engine/arrow_data.rs | 18 ++------- kernel/src/engine/default/json.rs | 9 +---- kernel/src/scan/mod.rs | 15 ++------ kernel/src/utils.rs | 37 +++++++++++++++++- 5 files changed, 49 insertions(+), 92 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index aa0f74f351..d68c6f9e85 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -511,45 +511,10 @@ pub(crate) fn visit_deletion_vector_at<'a>( #[cfg(test)] mod tests { - use std::sync::Arc; - - use crate::arrow::array::{RecordBatch, StringArray}; - use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; - use super::*; - use crate::{ - actions::get_log_schema, - engine::arrow_data::ArrowEngineData, - engine::sync::{json::SyncJsonHandler, SyncEngine}, - Engine, EngineData, JsonHandler, - }; - - // TODO(nick): Merge all copies of this into one "test utils" thing - fn string_array_to_engine_data(string_array: StringArray) -> Box { - let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); - let schema = Arc::new(ArrowSchema::new(vec![string_field])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) - .expect("Can't convert to record batch"); - Box::new(ArrowEngineData::new(batch)) - } - - fn action_batch() -> Box { - let handler = SyncJsonHandler {}; - let json_strings: StringArray = vec![ - r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, - r#"{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Databricks-Runtime/","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}}"#, - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#, - r#"{"cdc":{"path":"_change_data/age=21/cdc-00000-93f7fceb-281a-446a-b221-07b88132d203.c000.snappy.parquet","partitionValues":{"age":"21"},"size":1033,"dataChange":false}}"#, - r#"{"sidecar":{"path":"016ae953-37a9-438e-8683-9a9a4a79a395.parquet","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{"tag_foo":"tag_bar"}}}"#, - ] - .into(); - let output_schema = get_log_schema().clone(); - let parsed = handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); - ArrowEngineData::try_from_engine_data(parsed).unwrap() - } + + use crate::utils::test_utils::action_batch; + use crate::{arrow::array::StringArray, utils::test_utils::parse_json_batch}; #[test] fn test_parse_protocol() -> DeltaResult<()> { @@ -639,8 +604,6 @@ mod tests { #[test] fn test_parse_add_partitioned() { - let engine = SyncEngine::new(); - let json_handler = engine.get_json_handler(); let json_strings: StringArray = vec![ r#"{"commitInfo":{"timestamp":1670892998177,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[\"c1\",\"c2\"]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"3","numOutputRows":"3","numOutputBytes":"1356"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.2.0","txnId":"046a258f-45e3-4657-b0bf-abfb0f76681c"}}"#, r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, @@ -650,10 +613,7 @@ mod tests { r#"{"add":{"path":"c1=6/c2=a/part-00011-10619b10-b691-4fd0-acc4-2a9608499d7c.c000.snappy.parquet","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":4},\"maxValues\":{\"c3\":4},\"nullCount\":{\"c3\":0}}"}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let batch = json_handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); + let batch = parse_json_batch(json_strings); let mut add_visitor = AddVisitor::default(); add_visitor.visit_rows_of(batch.as_ref()).unwrap(); let add1 = Add { @@ -697,18 +657,13 @@ mod tests { #[test] fn test_parse_remove_partitioned() { - let engine = SyncEngine::new(); - let json_handler = engine.get_json_handler(); let json_strings: StringArray = vec![ r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, r#"{"metaData":{"id":"aff5cb91-8cd9-4195-aef9-446908507302","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, r#"{"remove":{"path":"c1=4/c2=c/part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let batch = json_handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); + let batch = parse_json_batch(json_strings); let mut remove_visitor = RemoveVisitor::default(); remove_visitor.visit_rows_of(batch.as_ref()).unwrap(); let expected_remove = Remove { @@ -736,8 +691,6 @@ mod tests { #[test] fn test_parse_txn() { - let engine = SyncEngine::new(); - let json_handler = engine.get_json_handler(); let json_strings: StringArray = vec![ r#"{"commitInfo":{"timestamp":1670892998177,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[\"c1\",\"c2\"]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"3","numOutputRows":"3","numOutputBytes":"1356"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.2.0","txnId":"046a258f-45e3-4657-b0bf-abfb0f76681c"}}"#, r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, @@ -747,10 +700,7 @@ mod tests { r#"{"txn":{"appId":"myApp2","version": 4, "lastUpdated": 1670892998177}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let batch = json_handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); + let batch = parse_json_batch(json_strings); let mut txn_visitor = SetTransactionVisitor::default(); txn_visitor.visit_rows_of(batch.as_ref()).unwrap(); let mut actual = txn_visitor.set_transactions; diff --git a/kernel/src/engine/arrow_data.rs b/kernel/src/engine/arrow_data.rs index 9883809013..8f33058437 100644 --- a/kernel/src/engine/arrow_data.rs +++ b/kernel/src/engine/arrow_data.rs @@ -294,27 +294,15 @@ impl ArrowEngineData { #[cfg(test)] mod tests { - use std::sync::Arc; - - use crate::arrow::array::{RecordBatch, StringArray}; - use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use crate::arrow::array::StringArray; + use crate::utils::test_utils::string_array_to_engine_data; use crate::{ actions::{get_log_schema, Metadata, Protocol}, engine::sync::SyncEngine, - DeltaResult, Engine, EngineData, + DeltaResult, Engine, }; - use super::ArrowEngineData; - - fn string_array_to_engine_data(string_array: StringArray) -> Box { - let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); - let schema = Arc::new(ArrowSchema::new(vec![string_field])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) - .expect("Can't convert to record batch"); - Box::new(ArrowEngineData::new(batch)) - } - #[test] fn test_md_extract() -> DeltaResult<()> { let engine = SyncEngine::new(); diff --git a/kernel/src/engine/default/json.rs b/kernel/src/engine/default/json.rs index 3e0173f956..1dc35539e4 100644 --- a/kernel/src/engine/default/json.rs +++ b/kernel/src/engine/default/json.rs @@ -257,6 +257,7 @@ mod tests { use crate::engine::default::executor::tokio::{ TokioBackgroundExecutor, TokioMultiThreadExecutor, }; + use crate::utils::test_utils::string_array_to_engine_data; use futures::future; use itertools::Itertools; use object_store::local::LocalFileSystem; @@ -471,14 +472,6 @@ mod tests { } } - fn string_array_to_engine_data(string_array: StringArray) -> Box { - let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); - let schema = Arc::new(ArrowSchema::new(vec![string_field])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) - .expect("Can't convert to record batch"); - Box::new(ArrowEngineData::new(batch)) - } - #[test] fn test_parse_json() { let store = Arc::new(LocalFileSystem::new()); diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 0372bfd252..a8e5da899f 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -664,8 +664,8 @@ pub fn selection_vector( // some utils that are used in file_stream.rs and state.rs tests #[cfg(test)] pub(crate) mod test_utils { - use crate::arrow::array::{RecordBatch, StringArray}; - use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use crate::arrow::array::StringArray; + use crate::utils::test_utils::string_array_to_engine_data; use itertools::Itertools; use std::sync::Arc; @@ -677,20 +677,11 @@ pub(crate) mod test_utils { }, scan::log_replay::scan_action_iter, schema::SchemaRef, - EngineData, JsonHandler, + JsonHandler, }; use super::{state::ScanCallback, Transform}; - // TODO(nick): Merge all copies of this into one "test utils" thing - fn string_array_to_engine_data(string_array: StringArray) -> Box { - let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); - let schema = Arc::new(ArrowSchema::new(vec![string_field])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) - .expect("Can't convert to record batch"); - Box::new(ArrowEngineData::new(batch)) - } - // Generates a batch of sidecar actions with the given paths. // The schema is provided as null columns affect equality checks. pub(crate) fn sidecar_batch_with_given_paths( diff --git a/kernel/src/utils.rs b/kernel/src/utils.rs index fd2db25013..19e23d86c2 100644 --- a/kernel/src/utils.rs +++ b/kernel/src/utils.rs @@ -13,7 +13,12 @@ pub(crate) use require; #[cfg(test)] pub(crate) mod test_utils { - use crate::arrow::array::RecordBatch; + use crate::actions::get_log_schema; + use crate::arrow::array::{RecordBatch, StringArray}; + use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use crate::engine::sync::SyncEngine; + use crate::Engine; + use itertools::Itertools; use object_store::local::LocalFileSystem; use object_store::ObjectStore; @@ -97,4 +102,34 @@ pub(crate) mod test_utils { pub(crate) fn assert_batch_matches(actual: Box, expected: Box) { assert_eq!(into_record_batch(actual), into_record_batch(expected)); } + + pub(crate) fn string_array_to_engine_data(string_array: StringArray) -> Box { + let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); + let schema = Arc::new(ArrowSchema::new(vec![string_field])); + let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) + .expect("Can't convert to record batch"); + Box::new(ArrowEngineData::new(batch)) + } + + pub(crate) fn parse_json_batch(json_strings: StringArray) -> Box { + let engine = SyncEngine::new(); + let json_handler = engine.get_json_handler(); + let output_schema = get_log_schema().clone(); + json_handler + .parse_json(string_array_to_engine_data(json_strings), output_schema) + .unwrap() + } + + pub(crate) fn action_batch() -> Box { + let json_strings: StringArray = vec![ + r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, + r#"{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Databricks-Runtime/","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#, + r#"{"cdc":{"path":"_change_data/age=21/cdc-00000-93f7fceb-281a-446a-b221-07b88132d203.c000.snappy.parquet","partitionValues":{"age":"21"},"size":1033,"dataChange":false}}"#, + r#"{"sidecar":{"path":"016ae953-37a9-438e-8683-9a9a4a79a395.parquet","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{"tag_foo":"tag_bar"}}}"#, + ] + .into(); + parse_json_batch(json_strings) + } } From 348831802b35cfcd85c6a2c4db7e2e18bf57ea06 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 2 Apr 2025 12:17:22 -0700 Subject: [PATCH 062/176] merge --- kernel/src/checkpoint/log_replay.rs | 243 +++++++++++++++++++++++++- kernel/src/checkpoints/log_replay.rs | 244 --------------------------- kernel/src/checkpoints/mod.rs | 1 - kernel/src/utils.rs | 14 +- 4 files changed, 241 insertions(+), 261 deletions(-) delete mode 100644 kernel/src/checkpoints/log_replay.rs delete mode 100644 kernel/src/checkpoints/mod.rs diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index e17e9d1607..f770b30dee 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -10,8 +10,7 @@ //! duplicate or obsolete actions (including remove actions) are ignored. //! - Retention Filtering: Tombstones older than the configured `minimum_file_retention_timestamp` are excluded. //! -//! TODO: V1CheckpointLogReplayProcessor & CheckpointData is a WIP. -//! The module defines the CheckpointLogReplayProcessor which implements the LogReplayProcessor trait, +//! The module defines the [`V1CheckpointLogReplayProccessor`] which implements the LogReplayProcessor trait, //! as well as a [`V1CheckpointVisitor`] to traverse and process batches of log actions. //! //! The processing result is encapsulated in CheckpointData, which includes the transformed log data and @@ -20,13 +19,170 @@ //! For log replay functionality used during table scans (i.e. for reading checkpoints and commit logs), refer to //! the `scan/log_replay.rs` module. use std::collections::HashSet; -use std::sync::LazyLock; +use std::sync::atomic::{AtomicI64, Ordering}; +use std::sync::{Arc, LazyLock}; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; -use crate::log_replay::{FileActionDeduplicator, FileActionKey}; +use crate::log_replay::{ + FileActionDeduplicator, FileActionKey, HasSelectionVector, LogReplayProcessor, +}; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; -use crate::{DeltaResult, Error}; +use crate::{DeltaResult, EngineData, Error}; + +/// `CheckpointData` contains a batch of filtered actions for checkpoint creation. +/// This structure holds a single batch of engine data along with a selection vector +/// that marks which rows should be included in the V1 checkpoint file. +/// TODO!(seb): change to type CheckpointData = FilteredEngineData, when introduced +pub struct CheckpointData { + /// The original engine data containing the actions + #[allow(dead_code)] // TODO: Remove once checkpoint_v1 API is implemented + data: Box, + /// Boolean vector indicating which rows should be included in the checkpoint + selection_vector: Vec, +} + +impl HasSelectionVector for CheckpointData { + /// Returns true if any row in the selection vector is marked as selected + fn has_selected_rows(&self) -> bool { + self.selection_vector.contains(&true) + } +} + +/// The [`V1CheckpointLogReplayProccessor`] is an implementation of the [`LogReplayProcessor`] +/// trait that filters log segment actions for inclusion in a V1 spec checkpoint file. +/// +/// It processes each action batch via the `process_actions_batch` method, using the +/// [`V1CheckpointVisitor`] to convert each batch into a [`CheckpointData`] instance that +/// contains only the actions required for the checkpoint. +pub(crate) struct V1CheckpointLogReplayProccessor { + /// Tracks file actions that have been seen during log replay to avoid duplicates. + /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. + seen_file_keys: HashSet, + + /// Counter for the total number of actions processed during log replay. + total_actions: Arc, + + /// Counter for the total number of add actions processed during log replay. + total_add_actions: Arc, + + /// Indicates whether a protocol action has been seen in the log. + seen_protocol: bool, + + /// Indicates whether a metadata action has been seen in the log. + seen_metadata: bool, + + /// Set of transaction app IDs that have been processed to avoid duplicates. + seen_txns: HashSet, + + /// Minimum timestamp for file retention, used for filtering expired tombstones. + minimum_file_retention_timestamp: i64, +} + +impl LogReplayProcessor for V1CheckpointLogReplayProccessor { + // Define the processing result type as CheckpointData + type Output = CheckpointData; + + /// This function processes batches of actions in reverse chronological order + /// (from most recent to least recent) and performs the necessary filtering + /// to ensure the checkpoint contains only the actions needed to reconstruct + /// the complete state of the table. + /// + /// # Filtering Rules + /// + /// The following rules apply when filtering actions: + /// + /// 1. Only the most recent protocol and metadata actions are included + /// 2. For each app ID, only the most recent transaction action is included + /// 3. Add and remove actions are deduplicated based on path and unique ID + /// 4. Tombstones older than `minimum_file_retention_timestamp` are excluded + /// 5. Sidecar, commitInfo, and CDC actions are excluded + fn process_actions_batch( + &mut self, + batch: Box, + is_log_batch: bool, + ) -> DeltaResult { + // Initialize selection vector with all rows un-selected + let selection_vector = vec![false; batch.len()]; + assert_eq!( + selection_vector.len(), + batch.len(), + "Initial selection vector length does not match actions length" + ); + + // Create the checkpoint visitor to process actions and update selection vector + let mut visitor = V1CheckpointVisitor::new( + &mut self.seen_file_keys, + is_log_batch, + selection_vector, + self.minimum_file_retention_timestamp, + self.seen_protocol, + self.seen_metadata, + &mut self.seen_txns, + ); + + // Process actions and let visitor update selection vector + visitor.visit_rows_of(batch.as_ref())?; + + // Update shared counters with file action counts from this batch + self.total_actions.fetch_add( + visitor.total_file_actions + visitor.total_non_file_actions, + Ordering::SeqCst, + ); + self.total_add_actions + .fetch_add(visitor.total_add_actions, Ordering::SeqCst); + + // Update protocol and metadata seen flags + self.seen_protocol = visitor.seen_protocol; + self.seen_metadata = visitor.seen_metadata; + + Ok(CheckpointData { + data: batch, + selection_vector: visitor.selection_vector, + }) + } +} + +impl V1CheckpointLogReplayProccessor { + pub(crate) fn new( + total_actions_counter: Arc, + total_add_actions_counter: Arc, + minimum_file_retention_timestamp: i64, + ) -> Self { + Self { + seen_file_keys: Default::default(), + total_actions: total_actions_counter, + total_add_actions: total_add_actions_counter, + seen_protocol: false, + seen_metadata: false, + seen_txns: Default::default(), + minimum_file_retention_timestamp, + } + } +} + +/// Given an iterator of (engine_data, bool) tuples, returns an iterator of +/// `(engine_data, selection_vec)`. Each row that is selected in the returned `engine_data` _must_ +/// be written to the V1 checkpoint file in order to capture the table version's complete state. +/// Non-selected rows _must_ be ignored. The boolean flag indicates whether the record batch +/// is a log or checkpoint batch. +/// +/// Note: The iterator of (engine_data, bool) tuples 'action_iter' parameter must be sorted by the +/// order of the actions in the log from most recent to least recent. +#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented +pub(crate) fn checkpoint_actions_iter( + action_iter: impl Iterator, bool)>> + Send + 'static, + total_actions_counter: Arc, + total_add_actions_counter: Arc, + minimum_file_retention_timestamp: i64, +) -> impl Iterator> + Send + 'static { + let log_scanner = V1CheckpointLogReplayProccessor::new( + total_actions_counter, + total_add_actions_counter, + minimum_file_retention_timestamp, + ); + V1CheckpointLogReplayProccessor::apply_to_iterator(log_scanner, action_iter) +} /// A visitor that filters actions for inclusion in a V1 spec checkpoint file. /// @@ -591,4 +747,81 @@ mod tests { Ok(()) } + + /// Tests the end-to-end processing of multiple batches with various action types. + /// This tests the integration of the visitors with the main iterator function. + /// More granular testing is performed in the visitor tests. + #[test] + fn test_v1_checkpoint_actions_iter_multi_batch_test() -> DeltaResult<()> { + // Setup counters + let total_actions_counter = Arc::new(AtomicI64::new(0)); + let total_add_actions_counter = Arc::new(AtomicI64::new(0)); + + // Create first batch with protocol, metadata, and some files + let json_strings1: StringArray = vec![ + r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, + r#"{"metaData":{"id":"test2","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, + r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, + r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, + ].into(); + + // Create second batch with some duplicates and new files + let json_strings2: StringArray = vec![ + // Protocol and metadata should be skipped as duplicates + r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, + r#"{"metaData":{"id":"test1","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, + // New files + r#"{"add":{"path":"file3","partitionValues":{},"size":800,"modificationTime":102,"dataChange":true}}"#, + // Duplicate file should be skipped + r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, // Transaction + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"# + ].into(); + + // Create third batch with all duplicate actions. + // The entire batch should be skippped as there are no selected actions to write from this batch. + let json_strings3: StringArray = vec![ + r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, + r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, + ].into(); + + let input_batches = vec![ + Ok((parse_json_batch(json_strings1), true)), + Ok((parse_json_batch(json_strings2), true)), + Ok((parse_json_batch(json_strings3), true)), + ]; + + // Run the iterator + let results: Vec<_> = checkpoint_actions_iter( + input_batches.into_iter(), + total_actions_counter.clone(), + total_add_actions_counter.clone(), + 0, + ) + .collect::, _>>()?; + + // Expect two batches in results (third batch should be filtered out)" + assert_eq!(results.len(), 2); + + // First batch should have all rows selected + let checkpoint_data = &results[0]; + assert_eq!( + checkpoint_data.selection_vector, + vec![true, true, true, true] + ); + + // Second batch should have only new file and transaction selected + let checkpoint_data = &results[1]; + assert_eq!( + checkpoint_data.selection_vector, + vec![false, false, true, false, true] + ); + + // 6 total actions (4 from batch1 + 2 from batch2 + 0 from batch3) + assert_eq!(total_actions_counter.load(Ordering::Relaxed), 6); + + // 3 add actions (2 from batch1 + 1 from batch2) + assert_eq!(total_add_actions_counter.load(Ordering::Relaxed), 3); + + Ok(()) + } } diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs deleted file mode 100644 index 748c4bcf16..0000000000 --- a/kernel/src/checkpoints/log_replay.rs +++ /dev/null @@ -1,244 +0,0 @@ -use std::collections::HashSet; -use std::sync::atomic::{AtomicI64, Ordering}; -use std::sync::Arc; - -use crate::actions::visitors::V1CheckpointVisitor; -use crate::log_replay::{FileActionKey, HasSelectionVector, LogReplayProcessor}; -use crate::{DeltaResult, EngineData, RowVisitor}; - -/// `CheckpointData` is a wrapper struct that contains the engine data and selection vector -/// for a batch of actions that have been processed during log replay. -/// TODO: Use `FilteredEngineData` when implemented -pub struct CheckpointData { - #[allow(unused)] - data: Box, - selection_vector: Vec, -} - -/// Implement the `HasSelectionVector` trait for `CheckpointData` to allow checking -/// whether the data contains selected rows. -impl HasSelectionVector for CheckpointData { - fn has_selected_rows(&self) -> bool { - self.selection_vector.contains(&true) - } -} - -/// The [`CheckpointLogReplayProcessor`] is an implementation of the [`LogReplayProcessor`] trait -/// that filters log segment actions for inclusion in a V1 checkpoint file. -/// -/// It processes each action batch via the `process_actions_batch` method, using the -/// [`V1CheckpointVisitor`] to convert each batch into a [`CheckpointData`] instance that -/// contains only the actions required for the checkpoint. -#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented -struct CheckpointLogReplayProcessor { - /// Tracks file actions that have been seen during log replay to avoid duplicates. - /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. - seen_file_keys: HashSet, - /// Counter for the total number of actions processed during log replay. - total_actions: Arc, - /// Counter for the total number of add actions processed during log replay. - total_add_actions: Arc, - /// Indicates whether a protocol action has been seen in the log. - seen_protocol: bool, - /// Indicates whether a metadata action has been seen in the log. - seen_metadata: bool, - /// Set of transaction app IDs that have been processed to avoid duplicates. - seen_txns: HashSet, - /// Minimum timestamp for file retention, used for filtering expired tombstones. - minimum_file_retention_timestamp: i64, -} - -impl LogReplayProcessor for CheckpointLogReplayProcessor { - // Define the processing result type as a tuple of the data and selection vector - type Output = CheckpointData; - - /// This function processes batches of actions in reverse chronological order - /// (from most recent to least recent) and performs the necessary filtering - /// to ensure the checkpoint contains only the actions needed to reconstruct - /// the complete state of the table. - /// - /// # Filtering Rules - /// - /// The following rules apply when filtering actions: - /// - /// 1. Only the most recent protocol and metadata actions are included - /// 2. For each app ID, only the most recent transaction action is included - /// 3. Add and remove actions are deduplicated based on path and unique ID - /// 4. Tombstones older than `minimum_file_retention_timestamp` are excluded - /// 5. Sidecar, commitInfo, and CDC actions are excluded - fn process_actions_batch( - &mut self, - batch: Box, - is_log_batch: bool, - ) -> DeltaResult { - // Initialize selection vector with all rows un-selected - let selection_vector = vec![false; batch.len()]; - assert_eq!( - selection_vector.len(), - batch.len(), - "Initial selection vector length does not match actions length" - ); - - // Create the checkpoint visitor to process actions and update selection vector - let mut visitor = V1CheckpointVisitor::new( - &mut self.seen_file_keys, - is_log_batch, - selection_vector, - self.minimum_file_retention_timestamp, - self.seen_protocol, - self.seen_metadata, - &mut self.seen_txns, - ); - - // Process actions and let visitor update selection vector - visitor.visit_rows_of(batch.as_ref())?; - - // Update shared counters with file action counts from this batch - self.total_actions.fetch_add( - visitor.total_file_actions + visitor.total_non_file_actions, - Ordering::SeqCst, - ); - self.total_add_actions - .fetch_add(visitor.total_add_actions, Ordering::SeqCst); - - // Update protocol and metadata seen flags - self.seen_protocol = visitor.seen_protocol; - self.seen_metadata = visitor.seen_metadata; - - Ok(CheckpointData { - data: batch, - selection_vector: visitor.selection_vector, - }) - } -} - -#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented -impl CheckpointLogReplayProcessor { - pub(super) fn new( - total_actions_counter: Arc, - total_add_actions_counter: Arc, - minimum_file_retention_timestamp: i64, - ) -> Self { - Self { - seen_file_keys: Default::default(), - total_actions: total_actions_counter, - total_add_actions: total_add_actions_counter, - seen_protocol: false, - seen_metadata: false, - seen_txns: Default::default(), - minimum_file_retention_timestamp, - } - } -} - -/// Given an iterator of (engine_data, bool) tuples, returns an iterator of -/// `(engine_data, selection_vec)`. Each row that is selected in the returned `engine_data` _must_ -/// be written to the V1 checkpoint file in order to capture the table version's complete state. -/// Non-selected rows _must_ be ignored. The boolean flag indicates whether the record batch -/// is a log or checkpoint batch. -/// -/// Note: The iterator of (engine_data, bool) tuples 'action_iter' parameter must be sorted by the -/// order of the actions in the log from most recent to least recent. -#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented -pub(crate) fn checkpoint_actions_iter( - action_iter: impl Iterator, bool)>> + Send + 'static, - total_actions_counter: Arc, - total_add_actions_counter: Arc, - minimum_file_retention_timestamp: i64, -) -> impl Iterator> + Send + 'static { - let mut log_scanner = CheckpointLogReplayProcessor::new( - total_actions_counter, - total_add_actions_counter, - minimum_file_retention_timestamp, - ); - - CheckpointLogReplayProcessor::apply_to_iterator(log_scanner, action_iter) -} - -#[cfg(test)] -mod tests { - use std::sync::atomic::{AtomicI64, Ordering}; - use std::sync::Arc; - - use crate::arrow::array::StringArray; - use crate::checkpoints::log_replay::checkpoint_actions_iter; - use crate::utils::test_utils::parse_json_batch; - use crate::DeltaResult; - - /// Tests the end-to-end processing of multiple batches with various action types. - /// This tests the integration of the visitors with the main iterator function. - /// More granular testing is performed in the individual visitor tests. - #[test] - fn test_v1_checkpoint_actions_iter_multi_batch_test() -> DeltaResult<()> { - // Setup counters - let total_actions_counter = Arc::new(AtomicI64::new(0)); - let total_add_actions_counter = Arc::new(AtomicI64::new(0)); - - // Create first batch with protocol, metadata, and some files - let json_strings1: StringArray = vec![ - r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, - r#"{"metaData":{"id":"test2","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, - r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, - r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, - ].into(); - - // Create second batch with some duplicates and new files - let json_strings2: StringArray = vec![ - // Protocol and metadata should be skipped as duplicates - r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, - r#"{"metaData":{"id":"test1","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, - // New files - r#"{"add":{"path":"file3","partitionValues":{},"size":800,"modificationTime":102,"dataChange":true}}"#, - // Duplicate file should be skipped - r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, // Transaction - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"# - ].into(); - - // Create third batch with all duplicate actions. - // The entire batch should be skippped as there are no selected actions to write from this batch. - let json_strings3: StringArray = vec![ - r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, - r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, - ].into(); - - let input_batches = vec![ - Ok((parse_json_batch(json_strings1), true)), - Ok((parse_json_batch(json_strings2), true)), - Ok((parse_json_batch(json_strings3), true)), - ]; - - // Run the iterator - let results: Vec<_> = checkpoint_actions_iter( - input_batches.into_iter(), - total_actions_counter.clone(), - total_add_actions_counter.clone(), - 0, - ) - .collect::, _>>()?; - - // Expect two batches in results (third batch should be filtered out)" - assert_eq!(results.len(), 2); - - // First batch should have all rows selected - let checkpoint_data = &results[0]; - assert_eq!( - checkpoint_data.selection_vector, - vec![true, true, true, true] - ); - - // Second batch should have only new file and transaction selected - let checkpoint_data = &results[1]; - assert_eq!( - checkpoint_data.selection_vector, - vec![false, false, true, false, true] - ); - - // 6 total actions (4 from batch1 + 2 from batch2 + 0 from batch3) - assert_eq!(total_actions_counter.load(Ordering::Relaxed), 6); - - // 3 add actions (2 from batch1 + 1 from batch2) - assert_eq!(total_add_actions_counter.load(Ordering::Relaxed), 3); - - Ok(()) - } -} diff --git a/kernel/src/checkpoints/mod.rs b/kernel/src/checkpoints/mod.rs deleted file mode 100644 index 826ff771fb..0000000000 --- a/kernel/src/checkpoints/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub mod log_replay; diff --git a/kernel/src/utils.rs b/kernel/src/utils.rs index 431d40ee05..872a70736e 100644 --- a/kernel/src/utils.rs +++ b/kernel/src/utils.rs @@ -13,11 +13,13 @@ pub(crate) use require; #[cfg(test)] pub(crate) mod test_utils { - use crate::actions::get_log_schema; + use crate::actions::{get_log_schema, Add, Cdc, CommitInfo, Metadata, Protocol, Remove}; use crate::arrow::array::{RecordBatch, StringArray}; use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use crate::engine::arrow_data::ArrowEngineData; use crate::engine::sync::SyncEngine; use crate::Engine; + use crate::EngineData; use itertools::Itertools; use object_store::local::LocalFileSystem; @@ -27,16 +29,6 @@ pub(crate) mod test_utils { use tempfile::TempDir; use test_utils::delta_path_for_version; - use crate::actions::get_log_schema; - use crate::arrow::array::StringArray; - use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; - use crate::engine::sync::SyncEngine; - use crate::{ - actions::{Add, Cdc, CommitInfo, Metadata, Protocol, Remove}, - engine::arrow_data::ArrowEngineData, - }; - use crate::{Engine, EngineData}; - #[derive(Serialize)] pub(crate) enum Action { #[serde(rename = "add")] From c4e552213f1fa5269b2ea685cddf521b22415ca4 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 2 Apr 2025 12:18:15 -0700 Subject: [PATCH 063/176] redundant docs --- kernel/src/checkpoint/log_replay.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index e17e9d1607..b17f955b9c 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -77,14 +77,6 @@ impl V1CheckpointVisitor<'_> { const REMOVE_PATH_INDEX: usize = 4; // Position of "remove.path" in getters const REMOVE_DV_START_INDEX: usize = 6; // Start position of remove deletion vector columns - /// Creates a new V1CheckpointVisitor for filtering checkpoint actions. - /// - /// # Arguments - /// * `seen_file_keys` - Set to track already seen file keys for deduplication - /// * `is_log_batch` - True if processing a batch from a commit file, false if from a checkpoint file - /// * `selection_vector` - Vector to mark rows for selection in the output - /// * `seen_txns` - Set to track already seen transaction app IDs - /// * `minimum_file_retention_timestamp` - Timestamp threshold for tombstone expiration pub(crate) fn new<'seen>( seen_file_keys: &'seen mut HashSet, is_log_batch: bool, From 18d1a29cb95ab86c36840e1162d3567e4cc43abe Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 2 Apr 2025 12:19:11 -0700 Subject: [PATCH 064/176] fix doc --- kernel/src/checkpoint/log_replay.rs | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index f770b30dee..d806bd3c9d 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -13,7 +13,7 @@ //! The module defines the [`V1CheckpointLogReplayProccessor`] which implements the LogReplayProcessor trait, //! as well as a [`V1CheckpointVisitor`] to traverse and process batches of log actions. //! -//! The processing result is encapsulated in CheckpointData, which includes the transformed log data and +//! The processing result is encapsulated in [`CheckpointData`], which includes the transformed log data and //! a selection vector indicating which rows should be written to the checkpoint. //! //! For log replay functionality used during table scans (i.e. for reading checkpoints and commit logs), refer to @@ -233,14 +233,6 @@ impl V1CheckpointVisitor<'_> { const REMOVE_PATH_INDEX: usize = 4; // Position of "remove.path" in getters const REMOVE_DV_START_INDEX: usize = 6; // Start position of remove deletion vector columns - /// Creates a new V1CheckpointVisitor for filtering checkpoint actions. - /// - /// # Arguments - /// * `seen_file_keys` - Set to track already seen file keys for deduplication - /// * `is_log_batch` - True if processing a batch from a commit file, false if from a checkpoint file - /// * `selection_vector` - Vector to mark rows for selection in the output - /// * `seen_txns` - Set to track already seen transaction app IDs - /// * `minimum_file_retention_timestamp` - Timestamp threshold for tombstone expiration pub(crate) fn new<'seen>( seen_file_keys: &'seen mut HashSet, is_log_batch: bool, From 7dccdea708f8943cffbf6d09dc64a7f875ce2bc6 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 2 Apr 2025 13:45:13 -0700 Subject: [PATCH 065/176] wip --- kernel/src/checkpoint/mod.rs | 360 +++++++++++++++++++++++++++++- kernel/src/table_configuration.rs | 19 ++ 2 files changed, 378 insertions(+), 1 deletion(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 5978574904..0ee17114d0 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -47,4 +47,362 @@ //! This module, along with its submodule `checkpoint/log_replay.rs`, provides the full //! API and implementation for generating checkpoints. See `checkpoint/log_replay.rs` for details //! on how log replay is used to filter and deduplicate actions for checkpoint creation. -pub mod log_replay; + +use log_replay::{checkpoint_actions_iter, CheckpointData}; +use std::{ + sync::{atomic::AtomicI64, Arc, LazyLock}, + time::{Duration, SystemTime, UNIX_EPOCH}, +}; +use url::Url; + +use crate::actions::schemas::GetStructField; +use crate::schema::{SchemaRef, StructType}; +use crate::{ + actions::{ + Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, METADATA_NAME, + PROTOCOL_NAME, REMOVE_NAME, SET_TRANSACTION_NAME, SIDECAR_NAME, + }, + snapshot::Snapshot, + DeltaResult, Engine, EngineData, Error, +}; + +pub(crate) mod log_replay; + +/// Read schema definition for collecting checkpoint actions +static CHECKPOINT_READ_SCHEMA: LazyLock = LazyLock::new(|| { + StructType::new([ + Option::::get_struct_field(ADD_NAME), + Option::::get_struct_field(REMOVE_NAME), + Option::::get_struct_field(METADATA_NAME), + Option::::get_struct_field(PROTOCOL_NAME), + Option::::get_struct_field(SET_TRANSACTION_NAME), + Option::::get_struct_field(SIDECAR_NAME), + ]) + .into() +}); + +/// Returns the read schema to collect checkpoint actions +#[cfg_attr(feature = "developer-visibility", visibility::make(pub))] +#[cfg_attr(not(feature = "developer-visibility"), visibility::make(pub(crate)))] +fn get_checkpoint_read_schema() -> &'static SchemaRef { + &CHECKPOINT_READ_SCHEMA +} + +/// Contains the path and data for a single-file checkpoint. +/// +/// This struct holds all the necessary information for writing a checkpoint file, +/// including the destination path and the iterator over checkpoint actions. +pub struct SingleFileCheckpointData { + /// The target URL where the checkpoint file will be written + pub path: Url, + + /// Iterator over checkpoint actions to be written to the file + pub data: Box>>, +} + +/// Writer for creating checkpoint files in Delta tables. +/// +/// The CheckpointWriter orchestrates the process of writing checkpoint data to storage. +/// It manages the one-time consumption of checkpoint data and tracks statistics +/// about the actions included in the checkpoint. +pub struct CheckpointWriter { + /// Using Option to enforce single consumption at compile time + single_file_checkpoint_data: Option, + + /// Total actions counter to be written to the last checkpoint file + #[allow(dead_code)] // TODO: Remove when finalize_checkpoint is implemented + total_actions_counter: Arc, + + /// Total add actions counter to be written to the last checkpoint file + #[allow(dead_code)] // TODO: Remove when finalize_checkpoint is implemented + total_add_actions_counter: Arc, + + /// Version of the checkpoint + #[allow(dead_code)] // TODO: Remove when finalize_checkpoint is implemented + version: i64, + + /// Number of parts of the checkpoint + #[allow(dead_code)] // TODO: Remove when finalize_checkpoint is implemented + parts: i64, + + /// Path to table's log + #[allow(dead_code)] // TODO: Remove when finalize_checkpoint is implemented + log_root: Url, +} + +impl CheckpointWriter { + /// Creates a new CheckpointWriter with the provided checkpoint data and counters + fn new( + single_file_checkpoint_data: Option, + total_actions_counter: Arc, + total_add_actions_counter: Arc, + version: i64, + parts: i64, + log_root: Url, + ) -> Self { + Self { + single_file_checkpoint_data, + total_actions_counter, + total_add_actions_counter, + version, + parts, + log_root, + } + } + + /// Retrieves the checkpoint data and path information + /// + /// This method takes ownership of the checkpoint data, ensuring it can + /// only be consumed once. It returns an error if the data has already + /// been consumed. + pub fn get_checkpoint_info(&mut self) -> DeltaResult { + self.single_file_checkpoint_data + .take() + .ok_or_else(|| Error::generic("Checkpoint data already consumed")) + } + + /// Finalizes the checkpoint writing process + /// + /// This method should be only called AFTER writing all checkpoint data to + /// ensure proper completion of the checkpoint operation, which includes + /// writing the _last_checkpoint file. + /// + /// Metadata is a single-row EngineData batch with {size_in_bytes: i64} + /// Given the engine collected checkpoint metadata we want to extend + /// the EngineData batch with the remaining fields for the `_last_checkpoint` + /// file. + #[allow(dead_code)] // TODO: Remove when finalize_checkpoint is implemented + fn finalize_checkpoint( + self, + _engine: &dyn Engine, + _metadata: &dyn EngineData, + ) -> DeltaResult<()> { + todo!("Implement finalize_checkpoint"); + } +} + +/// Builder for configuring and creating CheckpointWriter instances +/// +/// The CheckpointBuilder provides an interface for configuring checkpoint +/// generation. It handles table feature detection and enforces compatibility +/// between configuration options and table features. +pub struct CheckpointBuilder { + /// The table snapshot from which to create the checkpoint + snapshot: Snapshot, + + /// Whether to use classic naming for the checkpoint file + with_classic_naming: bool, +} + +impl CheckpointBuilder { + #[allow(dead_code)] // TODO: Remove when table.checkpoint is implemented + pub(crate) fn new(snapshot: Snapshot) -> Self { + Self { + snapshot, + with_classic_naming: false, + } + } + + /// Configures the builder to use the classic naming scheme + /// + /// Classic naming is required for V1 checkpoints and optional for V2 checkpoints. + /// For V1 checkpoints, this method is a no-op. + /// For V2 checkpoints, the default is UUID naming unless this method is called. + pub fn with_classic_naming(mut self, with_classic_naming: bool) -> Self { + self.with_classic_naming = with_classic_naming; + self + } + + /// Builds a CheckpointWriter based on the configuration + /// + /// This method validates the configuration against table features and creates + /// a CheckpointWriter for the appropriate checkpoint type. It performs protocol + /// table feature checks to determine if v2Checkpoints are supported. + /// + /// # Arguments + /// * `engine` - The engine implementation for data operations + /// + /// # Returns + /// * `DeltaResult` - A configured checkpoint writer on success, + /// or an error if the configuration is incompatible with table features + pub fn build(self, engine: &dyn Engine) -> DeltaResult { + let v2_checkpoints_supported = self + .snapshot + .table_configuration() + .is_v2_checkpoint_supported(); + + // Create counters for tracking actions + let total_actions_counter = Arc::new(AtomicI64::new(0)); + let total_add_actions_counter = Arc::new(AtomicI64::new(0)); + + // Create iterator over actions for checkpoint data + let checkpoint_data = checkpoint_actions_iter( + self.replay_for_checkpoint_data(engine)?, + total_actions_counter.clone(), + total_add_actions_counter.clone(), + self.deleted_file_retention_timestamp()?, + ); + + // Chain the result of create_checkpoint_metadata_batch to the checkpoint data + let chained = checkpoint_data.chain(create_checkpoint_metadata_batch( + self.snapshot.version() as i64, + engine, + v2_checkpoints_supported, + )?); + + // Generate checkpoint path based on builder configuration + // Classic naming is required for V1 checkpoints and optional for V2 checkpoints + // let checkpoint_path = if self.with_classic_naming || !v2_checkpoints_supported { + // ParsedLogPath::new_classic_parquet_checkpoint( + // self.snapshot.table_root(), + // self.snapshot.version(), + // )? + // } else { + // ParsedLogPath::new_uuid_parquet_checkpoint( + // self.snapshot.table_root(), + // self.snapshot.version(), + // )? + // }; + + Ok(CheckpointWriter::new( + Some(SingleFileCheckpointData { + data: Box::new(chained), + path: Url::parse("memory://test-table/_delta_log/checkpoint.parquet").unwrap(), + }), + total_actions_counter, + total_add_actions_counter, + self.snapshot.version() as i64, + 1, + self.snapshot.log_segment().log_root.clone(), + )) + } + + /// Prepares the iterator over actions for checkpoint creation + /// + /// This method is factored out to facilitate testing and returns an iterator + /// over all actions to be included in the checkpoint. + fn replay_for_checkpoint_data( + &self, + engine: &dyn Engine, + ) -> DeltaResult, bool)>> + Send> { + let read_schema = get_checkpoint_read_schema(); + self.snapshot.log_segment().read_actions( + engine, + read_schema.clone(), + read_schema.clone(), + None, + ) + } + + /// Calculates the cutoff timestamp for deleted file cleanup. + /// + /// This function determines the minimum timestamp before which deleted files + /// will be permanently removed during VACUUM operations, based on the table's + /// deleted_file_retention_duration property. + /// + /// Returns the cutoff timestamp in milliseconds since epoch, matching + /// the remove action's deletion_timestamp format for comparison. + /// + /// The default retention period is 7 days, matching delta-spark's behavior. + pub(crate) fn deleted_file_retention_timestamp(&self) -> DeltaResult { + let retention_duration = self + .snapshot + .table_properties() + .deleted_file_retention_duration; + + deleted_file_retention_timestamp_with_time( + retention_duration, + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_err(|e| Error::generic(format!("Failed to calculate system time: {}", e)))?, + ) + } +} + +/// Internal implementation with injectable time parameter for testing +fn deleted_file_retention_timestamp_with_time( + retention_duration: Option, + now_duration: Duration, +) -> DeltaResult { + // Use provided retention duration or default (7 days) + let retention_duration = + retention_duration.unwrap_or_else(|| Duration::from_secs(60 * 60 * 24 * 7)); + + // Convert to milliseconds for remove action deletion_timestamp comparison + let now_ms: i64 = now_duration + .as_millis() + .try_into() + .map_err(|_| Error::generic("Current timestamp exceeds i64 millisecond range"))?; + + let retention_ms: i64 = retention_duration + .as_millis() + .try_into() + .map_err(|_| Error::generic("Retention duration exceeds i64 millisecond range"))?; + + // Simple subtraction - will produce negative values if retention > now + Ok(now_ms - retention_ms) +} + +/// Create a batch with a single row containing the [`CheckpointMetadata`] action +/// for the V2 spec checkpoint. +/// +/// This method calls the create_one method on the expression handler to create +/// a single-row batch with the checkpoint metadata action. The method returns: +/// - None if the checkpoint is not a V2 checkpoint +/// - Some(Ok(batch)) if the batch was successfully created +fn create_checkpoint_metadata_batch( + _version: i64, + _engine: &dyn Engine, + _is_v2_checkpoint: bool, +) -> DeltaResult>> { + todo!("Implement create_checkpoint_metadata_batch"); + // if is_v2_checkpoint { + // let values: &[Scalar] = &[version.into()]; + // let checkpoint_metadata_batch = engine.get_expression_handler().create_one( + // // TODO: Include checkpointMetadata.tags when maps are supported + // Arc::new(CheckpointMetadata::to_schema().project_as_struct(&["version"])?), + // &values, + // )?; + + // let result = CheckpointData { + // data: checkpoint_metadata_batch, + // selection_vector: vec![true], + // }; + + // Ok(Some(Ok(result))) + // } else { + // Ok(None) + // } +} + +#[cfg(test)] +mod unit_tests { + use super::*; + + use std::time::Duration; + + #[test] + fn test_deleted_file_retention_timestamp() -> DeltaResult<()> { + let now = Duration::from_secs(1000).as_millis() as i64; + + // Test cases + let test_cases = [ + // Default case (7 days) + (None, now - (7 * 24 * 60 * 60 * 1000)), + // Zero retention + (Some(Duration::from_secs(0)), now), + // Custom retention (2000 seconds) + // This results in a negative timestamp which is valid - as it just means that + // the retention window extends to before UNIX epoch. + (Some(Duration::from_secs(2000)), now - (2000 * 1000)), + ]; + + for (retention, expected) in test_cases { + let result = + deleted_file_retention_timestamp_with_time(retention, Duration::from_secs(1000))?; + assert_eq!(result, expected); + } + + Ok(()) + } +} diff --git a/kernel/src/table_configuration.rs b/kernel/src/table_configuration.rs index e2d287b60f..3b659615db 100644 --- a/kernel/src/table_configuration.rs +++ b/kernel/src/table_configuration.rs @@ -238,6 +238,25 @@ impl TableConfiguration { version => (2..=6).contains(&version), } } + + /// Returns `true` if V2 checkpoint is supported on this table. To support V2 checkpoint, + /// a table must support reader version 3, writer version 7, and the v2Checkpoint feature in + /// both the protocol's readerFeatures and writerFeatures. + /// + /// See: + #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] + #[allow(unused)] // needed to compile w/o default features + pub(crate) fn is_v2_checkpoint_supported(&self) -> bool { + let read_supported = self + .protocol() + .has_reader_feature(&ReaderFeatures::V2Checkpoint) + && self.protocol.min_reader_version() == 3; + let write_supported = self + .protocol() + .has_writer_feature(&WriterFeatures::V2Checkpoint) + && self.protocol.min_writer_version() == 7; + read_supported && write_supported + } } #[cfg(test)] From a9d6c8157b2948459d5bdd4f3e305f3ebbb7661f Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 2 Apr 2025 14:13:33 -0700 Subject: [PATCH 066/176] include table API --- kernel/src/checkpoint/mod.rs | 56 ++++++++++++++++++------------------ kernel/src/table.rs | 32 +++++++++++++++++++++ 2 files changed, 60 insertions(+), 28 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 0ee17114d0..35c12d4333 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -2,7 +2,9 @@ //! //! This module implements the API for writing single-file checkpoints in Delta tables. //! Checkpoints provide a compact summary of the table state, enabling faster recovery by -//! avoiding full log replay. This API supports multiple checkpoint types: +//! avoiding full log replay. +//! +//! ## Checkpoint Types //! //! 1. **Single-file Classic-named V1 Checkpoint** – for legacy tables that do not support //! the v2Checkpoints feature. @@ -11,15 +13,15 @@ //! 3. **Single-file UUID-named V2 Checkpoint** – the recommended option for small to medium //! tables with v2Checkpoints support. //! -//! TODO!(seb): API WIP -//! The API is designed using a builder pattern via the `CheckpointBuilder`, which performs -//! table feature detection and configuration validation before constructing a `CheckpointWriter`. +//! ## Architecture +//! +//! The API is designed using a builder pattern: //! -//! The `CheckpointWriter` then orchestrates the process of: -//! - Replaying Delta log actions (via the `checkpoint/log_replay.rs` module) to filter, deduplicate, -//! and select the actions that represent the table's current state. -//! - Writing the consolidated checkpoint data to a single file. -//! - Finalizing the checkpoint by generating a `_last_checkpoint` file with metadata. +//! 1. [`CheckpointBuilder`] performs table feature detection and configuration validation +//! 2. [`CheckpointWriter`] is constructed from the builder and handles: +//! - Replaying Delta log actions to filter, deduplicate, and select actions +//! - Writing consolidated checkpoint data to a single file +//! - Finalizing the checkpoint by generating a `_last_checkpoint` file with metadata//! //! //! ## Example //! @@ -40,7 +42,11 @@ //! // Retrieve checkpoint data (ensuring single consumption) //! let checkpoint_data = writer.get_checkpoint_info()?; //! -//! // Write checkpoint data to file and collect metadata before finalizing +//! /* Write checkpoint data to file and collect metadata about the write */ +//! /* The implementation of the write is storage-specific and not shown */ +//! /* IMPORTANT: All data must be written before finalizing the checkpoint */ +//! +//! // Finalize the checkpoint by writing the _last_checkpoint file //! writer.finalize_checkpoint(&engine, &checkpoint_metadata)?; //! ``` //! @@ -68,7 +74,8 @@ use crate::{ pub(crate) mod log_replay; -/// Read schema definition for collecting checkpoint actions +/// This schema contains all the actions that we care to extract from the log +/// files for the purpose of creating a checkpoint. static CHECKPOINT_READ_SCHEMA: LazyLock = LazyLock::new(|| { StructType::new([ Option::::get_struct_field(ADD_NAME), @@ -81,7 +88,8 @@ static CHECKPOINT_READ_SCHEMA: LazyLock = LazyLock::new(|| { .into() }); -/// Returns the read schema to collect checkpoint actions +/// This schema is used when reading actions from the Delta log +/// to ensure we capture all necessary action types. #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] #[cfg_attr(not(feature = "developer-visibility"), visibility::make(pub(crate)))] fn get_checkpoint_read_schema() -> &'static SchemaRef { @@ -164,11 +172,11 @@ impl CheckpointWriter { /// Finalizes the checkpoint writing process /// /// This method should be only called AFTER writing all checkpoint data to - /// ensure proper completion of the checkpoint operation, which includes - /// writing the _last_checkpoint file. + /// ensure proper completion of the checkpoint operation. This method + /// generates the `_last_checkpoint` file with metadata about the checkpoint. /// - /// Metadata is a single-row EngineData batch with {size_in_bytes: i64} - /// Given the engine collected checkpoint metadata we want to extend + /// The metadata parameter is a single-row EngineData batch containing + /// {size_in_bytes: i64} for the checkpoint file. This method will extend /// the EngineData batch with the remaining fields for the `_last_checkpoint` /// file. #[allow(dead_code)] // TODO: Remove when finalize_checkpoint is implemented @@ -195,7 +203,6 @@ pub struct CheckpointBuilder { } impl CheckpointBuilder { - #[allow(dead_code)] // TODO: Remove when table.checkpoint is implemented pub(crate) fn new(snapshot: Snapshot) -> Self { Self { snapshot, @@ -206,25 +213,18 @@ impl CheckpointBuilder { /// Configures the builder to use the classic naming scheme /// /// Classic naming is required for V1 checkpoints and optional for V2 checkpoints. - /// For V1 checkpoints, this method is a no-op. - /// For V2 checkpoints, the default is UUID naming unless this method is called. + /// - For V1 checkpoints, this method is a no-op. + /// - For V2 checkpoints, the default is UUID naming unless this method is called. pub fn with_classic_naming(mut self, with_classic_naming: bool) -> Self { self.with_classic_naming = with_classic_naming; self } - /// Builds a CheckpointWriter based on the configuration + /// Builds a [`CheckpointWriter`] based on the builder configuration. /// /// This method validates the configuration against table features and creates - /// a CheckpointWriter for the appropriate checkpoint type. It performs protocol + /// a [`CheckpointWriter`] for the appropriate checkpoint type. It performs protocol /// table feature checks to determine if v2Checkpoints are supported. - /// - /// # Arguments - /// * `engine` - The engine implementation for data operations - /// - /// # Returns - /// * `DeltaResult` - A configured checkpoint writer on success, - /// or an error if the configuration is incompatible with table features pub fn build(self, engine: &dyn Engine) -> DeltaResult { let v2_checkpoints_supported = self .snapshot diff --git a/kernel/src/table.rs b/kernel/src/table.rs index 97e1596d77..f8ebb4e8ac 100644 --- a/kernel/src/table.rs +++ b/kernel/src/table.rs @@ -7,6 +7,7 @@ use std::path::PathBuf; use url::Url; +use crate::checkpoint::CheckpointBuilder; use crate::snapshot::Snapshot; use crate::table_changes::TableChanges; use crate::transaction::Transaction; @@ -98,6 +99,37 @@ impl Table { ) } + /// Creates a [`CheckpointBuilder`] for generating table checkpoints. + /// + /// Checkpoints are compact representations of the table state that improve reading performance + /// by providing a consolidated view without requiring full log replay. + /// + /// # Checkpoint Types + /// + /// The type of checkpoint created depends on table features and builder configuration: + /// + /// 1. Classic V1 Checkpoint: Created automatically for tables without v2Checkpoints feature support. + /// - Uses classic naming format (`.checkpoint.parquet`) + /// - Created regardless of `with_classic_naming` setting + /// + /// 2. Classic V2 Checkpoint* Created when tables support v2Checkpoints feature AND + /// `with_classic_naming(true)` is specified. + /// - Uses classic naming format (`.checkpoint.parquet`) + /// - Includes additional V2 metadata + /// + /// 3. **UUID V2 Checkpoint**: Created when tables support v2Checkpoints feature AND + /// `with_classic_naming(false)` is used (default). + /// - Uses UUID naming format (`..checkpoint.parquet`) + /// - Includes additional V2 metadata + /// - Recommended for most tables that support v2Checkpoints + pub fn checkpoint( + &self, + engine: &dyn Engine, + version: Option, + ) -> DeltaResult { + Ok(CheckpointBuilder::new(self.snapshot(engine, version)?)) + } + /// Create a new write transaction for this table. pub fn new_transaction(&self, engine: &dyn Engine) -> DeltaResult { Transaction::try_new(self.snapshot(engine, None)?) From fffd8f7fb8913836f7f0f54f81c6447fe6139eaa Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 2 Apr 2025 14:17:45 -0700 Subject: [PATCH 067/176] fix docs --- kernel/src/checkpoint/mod.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 35c12d4333..3b456d768d 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -17,11 +17,12 @@ //! //! The API is designed using a builder pattern: //! -//! 1. [`CheckpointBuilder`] performs table feature detection and configuration validation -//! 2. [`CheckpointWriter`] is constructed from the builder and handles: +//! 1. [`CheckpointBuilder`] performs table feature detection and constructs the writer by: +//! - Configuring the writer with classic naming (optional) //! - Replaying Delta log actions to filter, deduplicate, and select actions -//! - Writing consolidated checkpoint data to a single file -//! - Finalizing the checkpoint by generating a `_last_checkpoint` file with metadata//! +//! 2. [`CheckpointWriter`] is constructed from the builder and handles: +//! - Returning consolidated checkpoint data for writing to the engine +//! - Finalizing the checkpoint by generating a `_last_checkpoint` file with metadata //! //! ## Example //! From 6167cf2ac4000580469b527c63116d885bde924f Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 2 Apr 2025 14:30:20 -0700 Subject: [PATCH 068/176] merge --- kernel/src/checkpoint/log_replay.rs | 44 ++--------------------------- kernel/src/utils.rs | 2 ++ 2 files changed, 4 insertions(+), 42 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index b17f955b9c..e3b6fe392c 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -277,51 +277,11 @@ impl RowVisitor for V1CheckpointVisitor<'_> { #[cfg(test)] mod tests { use std::collections::HashSet; - use std::sync::Arc; - use crate::arrow::array::{RecordBatch, StringArray}; - use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use crate::arrow::array::StringArray; + use crate::utils::test_utils::{action_batch, parse_json_batch}; use super::*; - use crate::{ - actions::get_log_schema, engine::arrow_data::ArrowEngineData, engine::sync::SyncEngine, - Engine, EngineData, - }; - - // Helper function to convert a StringArray to EngineData - fn string_array_to_engine_data(string_array: StringArray) -> Box { - let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); - let schema = Arc::new(ArrowSchema::new(vec![string_field])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) - .expect("Can't convert to record batch"); - Box::new(ArrowEngineData::new(batch)) - } - - // Creates a batch of actions for testing - fn action_batch() -> Box { - let json_strings: StringArray = vec![ - r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, - r#"{"remove":{"path":"part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, - r#"{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Databricks-Runtime/","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}}"#, - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#, - r#"{"cdc":{"path":"_change_data/age=21/cdc-00000-93f7fceb-281a-446a-b221-07b88132d203.c000.snappy.parquet","partitionValues":{"age":"21"},"size":1033,"dataChange":false}}"#, - r#"{"sidecar":{"path":"016ae953-37a9-438e-8683-9a9a4a79a395.parquet","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{"tag_foo":"tag_bar"}}}"#, - r#"{"txn":{"appId":"myApp","version": 3}}"#, - ] - .into(); - parse_json_batch(json_strings) - } - - // Parses JSON strings into EngineData - fn parse_json_batch(json_strings: StringArray) -> Box { - let engine = SyncEngine::new(); - let json_handler = engine.get_json_handler(); - let output_schema = get_log_schema().clone(); - json_handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap() - } #[test] fn test_v1_checkpoint_visitor() -> DeltaResult<()> { diff --git a/kernel/src/utils.rs b/kernel/src/utils.rs index 19e23d86c2..07be9315ed 100644 --- a/kernel/src/utils.rs +++ b/kernel/src/utils.rs @@ -123,11 +123,13 @@ pub(crate) mod test_utils { pub(crate) fn action_batch() -> Box { let json_strings: StringArray = vec![ r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, + r#"{"remove":{"path":"part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, r#"{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Databricks-Runtime/","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}}"#, r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#, r#"{"cdc":{"path":"_change_data/age=21/cdc-00000-93f7fceb-281a-446a-b221-07b88132d203.c000.snappy.parquet","partitionValues":{"age":"21"},"size":1033,"dataChange":false}}"#, r#"{"sidecar":{"path":"016ae953-37a9-438e-8683-9a9a4a79a395.parquet","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{"tag_foo":"tag_bar"}}}"#, + r#"{"txn":{"appId":"myApp","version": 3}}"#, ] .into(); parse_json_batch(json_strings) From 0d8b3c0e49f73ccadc835657b6c7e02d969e7fea Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 3 Apr 2025 10:56:58 -0700 Subject: [PATCH 069/176] hoist selection vector and data skipping filter --- kernel/src/log_replay.rs | 54 +++++++++++++++++++++++++++-------- kernel/src/scan/log_replay.rs | 17 ++++++----- 2 files changed, 52 insertions(+), 19 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 79ad70b6fc..e7c94f727b 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -18,6 +18,7 @@ use std::collections::HashSet; use crate::actions::deletion_vector::DeletionVectorDescriptor; use crate::engine_data::{GetData, TypedGetData}; +use crate::scan::data_skipping::DataSkippingFilter; use crate::{DeltaResult, EngineData}; use tracing::debug; @@ -43,6 +44,9 @@ impl FileActionKey { /// unique file (identified by path and deletion vector ID) is processed only once. Performing /// deduplication at the visitor level avoids having to load all actions into memory at once, /// significantly reducing memory usage for large Delta tables with extensive history. +/// +/// TODO: Modify deduplication to track only file paths instead of (path, dv_unique_id). +/// More info here: https://github.com/delta-io/delta-kernel-rs/issues/701 pub(crate) struct FileActionDeduplicator<'seen> { /// A set of (data file path, dv_unique_id) pairs that have been seen thus /// far in the log for deduplication. This is a mutable reference to the set @@ -201,16 +205,18 @@ impl<'seen> FileActionDeduplicator<'seen> { /// - **Apply custom filtering logic** based on the processor’s purpose (e.g., checkpointing, scanning). /// /// Implementations: -/// - `ScanLogReplayProcessor`: Used for table scans, this processor filters and selects relevant -/// file actions to reconstruct the table state at a specific point in time. -/// - `V1CheckpointLogReplayProcessor`(WIP): Will be responsible for processing log batches to construct -/// V1 spec checkpoint files, ensuring only necessary metadata and file actions are retained. +/// - `ScanLogReplayProcessor`: Used for table scans, this processor filters and selects deduplicated +/// `Add` actions from log batches to reconstruct the view of the table at a specific point in time. +/// Note that scans do not expose `Remove` actions. +/// - `V1CheckpointLogReplayProcessor`(WIP): Will be responsible for processing log batches to construct +/// V1 spec checkpoint files. Unlike scans, checkpoint processing includes additional actions, +/// such as `Remove`, `Metadata`, and `Protocol`, required to fully reconstruct table state. /// /// The `Output` type must implement [`HasSelectionVector`] to enable filtering of batches /// with no selected rows. /// /// TODO: Refactor the Change Data Feed (CDF) processor to use this trait. -pub(crate) trait LogReplayProcessor { +pub(crate) trait LogReplayProcessor: Sized { /// The type of results produced by this processor must implement the /// `HasSelectionVector` trait to allow filtering out batches with no selected rows. type Output: HasSelectionVector; @@ -241,23 +247,47 @@ pub(crate) trait LogReplayProcessor { /// /// Note: This is an associated function rather than an instance method because the /// returned iterator needs to own the processor. - fn apply_to_iterator( - mut processor: impl LogReplayProcessor, + fn process_batches( + mut self, action_iter: impl Iterator, bool)>>, - ) -> impl Iterator> - where - Self::Output: HasSelectionVector, - { + ) -> impl Iterator> { action_iter .map(move |action_res| { let (batch, is_log_batch) = action_res?; - processor.process_actions_batch(batch, is_log_batch) + self.process_actions_batch(batch, is_log_batch) }) .filter(|res| { + // TODO: Leverage .is_none_or() when msrv = 1.82 res.as_ref() .map_or(true, |result| result.has_selected_rows()) }) } + + /// Builds the initial selection vector for the action batch, used to filter out rows that + /// are not relevant to the current processor's purpose (e.g., checkpointing, scanning). + /// This method performs a first pass of filtering using an optional [`DataSkippingFilter`]. + /// If no filter is provided, it assumes that all rows should be selected. + /// + /// The selection vector is further updated based on the processor's logic in the + /// `process_actions_batch` method. + /// + /// # Arguments + /// - `batch` - A reference to the batch of actions to be processed. + /// + /// # Returns + /// A `DeltaResult>`, where each boolean indicates if the corresponding row should be included. + /// If no filter is provided, all rows are selected. + fn build_selection_vector(&self, batch: &dyn EngineData) -> DeltaResult> { + match self.get_data_skipping_filter() { + Some(filter) => filter.apply(batch), + None => Ok(vec![true; batch.len()]), // If no filter is provided, select all rows + } + } + + /// Returns an optional reference to the [`DataSkippingFilter`] used to filter rows + /// when building the initial selection vector in `build_selection_vector`. + /// If `None` is returned, all rows are selected. + fn get_data_skipping_filter(&self) -> Option<&DataSkippingFilter>; } /// This trait is used to determine if a processor's output contains any selected rows. diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 845fb09f00..cdfbf1fa86 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -342,12 +342,11 @@ impl LogReplayProcessor for ScanLogReplayProcessor { batch: Box, is_log_batch: bool, ) -> DeltaResult { - // Apply data skipping to get back a selection vector for actions that passed skipping. We - // will update the vector below as log replay identifies duplicates that should be ignored. - let selection_vector = match &self.data_skipping_filter { - Some(filter) => filter.apply(batch.as_ref())?, - None => vec![true; batch.len()], - }; + // Build an initial selection vector for the batch which has had the data skipping filter + // applied. The selection vector is further updated by the deduplication visitor to remove + // rows that are not valid adds. + let selection_vector = self.build_selection_vector(batch.as_ref())?; + assert_eq!(selection_vector.len(), batch.len()); let logical_schema = self.logical_schema.clone(); @@ -372,6 +371,10 @@ impl LogReplayProcessor for ScanLogReplayProcessor { visitor.row_transform_exprs, )) } + + fn get_data_skipping_filter(&self) -> Option<&DataSkippingFilter> { + self.data_skipping_filter.as_ref() + } } /// Given an iterator of (engine_data, bool) tuples and a predicate, returns an iterator of @@ -388,7 +391,7 @@ pub(crate) fn scan_action_iter( let log_scanner = ScanLogReplayProcessor::new(engine, physical_predicate, logical_schema, transform); - ScanLogReplayProcessor::apply_to_iterator(log_scanner, action_iter) + log_scanner.process_batches(action_iter) } #[cfg(test)] From 43760a5e20c655a3761a2a18ae3c0814c89437f4 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 3 Apr 2025 11:00:41 -0700 Subject: [PATCH 070/176] docs --- kernel/src/log_replay.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index e7c94f727b..83095162ce 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -207,7 +207,7 @@ impl<'seen> FileActionDeduplicator<'seen> { /// Implementations: /// - `ScanLogReplayProcessor`: Used for table scans, this processor filters and selects deduplicated /// `Add` actions from log batches to reconstruct the view of the table at a specific point in time. -/// Note that scans do not expose `Remove` actions. +/// Note that scans do not expose `Remove` actions. /// - `V1CheckpointLogReplayProcessor`(WIP): Will be responsible for processing log batches to construct /// V1 spec checkpoint files. Unlike scans, checkpoint processing includes additional actions, /// such as `Remove`, `Metadata`, and `Protocol`, required to fully reconstruct table state. @@ -286,7 +286,7 @@ pub(crate) trait LogReplayProcessor: Sized { /// Returns an optional reference to the [`DataSkippingFilter`] used to filter rows /// when building the initial selection vector in `build_selection_vector`. - /// If `None` is returned, all rows are selected. + /// If `None` is returned, no filter is applied, and all rows are selected. fn get_data_skipping_filter(&self) -> Option<&DataSkippingFilter>; } From 1137be67c82872286ddb07c0a236fa429b957592 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 3 Apr 2025 13:48:28 -0700 Subject: [PATCH 071/176] refactorg --- kernel/src/checkpoint/log_replay.rs | 203 ++++++++++++++++++---------- 1 file changed, 134 insertions(+), 69 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index e3b6fe392c..8f8cbdaebf 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -55,15 +55,15 @@ use crate::{DeltaResult, Error}; pub(crate) struct V1CheckpointVisitor<'seen> { // File actions state deduplicator: FileActionDeduplicator<'seen>, // Used to deduplicate file actions - selection_vector: Vec, // Used to mark rows for selection + selection_vector: Vec, // Tracks which rows to include in the final output total_file_actions: i64, // i64 to match the `_last_checkpoint` file schema total_add_actions: i64, // i64 to match the `_last_checkpoint` file schema minimum_file_retention_timestamp: i64, // i64 for comparison with remove.deletionTimestamp // Non-file actions state - seen_protocol: bool, // Used to keep only the first protocol action - seen_metadata: bool, // Used to keep only the first metadata action - seen_txns: &'seen mut HashSet, // Used to keep only the first txn action for each app ID + seen_protocol: bool, // Flag to keep only the first protocol action + seen_metadata: bool, // Flag to keep only the first metadata action + seen_txns: &'seen mut HashSet, // Set of transaction IDs to deduplicate by appId total_non_file_actions: i64, // i64 to match the `_last_checkpoint` file schema } @@ -130,87 +130,161 @@ impl V1CheckpointVisitor<'_> { Ok(deletion_timestamp <= self.minimum_file_retention_timestamp) } - /// Returns true if the row contains a valid file action to be included in the checkpoint. - /// This function handles both add and remove actions, applying deduplication logic and + /// Processes a potential file action to determine if it should be included in the checkpoint. + /// + /// Returns Some(Ok(())) if the row contains a valid file action to be included in the checkpoint. + /// Returns None if the row doesn't contain a file action or should be skipped. + /// Returns Some(Err(...)) if there was an error processing the action. + /// Note: This function handles both add and remove actions, applying deduplication logic and /// tombstone expiration rules as needed. - fn is_valid_file_action<'a>( + fn check_file_action<'a>( &mut self, i: usize, getters: &[&'a dyn GetData<'a>], - ) -> DeltaResult { - // Never skip remove actions, as they may be unexpired tombstones. - let Some((file_key, is_add)) = self.deduplicator.extract_file_action(i, getters, false)? - else { - return Ok(false); + ) -> Option> { + // Extract file action key + let file_action = self.deduplicator.extract_file_action( + i, getters, + false, // Do not skip remove actions, as they may be unexpired tombstones + ); + + let file_action = match file_action { + Err(e) => return Some(Err(e)), + Ok(action) => action, + }; + + // Check if this is a file action at all + let (file_key, is_add) = match file_action { + None => return None, + Some(action) => action, }; // Check if we've already seen this file action if self.deduplicator.check_and_record_seen(file_key) { - return Ok(false); + return None; // Skip duplicates } - // Ignore expired tombstones. The getter at the fifth index is the remove action's deletionTimestamp. - if !is_add && self.is_expired_tombstone(i, getters[5])? { - return Ok(false); + // For remove actions, check if it's an expired tombstone + if !is_add { + match self.is_expired_tombstone(i, getters[5]) { + Ok(true) => return None, // Skip expired tombstones + Ok(false) => {} // Not expired, continue + Err(e) => return Some(Err(e)), // Error checking expiration + } } + // Valid, non-duplicate file action if is_add { self.total_add_actions += 1; } - self.total_file_actions += 1; - Ok(true) + Some(Ok(())) // Include this action } - /// Returns true if the row contains a protocol action, and we haven't seen one yet. - fn is_valid_protocol_action<'a>( + /// Processes a potential protocol action to determine if it should be included in the checkpoint. + /// + /// Returns Some(Ok(())) if the row contains a valid protocol action. + /// Returns None if the row doesn't contain a protocol action or is a duplicate. + /// Returns Some(Err(...)) if there was an error processing the action. + fn check_protocol_action<'a>( &mut self, i: usize, getter: &'a dyn GetData<'a>, - ) -> DeltaResult { - if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol { - self.seen_protocol = true; - self.total_non_file_actions += 1; - Ok(true) - } else { - Ok(false) + ) -> Option> { + // Check for protocol field + let min_reader_version = match getter.get_int(i, "protocol.minReaderVersion") { + Err(e) => return Some(Err(e)), + Ok(None) => return None, // Not a protocol action + Ok(Some(_)) => (), // It is a protocol action + }; + + // Skip duplicates + if self.seen_protocol { + return None; } - } - /// Returns true if the row contains a metadata action, and we haven't seen one yet. - fn is_valid_metadata_action<'a>( + // Valid, non-duplicate protocol action + self.seen_protocol = true; + self.total_non_file_actions += 1; + Some(Ok(())) // Include this action + } + /// Processes a potential metadata action to determine if it should be included in the checkpoint. + /// + /// Returns Some(Ok(())) if the row contains a valid metadata action. + /// Returns None if the row doesn't contain a metadata action or is a duplicate. + /// Returns Some(Err(...)) if there was an error processing the action. + fn check_metadata_action<'a>( &mut self, i: usize, getter: &'a dyn GetData<'a>, - ) -> DeltaResult { - if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata { - self.seen_metadata = true; - self.total_non_file_actions += 1; - Ok(true) - } else { - Ok(false) + ) -> Option> { + // Check for metadata field + match getter.get_str(i, "metaData.id") { + Err(e) => return Some(Err(e)), + Ok(None) => return None, // Not a metadata action + Ok(Some(_)) => (), // It is a metadata action + }; + + // Skip duplicates + if self.seen_metadata { + return None; } - } - /// Returns true if the row contains a txn action with an appId that we haven't seen yet. - fn is_valid_txn_action<'a>( + // Valid, non-duplicate metadata action + self.seen_metadata = true; + self.total_non_file_actions += 1; + Some(Ok(())) // Include this action + } + /// Processes a potential txn action to determine if it should be included in the checkpoint. + /// + /// Returns Some(Ok(())) if the row contains a valid txn action. + /// Returns None if the row doesn't contain a txn action or is a duplicate. + /// Returns Some(Err(...)) if there was an error processing the action. + fn check_txn_action<'a>( &mut self, i: usize, getter: &'a dyn GetData<'a>, - ) -> DeltaResult { - let app_id = match getter.get_str(i, "txn.appId")? { - Some(id) => id, - None => return Ok(false), + ) -> Option> { + // Check for txn field + let app_id = match getter.get_str(i, "txn.appId") { + Err(e) => return Some(Err(e)), + Ok(None) => return None, // Not a txn action + Ok(Some(id)) => id, }; - // Attempting to insert the app_id into the set. If it's already present, the insert will - // return false, indicating that we've already seen this app_id. - if self.seen_txns.insert(app_id.to_string()) { - self.total_non_file_actions += 1; - Ok(true) - } else { - Ok(false) + // If the app ID already exists in the set, the insertion will return false, + // indicating that this is a duplicate. + if !self.seen_txns.insert(app_id.to_string()) { + return None; } + + // Valid, non-duplicate txn action + self.total_non_file_actions += 1; + Some(Ok(())) // Include this action + } + + /// Determines if a row in the batch should be included in the checkpoint by checking + /// if it contains any valid action type. + /// + /// Note:This method checks each action type in sequence and prioritizes file actions as + /// they appear most frequently, followed by transaction, protocol, and metadata actions. + pub(crate) fn is_valid_action<'a>( + &mut self, + i: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult { + // Try each action type in sequence, stopping at the first match. + // We check file actions first as they appear most frequently in the log, + // followed by txn, protocol, and metadata actions in descending order of frequency. + let is_valid = self + .check_file_action(i, getters) + .or_else(|| self.check_txn_action(i, getters[11])) + .or_else(|| self.check_protocol_action(i, getters[10])) + .or_else(|| self.check_metadata_action(i, getters[9])) + .transpose()? // Swap the Result outside and return if Err + .is_some(); // If we got Some(Ok(())), it's a valid action + + Ok(is_valid) } } @@ -257,17 +331,8 @@ impl RowVisitor for V1CheckpointVisitor<'_> { ); for i in 0..row_count { - // Check for non-file actions (metadata, protocol, txn) - let is_non_file_action = self.is_valid_metadata_action(i, getters[9])? - || self.is_valid_protocol_action(i, getters[10])? - || self.is_valid_txn_action(i, getters[11])?; - - // Check for file actions (add, remove) - let is_file_action = self.is_valid_file_action(i, getters)?; - - // Mark the row for selection if it's either a valid non-file or file action - if is_non_file_action || is_file_action { - self.selection_vector[i] = true; + if self.selection_vector[i] { + self.selection_vector[i] = self.is_valid_action(i, getters)?; } } Ok(()) @@ -291,7 +356,7 @@ mod tests { let mut visitor = V1CheckpointVisitor::new( &mut seen_file_keys, true, - vec![false; 8], + vec![true; 8], 0, // minimum_file_retention_timestamp (no expired tombstones) false, false, @@ -348,7 +413,7 @@ mod tests { let mut visitor = V1CheckpointVisitor::new( &mut seen_file_keys, true, - vec![false; 4], + vec![true; 4], 100, // minimum_file_retention_timestamp (threshold set to 100) false, false, @@ -381,7 +446,7 @@ mod tests { let mut visitor = V1CheckpointVisitor::new( &mut seen_file_keys, true, - vec![false; 2], + vec![true; 2], 0, false, false, @@ -412,7 +477,7 @@ mod tests { let mut visitor = V1CheckpointVisitor::new( &mut seen_file_keys, false, // is_log_batch = false (checkpoint batch) - vec![false; 1], + vec![true; 1], 0, false, false, @@ -451,7 +516,7 @@ mod tests { let mut visitor = V1CheckpointVisitor::new( &mut seen_file_keys, true, - vec![false; 4], + vec![true; 4], 0, false, false, @@ -487,7 +552,7 @@ mod tests { let mut visitor = V1CheckpointVisitor::new( &mut seen_file_keys, true, - vec![false; 3], + vec![true; 3], 0, true, // The visior has already seen a protocol action true, // The visitor has already seen a metadata action @@ -525,7 +590,7 @@ mod tests { let mut visitor = V1CheckpointVisitor::new( &mut seen_file_keys, true, // is_log_batch - vec![false; 7], + vec![true; 7], 0, // minimum_file_retention_timestamp false, false, From 6e3d7222765bc1efa946849dcc1daf6c739f23e3 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 3 Apr 2025 13:54:50 -0700 Subject: [PATCH 072/176] docs --- kernel/src/checkpoint/log_replay.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 8f8cbdaebf..a3885aa96a 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -191,8 +191,8 @@ impl V1CheckpointVisitor<'_> { i: usize, getter: &'a dyn GetData<'a>, ) -> Option> { - // Check for protocol field - let min_reader_version = match getter.get_int(i, "protocol.minReaderVersion") { + // minReaderVersion is a required field, so we check for its presence to determine if this is a protocol action. + match getter.get_int(i, "protocol.minReaderVersion") { Err(e) => return Some(Err(e)), Ok(None) => return None, // Not a protocol action Ok(Some(_)) => (), // It is a protocol action @@ -218,7 +218,7 @@ impl V1CheckpointVisitor<'_> { i: usize, getter: &'a dyn GetData<'a>, ) -> Option> { - // Check for metadata field + // id is a required field, so we check for its presence to determine if this is a metadata action. match getter.get_str(i, "metaData.id") { Err(e) => return Some(Err(e)), Ok(None) => return None, // Not a metadata action From 2252cec7240a33dc742c60a2a03fd4270301acc3 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 3 Apr 2025 14:12:26 -0700 Subject: [PATCH 073/176] match simplification --- kernel/src/checkpoint/log_replay.rs | 30 ++++++++++------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index a3885aa96a..e6e91a16c6 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -142,21 +142,11 @@ impl V1CheckpointVisitor<'_> { i: usize, getters: &[&'a dyn GetData<'a>], ) -> Option> { - // Extract file action key - let file_action = self.deduplicator.extract_file_action( - i, getters, - false, // Do not skip remove actions, as they may be unexpired tombstones - ); - - let file_action = match file_action { + // Extract the file action and handle errors immediately + let (file_key, is_add) = match self.deduplicator.extract_file_action(i, getters, false) { + Ok(Some(action)) => action, + Ok(None) => return None, // If no file action is found, skip this row Err(e) => return Some(Err(e)), - Ok(action) => action, - }; - - // Check if this is a file action at all - let (file_key, is_add) = match file_action { - None => return None, - Some(action) => action, }; // Check if we've already seen this file action @@ -193,9 +183,9 @@ impl V1CheckpointVisitor<'_> { ) -> Option> { // minReaderVersion is a required field, so we check for its presence to determine if this is a protocol action. match getter.get_int(i, "protocol.minReaderVersion") { - Err(e) => return Some(Err(e)), - Ok(None) => return None, // Not a protocol action Ok(Some(_)) => (), // It is a protocol action + Ok(None) => return None, // Not a protocol action + Err(e) => return Some(Err(e)), }; // Skip duplicates @@ -220,9 +210,9 @@ impl V1CheckpointVisitor<'_> { ) -> Option> { // id is a required field, so we check for its presence to determine if this is a metadata action. match getter.get_str(i, "metaData.id") { - Err(e) => return Some(Err(e)), - Ok(None) => return None, // Not a metadata action Ok(Some(_)) => (), // It is a metadata action + Ok(None) => return None, // Not a metadata action + Err(e) => return Some(Err(e)), }; // Skip duplicates @@ -247,9 +237,9 @@ impl V1CheckpointVisitor<'_> { ) -> Option> { // Check for txn field let app_id = match getter.get_str(i, "txn.appId") { - Err(e) => return Some(Err(e)), - Ok(None) => return None, // Not a txn action Ok(Some(id)) => id, + Ok(None) => return None, // Not a txn action + Err(e) => return Some(Err(e)), }; // If the app ID already exists in the set, the insertion will return false, From 09f3930e2a1cd775448e6ddd9ab2c84096c6700d Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 3 Apr 2025 14:22:39 -0700 Subject: [PATCH 074/176] docs --- kernel/src/log_replay.rs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 83095162ce..6322c43370 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -236,17 +236,12 @@ pub(crate) trait LogReplayProcessor: Sized { is_log_batch: bool, ) -> DeltaResult; - /// Applies a processor to an action iterator and filters out empty results. + /// Applies the processor to an actions iterator and filters out empty results. /// /// # Arguments - /// * `processor` - The processor implementation to apply /// * `action_iter` - Iterator of action batches and their source flags /// - /// Returns an iterator that yields processed results, filtering out batches - /// where no rows were selected - /// - /// Note: This is an associated function rather than an instance method because the - /// returned iterator needs to own the processor. + /// Returns an iterator that yields the Output type of the processor. fn process_batches( mut self, action_iter: impl Iterator, bool)>>, From 3efeef64d422b9794fdc2e45ae9e3bee93ebdb4b Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 4 Apr 2025 13:34:35 -0700 Subject: [PATCH 075/176] docs and rename --- kernel/src/checkpoint/log_replay.rs | 57 ++++++++++++++++------------- kernel/src/checkpoint/mod.rs | 2 +- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index e6e91a16c6..45ee483c5b 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -12,7 +12,7 @@ //! //! TODO: V1CheckpointLogReplayProcessor & CheckpointData is a WIP. //! The module defines the CheckpointLogReplayProcessor which implements the LogReplayProcessor trait, -//! as well as a [`V1CheckpointVisitor`] to traverse and process batches of log actions. +//! as well as a [`CheckpointVisitor`] to traverse and process batches of log actions. //! //! The processing result is encapsulated in CheckpointData, which includes the transformed log data and //! a selection vector indicating which rows should be written to the checkpoint. @@ -51,24 +51,29 @@ use crate::{DeltaResult, Error}; /// /// The resulting filtered set of actions represents the minimal set needed to reconstruct /// the latest valid state of the table at the checkpointed version. -#[cfg_attr(feature = "developer-visibility", visibility::make(pub))] -pub(crate) struct V1CheckpointVisitor<'seen> { - // File actions state - deduplicator: FileActionDeduplicator<'seen>, // Used to deduplicate file actions - selection_vector: Vec, // Tracks which rows to include in the final output - total_file_actions: i64, // i64 to match the `_last_checkpoint` file schema - total_add_actions: i64, // i64 to match the `_last_checkpoint` file schema - minimum_file_retention_timestamp: i64, // i64 for comparison with remove.deletionTimestamp - - // Non-file actions state - seen_protocol: bool, // Flag to keep only the first protocol action - seen_metadata: bool, // Flag to keep only the first metadata action - seen_txns: &'seen mut HashSet, // Set of transaction IDs to deduplicate by appId - total_non_file_actions: i64, // i64 to match the `_last_checkpoint` file schema +pub(crate) struct CheckpointVisitor<'seen> { + // Used to deduplicate file actions + deduplicator: FileActionDeduplicator<'seen>, + // Tracks which rows to include in the final output + selection_vector: Vec, + // i64 to match the `_last_checkpoint` file schema + total_file_actions: i64, + // i64 to match the `_last_checkpoint` file schema + total_add_actions: i64, + // i64 for comparison with remove.deletionTimestamp + minimum_file_retention_timestamp: i64, + // Flag to keep only the first protocol action + seen_protocol: bool, + // Flag to keep only the first metadata action + seen_metadata: bool, + // Set of transaction IDs to deduplicate by appId + seen_txns: &'seen mut HashSet, + // i64 to match the `_last_checkpoint` file schema + total_non_file_actions: i64, } #[allow(unused)] -impl V1CheckpointVisitor<'_> { +impl CheckpointVisitor<'_> { // These index positions correspond to the order of columns defined in // `selected_column_names_and_types()`, and are used to extract file key information // for deduplication purposes @@ -85,8 +90,8 @@ impl V1CheckpointVisitor<'_> { seen_protocol: bool, seen_metadata: bool, seen_txns: &'seen mut HashSet, - ) -> V1CheckpointVisitor<'seen> { - V1CheckpointVisitor { + ) -> CheckpointVisitor<'seen> { + CheckpointVisitor { deduplicator: FileActionDeduplicator::new( seen_file_keys, is_log_batch, @@ -278,7 +283,7 @@ impl V1CheckpointVisitor<'_> { } } -impl RowVisitor for V1CheckpointVisitor<'_> { +impl RowVisitor for CheckpointVisitor<'_> { fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { // The data columns visited must be in the following order: // 1. ADD @@ -343,7 +348,7 @@ mod tests { let data = action_batch(); let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); - let mut visitor = V1CheckpointVisitor::new( + let mut visitor = CheckpointVisitor::new( &mut seen_file_keys, true, vec![true; 8], @@ -400,7 +405,7 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); - let mut visitor = V1CheckpointVisitor::new( + let mut visitor = CheckpointVisitor::new( &mut seen_file_keys, true, vec![true; 4], @@ -433,7 +438,7 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); - let mut visitor = V1CheckpointVisitor::new( + let mut visitor = CheckpointVisitor::new( &mut seen_file_keys, true, vec![true; 2], @@ -464,7 +469,7 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); - let mut visitor = V1CheckpointVisitor::new( + let mut visitor = CheckpointVisitor::new( &mut seen_file_keys, false, // is_log_batch = false (checkpoint batch) vec![true; 1], @@ -503,7 +508,7 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); - let mut visitor = V1CheckpointVisitor::new( + let mut visitor = CheckpointVisitor::new( &mut seen_file_keys, true, vec![true; 4], @@ -539,7 +544,7 @@ mod tests { let mut seen_txns = HashSet::new(); seen_txns.insert("app1".to_string()); - let mut visitor = V1CheckpointVisitor::new( + let mut visitor = CheckpointVisitor::new( &mut seen_file_keys, true, vec![true; 3], @@ -577,7 +582,7 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); - let mut visitor = V1CheckpointVisitor::new( + let mut visitor = CheckpointVisitor::new( &mut seen_file_keys, true, // is_log_batch vec![true; 7], diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 5978574904..39322cf2d8 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -47,4 +47,4 @@ //! This module, along with its submodule `checkpoint/log_replay.rs`, provides the full //! API and implementation for generating checkpoints. See `checkpoint/log_replay.rs` for details //! on how log replay is used to filter and deduplicate actions for checkpoint creation. -pub mod log_replay; +mod log_replay; From 63f02944f64d860f7d58ffe1a27a99b0259f377b Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 4 Apr 2025 13:41:00 -0700 Subject: [PATCH 076/176] nits and renames --- kernel/src/log_replay.rs | 6 +++--- kernel/src/scan/log_replay.rs | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 6322c43370..fdb8ecf039 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -242,7 +242,7 @@ pub(crate) trait LogReplayProcessor: Sized { /// * `action_iter` - Iterator of action batches and their source flags /// /// Returns an iterator that yields the Output type of the processor. - fn process_batches( + fn process_actions_iter( mut self, action_iter: impl Iterator, bool)>>, ) -> impl Iterator> { @@ -273,7 +273,7 @@ pub(crate) trait LogReplayProcessor: Sized { /// A `DeltaResult>`, where each boolean indicates if the corresponding row should be included. /// If no filter is provided, all rows are selected. fn build_selection_vector(&self, batch: &dyn EngineData) -> DeltaResult> { - match self.get_data_skipping_filter() { + match self.data_skipping_filter() { Some(filter) => filter.apply(batch), None => Ok(vec![true; batch.len()]), // If no filter is provided, select all rows } @@ -282,7 +282,7 @@ pub(crate) trait LogReplayProcessor: Sized { /// Returns an optional reference to the [`DataSkippingFilter`] used to filter rows /// when building the initial selection vector in `build_selection_vector`. /// If `None` is returned, no filter is applied, and all rows are selected. - fn get_data_skipping_filter(&self) -> Option<&DataSkippingFilter>; + fn data_skipping_filter(&self) -> Option<&DataSkippingFilter>; } /// This trait is used to determine if a processor's output contains any selected rows. diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index cdfbf1fa86..fec275d516 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -339,15 +339,15 @@ impl LogReplayProcessor for ScanLogReplayProcessor { fn process_actions_batch( &mut self, - batch: Box, + actions_batch: Box, is_log_batch: bool, ) -> DeltaResult { // Build an initial selection vector for the batch which has had the data skipping filter // applied. The selection vector is further updated by the deduplication visitor to remove // rows that are not valid adds. - let selection_vector = self.build_selection_vector(batch.as_ref())?; + let selection_vector = self.build_selection_vector(actions_batch.as_ref())?; - assert_eq!(selection_vector.len(), batch.len()); + assert_eq!(selection_vector.len(), actions_batch.len()); let logical_schema = self.logical_schema.clone(); let transform = self.transform.clone(); @@ -361,10 +361,10 @@ impl LogReplayProcessor for ScanLogReplayProcessor { partition_filter, is_log_batch, ); - visitor.visit_rows_of(batch.as_ref())?; + visitor.visit_rows_of(actions_batch.as_ref())?; // TODO: Teach expression eval to respect the selection vector we just computed so carefully! - let result = self.add_transform.evaluate(batch.as_ref())?; + let result = self.add_transform.evaluate(actions_batch.as_ref())?; Ok(( result, visitor.selection_vector, @@ -372,7 +372,7 @@ impl LogReplayProcessor for ScanLogReplayProcessor { )) } - fn get_data_skipping_filter(&self) -> Option<&DataSkippingFilter> { + fn data_skipping_filter(&self) -> Option<&DataSkippingFilter> { self.data_skipping_filter.as_ref() } } @@ -391,7 +391,7 @@ pub(crate) fn scan_action_iter( let log_scanner = ScanLogReplayProcessor::new(engine, physical_predicate, logical_schema, transform); - log_scanner.process_batches(action_iter) + log_scanner.process_actions_iter(action_iter) } #[cfg(test)] From fab97baebaff9a3068d7b7dce3381f285781f76e Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 4 Apr 2025 13:49:32 -0700 Subject: [PATCH 077/176] rename --- kernel/src/checkpoint/log_replay.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 45ee483c5b..fd93317b11 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -344,7 +344,7 @@ mod tests { use super::*; #[test] - fn test_v1_checkpoint_visitor() -> DeltaResult<()> { + fn test_checkpoint_visitor() -> DeltaResult<()> { let data = action_batch(); let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); @@ -392,7 +392,7 @@ mod tests { /// - Remove actions with deletionTimestamp > minimumFileRetentionTimestamp (should be included) /// - Remove actions with missing deletionTimestamp (defaults to 0, should be excluded) #[test] - fn test_v1_checkpoint_visitor_boundary_cases_for_tombstone_expiration() -> DeltaResult<()> { + fn test_checkpoint_visitor_boundary_cases_for_tombstone_expiration() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, @@ -427,7 +427,7 @@ mod tests { } #[test] - fn test_v1_checkpoint_visitor_conflicting_file_actions_in_log_batch() -> DeltaResult<()> { + fn test_checkpoint_visitor_conflicting_file_actions_in_log_batch() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, // Duplicate path @@ -460,7 +460,7 @@ mod tests { } #[test] - fn test_v1_checkpoint_visitor_file_actions_in_checkpoint_batch() -> DeltaResult<()> { + fn test_checkpoint_visitor_file_actions_in_checkpoint_batch() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, ] @@ -493,7 +493,7 @@ mod tests { } #[test] - fn test_v1_checkpoint_visitor_conflicts_with_deletion_vectors() -> DeltaResult<()> { + fn test_checkpoint_visitor_conflicts_with_deletion_vectors() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, // Same path but different DV @@ -531,7 +531,7 @@ mod tests { } #[test] - fn test_v1_checkpoint_visitor_already_seen_non_file_actions() -> DeltaResult<()> { + fn test_checkpoint_visitor_already_seen_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, @@ -566,7 +566,7 @@ mod tests { } #[test] - fn test_v1_checkpoint_visitor_duplicate_non_file_actions() -> DeltaResult<()> { + fn test_checkpoint_visitor_duplicate_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, // Duplicate txn From f79d9a5a1a72a68a09457b2109ea551318499f3f Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 4 Apr 2025 14:31:09 -0700 Subject: [PATCH 078/176] priv mod --- kernel/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index fb6e1f069c..68bed99e55 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -74,7 +74,7 @@ use url::Url; use self::schema::{DataType, SchemaRef}; pub mod actions; -pub mod checkpoint; +mod checkpoint; pub mod engine_data; pub mod error; pub mod expressions; From 568b59e3bf89d96348bb4ad55f015ac639e5587f Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sat, 5 Apr 2025 16:58:18 -0700 Subject: [PATCH 079/176] docs --- kernel/src/log_replay.rs | 35 +++++++++++++++++++++++--------- kernel/src/scan/log_replay.rs | 38 ++++++++++++++++------------------- 2 files changed, 42 insertions(+), 31 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index fdb8ecf039..abb451c01c 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -200,20 +200,35 @@ impl<'seen> FileActionDeduplicator<'seen> { /// Log replay processors scan transaction logs in **reverse chronological order** (newest to oldest), /// filtering and transforming action batches into specialized output types. These processors: /// -/// - **Track and deduplicate file actions** to ensure only the latest relevant changes are kept. +/// - **Track and deduplicate file actions** to apply appropriate `Remove` actions to corresponding +/// `Add` actions (and omit the file from the log replay output) /// - **Maintain selection vectors** to indicate which actions in each batch should be included. /// - **Apply custom filtering logic** based on the processor’s purpose (e.g., checkpointing, scanning). +/// - **Data skipping** filters are applied to the initial selection vector to reduce the number of rows +/// processed by the processor, (if a filter is provided). /// /// Implementations: /// - `ScanLogReplayProcessor`: Used for table scans, this processor filters and selects deduplicated /// `Add` actions from log batches to reconstruct the view of the table at a specific point in time. -/// Note that scans do not expose `Remove` actions. -/// - `V1CheckpointLogReplayProcessor`(WIP): Will be responsible for processing log batches to construct -/// V1 spec checkpoint files. Unlike scans, checkpoint processing includes additional actions, -/// such as `Remove`, `Metadata`, and `Protocol`, required to fully reconstruct table state. +/// Note that scans do not expose `Remove` actions. Data skipping may be applied when a predicate is +/// provided. /// -/// The `Output` type must implement [`HasSelectionVector`] to enable filtering of batches -/// with no selected rows. +/// - `CheckpointLogReplayProcessor` (WIP): Will be responsible for processing log batches to construct +/// V1 spec checkpoint files. Unlike scans, checkpoint processing includes additional actions, such as +/// `Remove`, `Metadata`, and `Protocol`, required to fully reconstruct table state. +/// Data skipping is not applied during checkpoint processing. +/// +/// The `Output` type represents the material result of log replay, and it must implement the +/// `HasSelectionVector` trait to allow filtering of irrelevant rows: +/// +/// - For **scans**, the output type is `ScanData`, which contains the file actions (`Add` actions) that +/// need to be applied to build the table's view, accompanied by a **selection vector** that identifies +/// which rows should be included. A transform vector may also be included to handle schema changes, +/// such as renaming columns or modifying data types. +/// +/// - For **checkpoints**, the output includes the actions necessary to write to the checkpoint file (`Add`, +/// `Remove`, `Metadata`, `Protocol` actions), filtered by the **selection vector** to determine which +/// rows are included in the final checkpoint. /// /// TODO: Refactor the Change Data Feed (CDF) processor to use this trait. pub(crate) trait LogReplayProcessor: Sized { @@ -224,7 +239,7 @@ pub(crate) trait LogReplayProcessor: Sized { /// Processes a batch of actions and returns the filtered results. /// /// # Arguments - /// - `actions_batch` - A boxed [`EngineData`] instance representing a batch of actions. + /// - `actions_batch` - A reference to an [`EngineData`] instance representing a batch of actions. /// - `is_log_batch` - `true` if the batch originates from a commit log, `false` if from a checkpoint. /// /// Returns a [`DeltaResult`] containing the processor’s output, which includes only selected actions. @@ -232,7 +247,7 @@ pub(crate) trait LogReplayProcessor: Sized { /// Note: Since log replay is stateful, processing may update internal processor state (e.g., deduplication sets). fn process_actions_batch( &mut self, - actions_batch: Box, + actions_batch: &dyn EngineData, is_log_batch: bool, ) -> DeltaResult; @@ -249,7 +264,7 @@ pub(crate) trait LogReplayProcessor: Sized { action_iter .map(move |action_res| { let (batch, is_log_batch) = action_res?; - self.process_actions_batch(batch, is_log_batch) + self.process_actions_batch(batch.as_ref(), is_log_batch) }) .filter(|res| { // TODO: Leverage .is_none_or() when msrv = 1.82 diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index fec275d516..e929379563 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -16,23 +16,26 @@ use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructFie use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; -/// [`ScanLogReplayProcessor`] processes Delta log replay actions specifically for scanning file data. +/// [`ScanLogReplayProcessor`] performs log replay (processes actions) specifically for doing a table scan. /// /// During a table scan, the processor reads batches of log actions (in reverse chronological order) /// and performs the following steps: /// /// - Data Skipping: Applies a predicate-based filter (via [`DataSkippingFilter`]) to quickly skip -/// rows that are irrelevant for the query. +/// files that are irrelevant for the query. /// - Partition Pruning: Uses an optional partition filter (extracted from a physical predicate) /// to exclude actions whose partition values do not meet the required criteria. /// - Action Deduplication: Leverages the [`FileActionDeduplicator`] to ensure that for each unique file /// (identified by its path and deletion vector unique ID), only the latest valid Add action is processed. -/// - Transformation: Evaluates and applies any necessary transformations to convert physical log actions -/// into a logical representation, as dictated by the table schema and optional transform logic. +/// - Transformation: Applies a built-in transformation (`add_transform`) to convert selected Add actions +/// into [`ScanData`], the intermediate format passed to the engine. +/// - Row Transform Passthrough: Any user-provided row-level transformation expressions (e.g. those derived +/// from projection or filters) are preserved and passed through to the engine, which applies them as part +/// of its scan execution logic. /// /// As an implementation of [`LogReplayProcessor`], [`ScanLogReplayProcessor`] provides the `process_actions_batch` /// method, which applies these steps to each batch of log actions and produces a [`ScanData`] result. This result -/// includes the transformed batch, a selection vector indicating which rows should be processed further, and any +/// includes the transformed batch, a selection vector indicating which rows are valid, and any /// row-level transformation expressions that need to be applied to the selected rows. struct ScanLogReplayProcessor { partition_filter: Option, @@ -339,32 +342,27 @@ impl LogReplayProcessor for ScanLogReplayProcessor { fn process_actions_batch( &mut self, - actions_batch: Box, + actions_batch: &dyn EngineData, is_log_batch: bool, ) -> DeltaResult { // Build an initial selection vector for the batch which has had the data skipping filter // applied. The selection vector is further updated by the deduplication visitor to remove // rows that are not valid adds. - let selection_vector = self.build_selection_vector(actions_batch.as_ref())?; - + let selection_vector = self.build_selection_vector(actions_batch)?; assert_eq!(selection_vector.len(), actions_batch.len()); - let logical_schema = self.logical_schema.clone(); - let transform = self.transform.clone(); - let partition_filter = self.partition_filter.clone(); - let mut visitor = AddRemoveDedupVisitor::new( &mut self.seen_file_keys, selection_vector, - logical_schema, - transform, - partition_filter, + self.logical_schema.clone(), + self.transform.clone(), + self.partition_filter.clone(), is_log_batch, ); - visitor.visit_rows_of(actions_batch.as_ref())?; + visitor.visit_rows_of(actions_batch)?; // TODO: Teach expression eval to respect the selection vector we just computed so carefully! - let result = self.add_transform.evaluate(actions_batch.as_ref())?; + let result = self.add_transform.evaluate(actions_batch)?; Ok(( result, visitor.selection_vector, @@ -388,10 +386,8 @@ pub(crate) fn scan_action_iter( transform: Option>, physical_predicate: Option<(ExpressionRef, SchemaRef)>, ) -> impl Iterator> { - let log_scanner = - ScanLogReplayProcessor::new(engine, physical_predicate, logical_schema, transform); - - log_scanner.process_actions_iter(action_iter) + ScanLogReplayProcessor::new(engine, physical_predicate, logical_schema, transform) + .process_actions_iter(action_iter) } #[cfg(test)] From bce9384d1e12aa686748caab0ff4c03b3bd6c2f2 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 6 Apr 2025 12:00:47 -0700 Subject: [PATCH 080/176] clean up docs --- kernel/src/checkpoint/log_replay.rs | 44 ++++++++++++++--------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index fd93317b11..2dc3ebea72 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -10,12 +10,12 @@ //! duplicate or obsolete actions (including remove actions) are ignored. //! - Retention Filtering: Tombstones older than the configured `minimum_file_retention_timestamp` are excluded. //! -//! TODO: V1CheckpointLogReplayProcessor & CheckpointData is a WIP. -//! The module defines the CheckpointLogReplayProcessor which implements the LogReplayProcessor trait, +//! TODO: `CheckpointLogReplayProcessor` struct & `CheckpointData` type +//! The module defines the CheckpointLogReplayProcessor which implements the `LogReplayProcessor` trait, //! as well as a [`CheckpointVisitor`] to traverse and process batches of log actions. //! -//! The processing result is encapsulated in CheckpointData, which includes the transformed log data and -//! a selection vector indicating which rows should be written to the checkpoint. +//! The processing result is encapsulated in `CheckpointData`, which includes the log data accompanied with +//! a selection vector indicating which rows should be included in the checkpoint file. //! //! For log replay functionality used during table scans (i.e. for reading checkpoints and commit logs), refer to //! the `scan/log_replay.rs` module. @@ -52,10 +52,12 @@ use crate::{DeltaResult, Error}; /// The resulting filtered set of actions represents the minimal set needed to reconstruct /// the latest valid state of the table at the checkpointed version. pub(crate) struct CheckpointVisitor<'seen> { - // Used to deduplicate file actions + // Desduplicates file actions deduplicator: FileActionDeduplicator<'seen>, // Tracks which rows to include in the final output selection_vector: Vec, + // TODO: _last_checkpoint schema should be updated to use u64 instead of i64 + // for fields that are not expected to be negative. (Issue #786) // i64 to match the `_last_checkpoint` file schema total_file_actions: i64, // i64 to match the `_last_checkpoint` file schema @@ -140,6 +142,7 @@ impl CheckpointVisitor<'_> { /// Returns Some(Ok(())) if the row contains a valid file action to be included in the checkpoint. /// Returns None if the row doesn't contain a file action or should be skipped. /// Returns Some(Err(...)) if there was an error processing the action. + /// /// Note: This function handles both add and remove actions, applying deduplication logic and /// tombstone expiration rules as needed. fn check_file_action<'a>( @@ -198,10 +201,10 @@ impl CheckpointVisitor<'_> { return None; } - // Valid, non-duplicate protocol action + // Valid, non-duplicate protocol action to be included self.seen_protocol = true; self.total_non_file_actions += 1; - Some(Ok(())) // Include this action + Some(Ok(())) } /// Processes a potential metadata action to determine if it should be included in the checkpoint. /// @@ -225,10 +228,10 @@ impl CheckpointVisitor<'_> { return None; } - // Valid, non-duplicate metadata action + // Valid, non-duplicate metadata action to be included self.seen_metadata = true; self.total_non_file_actions += 1; - Some(Ok(())) // Include this action + Some(Ok(())) } /// Processes a potential txn action to determine if it should be included in the checkpoint. /// @@ -253,15 +256,15 @@ impl CheckpointVisitor<'_> { return None; } - // Valid, non-duplicate txn action + // Valid, non-duplicate txn action to be included self.total_non_file_actions += 1; - Some(Ok(())) // Include this action + Some(Ok(())) } /// Determines if a row in the batch should be included in the checkpoint by checking - /// if it contains any valid action type. + /// if it contains any valid action type for the checkpoint. /// - /// Note:This method checks each action type in sequence and prioritizes file actions as + /// Note: This method checks each action type in sequence, and prioritizes file actions as /// they appear most frequently, followed by transaction, protocol, and metadata actions. pub(crate) fn is_valid_action<'a>( &mut self, @@ -269,8 +272,6 @@ impl CheckpointVisitor<'_> { getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { // Try each action type in sequence, stopping at the first match. - // We check file actions first as they appear most frequently in the log, - // followed by txn, protocol, and metadata actions in descending order of frequency. let is_valid = self .check_file_action(i, getters) .or_else(|| self.check_txn_action(i, getters[11])) @@ -360,7 +361,6 @@ mod tests { visitor.visit_rows_of(data.as_ref())?; - // Combined results from both file and non-file actions // Row 0 is an add action (included) // Row 1 is a remove action (included) // Row 2 is a commit info action (excluded) @@ -371,11 +371,8 @@ mod tests { // Row 7 is a txn action (included) let expected = vec![true, true, false, true, true, false, false, true]; - // Verify file action results assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 1); - - // Verify non-file action results assert!(visitor.seen_protocol); assert!(visitor.seen_metadata); assert_eq!(visitor.seen_txns.len(), 1); @@ -487,7 +484,8 @@ mod tests { assert_eq!(visitor.total_add_actions, 1); assert_eq!(visitor.total_non_file_actions, 0); // The action should NOT be added to the seen_file_keys set as it's a checkpoint batch - // and actions in checkpoint batches do not conflict with + // and actions in checkpoint batches do not conflict with each other. + // This is a key difference from log batches, where actions can conflict. assert!(seen_file_keys.is_empty()); Ok(()) } @@ -496,11 +494,11 @@ mod tests { fn test_checkpoint_visitor_conflicts_with_deletion_vectors() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - // Same path but different DV + // Same path but different DV, should be included r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"two","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - // Duplicate of first entry + // Duplicate of first entry, should be excluded r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - // Conflicting remove action with DV + // Conflicting remove action with DV, should be excluded r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, ] .into(); From 87b17d4fd70ab7e7bb26c4c9470ce282af8bbe00 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 6 Apr 2025 12:29:37 -0700 Subject: [PATCH 081/176] polish docs --- kernel/src/checkpoint/mod.rs | 47 ++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 39322cf2d8..e0df5205cd 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -1,27 +1,36 @@ //! # Delta Kernel Checkpoint API //! -//! This module implements the API for writing single-file checkpoints in Delta tables. +//! This module implements the API for writing checkpoints in delta tables. //! Checkpoints provide a compact summary of the table state, enabling faster recovery by -//! avoiding full log replay. This API supports multiple checkpoint types: +//! avoiding full log replay. This API supports three checkpoint types: //! //! 1. **Single-file Classic-named V1 Checkpoint** – for legacy tables that do not support -//! the v2Checkpoints feature. -//! 2. **Single-file Classic-named V2 Checkpoint** – for backwards compatibility when the -//! v2Checkpoints feature is enabled. -//! 3. **Single-file UUID-named V2 Checkpoint** – the recommended option for small to medium -//! tables with v2Checkpoints support. +//! the `v2Checkpoints` reader/writer feature. +//! 2. **Single-file Classic-named V2 Checkpoint** – ensures backwards compatibility by +//! allowing legacy readers to recognize the checkpoint file, read the protocol action, and +//! fail gracefully. +//! 3. **Single-file UUID-named V2 Checkpoint** – the default and preferred option for small to +//! medium tables with `v2Checkpoints` reader/writer feature enabled. //! -//! TODO!(seb): API WIP -//! The API is designed using a builder pattern via the `CheckpointBuilder`, which performs -//! table feature detection and configuration validation before constructing a `CheckpointWriter`. +//! TODO!(seb): API is a WIP +//! The API follows a builder pattern using `CheckpointBuilder`, which performs tbale feature +//! detection and configuration validation. Depending on table features and builder options: //! -//! The `CheckpointWriter` then orchestrates the process of: -//! - Replaying Delta log actions (via the `checkpoint/log_replay.rs` module) to filter, deduplicate, -//! and select the actions that represent the table's current state. -//! - Writing the consolidated checkpoint data to a single file. -//! - Finalizing the checkpoint by generating a `_last_checkpoint` file with metadata. +//! - Without `v2Checkpoints`: produces a **Classic-named V1** checkpoint. +//! - With `v2Checkpoints`: produces a **UUID-named V2** checkpoint. +//! - With `v2Checkpoints` + `.classic_naming()`: produces a **Classic-named V2** checkpoint. //! -//! ## Example +//! The builder returns the `CheckpointWriter` which is responsible for: +//! - Producing the correct set of actions to be written to the checkpoint file when +//! `.get_checkpoint_info()` is called. +//! - Writing the _last_checkpoint file when `.finalize_checkpoint()` is called. +//! +//! Note: +//! - Multi-file V2 checkpoints are not supported yet. The API is designed to be extensible for future +//! multi-file support, but the current implementation only supports single-file checkpoints. +//! - Multi-file V1 checkpoints are DEPRECATED. +//! +//! ## Example: Writing a classic-named V1/V2 checkpoint (depending on `v2Checkpoints` feature support) //! //! ```ignore //! let path = "./tests/data/app-txn-no-checkpoint"; @@ -32,7 +41,7 @@ //! let builder = table.checkpoint(&engine, Some(2))?; //! //! // Optionally configure the builder (e.g., force classic naming) -//! let writer = builder.with_classic_naming(true); +//! let writer = builder.with_classic_naming(); //! //! // Build the checkpoint writer //! let mut writer = builder.build(&engine)?; @@ -40,7 +49,9 @@ //! // Retrieve checkpoint data (ensuring single consumption) //! let checkpoint_data = writer.get_checkpoint_info()?; //! -//! // Write checkpoint data to file and collect metadata before finalizing +//! /* Write checkpoint data to file and collect metadata before finalizing */ +//! +//! // Write the _last_checkpoint file //! writer.finalize_checkpoint(&engine, &checkpoint_metadata)?; //! ``` //! From d8df2ea07a47ec4f268d5103fbdde255c4c0bd9f Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 6 Apr 2025 12:32:52 -0700 Subject: [PATCH 082/176] notes --- kernel/src/checkpoint/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index e0df5205cd..b17ba7df72 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -25,7 +25,7 @@ //! `.get_checkpoint_info()` is called. //! - Writing the _last_checkpoint file when `.finalize_checkpoint()` is called. //! -//! Note: +//! Notes: //! - Multi-file V2 checkpoints are not supported yet. The API is designed to be extensible for future //! multi-file support, but the current implementation only supports single-file checkpoints. //! - Multi-file V1 checkpoints are DEPRECATED. From 7f49ccdf4a896a6a03197bdc524863d8042e1b61 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 6 Apr 2025 12:46:30 -0700 Subject: [PATCH 083/176] fix indentation --- kernel/src/checkpoint/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index b17ba7df72..aa6fc48432 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -22,7 +22,7 @@ //! //! The builder returns the `CheckpointWriter` which is responsible for: //! - Producing the correct set of actions to be written to the checkpoint file when -//! `.get_checkpoint_info()` is called. +//! `.get_checkpoint_info()` is called. //! - Writing the _last_checkpoint file when `.finalize_checkpoint()` is called. //! //! Notes: From c9f6edd610856bf6ffef4dfd478b6abc04d60f43 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 6 Apr 2025 12:52:39 -0700 Subject: [PATCH 084/176] bool flags --- kernel/src/checkpoint/log_replay.rs | 34 ++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 2b1a8a2dfe..3b36351021 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -1,7 +1,7 @@ //! This module implements log replay functionality specifically for checkpoint writes in delta tables. //! //! The primary goal is to process Delta log actions in reverse chronological order (from most recent to -//! least recent) to produce the minimal set of actions required to reconstruct the table state in a V1 checkpoint. +//! least recent) to produce the minimal set of actions required to reconstruct the table state in a checkpoint. //! //! ## Key Responsibilities //! - Filtering: Only the most recent protocol and metadata actions are retained, and for each transaction @@ -10,8 +10,8 @@ //! duplicate or obsolete actions (including remove actions) are ignored. //! - Retention Filtering: Tombstones older than the configured `minimum_file_retention_timestamp` are excluded. //! -//! The module defines the [`V1CheckpointLogReplayProccessor`] which implements the LogReplayProcessor trait, -//! as well as a [`V1CheckpointVisitor`] to traverse and process batches of log actions. +//! The module defines the [`CheckpointLogReplayProcessor`] which implements the LogReplayProcessor trait, +//! as well as a [`CheckpointVisitor`] to traverse and process batches of log actions. //! //! The processing result is encapsulated in [`CheckpointData`], which includes the log data accompanied with //! a selection vector indicating which rows should be included in the checkpoint file. @@ -30,11 +30,12 @@ use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, EngineData, Error}; -/// `CheckpointData` contains a batch of filtered actions for checkpoint creation. -/// This structure holds a single batch of engine data along with a selection vector -/// that marks which rows should be included in the V1 checkpoint file. /// TODO!(seb): change to type CheckpointData = FilteredEngineData, when introduced -pub struct CheckpointData { +/// +/// [`CheckpointData`] contains a batch of filtered actions for checkpoint creation. +/// This structure holds a single batch of engine data along with a selection vector +/// that marks which rows should be included in the checkpoint file. +pub(crate) struct CheckpointData { /// The original engine data containing the actions #[allow(dead_code)] // TODO: Remove once checkpoint_v1 API is implemented data: Box, @@ -49,13 +50,13 @@ impl HasSelectionVector for CheckpointData { } } -/// The [`V1CheckpointLogReplayProccessor`] is an implementation of the [`LogReplayProcessor`] +/// The [`CheckpointLogReplayProcessor`] is an implementation of the [`LogReplayProcessor`] /// trait that filters log segment actions for inclusion in a V1 spec checkpoint file. /// /// It processes each action batch via the `process_actions_batch` method, using the -/// [`V1CheckpointVisitor`] to convert each batch into a [`CheckpointData`] instance that +/// [`CheckpointVisitor`] to convert each batch into a [`CheckpointData`] instance that /// contains only the actions required for the checkpoint. -pub(crate) struct V1CheckpointLogReplayProccessor { +pub(crate) struct CheckpointLogReplayProcessor { /// Tracks file actions that have been seen during log replay to avoid duplicates. /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. seen_file_keys: HashSet, @@ -79,7 +80,7 @@ pub(crate) struct V1CheckpointLogReplayProccessor { minimum_file_retention_timestamp: i64, } -impl LogReplayProcessor for V1CheckpointLogReplayProccessor { +impl LogReplayProcessor for CheckpointLogReplayProcessor { // Define the processing result type as CheckpointData type Output = CheckpointData; @@ -102,8 +103,7 @@ impl LogReplayProcessor for V1CheckpointLogReplayProccessor { batch: Box, is_log_batch: bool, ) -> DeltaResult { - // Initialize selection vector with all rows un-selected - let selection_vector = vec![false; batch.len()]; + let selection_vector = vec![true; batch.len()]; assert_eq!( selection_vector.len(), batch.len(), @@ -111,7 +111,7 @@ impl LogReplayProcessor for V1CheckpointLogReplayProccessor { ); // Create the checkpoint visitor to process actions and update selection vector - let mut visitor = V1CheckpointVisitor::new( + let mut visitor = CheckpointVisitor::new( &mut self.seen_file_keys, is_log_batch, selection_vector, @@ -143,7 +143,7 @@ impl LogReplayProcessor for V1CheckpointLogReplayProccessor { } } -impl V1CheckpointLogReplayProccessor { +impl CheckpointLogReplayProcessor { pub(crate) fn new( total_actions_counter: Arc, total_add_actions_counter: Arc, @@ -176,12 +176,12 @@ pub(crate) fn checkpoint_actions_iter( total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> impl Iterator> + Send + 'static { - let log_scanner = V1CheckpointLogReplayProccessor::new( + let log_scanner = CheckpointLogReplayProcessor::new( total_actions_counter, total_add_actions_counter, minimum_file_retention_timestamp, ); - V1CheckpointLogReplayProccessor::apply_to_iterator(log_scanner, action_iter) + CheckpointLogReplayProcessor::apply_to_iterator(log_scanner, action_iter) } /// A visitor that filters actions for inclusion in a V1 spec checkpoint file. From e520d1f9e05000ded4d32c0e598bc9ec10c6cad9 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 6 Apr 2025 14:13:17 -0700 Subject: [PATCH 085/176] remove atomic counters --- kernel/src/checkpoint/log_replay.rs | 104 ++++++++++++---------------- kernel/src/log_replay.rs | 4 +- kernel/src/scan/log_replay.rs | 8 +-- 3 files changed, 50 insertions(+), 66 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 3b36351021..88a5844922 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -19,22 +19,22 @@ //! For log replay functionality used during table scans (i.e. for reading checkpoints and commit logs), refer to //! the `scan/log_replay.rs` module. use std::collections::HashSet; -use std::sync::atomic::{AtomicI64, Ordering}; -use std::sync::{Arc, LazyLock}; +use std::sync::LazyLock; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::log_replay::{ FileActionDeduplicator, FileActionKey, HasSelectionVector, LogReplayProcessor, }; +use crate::scan::data_skipping::DataSkippingFilter; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, EngineData, Error}; -/// TODO!(seb): change to type CheckpointData = FilteredEngineData, when introduced +/// TODO!(seb): Change this to `type CheckpointData = FilteredEngineData` once available. /// -/// [`CheckpointData`] contains a batch of filtered actions for checkpoint creation. -/// This structure holds a single batch of engine data along with a selection vector -/// that marks which rows should be included in the checkpoint file. +/// [`CheckpointData`] represents a batch of actions filtered for checkpoint creation. +/// It wraps a single engine data batch and a corresponding selection vector indicating +/// which rows should be written to the checkpoint file. pub(crate) struct CheckpointData { /// The original engine data containing the actions #[allow(dead_code)] // TODO: Remove once checkpoint_v1 API is implemented @@ -54,49 +54,40 @@ impl HasSelectionVector for CheckpointData { /// trait that filters log segment actions for inclusion in a V1 spec checkpoint file. /// /// It processes each action batch via the `process_actions_batch` method, using the -/// [`CheckpointVisitor`] to convert each batch into a [`CheckpointData`] instance that -/// contains only the actions required for the checkpoint. +/// [`CheckpointVisitor`] to convert each [`EngineData`] batch into a [`CheckpointData`] +/// instance that reflect only the necessary actions for the checkpoint. pub(crate) struct CheckpointLogReplayProcessor { /// Tracks file actions that have been seen during log replay to avoid duplicates. /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. seen_file_keys: HashSet, - /// Counter for the total number of actions processed during log replay. - total_actions: Arc, - + total_actions: i64, /// Counter for the total number of add actions processed during log replay. - total_add_actions: Arc, - + total_add_actions: i64, /// Indicates whether a protocol action has been seen in the log. seen_protocol: bool, - /// Indicates whether a metadata action has been seen in the log. seen_metadata: bool, - /// Set of transaction app IDs that have been processed to avoid duplicates. seen_txns: HashSet, - /// Minimum timestamp for file retention, used for filtering expired tombstones. minimum_file_retention_timestamp: i64, } impl LogReplayProcessor for CheckpointLogReplayProcessor { - // Define the processing result type as CheckpointData type Output = CheckpointData; - /// This function processes batches of actions in reverse chronological order - /// (from most recent to least recent) and performs the necessary filtering - /// to ensure the checkpoint contains only the actions needed to reconstruct - /// the complete state of the table. + /// This function is applied to each batch of actions read from the log during + /// log replay in reverse chronological order (from most recent to least recent), + /// and performs the necessary filtering and deduplication to produce the minimal + /// set of actions to be written to the checkpoint file. /// /// # Filtering Rules /// - /// The following rules apply when filtering actions: - /// /// 1. Only the most recent protocol and metadata actions are included /// 2. For each app ID, only the most recent transaction action is included /// 3. Add and remove actions are deduplicated based on path and unique ID - /// 4. Tombstones older than `minimum_file_retention_timestamp` are excluded + /// 4. Remove tombstones older than `minimum_file_retention_timestamp` are excluded /// 5. Sidecar, commitInfo, and CDC actions are excluded fn process_actions_batch( &mut self, @@ -120,17 +111,11 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { self.seen_metadata, &mut self.seen_txns, ); - - // Process actions and let visitor update selection vector visitor.visit_rows_of(batch.as_ref())?; - // Update shared counters with file action counts from this batch - self.total_actions.fetch_add( - visitor.total_file_actions + visitor.total_non_file_actions, - Ordering::SeqCst, - ); - self.total_add_actions - .fetch_add(visitor.total_add_actions, Ordering::SeqCst); + // Update counters + self.total_actions += visitor.total_file_actions + visitor.total_non_file_actions; + self.total_add_actions += visitor.total_add_actions; // Update protocol and metadata seen flags self.seen_protocol = visitor.seen_protocol; @@ -141,12 +126,17 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { selection_vector: visitor.selection_vector, }) } + + /// Data skipping is not applicable for checkpoint log replay. + fn data_skipping_filter(&self) -> Option<&DataSkippingFilter> { + None + } } impl CheckpointLogReplayProcessor { pub(crate) fn new( - total_actions_counter: Arc, - total_add_actions_counter: Arc, + total_actions_counter: i64, + total_add_actions_counter: i64, minimum_file_retention_timestamp: i64, ) -> Self { Self { @@ -164,24 +154,24 @@ impl CheckpointLogReplayProcessor { /// Given an iterator of (engine_data, bool) tuples, returns an iterator of /// `(engine_data, selection_vec)`. Each row that is selected in the returned `engine_data` _must_ /// be written to the V1 checkpoint file in order to capture the table version's complete state. -/// Non-selected rows _must_ be ignored. The boolean flag indicates whether the record batch -/// is a log or checkpoint batch. +/// Non-selected rows _must_ be ignored. The boolean flag tied to each actions batch indicates +/// whether the batch is a commit batch (true) or a checkpoint batch (false). /// -/// Note: The iterator of (engine_data, bool) tuples 'action_iter' parameter must be sorted by the -/// order of the actions in the log from most recent to least recent. +/// Note: The 'action_iter' parameter is an iterator of (engine_data, bool) tuples that _must_ be +/// sorted by the order of the actions in the log from most recent to least recent. #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented pub(crate) fn checkpoint_actions_iter( action_iter: impl Iterator, bool)>> + Send + 'static, - total_actions_counter: Arc, - total_add_actions_counter: Arc, + total_actions_counter: i64, + total_add_actions_counter: i64, minimum_file_retention_timestamp: i64, ) -> impl Iterator> + Send + 'static { - let log_scanner = CheckpointLogReplayProcessor::new( + CheckpointLogReplayProcessor::new( total_actions_counter, total_add_actions_counter, minimum_file_retention_timestamp, - ); - CheckpointLogReplayProcessor::apply_to_iterator(log_scanner, action_iter) + ) + .process_actions_iter(action_iter) } /// A visitor that filters actions for inclusion in a V1 spec checkpoint file. @@ -493,12 +483,11 @@ impl RowVisitor for CheckpointVisitor<'_> { #[cfg(test)] mod tests { - use std::collections::HashSet; - + use super::*; use crate::arrow::array::StringArray; use crate::utils::test_utils::{action_batch, parse_json_batch}; - - use super::*; + use itertools::Itertools; + use std::collections::HashSet; #[test] fn test_checkpoint_visitor() -> DeltaResult<()> { @@ -764,8 +753,8 @@ mod tests { #[test] fn test_v1_checkpoint_actions_iter_multi_batch_test() -> DeltaResult<()> { // Setup counters - let total_actions_counter = Arc::new(AtomicI64::new(0)); - let total_add_actions_counter = Arc::new(AtomicI64::new(0)); + let total_actions_counter = 0; + let total_add_actions_counter = 0; // Create first batch with protocol, metadata, and some files let json_strings1: StringArray = vec![ @@ -774,7 +763,6 @@ mod tests { r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, ].into(); - // Create second batch with some duplicates and new files let json_strings2: StringArray = vec![ // Protocol and metadata should be skipped as duplicates @@ -786,28 +774,25 @@ mod tests { r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, // Transaction r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"# ].into(); - // Create third batch with all duplicate actions. // The entire batch should be skippped as there are no selected actions to write from this batch. let json_strings3: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, ].into(); - let input_batches = vec![ Ok((parse_json_batch(json_strings1), true)), Ok((parse_json_batch(json_strings2), true)), Ok((parse_json_batch(json_strings3), true)), ]; - // Run the iterator let results: Vec<_> = checkpoint_actions_iter( input_batches.into_iter(), - total_actions_counter.clone(), - total_add_actions_counter.clone(), + total_actions_counter, + total_add_actions_counter, 0, ) - .collect::, _>>()?; + .try_collect()?; // Expect two batches in results (third batch should be filtered out)" assert_eq!(results.len(), 2); @@ -827,10 +812,9 @@ mod tests { ); // 6 total actions (4 from batch1 + 2 from batch2 + 0 from batch3) - assert_eq!(total_actions_counter.load(Ordering::Relaxed), 6); - + assert_eq!(total_actions_counter, 6); // 3 add actions (2 from batch1 + 1 from batch2) - assert_eq!(total_add_actions_counter.load(Ordering::Relaxed), 3); + assert_eq!(total_add_actions_counter, 3); Ok(()) } diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index abb451c01c..67ad75ee0e 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -247,7 +247,7 @@ pub(crate) trait LogReplayProcessor: Sized { /// Note: Since log replay is stateful, processing may update internal processor state (e.g., deduplication sets). fn process_actions_batch( &mut self, - actions_batch: &dyn EngineData, + actions_batch: Box, is_log_batch: bool, ) -> DeltaResult; @@ -264,7 +264,7 @@ pub(crate) trait LogReplayProcessor: Sized { action_iter .map(move |action_res| { let (batch, is_log_batch) = action_res?; - self.process_actions_batch(batch.as_ref(), is_log_batch) + self.process_actions_batch(batch, is_log_batch) }) .filter(|res| { // TODO: Leverage .is_none_or() when msrv = 1.82 diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 9d0fdb7e15..29fdc2222f 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -321,13 +321,13 @@ impl LogReplayProcessor for ScanLogReplayProcessor { fn process_actions_batch( &mut self, - actions_batch: &dyn EngineData, + actions_batch: Box, is_log_batch: bool, ) -> DeltaResult { // Build an initial selection vector for the batch which has had the data skipping filter // applied. The selection vector is further updated by the deduplication visitor to remove // rows that are not valid adds. - let selection_vector = self.build_selection_vector(actions_batch)?; + let selection_vector = self.build_selection_vector(actions_batch.as_ref())?; assert_eq!(selection_vector.len(), actions_batch.len()); let mut visitor = AddRemoveDedupVisitor::new( @@ -338,10 +338,10 @@ impl LogReplayProcessor for ScanLogReplayProcessor { self.partition_filter.clone(), is_log_batch, ); - visitor.visit_rows_of(actions_batch)?; + visitor.visit_rows_of(actions_batch.as_ref())?; // TODO: Teach expression eval to respect the selection vector we just computed so carefully! - let result = self.add_transform.evaluate(actions_batch)?; + let result = self.add_transform.evaluate(actions_batch.as_ref())?; Ok(( result, visitor.selection_vector, From f31e51d0d28eeb0e5a1b866b10930628fbba896d Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 6 Apr 2025 15:15:44 -0700 Subject: [PATCH 086/176] box counters --- kernel/src/checkpoint/log_replay.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 88a5844922..e2647cbdd6 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -61,9 +61,9 @@ pub(crate) struct CheckpointLogReplayProcessor { /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. seen_file_keys: HashSet, /// Counter for the total number of actions processed during log replay. - total_actions: i64, + total_actions: Box, /// Counter for the total number of add actions processed during log replay. - total_add_actions: i64, + total_add_actions: Box, /// Indicates whether a protocol action has been seen in the log. seen_protocol: bool, /// Indicates whether a metadata action has been seen in the log. @@ -114,8 +114,8 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { visitor.visit_rows_of(batch.as_ref())?; // Update counters - self.total_actions += visitor.total_file_actions + visitor.total_non_file_actions; - self.total_add_actions += visitor.total_add_actions; + *self.total_actions += visitor.total_file_actions + visitor.total_non_file_actions; + *self.total_add_actions += visitor.total_add_actions; // Update protocol and metadata seen flags self.seen_protocol = visitor.seen_protocol; @@ -135,8 +135,8 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { impl CheckpointLogReplayProcessor { pub(crate) fn new( - total_actions_counter: i64, - total_add_actions_counter: i64, + total_actions_counter: Box, + total_add_actions_counter: Box, minimum_file_retention_timestamp: i64, ) -> Self { Self { @@ -162,8 +162,8 @@ impl CheckpointLogReplayProcessor { #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented pub(crate) fn checkpoint_actions_iter( action_iter: impl Iterator, bool)>> + Send + 'static, - total_actions_counter: i64, - total_add_actions_counter: i64, + total_actions_counter: Box, // Boxed to avoid lifetime complications + total_add_actions_counter: Box, // Boxed to avoid lifetime complications minimum_file_retention_timestamp: i64, ) -> impl Iterator> + Send + 'static { CheckpointLogReplayProcessor::new( @@ -753,8 +753,8 @@ mod tests { #[test] fn test_v1_checkpoint_actions_iter_multi_batch_test() -> DeltaResult<()> { // Setup counters - let total_actions_counter = 0; - let total_add_actions_counter = 0; + let total_actions_counter = Box::new(0); + let total_add_actions_counter = Box::new(0); // Create first batch with protocol, metadata, and some files let json_strings1: StringArray = vec![ @@ -788,8 +788,8 @@ mod tests { let results: Vec<_> = checkpoint_actions_iter( input_batches.into_iter(), - total_actions_counter, - total_add_actions_counter, + total_actions_counter.clone(), + total_add_actions_counter.clone(), 0, ) .try_collect()?; @@ -812,9 +812,9 @@ mod tests { ); // 6 total actions (4 from batch1 + 2 from batch2 + 0 from batch3) - assert_eq!(total_actions_counter, 6); + assert_eq!(*total_actions_counter, 6); // 3 add actions (2 from batch1 + 1 from batch2) - assert_eq!(total_add_actions_counter, 3); + assert_eq!(*total_add_actions_counter, 3); Ok(()) } From 79d6ff86857745fad4ed8ba6e9de2b28eac81b81 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 6 Apr 2025 17:12:06 -0700 Subject: [PATCH 087/176] review --- kernel/src/checkpoint/log_replay.rs | 172 ++++++++++++++-------------- 1 file changed, 84 insertions(+), 88 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 2dc3ebea72..f0ba1147d7 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -1,7 +1,7 @@ //! This module implements log replay functionality specifically for checkpoint writes in delta tables. //! //! The primary goal is to process Delta log actions in reverse chronological order (from most recent to -//! least recent) to produce the minimal set of actions required to reconstruct the table state in a V1 checkpoint. +//! oldest) to produce the minimal set of actions required to reconstruct the table state in a checkpoint. //! //! ## Key Responsibilities //! - Filtering: Only the most recent protocol and metadata actions are retained, and for each transaction @@ -52,7 +52,7 @@ use crate::{DeltaResult, Error}; /// The resulting filtered set of actions represents the minimal set needed to reconstruct /// the latest valid state of the table at the checkpointed version. pub(crate) struct CheckpointVisitor<'seen> { - // Desduplicates file actions + // Deduplicates file actions deduplicator: FileActionDeduplicator<'seen>, // Tracks which rows to include in the final output selection_vector: Vec, @@ -82,6 +82,7 @@ impl CheckpointVisitor<'_> { const ADD_PATH_INDEX: usize = 0; // Position of "add.path" in getters const ADD_DV_START_INDEX: usize = 1; // Start position of add deletion vector columns const REMOVE_PATH_INDEX: usize = 4; // Position of "remove.path" in getters + const REMOVE_DELETION_TIMESTAMP_INDEX: usize = 5; // Position of "remove.deletionTimestamp" in getters const REMOVE_DV_START_INDEX: usize = 6; // Start position of remove deletion vector columns pub(crate) fn new<'seen>( @@ -117,10 +118,10 @@ impl CheckpointVisitor<'_> { /// Determines if a remove action tombstone has expired and should be excluded from the checkpoint. /// /// A remove action includes a timestamp indicating when the deletion occurred. Physical files - /// are deleted lazily after a user-defined expiration time, allowing concurrent readers to - /// access stale snapshots. A remove action remains as a tombstone in a checkpoint file until - /// it expires, which happens when the deletion timestamp is less than or equal to the - /// minimum file retention timestamp. + /// are deleted lazily after a user-defined expiration time. Remove actions are kept to allow + /// concurrent readers to read snapshots at older versions. A remove action remains as a tombstone + /// in a checkpoint file until it expires, which happens when the deletion timestamp is less than + /// or equal to the minimum file retention timestamp. /// /// Note: When remove.deletion_timestamp is not present (defaulting to 0), the remove action /// will be excluded from the checkpoint file as it will be treated as expired. @@ -139,9 +140,9 @@ impl CheckpointVisitor<'_> { /// Processes a potential file action to determine if it should be included in the checkpoint. /// - /// Returns Some(Ok(())) if the row contains a valid file action to be included in the checkpoint. - /// Returns None if the row doesn't contain a file action or should be skipped. - /// Returns Some(Err(...)) if there was an error processing the action. + /// Returns Ok(Some(())) if the row contains a valid file action to be included in the checkpoint. + /// Returns Ok(None) if the row doesn't contain a file action or should be skipped. + /// Returns Err(...) if there was an error processing the action. /// /// Note: This function handles both add and remove actions, applying deduplication logic and /// tombstone expiration rules as needed. @@ -149,26 +150,23 @@ impl CheckpointVisitor<'_> { &mut self, i: usize, getters: &[&'a dyn GetData<'a>], - ) -> Option> { + ) -> DeltaResult> { // Extract the file action and handle errors immediately - let (file_key, is_add) = match self.deduplicator.extract_file_action(i, getters, false) { - Ok(Some(action)) => action, - Ok(None) => return None, // If no file action is found, skip this row - Err(e) => return Some(Err(e)), + let (file_key, is_add) = match self.deduplicator.extract_file_action(i, getters, false)? { + Some(action) => action, + None => return Ok(None), // If no file action is found, skip this row }; // Check if we've already seen this file action if self.deduplicator.check_and_record_seen(file_key) { - return None; // Skip duplicates + return Ok(None); // Skip duplicates } // For remove actions, check if it's an expired tombstone - if !is_add { - match self.is_expired_tombstone(i, getters[5]) { - Ok(true) => return None, // Skip expired tombstones - Ok(false) => {} // Not expired, continue - Err(e) => return Some(Err(e)), // Error checking expiration - } + if !is_add + && self.is_expired_tombstone(i, getters[Self::REMOVE_DELETION_TIMESTAMP_INDEX])? + { + return Ok(None); // Skip expired remove tombstones } // Valid, non-duplicate file action @@ -176,89 +174,88 @@ impl CheckpointVisitor<'_> { self.total_add_actions += 1; } self.total_file_actions += 1; - Some(Ok(())) // Include this action + Ok(Some(())) // Include this action } /// Processes a potential protocol action to determine if it should be included in the checkpoint. /// - /// Returns Some(Ok(())) if the row contains a valid protocol action. - /// Returns None if the row doesn't contain a protocol action or is a duplicate. - /// Returns Some(Err(...)) if there was an error processing the action. + /// Returns Ok(Some(())) if the row contains a valid protocol action. + /// Returns Ok(None) if the row doesn't contain a protocol action or is a duplicate. + /// Returns Err(...) if there was an error processing the action. fn check_protocol_action<'a>( &mut self, i: usize, getter: &'a dyn GetData<'a>, - ) -> Option> { - // minReaderVersion is a required field, so we check for its presence to determine if this is a protocol action. - match getter.get_int(i, "protocol.minReaderVersion") { - Ok(Some(_)) => (), // It is a protocol action - Ok(None) => return None, // Not a protocol action - Err(e) => return Some(Err(e)), - }; - + ) -> DeltaResult> { // Skip duplicates if self.seen_protocol { - return None; + return Ok(None); } + // minReaderVersion is a required field, so we check for its presence to determine if this is a protocol action. + match getter.get_int(i, "protocol.minReaderVersion")? { + Some(_) => (), // It is a protocol action + None => return Ok(None), // Not a protocol action + }; + // Valid, non-duplicate protocol action to be included self.seen_protocol = true; self.total_non_file_actions += 1; - Some(Ok(())) + Ok(Some(())) } + /// Processes a potential metadata action to determine if it should be included in the checkpoint. /// - /// Returns Some(Ok(())) if the row contains a valid metadata action. - /// Returns None if the row doesn't contain a metadata action or is a duplicate. - /// Returns Some(Err(...)) if there was an error processing the action. + /// Returns Ok(Some(())) if the row contains a valid metadata action. + /// Returns Ok(None) if the row doesn't contain a metadata action or is a duplicate. + /// Returns Err(...) if there was an error processing the action. fn check_metadata_action<'a>( &mut self, i: usize, getter: &'a dyn GetData<'a>, - ) -> Option> { - // id is a required field, so we check for its presence to determine if this is a metadata action. - match getter.get_str(i, "metaData.id") { - Ok(Some(_)) => (), // It is a metadata action - Ok(None) => return None, // Not a metadata action - Err(e) => return Some(Err(e)), - }; - + ) -> DeltaResult> { // Skip duplicates if self.seen_metadata { - return None; + return Ok(None); } + // id is a required field, so we check for its presence to determine if this is a metadata action. + match getter.get_str(i, "metaData.id")? { + Some(_) => (), // It is a metadata action + None => return Ok(None), // Not a metadata action + }; + // Valid, non-duplicate metadata action to be included self.seen_metadata = true; self.total_non_file_actions += 1; - Some(Ok(())) + Ok(Some(())) } + /// Processes a potential txn action to determine if it should be included in the checkpoint. /// - /// Returns Some(Ok(())) if the row contains a valid txn action. - /// Returns None if the row doesn't contain a txn action or is a duplicate. - /// Returns Some(Err(...)) if there was an error processing the action. + /// Returns Ok(Some(())) if the row contains a valid txn action. + /// Returns Ok(None) if the row doesn't contain a txn action or is a duplicate. + /// Returns Err(...) if there was an error processing the action. fn check_txn_action<'a>( &mut self, i: usize, getter: &'a dyn GetData<'a>, - ) -> Option> { + ) -> DeltaResult> { // Check for txn field - let app_id = match getter.get_str(i, "txn.appId") { - Ok(Some(id)) => id, - Ok(None) => return None, // Not a txn action - Err(e) => return Some(Err(e)), + let app_id = match getter.get_str(i, "txn.appId")? { + Some(id) => id, + None => return Ok(None), // Not a txn action }; // If the app ID already exists in the set, the insertion will return false, // indicating that this is a duplicate. if !self.seen_txns.insert(app_id.to_string()) { - return None; + return Ok(None); } // Valid, non-duplicate txn action to be included self.total_non_file_actions += 1; - Some(Ok(())) + Ok(Some(())) } /// Determines if a row in the batch should be included in the checkpoint by checking @@ -266,21 +263,21 @@ impl CheckpointVisitor<'_> { /// /// Note: This method checks each action type in sequence, and prioritizes file actions as /// they appear most frequently, followed by transaction, protocol, and metadata actions. + /// + /// Returns Ok(true) if the row should be included in the checkpoint. + /// Returns Ok(false) if the row should be skipped. + /// Returns Err(...) if any validation or extraction failed. pub(crate) fn is_valid_action<'a>( &mut self, i: usize, getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { - // Try each action type in sequence, stopping at the first match. - let is_valid = self - .check_file_action(i, getters) - .or_else(|| self.check_txn_action(i, getters[11])) - .or_else(|| self.check_protocol_action(i, getters[10])) - .or_else(|| self.check_metadata_action(i, getters[9])) - .transpose()? // Swap the Result outside and return if Err - .is_some(); // If we got Some(Ok(())), it's a valid action - - Ok(is_valid) + Ok(self + .check_file_action(i, getters)? + .or(self.check_txn_action(i, getters[11])?) + .or(self.check_protocol_action(i, getters[10])?) + .or(self.check_metadata_action(i, getters[9])?) + .is_some()) } } @@ -361,15 +358,16 @@ mod tests { visitor.visit_rows_of(data.as_ref())?; - // Row 0 is an add action (included) - // Row 1 is a remove action (included) - // Row 2 is a commit info action (excluded) - // Row 3 is a protocol action (included) - // Row 4 is a metadata action (included) - // Row 5 is a cdc action (excluded) - // Row 6 is a sidecar action (excluded) - // Row 7 is a txn action (included) - let expected = vec![true, true, false, true, true, false, false, true]; + let expected = vec![ + true, // Row 0 is an add action (included) + true, // Row 1 is a remove action (included) + false, // Row 2 is a commit info action (excluded) + true, // Row 3 is a protocol action (included) + true, // Row 4 is a metadata action (included) + false, // Row 5 is a cdc action (excluded) + false, // Row 6 is a sidecar action (excluded) + true, // Row 7 is a txn action (included) + ]; assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 1); @@ -493,14 +491,13 @@ mod tests { #[test] fn test_checkpoint_visitor_conflicts_with_deletion_vectors() -> DeltaResult<()> { let json_strings: StringArray = vec![ - r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - // Same path but different DV, should be included + // Add action for file1 with deletion vector r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"two","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - // Duplicate of first entry, should be excluded - r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - // Conflicting remove action with DV, should be excluded - r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - ] + // Remove action for file1 with a different deletion vector + r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Add action for file1 with the same deletion vector as the remove action above (excluded) + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + ] .into(); let batch = parse_json_batch(json_strings); @@ -509,7 +506,7 @@ mod tests { let mut visitor = CheckpointVisitor::new( &mut seen_file_keys, true, - vec![true; 4], + vec![true; 3], 0, false, false, @@ -518,11 +515,10 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; - // Only the first two should be included since they have different (path, DvID) keys - let expected = vec![true, true, false, false]; + let expected = vec![true, true, false]; assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_file_actions, 2); - assert_eq!(visitor.total_add_actions, 2); + assert_eq!(visitor.total_add_actions, 1); assert_eq!(visitor.total_non_file_actions, 0); Ok(()) From a3cf0f223edaaa7e165935131341b12c95f157d2 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 6 Apr 2025 18:07:35 -0700 Subject: [PATCH 088/176] revert --- kernel/src/checkpoint/log_replay.rs | 38 ++++++++++++++++++----------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index e2647cbdd6..5567bd7cea 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -19,7 +19,8 @@ //! For log replay functionality used during table scans (i.e. for reading checkpoints and commit logs), refer to //! the `scan/log_replay.rs` module. use std::collections::HashSet; -use std::sync::LazyLock; +use std::sync::atomic::{AtomicI64, Ordering}; +use std::sync::{Arc, LazyLock}; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::log_replay::{ @@ -60,10 +61,14 @@ pub(crate) struct CheckpointLogReplayProcessor { /// Tracks file actions that have been seen during log replay to avoid duplicates. /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. seen_file_keys: HashSet, + + // Arc is necessary because the resulting iterator from process_actions_iter() + // is marked with 'Send + 'static', which requires all captured state to be thread-safe. /// Counter for the total number of actions processed during log replay. - total_actions: Box, + total_actions: Arc, /// Counter for the total number of add actions processed during log replay. - total_add_actions: Box, + total_add_actions: Arc, + /// Indicates whether a protocol action has been seen in the log. seen_protocol: bool, /// Indicates whether a metadata action has been seen in the log. @@ -113,9 +118,14 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { ); visitor.visit_rows_of(batch.as_ref())?; - // Update counters - *self.total_actions += visitor.total_file_actions + visitor.total_non_file_actions; - *self.total_add_actions += visitor.total_add_actions; + // We only require eventual consistency for the counters to read the final values + // after all actions have been processed, so we can use relaxed ordering. + self.total_actions.fetch_add( + visitor.total_file_actions + visitor.total_non_file_actions, + Ordering::Relaxed, + ); + self.total_add_actions + .fetch_add(visitor.total_add_actions, Ordering::Relaxed); // Update protocol and metadata seen flags self.seen_protocol = visitor.seen_protocol; @@ -135,8 +145,8 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { impl CheckpointLogReplayProcessor { pub(crate) fn new( - total_actions_counter: Box, - total_add_actions_counter: Box, + total_actions_counter: Arc, + total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> Self { Self { @@ -162,8 +172,8 @@ impl CheckpointLogReplayProcessor { #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented pub(crate) fn checkpoint_actions_iter( action_iter: impl Iterator, bool)>> + Send + 'static, - total_actions_counter: Box, // Boxed to avoid lifetime complications - total_add_actions_counter: Box, // Boxed to avoid lifetime complications + total_actions_counter: Arc, + total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> impl Iterator> + Send + 'static { CheckpointLogReplayProcessor::new( @@ -753,8 +763,8 @@ mod tests { #[test] fn test_v1_checkpoint_actions_iter_multi_batch_test() -> DeltaResult<()> { // Setup counters - let total_actions_counter = Box::new(0); - let total_add_actions_counter = Box::new(0); + let total_actions_counter = Arc::new(AtomicI64::new(0)); + let total_add_actions_counter = Arc::new(AtomicI64::new(0)); // Create first batch with protocol, metadata, and some files let json_strings1: StringArray = vec![ @@ -812,9 +822,9 @@ mod tests { ); // 6 total actions (4 from batch1 + 2 from batch2 + 0 from batch3) - assert_eq!(*total_actions_counter, 6); + assert_eq!(total_actions_counter.load(Ordering::Relaxed), 6); // 3 add actions (2 from batch1 + 1 from batch2) - assert_eq!(*total_add_actions_counter, 3); + assert_eq!(total_add_actions_counter.load(Ordering::Relaxed), 3); Ok(()) } From 326bea65a77cfaec2c4175acf65f0f8c803a8310 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 6 Apr 2025 18:43:52 -0700 Subject: [PATCH 089/176] move logic to CheckpointWriter --- kernel/src/checkpoint/mod.rs | 223 ++++++++++++++++------------------- 1 file changed, 99 insertions(+), 124 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 3b456d768d..2c330b7d3a 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -73,9 +73,9 @@ use crate::{ DeltaResult, Engine, EngineData, Error, }; -pub(crate) mod log_replay; +mod log_replay; -/// This schema contains all the actions that we care to extract from the log +/// This schema contains all the actions that we care to extract from log /// files for the purpose of creating a checkpoint. static CHECKPOINT_READ_SCHEMA: LazyLock = LazyLock::new(|| { StructType::new([ @@ -115,47 +115,24 @@ pub struct SingleFileCheckpointData { /// It manages the one-time consumption of checkpoint data and tracks statistics /// about the actions included in the checkpoint. pub struct CheckpointWriter { - /// Using Option to enforce single consumption at compile time - single_file_checkpoint_data: Option, + // The snapshot from which the checkpoint is created + snapshot: Snapshot, + // Flag indicating if the table supports the `v2Checkpoints` reader/write feature + is_v2_checkpoints_supported: bool, - /// Total actions counter to be written to the last checkpoint file - #[allow(dead_code)] // TODO: Remove when finalize_checkpoint is implemented + // TODO, i dont think arc is necessary here total_actions_counter: Arc, - - /// Total add actions counter to be written to the last checkpoint file - #[allow(dead_code)] // TODO: Remove when finalize_checkpoint is implemented total_add_actions_counter: Arc, - - /// Version of the checkpoint - #[allow(dead_code)] // TODO: Remove when finalize_checkpoint is implemented - version: i64, - - /// Number of parts of the checkpoint - #[allow(dead_code)] // TODO: Remove when finalize_checkpoint is implemented - parts: i64, - - /// Path to table's log - #[allow(dead_code)] // TODO: Remove when finalize_checkpoint is implemented - log_root: Url, } impl CheckpointWriter { /// Creates a new CheckpointWriter with the provided checkpoint data and counters - fn new( - single_file_checkpoint_data: Option, - total_actions_counter: Arc, - total_add_actions_counter: Arc, - version: i64, - parts: i64, - log_root: Url, - ) -> Self { + fn new(snapshot: Snapshot, is_v2_checkpoints_supported: bool) -> Self { Self { - single_file_checkpoint_data, - total_actions_counter, - total_add_actions_counter, - version, - parts, - log_root, + snapshot, + is_v2_checkpoints_supported, + total_actions_counter: Arc::new(AtomicI64::new(0)), + total_add_actions_counter: Arc::new(AtomicI64::new(0)), } } @@ -164,10 +141,48 @@ impl CheckpointWriter { /// This method takes ownership of the checkpoint data, ensuring it can /// only be consumed once. It returns an error if the data has already /// been consumed. - pub fn get_checkpoint_info(&mut self) -> DeltaResult { - self.single_file_checkpoint_data - .take() - .ok_or_else(|| Error::generic("Checkpoint data already consumed")) + pub fn get_checkpoint_info( + &mut self, + engine: &dyn Engine, + ) -> DeltaResult { + // Create counters for tracking actions + let total_actions_counter = Arc::new(AtomicI64::new(0)); + let total_add_actions_counter = Arc::new(AtomicI64::new(0)); + + // Create iterator over actions for checkpoint data + let checkpoint_data = checkpoint_actions_iter( + self.replay_for_checkpoint_data(engine)?, + total_actions_counter.clone(), + total_add_actions_counter.clone(), + self.deleted_file_retention_timestamp()?, + ); + + // Chain the result of create_checkpoint_metadata_batch to the checkpoint data + let chained = checkpoint_data.chain(create_checkpoint_metadata_batch( + self.snapshot.version() as i64, + engine, + self.is_v2_checkpoints_supported, + )?); + + // Generate checkpoint path based on builder configuration + // Classic naming is required for V1 checkpoints and optional for V2 checkpoints + // let checkpoint_path = if self.with_classic_naming || !v2_checkpoints_supported { + // ParsedLogPath::new_classic_parquet_checkpoint( + // self.snapshot.table_root(), + // self.snapshot.version(), + // )? + // } else { + // ParsedLogPath::new_uuid_parquet_checkpoint( + // self.snapshot.table_root(), + // self.snapshot.version(), + // )? + // }; + + // Create the checkpoint data object + Ok(SingleFileCheckpointData { + path: Url::parse("todo://checkpoint_path").unwrap(), // TODO: Replace with actual path + data: Box::new(chained), + }) } /// Finalizes the checkpoint writing process @@ -188,6 +203,47 @@ impl CheckpointWriter { ) -> DeltaResult<()> { todo!("Implement finalize_checkpoint"); } + + /// Calculates the cutoff timestamp for deleted file cleanup. + /// + /// This function determines the minimum timestamp before which deleted files + /// will be permanently removed during VACUUM operations, based on the table's + /// deleted_file_retention_duration property. + /// + /// Returns the cutoff timestamp in milliseconds since epoch, matching + /// the remove action's deletion_timestamp format for comparison. + /// + /// The default retention period is 7 days, matching delta-spark's behavior. + fn deleted_file_retention_timestamp(&self) -> DeltaResult { + let retention_duration = self + .snapshot + .table_properties() + .deleted_file_retention_duration; + + deleted_file_retention_timestamp_with_time( + retention_duration, + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_err(|e| Error::generic(format!("Failed to calculate system time: {}", e)))?, + ) + } + + /// Prepares the iterator over actions for checkpoint creation + /// + /// This method is factored out to facilitate testing and returns an iterator + /// over all actions to be included in the checkpoint. + fn replay_for_checkpoint_data( + &self, + engine: &dyn Engine, + ) -> DeltaResult, bool)>> + Send> { + let read_schema = get_checkpoint_read_schema(); + self.snapshot.log_segment().read_actions( + engine, + read_schema.clone(), + read_schema.clone(), + None, + ) + } } /// Builder for configuring and creating CheckpointWriter instances @@ -226,98 +282,17 @@ impl CheckpointBuilder { /// This method validates the configuration against table features and creates /// a [`CheckpointWriter`] for the appropriate checkpoint type. It performs protocol /// table feature checks to determine if v2Checkpoints are supported. - pub fn build(self, engine: &dyn Engine) -> DeltaResult { - let v2_checkpoints_supported = self + pub fn build(self) -> DeltaResult { + let is_v2_checkpoints_supported = self .snapshot .table_configuration() .is_v2_checkpoint_supported(); - // Create counters for tracking actions - let total_actions_counter = Arc::new(AtomicI64::new(0)); - let total_add_actions_counter = Arc::new(AtomicI64::new(0)); - - // Create iterator over actions for checkpoint data - let checkpoint_data = checkpoint_actions_iter( - self.replay_for_checkpoint_data(engine)?, - total_actions_counter.clone(), - total_add_actions_counter.clone(), - self.deleted_file_retention_timestamp()?, - ); - - // Chain the result of create_checkpoint_metadata_batch to the checkpoint data - let chained = checkpoint_data.chain(create_checkpoint_metadata_batch( - self.snapshot.version() as i64, - engine, - v2_checkpoints_supported, - )?); - - // Generate checkpoint path based on builder configuration - // Classic naming is required for V1 checkpoints and optional for V2 checkpoints - // let checkpoint_path = if self.with_classic_naming || !v2_checkpoints_supported { - // ParsedLogPath::new_classic_parquet_checkpoint( - // self.snapshot.table_root(), - // self.snapshot.version(), - // )? - // } else { - // ParsedLogPath::new_uuid_parquet_checkpoint( - // self.snapshot.table_root(), - // self.snapshot.version(), - // )? - // }; - Ok(CheckpointWriter::new( - Some(SingleFileCheckpointData { - data: Box::new(chained), - path: Url::parse("memory://test-table/_delta_log/checkpoint.parquet").unwrap(), - }), - total_actions_counter, - total_add_actions_counter, - self.snapshot.version() as i64, - 1, - self.snapshot.log_segment().log_root.clone(), + self.snapshot, + is_v2_checkpoints_supported, )) } - - /// Prepares the iterator over actions for checkpoint creation - /// - /// This method is factored out to facilitate testing and returns an iterator - /// over all actions to be included in the checkpoint. - fn replay_for_checkpoint_data( - &self, - engine: &dyn Engine, - ) -> DeltaResult, bool)>> + Send> { - let read_schema = get_checkpoint_read_schema(); - self.snapshot.log_segment().read_actions( - engine, - read_schema.clone(), - read_schema.clone(), - None, - ) - } - - /// Calculates the cutoff timestamp for deleted file cleanup. - /// - /// This function determines the minimum timestamp before which deleted files - /// will be permanently removed during VACUUM operations, based on the table's - /// deleted_file_retention_duration property. - /// - /// Returns the cutoff timestamp in milliseconds since epoch, matching - /// the remove action's deletion_timestamp format for comparison. - /// - /// The default retention period is 7 days, matching delta-spark's behavior. - pub(crate) fn deleted_file_retention_timestamp(&self) -> DeltaResult { - let retention_duration = self - .snapshot - .table_properties() - .deleted_file_retention_duration; - - deleted_file_retention_timestamp_with_time( - retention_duration, - SystemTime::now() - .duration_since(UNIX_EPOCH) - .map_err(|e| Error::generic(format!("Failed to calculate system time: {}", e)))?, - ) - } } /// Internal implementation with injectable time parameter for testing From 4416968208669712d5fdeced4e7384906bb99b96 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 6 Apr 2025 19:17:28 -0700 Subject: [PATCH 090/176] rc --- kernel/src/checkpoint/log_replay.rs | 50 ++++++++++++++--------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 5567bd7cea..8078935282 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -18,9 +18,10 @@ //! //! For log replay functionality used during table scans (i.e. for reading checkpoints and commit logs), refer to //! the `scan/log_replay.rs` module. +use std::cell::RefCell; use std::collections::HashSet; -use std::sync::atomic::{AtomicI64, Ordering}; -use std::sync::{Arc, LazyLock}; +use std::rc::Rc; +use std::sync::LazyLock; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::log_replay::{ @@ -62,12 +63,13 @@ pub(crate) struct CheckpointLogReplayProcessor { /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. seen_file_keys: HashSet, - // Arc is necessary because the resulting iterator from process_actions_iter() - // is marked with 'Send + 'static', which requires all captured state to be thread-safe. - /// Counter for the total number of actions processed during log replay. - total_actions: Arc, - /// Counter for the total number of add actions processed during log replay. - total_add_actions: Arc, + // Rc> provides shared mutability for our counters, allowing both the + // iterator to update the values during processing and the caller to observe the final + // counts afterward. Note that this approach is not thread-safe and only works in + // single-threaded contexts, which means the iterator cannot be sent across thread + // boundaries (no Send trait). + total_actions: Rc>, + total_add_actions: Rc>, /// Indicates whether a protocol action has been seen in the log. seen_protocol: bool, @@ -118,14 +120,10 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { ); visitor.visit_rows_of(batch.as_ref())?; - // We only require eventual consistency for the counters to read the final values - // after all actions have been processed, so we can use relaxed ordering. - self.total_actions.fetch_add( - visitor.total_file_actions + visitor.total_non_file_actions, - Ordering::Relaxed, - ); - self.total_add_actions - .fetch_add(visitor.total_add_actions, Ordering::Relaxed); + // Update the total actions and add actions counters + *self.total_actions.borrow_mut() += + visitor.total_file_actions + visitor.total_non_file_actions; + *self.total_add_actions.borrow_mut() += visitor.total_add_actions; // Update protocol and metadata seen flags self.seen_protocol = visitor.seen_protocol; @@ -145,8 +143,8 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { impl CheckpointLogReplayProcessor { pub(crate) fn new( - total_actions_counter: Arc, - total_add_actions_counter: Arc, + total_actions_counter: Rc>, + total_add_actions_counter: Rc>, minimum_file_retention_timestamp: i64, ) -> Self { Self { @@ -171,11 +169,11 @@ impl CheckpointLogReplayProcessor { /// sorted by the order of the actions in the log from most recent to least recent. #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented pub(crate) fn checkpoint_actions_iter( - action_iter: impl Iterator, bool)>> + Send + 'static, - total_actions_counter: Arc, - total_add_actions_counter: Arc, + action_iter: impl Iterator, bool)>>, + total_actions_counter: Rc>, + total_add_actions_counter: Rc>, minimum_file_retention_timestamp: i64, -) -> impl Iterator> + Send + 'static { +) -> impl Iterator> { CheckpointLogReplayProcessor::new( total_actions_counter, total_add_actions_counter, @@ -763,8 +761,8 @@ mod tests { #[test] fn test_v1_checkpoint_actions_iter_multi_batch_test() -> DeltaResult<()> { // Setup counters - let total_actions_counter = Arc::new(AtomicI64::new(0)); - let total_add_actions_counter = Arc::new(AtomicI64::new(0)); + let total_actions_counter = Rc::new(RefCell::new(0)); + let total_add_actions_counter = Rc::new(RefCell::new(0)); // Create first batch with protocol, metadata, and some files let json_strings1: StringArray = vec![ @@ -822,9 +820,9 @@ mod tests { ); // 6 total actions (4 from batch1 + 2 from batch2 + 0 from batch3) - assert_eq!(total_actions_counter.load(Ordering::Relaxed), 6); + assert_eq!(*total_actions_counter.borrow(), 6); // 3 add actions (2 from batch1 + 1 from batch2) - assert_eq!(total_add_actions_counter.load(Ordering::Relaxed), 3); + assert_eq!(*total_add_actions_counter.borrow(), 3); Ok(()) } From 4ceaa500764d352637d802f035858aaa6b465aae Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 6 Apr 2025 23:48:59 -0700 Subject: [PATCH 091/176] update --- kernel/src/checkpoint/log_replay.rs | 6 +- kernel/src/checkpoint/mod.rs | 264 ++++++++++++++++------------ kernel/src/lib.rs | 2 +- 3 files changed, 157 insertions(+), 115 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 8078935282..f9fc357fb8 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -37,12 +37,12 @@ use crate::{DeltaResult, EngineData, Error}; /// [`CheckpointData`] represents a batch of actions filtered for checkpoint creation. /// It wraps a single engine data batch and a corresponding selection vector indicating /// which rows should be written to the checkpoint file. -pub(crate) struct CheckpointData { +pub struct CheckpointData { /// The original engine data containing the actions #[allow(dead_code)] // TODO: Remove once checkpoint_v1 API is implemented - data: Box, + pub(crate) data: Box, /// Boolean vector indicating which rows should be included in the checkpoint - selection_vector: Vec, + pub(crate) selection_vector: Vec, } impl HasSelectionVector for CheckpointData { diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 9e13181fa4..d0a0535416 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -14,17 +14,36 @@ //! //! ## Architecture //! - -//! The API follows a builder pattern using `CheckpointBuilder`, which performs table feature -//! detection and configuration validation. Depending on table features and builder options: -//! - Without `v2Checkpoints`: produces a **Classic-named V1** checkpoint. -//! - With `v2Checkpoints`: produces a **UUID-named V2** checkpoint. -//! - With `v2Checkpoints` + `.classic_naming()`: produces a **Classic-named V2** checkpoint. +//! ### [`CheckpointBuilder`] +//! The entry point for checkpoint creation with the following methods: +//! - `new(snapshot: Snapshot) -> Self` - Creates a new builder for the given table snapshot +//! - `with_classic_naming() -> Self` - Configures the builder to use classic naming +//! - `build() -> DeltaResult` - Creates the checkpoint writer +//! +//! ### [`CheckpointWriter`] +//! Handles the actual checkpoint generation with the following methods: +//! - `get_checkpoint_info(engine: &dyn Engine) -> DeltaResult` - +//! Retrieves checkpoint data and path +//! - `finalize_checkpoint(engine: &dyn Engine, metadata: &dyn EngineData) -> DeltaResult<()>` - +//! Writes the _last_checkpoint file +//! +//! ## Checkpoint Type Selection //! -//! The builder returns the `CheckpointWriter` which is responsible for: -//! - Producing the correct set of actions to be written to the checkpoint file when -//! `.get_checkpoint_info()` is called. -//! - Writing the _last_checkpoint file when `.finalize_checkpoint()` is called. +//! The checkpoint type is determined by two factors: +//! 1. Whether the table supports the `v2Checkpoints` reader/writer feature +//! 2. Whether classic naming is configured on the builder +//! +//! ```text +//! +------------------+---------------------------+-------------------------------+ +//! | Table Feature | Builder Configuration | Resulting Checkpoint Type | +//! +==================+===========================+===============================+ +//! | No v2Checkpoints | Any | Single-file Classic-named V1 | +//! +------------------+---------------------------+-------------------------------+ +//! | v2Checkpoints | with_classic_naming(false)| Single-file UUID-named V2 | +//! +------------------+---------------------------+-------------------------------+ +//! | v2Checkpoints | with_classic_naming(true) | Single-file Classic-named V2 | +//! +------------------+---------------------------+-------------------------------+ +//! ``` //! //! Notes: //! - Multi-file V2 checkpoints are not supported yet. The API is designed to be extensible for future @@ -61,29 +80,29 @@ //! This module, along with its submodule `checkpoint/log_replay.rs`, provides the full //! API and implementation for generating checkpoints. See `checkpoint/log_replay.rs` for details //! on how log replay is used to filter and deduplicate actions for checkpoint creation. - -use log_replay::{checkpoint_actions_iter, CheckpointData}; -use std::{ - sync::{atomic::AtomicI64, Arc, LazyLock}, - time::{Duration, SystemTime, UNIX_EPOCH}, -}; -use url::Url; - -use crate::actions::schemas::GetStructField; -use crate::schema::{SchemaRef, StructType}; use crate::{ actions::{ - Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, METADATA_NAME, - PROTOCOL_NAME, REMOVE_NAME, SET_TRANSACTION_NAME, SIDECAR_NAME, + schemas::{GetStructField, ToSchema}, + Add, CheckpointMetadata, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, + METADATA_NAME, PROTOCOL_NAME, REMOVE_NAME, SET_TRANSACTION_NAME, SIDECAR_NAME, }, + expressions::Scalar, + schema::{SchemaRef, StructType}, snapshot::Snapshot, - DeltaResult, Engine, EngineData, Error, + DeltaResult, Engine, EngineData, Error, ExpressionHandlerExtension, +}; +use log_replay::{checkpoint_actions_iter, CheckpointData}; +use std::{ + cell::RefCell, + rc::Rc, + sync::{Arc, LazyLock}, + time::{Duration, SystemTime, UNIX_EPOCH}, }; +use url::Url; mod log_replay; -/// This schema contains all the actions that we care to extract from log -/// files for the purpose of creating a checkpoint. +/// Schema for extracting relevant actions from log files during checkpoint creation static CHECKPOINT_READ_SCHEMA: LazyLock = LazyLock::new(|| { StructType::new([ Option::::get_struct_field(ADD_NAME), @@ -96,40 +115,40 @@ static CHECKPOINT_READ_SCHEMA: LazyLock = LazyLock::new(|| { .into() }); -/// This schema is used when reading actions from the Delta log -/// to ensure we capture all necessary action types. -#[cfg_attr(feature = "developer-visibility", visibility::make(pub))] -#[cfg_attr(not(feature = "developer-visibility"), visibility::make(pub(crate)))] +/// Returns the schema for reading Delta log actions during checkpoint creation fn get_checkpoint_read_schema() -> &'static SchemaRef { &CHECKPOINT_READ_SCHEMA } -/// Contains the path and data for a single-file checkpoint. -/// -/// This struct holds all the necessary information for writing a checkpoint file, -/// including the destination path and the iterator over checkpoint actions. +/// Contains the path and data for a single-file checkpoint pub struct SingleFileCheckpointData { - /// The target URL where the checkpoint file will be written + /// Target URL where the checkpoint file will be written pub path: Url, /// Iterator over checkpoint actions to be written to the file pub data: Box>>, } -/// Writer for creating checkpoint files in Delta tables. +/// Manages the checkpoint writing process for Delta tables /// -/// The CheckpointWriter orchestrates the process of writing checkpoint data to storage. -/// It manages the one-time consumption of checkpoint data and tracks statistics -/// about the actions included in the checkpoint. +/// The [`CheckpointWriter`] orchestrates creating checkpoint data and finalizing +/// the checkpoint file. It tracks statistics about included actions and +/// ensures checkpoint data is consumed only once. pub struct CheckpointWriter { - // The snapshot from which the checkpoint is created + /// The snapshot from which the checkpoint is created snapshot: Snapshot, - // Flag indicating if the table supports the `v2Checkpoints` reader/write feature + + /// Whether the table supports the `v2Checkpoints` feature is_v2_checkpoints_supported: bool, - // TODO, i dont think arc is necessary here - total_actions_counter: Arc, - total_add_actions_counter: Arc, + /// Counter for total actions included in the checkpoint + total_actions_counter: Rc>, + + /// Counter for Add actions included in the checkpoint + total_add_actions_counter: Rc>, + + /// Flag to track if checkpoint data has been consumed + data_consumed: bool, } impl CheckpointWriter { @@ -138,70 +157,81 @@ impl CheckpointWriter { Self { snapshot, is_v2_checkpoints_supported, - total_actions_counter: Arc::new(AtomicI64::new(0)), - total_add_actions_counter: Arc::new(AtomicI64::new(0)), + total_actions_counter: Rc::new(RefCell::::new(0.into())), + total_add_actions_counter: Rc::new(RefCell::::new(0.into())), + data_consumed: false, } } /// Retrieves the checkpoint data and path information /// - /// This method takes ownership of the checkpoint data, ensuring it can - /// only be consumed once. It returns an error if the data has already - /// been consumed. + /// This method is the core of the checkpoint generation process. It: + /// + /// 1. Ensures checkpoint data is consumed only once via `data_consumed` flag + /// 2. Reads actions from the log segment using the checkpoint schema + /// 3. Filters and deduplicates actions for the checkpoint + /// 4. Chains the checkpoint metadata action if writing a V2 spec checkpoint + /// (i.e., if `v2Checkpoints` feature is supported by table) + /// 5. Generates the appropriate checkpoint path + /// + /// The returned data should be written to persistent storage by the caller + /// before calling `finalize_checkpoint()` otherwise data loss may occur. + /// + /// # Returns + /// A [`SingleFileCheckpointData`] containing the checkpoint path and action iterator pub fn get_checkpoint_info( &mut self, engine: &dyn Engine, ) -> DeltaResult { - // Create counters for tracking actions - let total_actions_counter = Arc::new(AtomicI64::new(0)); - let total_add_actions_counter = Arc::new(AtomicI64::new(0)); + if self.data_consumed { + return Err(Error::generic("Checkpoint data has already been consumed")); + } // Create iterator over actions for checkpoint data let checkpoint_data = checkpoint_actions_iter( self.replay_for_checkpoint_data(engine)?, - total_actions_counter.clone(), - total_add_actions_counter.clone(), + self.total_actions_counter.clone(), + self.total_add_actions_counter.clone(), self.deleted_file_retention_timestamp()?, ); - // Chain the result of create_checkpoint_metadata_batch to the checkpoint data + // Chain the checkpoint metadata action if using V2 checkpoints let chained = checkpoint_data.chain(create_checkpoint_metadata_batch( self.snapshot.version() as i64, engine, self.is_v2_checkpoints_supported, )?); - // Generate checkpoint path based on builder configuration - // Classic naming is required for V1 checkpoints and optional for V2 checkpoints - // let checkpoint_path = if self.with_classic_naming || !v2_checkpoints_supported { - // ParsedLogPath::new_classic_parquet_checkpoint( - // self.snapshot.table_root(), - // self.snapshot.version(), - // )? - // } else { - // ParsedLogPath::new_uuid_parquet_checkpoint( - // self.snapshot.table_root(), - // self.snapshot.version(), - // )? - // }; - - // Create the checkpoint data object + // Generate the appropriate checkpoint path + // let checkpoint_path = self.generate_checkpoint_path()?; + + self.data_consumed = true; + Ok(SingleFileCheckpointData { - path: Url::parse("todo://checkpoint_path").unwrap(), // TODO: Replace with actual path + path: Url::parse("todo!")?, data: Box::new(chained), }) } - /// Finalizes the checkpoint writing process + /// Finalizes the checkpoint writing process by creating the _last_checkpoint file + /// + /// The `LastCheckpointInfo` (`_last_checkpoint`) file is a metadata file that contains + /// information about the last checkpoint created for the table. It is used as a hint + /// for the engine to quickly locate the last checkpoint and avoid full log replay when + /// reading the table. /// - /// This method should be only called AFTER writing all checkpoint data to - /// ensure proper completion of the checkpoint operation. This method - /// generates the `_last_checkpoint` file with metadata about the checkpoint. + /// # Workflow + /// 0. IMPORTANT: This method must only be called AFTER successfully writing + /// all checkpoint data to storage. Failure to do so may result in + /// data loss. + /// 1. Extracts size information from the provided metadata + /// 2. Combines with additional metadata collected during checkpoint creation + /// 3. Writes the _last_checkpoint file to the log /// - /// The metadata parameter is a single-row EngineData batch containing - /// {size_in_bytes: i64} for the checkpoint file. This method will extend - /// the EngineData batch with the remaining fields for the `_last_checkpoint` - /// file. + /// # Parameters + /// - `engine`: The engine used for writing the _last_checkpoint file + /// - `metadata`: A single-row [`EngineData`] batch containing: + /// - `size_in_bytes` (i64): The size of the written checkpoint file #[allow(dead_code)] // TODO: Remove when finalize_checkpoint is implemented fn finalize_checkpoint( self, @@ -235,10 +265,16 @@ impl CheckpointWriter { ) } - /// Prepares the iterator over actions for checkpoint creation + /// Retrieves an iterator over all actions to be included in the checkpoint /// - /// This method is factored out to facilitate testing and returns an iterator - /// over all actions to be included in the checkpoint. + /// This method reads the relevant actions from the table's log segment using + /// the checkpoint schema, which filters for action types needed in checkpoints. + /// + /// The returned iterator yields tuples where: + /// - The first element is data in engine format + /// - The second element is a flag that indicates the action's source: + /// - `true` if the action came from a commit file + /// - `false` if the action came from a previous checkpoint file fn replay_for_checkpoint_data( &self, engine: &dyn Engine, @@ -279,8 +315,8 @@ impl CheckpointBuilder { /// Classic naming is required for V1 checkpoints and optional for V2 checkpoints. /// - For V1 checkpoints, this method is a no-op. /// - For V2 checkpoints, the default is UUID naming unless this method is called. - pub fn with_classic_naming(mut self, with_classic_naming: bool) -> Self { - self.with_classic_naming = with_classic_naming; + pub fn with_classic_naming(mut self) -> Self { + self.with_classic_naming = true; self } @@ -325,37 +361,43 @@ fn deleted_file_retention_timestamp_with_time( // Simple subtraction - will produce negative values if retention > now Ok(now_ms - retention_ms) } - -/// Create a batch with a single row containing the [`CheckpointMetadata`] action -/// for the V2 spec checkpoint. +/// Creates the checkpoint metadata action for V2 checkpoints. /// -/// This method calls the create_one method on the expression handler to create -/// a single-row batch with the checkpoint metadata action. The method returns: -/// - None if the checkpoint is not a V2 checkpoint -/// - Some(Ok(batch)) if the batch was successfully created +/// For V2 checkpoints, this function generates a special [`CheckpointMetadata`] action +/// that must be included in the V2 spec checkpoint file. This action contains metadata +/// about the checkpoint, particularly its version. For V1 checkpoints, this function +/// returns `None`, as the V1 schema does not include this action type. +/// +/// # Implementation Details +/// +/// The function creates a single-row [`EngineData`] batch containing only the +/// version field of the [`CheckpointMetadata`] action. Future implementations will +/// include additional metadata fields such as tags when map support is added. +/// +/// The resulting [`CheckpointData`] includes a selection vector with a single `true` +/// value, indicating this action should always be included in the checkpoint. fn create_checkpoint_metadata_batch( - _version: i64, - _engine: &dyn Engine, - _is_v2_checkpoint: bool, + version: i64, + engine: &dyn Engine, + is_v2_checkpoint: bool, ) -> DeltaResult>> { - todo!("Implement create_checkpoint_metadata_batch"); - // if is_v2_checkpoint { - // let values: &[Scalar] = &[version.into()]; - // let checkpoint_metadata_batch = engine.get_expression_handler().create_one( - // // TODO: Include checkpointMetadata.tags when maps are supported - // Arc::new(CheckpointMetadata::to_schema().project_as_struct(&["version"])?), - // &values, - // )?; - - // let result = CheckpointData { - // data: checkpoint_metadata_batch, - // selection_vector: vec![true], - // }; - - // Ok(Some(Ok(result))) - // } else { - // Ok(None) - // } + if is_v2_checkpoint { + let values: &[Scalar] = &[version.into()]; + let checkpoint_metadata_batch = engine.get_expression_handler().create_one( + // TODO: Include checkpointMetadata.tags when maps are supported + Arc::new(CheckpointMetadata::to_schema().project_as_struct(&["version"])?), + &values, + )?; + + let result = CheckpointData { + data: checkpoint_metadata_batch, + selection_vector: vec![true], + }; + + Ok(Some(Ok(result))) + } else { + Ok(None) + } } #[cfg(test)] diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index 9851aeb03b..8a16a1edc5 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -386,7 +386,7 @@ trait ExpressionHandlerExtension: ExpressionHandler { } // Auto-implement the extension trait for all ExpressionHandlers -impl ExpressionHandlerExtension for T {} +impl ExpressionHandlerExtension for T {} /// Provides file system related functionalities to Delta Kernel. /// From 20fe7fefdb87821f282b464537f30f9e1bc4423f Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 7 Apr 2025 10:44:31 -0700 Subject: [PATCH 092/176] unignore --- kernel/src/checkpoint/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index aa6fc48432..af389f91d0 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -32,7 +32,7 @@ //! //! ## Example: Writing a classic-named V1/V2 checkpoint (depending on `v2Checkpoints` feature support) //! -//! ```ignore +//! ```ignore (TODO!(seb) un-ignore) //! let path = "./tests/data/app-txn-no-checkpoint"; //! let engine = Arc::new(SyncEngine::new()); //! let table = Table::try_from_uri(path)?; From 29489d7f14986bae95e593e649b95d3af9efee36 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 7 Apr 2025 12:05:37 -0700 Subject: [PATCH 093/176] fix docs --- kernel/src/checkpoint/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index af389f91d0..90540351d0 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -32,7 +32,8 @@ //! //! ## Example: Writing a classic-named V1/V2 checkpoint (depending on `v2Checkpoints` feature support) //! -//! ```ignore (TODO!(seb) un-ignore) +//! TODO(seb): unignore example +//! ```ignore //! let path = "./tests/data/app-txn-no-checkpoint"; //! let engine = Arc::new(SyncEngine::new()); //! let table = Table::try_from_uri(path)?; From 5ccde93ef889cbab67318db0311a4c0aa1215b01 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 7 Apr 2025 12:29:40 -0700 Subject: [PATCH 094/176] oops --- kernel/src/log_replay.rs | 6 +++--- kernel/src/scan/log_replay.rs | 7 ------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index abb451c01c..cc760a1527 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -239,7 +239,7 @@ pub(crate) trait LogReplayProcessor: Sized { /// Processes a batch of actions and returns the filtered results. /// /// # Arguments - /// - `actions_batch` - A reference to an [`EngineData`] instance representing a batch of actions. + /// - `actions_batch` - A boxed [`EngineData`] instance representing a batch of actions. /// - `is_log_batch` - `true` if the batch originates from a commit log, `false` if from a checkpoint. /// /// Returns a [`DeltaResult`] containing the processor’s output, which includes only selected actions. @@ -247,7 +247,7 @@ pub(crate) trait LogReplayProcessor: Sized { /// Note: Since log replay is stateful, processing may update internal processor state (e.g., deduplication sets). fn process_actions_batch( &mut self, - actions_batch: &dyn EngineData, + actions_batch: Box, is_log_batch: bool, ) -> DeltaResult; @@ -264,7 +264,7 @@ pub(crate) trait LogReplayProcessor: Sized { action_iter .map(move |action_res| { let (batch, is_log_batch) = action_res?; - self.process_actions_batch(batch.as_ref(), is_log_batch) + self.process_actions_batch(batch, is_log_batch) }) .filter(|res| { // TODO: Leverage .is_none_or() when msrv = 1.82 diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index a1faabc068..9712b5dad4 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -339,10 +339,6 @@ fn get_add_transform_expr() -> Expression { ]) } -impl LogReplayProcessor for ScanLogReplayProcessor { - type Output = ScanData; - - fn process_actions_batch( impl LogReplayProcessor for ScanLogReplayProcessor { type Output = ScanData; @@ -358,13 +354,10 @@ impl LogReplayProcessor for ScanLogReplayProcessor { assert_eq!(selection_vector.len(), actions_batch.len()); let mut visitor = AddRemoveDedupVisitor::new( - &mut self.seen_file_keys, &mut self.seen_file_keys, selection_vector, self.logical_schema.clone(), self.transform.clone(), - self.logical_schema.clone(), - self.transform.clone(), self.partition_filter.clone(), is_log_batch, ); From 00c834b1b80bb0ba75ed64eb1e1353c72d4262eb Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 7 Apr 2025 13:21:35 -0700 Subject: [PATCH 095/176] docs --- kernel/src/checkpoint/log_replay.rs | 40 +++++++++++++++-------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 8078935282..176b2b029a 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -1,23 +1,25 @@ -//! This module implements log replay functionality specifically for checkpoint writes in delta tables. +//! [`CheckpointLogReplayProcessor`] performs log replay specifically for checkpoint creation in delta tables. //! -//! The primary goal is to process Delta log actions in reverse chronological order (from most recent to -//! least recent) to produce the minimal set of actions required to reconstruct the table state in a checkpoint. +//! During checkpoint creation, the processor reads batches of log actions (in reverse chronological order) +//! and performs the following steps: //! -//! ## Key Responsibilities -//! - Filtering: Only the most recent protocol and metadata actions are retained, and for each transaction -//! (identified by its app ID), only the latest action is kept. -//! - Deduplication: File actions are deduplicated based on file path and deletion vector unique ID so that -//! duplicate or obsolete actions (including remove actions) are ignored. -//! - Retention Filtering: Tombstones older than the configured `minimum_file_retention_timestamp` are excluded. +//! - Protocol and Metadata Filtering: Ensures that only the most recent protocol and metadata actions +//! are retained +//! - Transaction Deduplication: For each transaction (identified by app ID), only the latest action +//! is preserved to maintain a consistent transaction history. +//! - File Action Deduplication: Leverages the [`FileActionDeduplicator`] mechanism to ensure that +//! for each unique file (identified by its path and deletion vector unique ID), only the most +//! recent valid action is included. +//! - Tombstone Retention Management: Excludes file removal tombstones that are older than the +//! configured `minimum_file_retention_timestamp`, reducing checkpoint size without compromising +//! table consistency. +//! - Action Type Filtering: Excludes other action types such as commitInfo, and CDC actions that +//! aren't required for reconstructing table state. //! -//! The module defines the [`CheckpointLogReplayProcessor`] which implements the LogReplayProcessor trait, -//! as well as a [`CheckpointVisitor`] to traverse and process batches of log actions. -//! -//! The processing result is encapsulated in [`CheckpointData`], which includes the log data accompanied with -//! a selection vector indicating which rows should be included in the checkpoint file. -//! -//! For log replay functionality used during table scans (i.e. for reading checkpoints and commit logs), refer to -//! the `scan/log_replay.rs` module. +//! As an implementation of [`LogReplayProcessor`], [`CheckpointLogReplayProcessor`] provides the +//! `process_actions_batch` method, which applies these steps to each batch of log actions and +//! produces a [`CheckpointData`] result. This result encapsulates both the original batch data +//! and a selection vector indicating which rows should be included in the checkpoint file. use std::cell::RefCell; use std::collections::HashSet; use std::rc::Rc; @@ -56,7 +58,7 @@ impl HasSelectionVector for CheckpointData { /// trait that filters log segment actions for inclusion in a V1 spec checkpoint file. /// /// It processes each action batch via the `process_actions_batch` method, using the -/// [`CheckpointVisitor`] to convert each [`EngineData`] batch into a [`CheckpointData`] +/// [`CheckpointVisitor`] to map each [`EngineData`] batch into a [`CheckpointData`] /// instance that reflect only the necessary actions for the checkpoint. pub(crate) struct CheckpointLogReplayProcessor { /// Tracks file actions that have been seen during log replay to avoid duplicates. @@ -167,7 +169,7 @@ impl CheckpointLogReplayProcessor { /// /// Note: The 'action_iter' parameter is an iterator of (engine_data, bool) tuples that _must_ be /// sorted by the order of the actions in the log from most recent to least recent. -#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented +#[allow(unused)] // TODO: Remove once API is implemented pub(crate) fn checkpoint_actions_iter( action_iter: impl Iterator, bool)>>, total_actions_counter: Rc>, From 3c11320e65a794860ffaccf5a4c2bfe70c6ede89 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 7 Apr 2025 13:39:05 -0700 Subject: [PATCH 096/176] clean up doc & test --- kernel/src/checkpoint/log_replay.rs | 36 +++++++++++++++-------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 176b2b029a..2160f866ef 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -757,11 +757,12 @@ mod tests { Ok(()) } - /// Tests the end-to-end processing of multiple batches with various action types. - /// This tests the integration of the visitors with the main iterator function. - /// More granular testing is performed in the visitor tests. + /// Tests the [`CheckpointLogReplayProcessor`] by applying the processor across + /// multiple batches of actions. This test ensures that the processor correctly saves state + /// in order to deduplicate actions across batches. More granular tests for the + /// [`CheckpointVisitor`] are in the above `test_checkpoint_visitor` tests. #[test] - fn test_v1_checkpoint_actions_iter_multi_batch_test() -> DeltaResult<()> { + fn test_checkpoint_actions_iter_multi_batch_test() -> DeltaResult<()> { // Setup counters let total_actions_counter = Rc::new(RefCell::new(0)); let total_add_actions_counter = Rc::new(RefCell::new(0)); @@ -770,28 +771,31 @@ mod tests { let json_strings1: StringArray = vec![ r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, r#"{"metaData":{"id":"test2","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, ].into(); // Create second batch with some duplicates and new files let json_strings2: StringArray = vec![ - // Protocol and metadata should be skipped as duplicates + // Protocol, metadata, txn should be skipped as duplicates r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, r#"{"metaData":{"id":"test1","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, - // New files + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + // New file r#"{"add":{"path":"file3","partitionValues":{},"size":800,"modificationTime":102,"dataChange":true}}"#, // Duplicate file should be skipped r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, // Transaction - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"# + // Unique transaction (appId) should be included + r#"{"txn":{"appId":"app2","version":1,"lastUpdated":123456789}}"# ].into(); // Create third batch with all duplicate actions. - // The entire batch should be skippped as there are no selected actions to write from this batch. + // The *entire* batch should be skippped as there are no selected actions to write from this batch. let json_strings3: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, ].into(); let input_batches = vec![ - Ok((parse_json_batch(json_strings1), true)), + Ok((parse_json_batch(json_strings1), true)), // true = commit batch Ok((parse_json_batch(json_strings2), true)), Ok((parse_json_batch(json_strings3), true)), ]; @@ -804,25 +808,23 @@ mod tests { ) .try_collect()?; - // Expect two batches in results (third batch should be filtered out)" + // Expect two batches in results (third batch should be filtered out) assert_eq!(results.len(), 2); // First batch should have all rows selected let checkpoint_data = &results[0]; assert_eq!( checkpoint_data.selection_vector, - vec![true, true, true, true] + vec![true, true, true, true, true] ); - - // Second batch should have only new file and transaction selected + // Second batch should have only new file and unique transaction selected let checkpoint_data = &results[1]; assert_eq!( checkpoint_data.selection_vector, - vec![false, false, true, false, true] + vec![false, false, false, true, false, true] ); - - // 6 total actions (4 from batch1 + 2 from batch2 + 0 from batch3) - assert_eq!(*total_actions_counter.borrow(), 6); + // 6 total actions (5 from batch1 + 2 from batch2 + 0 from batch3) + assert_eq!(*total_actions_counter.borrow(), 7); // 3 add actions (2 from batch1 + 1 from batch2) assert_eq!(*total_add_actions_counter.borrow(), 3); From 4f61757ed89c141b197c28cdf05685ced722d9f7 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 7 Apr 2025 13:47:17 -0700 Subject: [PATCH 097/176] clean up docs --- kernel/src/checkpoint/log_replay.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 2160f866ef..ff1d28418f 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -16,10 +16,17 @@ //! - Action Type Filtering: Excludes other action types such as commitInfo, and CDC actions that //! aren't required for reconstructing table state. //! +//! The [`CheckpointVisitor`] implements the visitor pattern to efficiently apply these filtering +//! rules to each action in the batch, determining which should be included in the checkpoint file. +//! It handles deduplication of file actions, expiration of remove tombstones, and filtering of +//! non-file actions (protocol, metadata, transaction) while excluding unnecessary action types. +//! //! As an implementation of [`LogReplayProcessor`], [`CheckpointLogReplayProcessor`] provides the //! `process_actions_batch` method, which applies these steps to each batch of log actions and //! produces a [`CheckpointData`] result. This result encapsulates both the original batch data //! and a selection vector indicating which rows should be included in the checkpoint file. +//! The [`CheckpointVisitor`] is applied within the `process_actions_batch` method to determine +//! which rows to include by filtering protocol, metadata, transaction, and file actions. use std::cell::RefCell; use std::collections::HashSet; use std::rc::Rc; From fdd4f68913c0eb8e6ef89ebd81456c4fa9b9de05 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 7 Apr 2025 13:48:14 -0700 Subject: [PATCH 098/176] update docs --- kernel/src/checkpoint/log_replay.rs | 42 +++++++++++++++++------------ 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index f0ba1147d7..ae632a4474 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -1,24 +1,32 @@ -//! This module implements log replay functionality specifically for checkpoint writes in delta tables. +//! `CheckpointLogReplayProcessor` performs log replay specifically for checkpoint creation in delta tables. //! -//! The primary goal is to process Delta log actions in reverse chronological order (from most recent to -//! oldest) to produce the minimal set of actions required to reconstruct the table state in a checkpoint. +//! During checkpoint creation, the processor reads batches of log actions (in reverse chronological order) +//! and performs the following steps: //! -//! ## Key Responsibilities -//! - Filtering: Only the most recent protocol and metadata actions are retained, and for each transaction -//! (identified by its app ID), only the latest action is kept. -//! - Deduplication: File actions are deduplicated based on file path and deletion vector unique ID so that -//! duplicate or obsolete actions (including remove actions) are ignored. -//! - Retention Filtering: Tombstones older than the configured `minimum_file_retention_timestamp` are excluded. +//! - Protocol and Metadata Filtering: Ensures that only the most recent protocol and metadata actions +//! are retained +//! - Transaction Deduplication: For each transaction (identified by app ID), only the latest action +//! is preserved to maintain a consistent transaction history. +//! - File Action Deduplication: Leverages the [`FileActionDeduplicator`] mechanism to ensure that +//! for each unique file (identified by its path and deletion vector unique ID), only the most +//! recent valid action is included. +//! - Tombstone Retention Management: Excludes file removal tombstones that are older than the +//! configured `minimum_file_retention_timestamp`, reducing checkpoint size without compromising +//! table consistency. +//! - Action Type Filtering: Excludes other action types such as commitInfo, and CDC actions that +//! aren't required for reconstructing table state. //! -//! TODO: `CheckpointLogReplayProcessor` struct & `CheckpointData` type -//! The module defines the CheckpointLogReplayProcessor which implements the `LogReplayProcessor` trait, -//! as well as a [`CheckpointVisitor`] to traverse and process batches of log actions. +//! The [`CheckpointVisitor`] implements the visitor pattern to efficiently apply these filtering +//! rules to each action in the batch, determining which should be included in the checkpoint file. +//! It handles deduplication of file actions, expiration of remove tombstones, and filtering of +//! non-file actions (protocol, metadata, transaction) while excluding unnecessary action types. //! -//! The processing result is encapsulated in `CheckpointData`, which includes the log data accompanied with -//! a selection vector indicating which rows should be included in the checkpoint file. -//! -//! For log replay functionality used during table scans (i.e. for reading checkpoints and commit logs), refer to -//! the `scan/log_replay.rs` module. +//! As an implementation of `LogReplayProcessor`, `CheckpointLogReplayProcessor` provides the +//! `process_actions_batch` method, which applies these steps to each batch of log actions and +//! produces a `CheckpointData` result. This result encapsulates both the original batch data +//! and a selection vector indicating which rows should be included in the checkpoint file. +//! The [`CheckpointVisitor`] is applied within the `process_actions_batch` method to determine +//! which rows to include by filtering protocol, metadata, transaction, and file actions. use std::collections::HashSet; use std::sync::LazyLock; From 72bb446328ee3f84fdf7ead51d8bf352ba8b0f97 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 7 Apr 2025 13:58:56 -0700 Subject: [PATCH 099/176] merge fix --- kernel/src/checkpoint/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index fb42d833d2..d051f08679 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -90,7 +90,7 @@ use crate::{ expressions::Scalar, schema::{SchemaRef, StructType}, snapshot::Snapshot, - DeltaResult, Engine, EngineData, Error, ExpressionHandlerExtension, + DeltaResult, Engine, EngineData, Error, EvaluationHandlerExtension, }; use log_replay::{checkpoint_actions_iter, CheckpointData}; use std::{ @@ -384,7 +384,7 @@ fn create_checkpoint_metadata_batch( ) -> DeltaResult>> { if is_v2_checkpoint { let values: &[Scalar] = &[version.into()]; - let checkpoint_metadata_batch = engine.get_expression_handler().create_one( + let checkpoint_metadata_batch = engine.evaluation_handler().create_one( // TODO: Include checkpointMetadata.tags when maps are supported Arc::new(CheckpointMetadata::to_schema().project_as_struct(&["version"])?), &values, From e2ceee38fda5e23f6ba38a96b2406924ce0dd968 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 7 Apr 2025 14:26:12 -0700 Subject: [PATCH 100/176] docs --- kernel/src/checkpoint/mod.rs | 57 ++++++++++++++++-- kernel/src/table_configuration.rs | 4 +- .../_delta_log/.00000000000000000002.json.crc | Bin 0 -> 20 bytes 3 files changed, 53 insertions(+), 8 deletions(-) create mode 100644 kernel/tests/data/v2-checkpoint-with-sidecars/_delta_log/.00000000000000000002.json.crc diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index d051f08679..751fd79398 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -135,19 +135,31 @@ pub struct SingleFileCheckpointData { /// The [`CheckpointWriter`] orchestrates creating checkpoint data and finalizing /// the checkpoint file. It tracks statistics about included actions and /// ensures checkpoint data is consumed only once. +/// +/// # Usage Flow +/// 1. Create via `CheckpointBuilder::build()` +/// 2. Call `get_checkpoint_info()` to obtain the [`SingleFileCheckpointData`] +/// containing the path and action iterator for the checkpoint +/// 3. Write the checkpoint data to storage (implementation-specific) +/// 4. Call `finalize_checkpoint()` to create the _last_checkpoint file +/// +/// # Internal Process +/// 1. Reads relevant actions from the log segment using the checkpoint read schema +/// 2. Applies selection and deduplication logic with the `CheckpointLogReplayProcessor` +/// 3. Tracks counts of included actions for to be written to the _last_checkpoint file +/// 5. Chains the [`CheckpointMetadata`] action to the actions iterator (for V2 checkpoints) pub struct CheckpointWriter { /// The snapshot from which the checkpoint is created snapshot: Snapshot, - /// Whether the table supports the `v2Checkpoints` feature is_v2_checkpoints_supported: bool, - + /// Note: Rc> provides shared mutability for our counters, allowing the + /// returned actions iterator from `.get_checkpoint_info()` to update the counters, + /// and the `finalize_checkpoint()` method to read them... /// Counter for total actions included in the checkpoint total_actions_counter: Rc>, - /// Counter for Add actions included in the checkpoint total_add_actions_counter: Rc>, - /// Flag to track if checkpoint data has been consumed data_consumed: bool, } @@ -295,6 +307,39 @@ impl CheckpointWriter { /// The CheckpointBuilder provides an interface for configuring checkpoint /// generation. It handles table feature detection and enforces compatibility /// between configuration options and table features. +/// +/// # Usage Flow +/// 1. Create a builder via `Table::checkpoint()` +/// 2. Optionally configure with `with_classic_naming()` +/// 3. Call `build()` to create a CheckpointWriter +/// +/// # Checkpoint Format Selection Logic +/// - For tables without v2Checkpoints support: Always uses Single-file Classic-named V1 +/// - For tables with v2Checkpoints support: +/// - With classic naming = false (default): Single-file UUID-named V2 +/// - With classic naming = true: Single-file Classic-named V2 +/// +/// # Checkpoint Naming Conventions +/// +/// ## UUID-named V2 Checkpoints +/// These follow the V2 spec using file name pattern: `n.checkpoint.u.{json/parquet}`, where: +/// - `n` is the snapshot version (zero-padded to 20 digits) +/// - `u` is a UUID +/// e.g. 00000000000000000010.checkpoint.80a083e8-7026-4e79-81be-64bd76c43a11.json +/// +/// ## Classic-named Checkpoints +/// A classic checkpoint for version `n` of the table consists of a file named +/// `n.checkpoint.parquet` where `n` is zero-padded to have length 20. These could +/// follow either V1 spec or V2 spec depending on the table's support for the +/// `v2Checkpoints` feature. +/// e.g. 00000000000000000010.checkpoint.parquet +/// +/// # Example +/// ```ignore +/// let table = Table::try_from_uri(path)?; +/// let builder = table.checkpoint(&engine, Some(version))?; +/// let writer = builder.with_classic_naming().build()?; +/// ``` pub struct CheckpointBuilder { /// The table snapshot from which to create the checkpoint snapshot: Snapshot, @@ -313,9 +358,9 @@ impl CheckpointBuilder { /// Configures the builder to use the classic naming scheme /// - /// Classic naming is required for V1 checkpoints and optional for V2 checkpoints. + /// Classic naming is optional for V2 checkpoints, but the only option for V1 checkpoints. /// - For V1 checkpoints, this method is a no-op. - /// - For V2 checkpoints, the default is UUID naming unless this method is called. + /// - For V2 checkpoints, the default is UUID-naming unless this method is called. pub fn with_classic_naming(mut self) -> Self { self.with_classic_naming = true; self diff --git a/kernel/src/table_configuration.rs b/kernel/src/table_configuration.rs index 07fe745c7e..49d5ecfbda 100644 --- a/kernel/src/table_configuration.rs +++ b/kernel/src/table_configuration.rs @@ -249,11 +249,11 @@ impl TableConfiguration { pub(crate) fn is_v2_checkpoint_supported(&self) -> bool { let read_supported = self .protocol() - .has_reader_feature(&ReaderFeatures::V2Checkpoint) + .has_reader_feature(&ReaderFeature::V2Checkpoint) && self.protocol.min_reader_version() == 3; let write_supported = self .protocol() - .has_writer_feature(&WriterFeatures::V2Checkpoint) + .has_writer_feature(&WriterFeature::V2Checkpoint) && self.protocol.min_writer_version() == 7; read_supported && write_supported } diff --git a/kernel/tests/data/v2-checkpoint-with-sidecars/_delta_log/.00000000000000000002.json.crc b/kernel/tests/data/v2-checkpoint-with-sidecars/_delta_log/.00000000000000000002.json.crc new file mode 100644 index 0000000000000000000000000000000000000000..f1a6ea85db288e6a905667cead26a14f9cd38b6a GIT binary patch literal 20 bcmYc;N@ieSU}Es(x>#r#q1~;dcw!v@GG_(Q literal 0 HcmV?d00001 From 1a5fcb454aa4d61dc0c9f364aa83ae598618cad7 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 7 Apr 2025 17:52:21 -0700 Subject: [PATCH 101/176] remove checkpoint builder --- kernel/src/checkpoint/mod.rs | 253 +++++++++++++++-------------------- kernel/src/table.rs | 33 ++--- 2 files changed, 120 insertions(+), 166 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 751fd79398..70b2fc1535 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -2,50 +2,44 @@ //! //! This module implements the API for writing checkpoints in delta tables. //! Checkpoints provide a compact summary of the table state, enabling faster recovery by -//! avoiding full log replay. This API supports three checkpoint types: +//! avoiding full log replay. This API supports two checkpoint types: //! -//! 1. **Single-file Classic-named V1 Checkpoint** – for legacy tables that do not support -//! the `v2Checkpoints` reader/writer feature. -//! 2. **Single-file Classic-named V2 Checkpoint** – ensures backwards compatibility by -//! allowing legacy readers to recognize the checkpoint file, read the protocol action, and -//! fail gracefully. -//! 3. **Single-file UUID-named V2 Checkpoint** – the default and preferred option for small to -//! medium tables with `v2Checkpoints` reader/writer feature enabled. +//! 1. **Single-file Classic-named V1 Checkpoint** – for legacy tables that do not support the +//! `v2Checkpoints` reader/writer feature. These checkpoints follow the V1 specification and do not +//! include a CheckpointMetadata action. +//! 2. **Single-file Classic-named V2 Checkpoint** – for tables supporting the `v2Checkpoints` feature. +//! These checkpoints follow the V2 specification and include a CheckpointMetadata action, while +//! maintaining backwards compatibility by using classic naming that legacy readers can recognize. //! -//! ## Architecture -//! -//! ### [`CheckpointBuilder`] -//! The entry point for checkpoint creation with the following methods: -//! - `new(snapshot: Snapshot) -> Self` - Creates a new builder for the given table snapshot -//! - `with_classic_naming() -> Self` - Configures the builder to use classic naming -//! - `build() -> DeltaResult` - Creates the checkpoint writer +//! For more information on the V1/V2 specifications, see the following protocol section: +//! https://github.com/delta-io/delta/blob/master/PROTOCOL.md#checkpoint-specs //! //! ### [`CheckpointWriter`] -//! Handles the actual checkpoint generation with the following methods: +//! Handles the actual checkpoint data generation and writing process. It is created via the +//! [`Table::checkpoint()`] method and provides the following methods: +//! - `new(snapshot: Snapshot) -> Self` - Creates a new writer for the given table snapshot //! - `get_checkpoint_info(engine: &dyn Engine) -> DeltaResult` - -//! Retrieves checkpoint data and path +//! Returns the checkpoint data and path information //! - `finalize_checkpoint(engine: &dyn Engine, metadata: &dyn EngineData) -> DeltaResult<()>` - -//! Writes the _last_checkpoint file +//! Writes the _last_checkpoint file after the checkpoint data has been written //! //! ## Checkpoint Type Selection //! -//! The checkpoint type is determined by two factors: -//! 1. Whether the table supports the `v2Checkpoints` reader/writer feature -//! 2. Whether classic naming is configured on the builder +//! The checkpoint type is determined by whether the table supports the `v2Checkpoints` reader/writer feature: //! //! ```text -//! +------------------+---------------------------+-------------------------------+ -//! | Table Feature | Builder Configuration | Resulting Checkpoint Type | -//! +==================+===========================+===============================+ -//! | No v2Checkpoints | Any | Single-file Classic-named V1 | -//! +------------------+---------------------------+-------------------------------+ -//! | v2Checkpoints | with_classic_naming(false)| Single-file UUID-named V2 | -//! +------------------+---------------------------+-------------------------------+ -//! | v2Checkpoints | with_classic_naming(true) | Single-file Classic-named V2 | -//! +------------------+---------------------------+-------------------------------+ +//! +------------------+-------------------------------+ +//! | Table Feature | Resulting Checkpoint Type | +//! +==================+===============================+ +//! | No v2Checkpoints | Single-file Classic-named V1 | +//! +------------------+-------------------------------+ +//! | v2Checkpoints | Single-file Classic-named V2 | +//! +------------------+-------------------------------+ //! ``` //! //! Notes: +//! - Single-file UUID-named V2 checkpoints (using `n.checkpoint.u.{json/parquet}` naming) are to be +//! implemented in the future. The current implementation only supports classic-named V2 checkpoints. //! - Multi-file V2 checkpoints are not supported yet. The API is designed to be extensible for future //! multi-file support, but the current implementation only supports single-file checkpoints. //! - Multi-file V1 checkpoints are DEPRECATED. @@ -58,16 +52,10 @@ //! let engine = Arc::new(SyncEngine::new()); //! let table = Table::try_from_uri(path)?; //! -//! // Create a checkpoint builder for the table at a specific version -//! let builder = table.checkpoint(&engine, Some(2))?; -//! -//! // Optionally configure the builder (e.g., force classic naming) -//! let writer = builder.with_classic_naming(); +//! // Create a checkpoint writer for the table at a specific version +//! let mut writer = table.checkpoint(&engine, Some(2))?; //! -//! // Build the checkpoint writer -//! let mut writer = builder.build(&engine)?; -//! -//! // Retrieve checkpoint data (ensuring single consumption) +//! // Retrieve checkpoint data //! let checkpoint_data = writer.get_checkpoint_info()?; //! //! /* Write checkpoint data to file and collect metadata about the write */ @@ -83,12 +71,12 @@ //! on how log replay is used to filter and deduplicate actions for checkpoint creation. use crate::{ actions::{ - schemas::{GetStructField, ToSchema}, - Add, CheckpointMetadata, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, - METADATA_NAME, PROTOCOL_NAME, REMOVE_NAME, SET_TRANSACTION_NAME, SIDECAR_NAME, + schemas::GetStructField, Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, + ADD_NAME, METADATA_NAME, PROTOCOL_NAME, REMOVE_NAME, SET_TRANSACTION_NAME, SIDECAR_NAME, }, expressions::Scalar, - schema::{SchemaRef, StructType}, + path::ParsedLogPath, + schema::{DataType, SchemaRef, StructField, StructType}, snapshot::Snapshot, DeltaResult, Engine, EngineData, Error, EvaluationHandlerExtension, }; @@ -137,7 +125,7 @@ pub struct SingleFileCheckpointData { /// ensures checkpoint data is consumed only once. /// /// # Usage Flow -/// 1. Create via `CheckpointBuilder::build()` +/// 1. Create via `Table::checkpoint()` /// 2. Call `get_checkpoint_info()` to obtain the [`SingleFileCheckpointData`] /// containing the path and action iterator for the checkpoint /// 3. Write the checkpoint data to storage (implementation-specific) @@ -151,8 +139,6 @@ pub struct SingleFileCheckpointData { pub struct CheckpointWriter { /// The snapshot from which the checkpoint is created snapshot: Snapshot, - /// Whether the table supports the `v2Checkpoints` feature - is_v2_checkpoints_supported: bool, /// Note: Rc> provides shared mutability for our counters, allowing the /// returned actions iterator from `.get_checkpoint_info()` to update the counters, /// and the `finalize_checkpoint()` method to read them... @@ -166,10 +152,9 @@ pub struct CheckpointWriter { impl CheckpointWriter { /// Creates a new CheckpointWriter with the provided checkpoint data and counters - fn new(snapshot: Snapshot, is_v2_checkpoints_supported: bool) -> Self { + pub fn new(snapshot: Snapshot) -> Self { Self { snapshot, - is_v2_checkpoints_supported, total_actions_counter: Rc::new(RefCell::::new(0.into())), total_add_actions_counter: Rc::new(RefCell::::new(0.into())), data_consumed: false, @@ -181,7 +166,7 @@ impl CheckpointWriter { /// This method is the core of the checkpoint generation process. It: /// /// 1. Ensures checkpoint data is consumed only once via `data_consumed` flag - /// 2. Reads actions from the log segment using the checkpoint schema + /// 2. Reads actions from the log segment using the checkpoint read schema /// 3. Filters and deduplicates actions for the checkpoint /// 4. Chains the checkpoint metadata action if writing a V2 spec checkpoint /// (i.e., if `v2Checkpoints` feature is supported by table) @@ -199,6 +184,10 @@ impl CheckpointWriter { if self.data_consumed { return Err(Error::generic("Checkpoint data has already been consumed")); } + let is_v2_checkpoints_supported = self + .snapshot + .table_configuration() + .is_v2_checkpoint_supported(); // Create iterator over actions for checkpoint data let checkpoint_data = checkpoint_actions_iter( @@ -212,16 +201,18 @@ impl CheckpointWriter { let chained = checkpoint_data.chain(create_checkpoint_metadata_batch( self.snapshot.version() as i64, engine, - self.is_v2_checkpoints_supported, + is_v2_checkpoints_supported, )?); - // Generate the appropriate checkpoint path - // let checkpoint_path = self.generate_checkpoint_path()?; + let checkpoint_path = ParsedLogPath::new_classic_parquet_checkpoint( + self.snapshot.table_root(), + self.snapshot.version(), + )?; self.data_consumed = true; Ok(SingleFileCheckpointData { - path: Url::parse("todo!")?, + path: checkpoint_path.location, data: Box::new(chained), }) } @@ -302,88 +293,6 @@ impl CheckpointWriter { } } -/// Builder for configuring and creating CheckpointWriter instances -/// -/// The CheckpointBuilder provides an interface for configuring checkpoint -/// generation. It handles table feature detection and enforces compatibility -/// between configuration options and table features. -/// -/// # Usage Flow -/// 1. Create a builder via `Table::checkpoint()` -/// 2. Optionally configure with `with_classic_naming()` -/// 3. Call `build()` to create a CheckpointWriter -/// -/// # Checkpoint Format Selection Logic -/// - For tables without v2Checkpoints support: Always uses Single-file Classic-named V1 -/// - For tables with v2Checkpoints support: -/// - With classic naming = false (default): Single-file UUID-named V2 -/// - With classic naming = true: Single-file Classic-named V2 -/// -/// # Checkpoint Naming Conventions -/// -/// ## UUID-named V2 Checkpoints -/// These follow the V2 spec using file name pattern: `n.checkpoint.u.{json/parquet}`, where: -/// - `n` is the snapshot version (zero-padded to 20 digits) -/// - `u` is a UUID -/// e.g. 00000000000000000010.checkpoint.80a083e8-7026-4e79-81be-64bd76c43a11.json -/// -/// ## Classic-named Checkpoints -/// A classic checkpoint for version `n` of the table consists of a file named -/// `n.checkpoint.parquet` where `n` is zero-padded to have length 20. These could -/// follow either V1 spec or V2 spec depending on the table's support for the -/// `v2Checkpoints` feature. -/// e.g. 00000000000000000010.checkpoint.parquet -/// -/// # Example -/// ```ignore -/// let table = Table::try_from_uri(path)?; -/// let builder = table.checkpoint(&engine, Some(version))?; -/// let writer = builder.with_classic_naming().build()?; -/// ``` -pub struct CheckpointBuilder { - /// The table snapshot from which to create the checkpoint - snapshot: Snapshot, - - /// Whether to use classic naming for the checkpoint file - with_classic_naming: bool, -} - -impl CheckpointBuilder { - pub(crate) fn new(snapshot: Snapshot) -> Self { - Self { - snapshot, - with_classic_naming: false, - } - } - - /// Configures the builder to use the classic naming scheme - /// - /// Classic naming is optional for V2 checkpoints, but the only option for V1 checkpoints. - /// - For V1 checkpoints, this method is a no-op. - /// - For V2 checkpoints, the default is UUID-naming unless this method is called. - pub fn with_classic_naming(mut self) -> Self { - self.with_classic_naming = true; - self - } - - /// Builds a [`CheckpointWriter`] based on the builder configuration. - /// - /// This method validates the configuration against table features and creates - /// a [`CheckpointWriter`] for the appropriate checkpoint type. It performs protocol - /// table feature checks to determine if v2Checkpoints are supported. - pub fn build(self) -> DeltaResult { - let is_v2_checkpoints_supported = self - .snapshot - .table_configuration() - .is_v2_checkpoint_supported(); - - Ok(CheckpointWriter::new( - self.snapshot, - is_v2_checkpoints_supported, - )) - } -} - /// Internal implementation with injectable time parameter for testing fn deleted_file_retention_timestamp_with_time( retention_duration: Option, @@ -417,7 +326,7 @@ fn deleted_file_retention_timestamp_with_time( /// # Implementation Details /// /// The function creates a single-row [`EngineData`] batch containing only the -/// version field of the [`CheckpointMetadata`] action. Future implementations will +/// version field of the `CheckpointMetadata` action. Future implementations will /// include additional metadata fields such as tags when map support is added. /// /// The resulting [`CheckpointData`] includes a selection vector with a single `true` @@ -429,11 +338,12 @@ fn create_checkpoint_metadata_batch( ) -> DeltaResult>> { if is_v2_checkpoint { let values: &[Scalar] = &[version.into()]; - let checkpoint_metadata_batch = engine.evaluation_handler().create_one( - // TODO: Include checkpointMetadata.tags when maps are supported - Arc::new(CheckpointMetadata::to_schema().project_as_struct(&["version"])?), - &values, - )?; + let schema = Arc::new(StructType::new([StructField::not_null( + "checkpointMetadata", + DataType::struct_type([StructField::not_null("version", DataType::LONG)]), + )])); + + let checkpoint_metadata_batch = engine.evaluation_handler().create_one(schema, values)?; let result = CheckpointData { data: checkpoint_metadata_batch, @@ -449,9 +359,14 @@ fn create_checkpoint_metadata_batch( #[cfg(test)] mod unit_tests { use super::*; - + use crate::engine::{arrow_data::ArrowEngineData, sync::SyncEngine}; + use arrow_53::{array::RecordBatch, datatypes::Field}; + use delta_kernel::arrow::array::create_array; use std::time::Duration; + use crate::arrow::array::{ArrayRef, StructArray}; + use crate::arrow::datatypes::{DataType, Schema}; + #[test] fn test_deleted_file_retention_timestamp() -> DeltaResult<()> { let now = Duration::from_secs(1000).as_millis() as i64; @@ -476,4 +391,58 @@ mod unit_tests { Ok(()) } + + #[test] + fn test_create_checkpoint_metadata_batch_when_v2_checkpoints_is_supported() -> DeltaResult<()> { + let engine = SyncEngine::new(); + let version = 10; + // Test with is_v2_checkpoint = true + let result = create_checkpoint_metadata_batch(version, &engine, true)?; + assert!(result.is_some()); + let checkpoint_data = result.unwrap()?; + + // Check selection vector has one true value + assert_eq!(checkpoint_data.selection_vector, vec![true]); + + // Verify the underlying EngineData contains the expected CheckpointMetadata action + let record_batch = checkpoint_data + .data + .any_ref() + .downcast_ref::() + .unwrap() + .record_batch(); + + // Build the expected RecordBatch + // Note: The schema is a struct with a single field "checkpointMetadata" of type struct + // containing a single field "version" of type long + let expected_schema = Arc::new(Schema::new(vec![Field::new( + "checkpointMetadata", + DataType::Struct(vec![Field::new("version", DataType::Int64, false)].into()), + false, + )])); + let expected = RecordBatch::try_new( + expected_schema, + vec![Arc::new(StructArray::from(vec![( + Arc::new(Field::new("version", DataType::Int64, false)), + create_array!(Int64, [version]) as ArrayRef, + )]))], + ) + .unwrap(); + + assert_eq!(*record_batch, expected); + + Ok(()) + } + + #[test] + fn test_create_checkpoint_metadata_batch_when_v2_checkpoints_not_supported() -> DeltaResult<()> + { + let engine = SyncEngine::new(); + // Test with is_v2_checkpoint = false + let result = create_checkpoint_metadata_batch(10, &engine, false)?; + + // Check that the result is None for V1 checkpoints + assert!(result.is_none()); + Ok(()) + } } diff --git a/kernel/src/table.rs b/kernel/src/table.rs index f8ebb4e8ac..ba7ed80547 100644 --- a/kernel/src/table.rs +++ b/kernel/src/table.rs @@ -7,7 +7,7 @@ use std::path::PathBuf; use url::Url; -use crate::checkpoint::CheckpointBuilder; +use crate::checkpoint::CheckpointWriter; use crate::snapshot::Snapshot; use crate::table_changes::TableChanges; use crate::transaction::Transaction; @@ -99,35 +99,20 @@ impl Table { ) } - /// Creates a [`CheckpointBuilder`] for generating table checkpoints. + /// Creates a [`CheckpointWriter`] for generating table checkpoints at the specified version. /// - /// Checkpoints are compact representations of the table state that improve reading performance - /// by providing a consolidated view without requiring full log replay. + /// The checkpoint type is automatically determined based on the table's feature support: + /// - Tables supporting `v2Checkpoints` feature -> Creates a Classic-named V2 checkpoint + /// - Tables not supporting `v2Checkpoints` feature -> Creates a Classic-named V1 checkpoint /// - /// # Checkpoint Types - /// - /// The type of checkpoint created depends on table features and builder configuration: - /// - /// 1. Classic V1 Checkpoint: Created automatically for tables without v2Checkpoints feature support. - /// - Uses classic naming format (`.checkpoint.parquet`) - /// - Created regardless of `with_classic_naming` setting - /// - /// 2. Classic V2 Checkpoint* Created when tables support v2Checkpoints feature AND - /// `with_classic_naming(true)` is specified. - /// - Uses classic naming format (`.checkpoint.parquet`) - /// - Includes additional V2 metadata - /// - /// 3. **UUID V2 Checkpoint**: Created when tables support v2Checkpoints feature AND - /// `with_classic_naming(false)` is used (default). - /// - Uses UUID naming format (`..checkpoint.parquet`) - /// - Includes additional V2 metadata - /// - Recommended for most tables that support v2Checkpoints + /// See the [`crate::checkpoint`] module documentation for more details on checkpoint types + /// and the overall checkpoint process. pub fn checkpoint( &self, engine: &dyn Engine, version: Option, - ) -> DeltaResult { - Ok(CheckpointBuilder::new(self.snapshot(engine, version)?)) + ) -> DeltaResult { + Ok(CheckpointWriter::new(self.snapshot(engine, version)?)) } /// Create a new write transaction for this table. From f1774920b1c716b46b36a1119770a85a4a8b3c41 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 7 Apr 2025 18:17:09 -0700 Subject: [PATCH 102/176] docs --- kernel/src/checkpoint/mod.rs | 53 ++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 70b2fc1535..d2f9a44aeb 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -12,7 +12,7 @@ //! maintaining backwards compatibility by using classic naming that legacy readers can recognize. //! //! For more information on the V1/V2 specifications, see the following protocol section: -//! https://github.com/delta-io/delta/blob/master/PROTOCOL.md#checkpoint-specs +//! //! //! ### [`CheckpointWriter`] //! Handles the actual checkpoint data generation and writing process. It is created via the @@ -110,7 +110,8 @@ fn get_checkpoint_read_schema() -> &'static SchemaRef { } /// Contains the path and data for a single-file checkpoint -pub struct SingleFileCheckpointData { +#[allow(unused)] // TODO(seb): Make pub for roll-out +pub(crate) struct SingleFileCheckpointData { /// Target URL where the checkpoint file will be written pub path: Url, @@ -136,7 +137,8 @@ pub struct SingleFileCheckpointData { /// 2. Applies selection and deduplication logic with the `CheckpointLogReplayProcessor` /// 3. Tracks counts of included actions for to be written to the _last_checkpoint file /// 5. Chains the [`CheckpointMetadata`] action to the actions iterator (for V2 checkpoints) -pub struct CheckpointWriter { +#[allow(unused)] // TODO(seb): Make pub for roll-out +pub(crate) struct CheckpointWriter { /// The snapshot from which the checkpoint is created snapshot: Snapshot, /// Note: Rc> provides shared mutability for our counters, allowing the @@ -152,7 +154,7 @@ pub struct CheckpointWriter { impl CheckpointWriter { /// Creates a new CheckpointWriter with the provided checkpoint data and counters - pub fn new(snapshot: Snapshot) -> Self { + pub(crate) fn new(snapshot: Snapshot) -> Self { Self { snapshot, total_actions_counter: Rc::new(RefCell::::new(0.into())), @@ -177,7 +179,8 @@ impl CheckpointWriter { /// /// # Returns /// A [`SingleFileCheckpointData`] containing the checkpoint path and action iterator - pub fn get_checkpoint_info( + #[allow(unused)] // TODO(seb): Make pub for roll-out + pub(crate) fn get_checkpoint_info( &mut self, engine: &dyn Engine, ) -> DeltaResult { @@ -236,7 +239,7 @@ impl CheckpointWriter { /// - `engine`: The engine used for writing the _last_checkpoint file /// - `metadata`: A single-row [`EngineData`] batch containing: /// - `size_in_bytes` (i64): The size of the written checkpoint file - #[allow(dead_code)] // TODO: Remove when finalize_checkpoint is implemented + #[allow(unused)] // TODO(seb): Make pub for roll-out fn finalize_checkpoint( self, _engine: &dyn Engine, @@ -336,24 +339,26 @@ fn create_checkpoint_metadata_batch( engine: &dyn Engine, is_v2_checkpoint: bool, ) -> DeltaResult>> { - if is_v2_checkpoint { - let values: &[Scalar] = &[version.into()]; - let schema = Arc::new(StructType::new([StructField::not_null( - "checkpointMetadata", - DataType::struct_type([StructField::not_null("version", DataType::LONG)]), - )])); - - let checkpoint_metadata_batch = engine.evaluation_handler().create_one(schema, values)?; - - let result = CheckpointData { - data: checkpoint_metadata_batch, - selection_vector: vec![true], - }; - - Ok(Some(Ok(result))) - } else { - Ok(None) + if !is_v2_checkpoint { + return Ok(None); } + let values: &[Scalar] = &[version.into()]; + // Create the nested schema structure for `CheckpointMetadata` + // Note: We cannot use `CheckpointMetadata::to_schema()` as it would include + // the 'tags' field which we're not supporting yet due to the lack of map support. + let schema = Arc::new(StructType::new([StructField::not_null( + "checkpointMetadata", + DataType::struct_type([StructField::not_null("version", DataType::LONG)]), + )])); + + let checkpoint_metadata_batch = engine.evaluation_handler().create_one(schema, values)?; + + let result = CheckpointData { + data: checkpoint_metadata_batch, + selection_vector: vec![true], // Always include this action + }; + + Ok(Some(Ok(result))) } #[cfg(test)] @@ -441,7 +446,7 @@ mod unit_tests { // Test with is_v2_checkpoint = false let result = create_checkpoint_metadata_batch(10, &engine, false)?; - // Check that the result is None for V1 checkpoints + // No checkpoint metadata action should be created for V1 checkpoints assert!(result.is_none()); Ok(()) } From 04d418ec5a473348e71656e612ef844c983b0583 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 7 Apr 2025 18:17:15 -0700 Subject: [PATCH 103/176] docs --- kernel/src/table.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/src/table.rs b/kernel/src/table.rs index ba7ed80547..3e48f5f661 100644 --- a/kernel/src/table.rs +++ b/kernel/src/table.rs @@ -107,7 +107,8 @@ impl Table { /// /// See the [`crate::checkpoint`] module documentation for more details on checkpoint types /// and the overall checkpoint process. - pub fn checkpoint( + #[allow(unused)] // TODO(seb) Make pub for roll-out + pub(crate) fn checkpoint( &self, engine: &dyn Engine, version: Option, From d3a97a77acfd945d01e85bb4516a4121f8c064b8 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 7 Apr 2025 18:19:13 -0700 Subject: [PATCH 104/176] priv --- kernel/src/checkpoint/log_replay.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 72b8070f2e..220af3d55c 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -48,9 +48,9 @@ use crate::{DeltaResult, EngineData, Error}; /// [`CheckpointData`] represents a batch of actions filtered for checkpoint creation. /// It wraps a single engine data batch and a corresponding selection vector indicating /// which rows should be written to the checkpoint file. -pub struct CheckpointData { +#[allow(unused)] // TODO(seb): Make pub for roll-out +pub(crate) struct CheckpointData { /// The original engine data containing the actions - #[allow(dead_code)] // TODO: Remove once checkpoint_v1 API is implemented pub(crate) data: Box, /// Boolean vector indicating which rows should be included in the checkpoint pub(crate) selection_vector: Vec, From 1adb10050db292a1673b6adeef39ca95702f3ad8 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 7 Apr 2025 20:16:46 -0700 Subject: [PATCH 105/176] tests and docs --- kernel/src/checkpoint/mod.rs | 130 +++++++++----- kernel/src/checkpoint/tests.rs | 315 +++++++++++++++++++++++++++++++++ 2 files changed, 396 insertions(+), 49 deletions(-) create mode 100644 kernel/src/checkpoint/tests.rs diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index d2f9a44aeb..becf6fa2bf 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -16,7 +16,7 @@ //! //! ### [`CheckpointWriter`] //! Handles the actual checkpoint data generation and writing process. It is created via the -//! [`Table::checkpoint()`] method and provides the following methods: +//! [`Table::checkpoint()`] method and provides the following APIs: //! - `new(snapshot: Snapshot) -> Self` - Creates a new writer for the given table snapshot //! - `get_checkpoint_info(engine: &dyn Engine) -> DeltaResult` - //! Returns the checkpoint data and path information @@ -90,6 +90,8 @@ use std::{ use url::Url; mod log_replay; +#[cfg(test)] +mod tests; /// Schema for extracting relevant actions from log files during checkpoint creation static CHECKPOINT_READ_SCHEMA: LazyLock = LazyLock::new(|| { @@ -140,16 +142,16 @@ pub(crate) struct SingleFileCheckpointData { #[allow(unused)] // TODO(seb): Make pub for roll-out pub(crate) struct CheckpointWriter { /// The snapshot from which the checkpoint is created - snapshot: Snapshot, + pub(crate) snapshot: Snapshot, /// Note: Rc> provides shared mutability for our counters, allowing the /// returned actions iterator from `.get_checkpoint_info()` to update the counters, /// and the `finalize_checkpoint()` method to read them... /// Counter for total actions included in the checkpoint - total_actions_counter: Rc>, + pub(crate) total_actions_counter: Rc>, /// Counter for Add actions included in the checkpoint - total_add_actions_counter: Rc>, + pub(crate) total_add_actions_counter: Rc>, /// Flag to track if checkpoint data has been consumed - data_consumed: bool, + pub(crate) data_consumed: bool, } impl CheckpointWriter { @@ -201,7 +203,7 @@ impl CheckpointWriter { ); // Chain the checkpoint metadata action if using V2 checkpoints - let chained = checkpoint_data.chain(create_checkpoint_metadata_batch( + let chained = checkpoint_data.chain(self.create_checkpoint_metadata_batch( self.snapshot.version() as i64, engine, is_v2_checkpoints_supported, @@ -248,6 +250,57 @@ impl CheckpointWriter { todo!("Implement finalize_checkpoint"); } + /// Creates the checkpoint metadata action for V2 checkpoints. + /// + /// For V2 checkpoints, this function generates a special [`CheckpointMetadata`] action + /// that must be included in the V2 spec checkpoint file. This action contains metadata + /// about the checkpoint, particularly its version. For V1 checkpoints, this function + /// returns `None`, as the V1 schema does not include this action type. + /// + /// # Implementation Details + /// + /// The function creates a single-row [`EngineData`] batch containing only the + /// version field of the `CheckpointMetadata` action. Future implementations will + /// include additional metadata fields such as tags when map support is added. + /// + /// The resulting [`CheckpointData`] includes a selection vector with a single `true` + /// value, indicating this action should always be included in the checkpoint. + fn create_checkpoint_metadata_batch( + &self, + version: i64, + engine: &dyn Engine, + is_v2_checkpoint: bool, + ) -> DeltaResult>> { + if !is_v2_checkpoint { + return Ok(None); + } + let values: &[Scalar] = &[version.into()]; + // Create the nested schema structure for `CheckpointMetadata` + // Note: We cannot use `CheckpointMetadata::to_schema()` as it would include + // the 'tags' field which we're not supporting yet due to the lack of map support. + let schema = Arc::new(StructType::new([StructField::not_null( + "checkpointMetadata", + DataType::struct_type([StructField::not_null("version", DataType::LONG)]), + )])); + + let checkpoint_metadata_batch = engine.evaluation_handler().create_one(schema, values)?; + + let result = CheckpointData { + data: checkpoint_metadata_batch, + selection_vector: vec![true], // Always include this action + }; + + // Safe to mutably borrow counter here as the iterator has not yet been returned from + // `get_checkpoint_info()`. The iterator is the only other consumer of the counter. + let mut counter_ref = self + .total_actions_counter + .try_borrow_mut() + .map_err(|e| Error::generic(format!("Failed to borrow mutably: {}", e)))?; + *counter_ref += 1; + + Ok(Some(Ok(result))) + } + /// Calculates the cutoff timestamp for deleted file cleanup. /// /// This function determines the minimum timestamp before which deleted files @@ -319,54 +372,15 @@ fn deleted_file_retention_timestamp_with_time( // Simple subtraction - will produce negative values if retention > now Ok(now_ms - retention_ms) } -/// Creates the checkpoint metadata action for V2 checkpoints. -/// -/// For V2 checkpoints, this function generates a special [`CheckpointMetadata`] action -/// that must be included in the V2 spec checkpoint file. This action contains metadata -/// about the checkpoint, particularly its version. For V1 checkpoints, this function -/// returns `None`, as the V1 schema does not include this action type. -/// -/// # Implementation Details -/// -/// The function creates a single-row [`EngineData`] batch containing only the -/// version field of the `CheckpointMetadata` action. Future implementations will -/// include additional metadata fields such as tags when map support is added. -/// -/// The resulting [`CheckpointData`] includes a selection vector with a single `true` -/// value, indicating this action should always be included in the checkpoint. -fn create_checkpoint_metadata_batch( - version: i64, - engine: &dyn Engine, - is_v2_checkpoint: bool, -) -> DeltaResult>> { - if !is_v2_checkpoint { - return Ok(None); - } - let values: &[Scalar] = &[version.into()]; - // Create the nested schema structure for `CheckpointMetadata` - // Note: We cannot use `CheckpointMetadata::to_schema()` as it would include - // the 'tags' field which we're not supporting yet due to the lack of map support. - let schema = Arc::new(StructType::new([StructField::not_null( - "checkpointMetadata", - DataType::struct_type([StructField::not_null("version", DataType::LONG)]), - )])); - - let checkpoint_metadata_batch = engine.evaluation_handler().create_one(schema, values)?; - - let result = CheckpointData { - data: checkpoint_metadata_batch, - selection_vector: vec![true], // Always include this action - }; - - Ok(Some(Ok(result))) -} #[cfg(test)] mod unit_tests { use super::*; use crate::engine::{arrow_data::ArrowEngineData, sync::SyncEngine}; + use crate::Table; use arrow_53::{array::RecordBatch, datatypes::Field}; use delta_kernel::arrow::array::create_array; + use std::path::PathBuf; use std::time::Duration; use crate::arrow::array::{ArrayRef, StructArray}; @@ -397,12 +411,21 @@ mod unit_tests { Ok(()) } + fn create_test_snapshot(engine: &dyn Engine) -> DeltaResult { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/app-txn-no-checkpoint/")); + let url = url::Url::from_directory_path(path.unwrap()).unwrap(); + let table = Table::new(url); + table.snapshot(engine, None) + } + #[test] fn test_create_checkpoint_metadata_batch_when_v2_checkpoints_is_supported() -> DeltaResult<()> { let engine = SyncEngine::new(); let version = 10; + let writer = CheckpointWriter::new(create_test_snapshot(&engine)?); + // Test with is_v2_checkpoint = true - let result = create_checkpoint_metadata_batch(version, &engine, true)?; + let result = writer.create_checkpoint_metadata_batch(version, &engine, true)?; assert!(result.is_some()); let checkpoint_data = result.unwrap()?; @@ -436,6 +459,9 @@ mod unit_tests { assert_eq!(*record_batch, expected); + // Verify counter was incremented + assert_eq!(*writer.total_actions_counter.borrow(), 1); + Ok(()) } @@ -443,11 +469,17 @@ mod unit_tests { fn test_create_checkpoint_metadata_batch_when_v2_checkpoints_not_supported() -> DeltaResult<()> { let engine = SyncEngine::new(); + let writer = CheckpointWriter::new(create_test_snapshot(&engine)?); + // Test with is_v2_checkpoint = false - let result = create_checkpoint_metadata_batch(10, &engine, false)?; + let result = writer.create_checkpoint_metadata_batch(10, &engine, false)?; // No checkpoint metadata action should be created for V1 checkpoints assert!(result.is_none()); + + // Verify counter was not incremented + assert_eq!(*writer.total_actions_counter.borrow(), 0); + Ok(()) } } diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs new file mode 100644 index 0000000000..52ac6d4faa --- /dev/null +++ b/kernel/src/checkpoint/tests.rs @@ -0,0 +1,315 @@ +use object_store::{memory::InMemory, path::Path, ObjectStore}; +use std::{i64::MAX, sync::Arc}; +use test_utils::delta_path_for_version; +use url::Url; + +use crate::{ + actions::{Add, Metadata, Protocol, Remove}, + engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}, + utils::test_utils::Action, + DeltaResult, Table, +}; + +/// TODO(seb): Merge copies and move to `test_utils` +/// Create an in-memory store and return the store and the URL for the store's _delta_log directory. +fn new_in_memory_store() -> (Arc, Url) { + ( + Arc::new(InMemory::new()), + Url::parse("memory:///") + .unwrap() + .join("_delta_log/") + .unwrap(), + ) +} + +/// TODO(seb): Merge copies and move to `test_utils` +/// Writes all actions to a _delta_log json commit file in the store. +/// This function formats the provided filename into the _delta_log directory. +fn write_commit_to_store( + store: &Arc, + actions: Vec, + version: u64, +) -> DeltaResult<()> { + let json_lines: Vec = actions + .into_iter() + .map(|action| serde_json::to_string(&action).expect("action to string")) + .collect(); + let content = json_lines.join("\n"); + + let commit_path = format!("_delta_log/{}", delta_path_for_version(version, "json")); + + tokio::runtime::Runtime::new() + .expect("create tokio runtime") + .block_on(async { store.put(&Path::from(commit_path), content.into()).await })?; + + Ok(()) +} + +/// Tests the `checkpoint()` API with: +/// - A table that does not support v2Checkpoint +/// - No version specified (latest version is used) +#[test] +fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { + let (store, _) = new_in_memory_store(); + let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + + // 1st commit: adds `fake_path_1` + write_commit_to_store( + &store, + vec![Action::Add(Add { + path: "fake_path_1".into(), + data_change: true, + ..Default::default() + })], + 0, + )?; + + // 2nd commit: adds `fake_path_2` & removes `fake_path_1` + write_commit_to_store( + &store, + vec![ + Action::Add(Add { + path: "fake_path_2".into(), + data_change: true, + ..Default::default() + }), + Action::Remove(Remove { + path: "fake_path_1".into(), + data_change: true, + deletion_timestamp: Some(MAX), // Ensure the remove action is not expired + ..Default::default() + }), + ], + 1, + )?; + + // 3rd commit: metadata & protocol actions + // Protocol action does not include the v2Checkpoint reader/writer feature. + write_commit_to_store( + &store, + vec![ + Action::Metadata(Metadata { + id: "fake_path_1".into(), + schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(), + ..Default::default() + }), + Action::Protocol(Protocol::try_new(3, 7, Vec::::new().into(), Vec::::new().into())?), + ], + 2, + )?; + + let table_root = Url::parse("memory:///")?; + let table = Table::new(table_root); + let mut writer = table.checkpoint(&engine, None)?; + let checkpoint_data = writer.get_checkpoint_info(&engine)?; + let mut data_iter = checkpoint_data.data; + + // Verify the checkpoint file path is the latest version by default. + assert_eq!( + checkpoint_data.path, + Url::parse("memory:///_delta_log/00000000000000000002.checkpoint.parquet")? + ); + + // Recall that the batches of actions are returned in reverse order, with the + // most recent actions first. + + // The first batch should be the metadata and protocol actions. + let checkpoint_data = data_iter.next().unwrap()?; + assert_eq!(checkpoint_data.selection_vector, [true, true]); + + // The second batch should only include the add action as the remove action is expired. + let checkpoint_data = data_iter.next().unwrap()?; + assert_eq!(checkpoint_data.selection_vector, [true, true]); + + // The third batch should not be included as the selection vector does not + // contain any true values, as the add action is removed in a following commit. + assert!(data_iter.next().is_none()); + + // Verify the collected metadata + // 2 actions (metadata, protocol) + 1 add action + 1 remove action (last action is reconciled) + assert_eq!(*writer.total_actions_counter.borrow(), 4); + // 1 add action + assert_eq!(*writer.total_add_actions_counter.borrow(), 1); + + Ok(()) +} + +/// Tests the `checkpoint()` API with: +/// - A table that does not support v2Checkpoint +/// - A specific version specified (version 0) +#[test] +fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { + let (store, _) = new_in_memory_store(); + let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + + // 1st commit (version 0) - metadata and protocol actions + // Protocol action does not include the v2Checkpoint reader/writer feature. + write_commit_to_store( + &store, + vec![ + Action::Protocol(Protocol::try_new(3, 7, Vec::::new().into(), Vec::::new().into())?), + Action::Metadata(Metadata { + id: "test-table-v0".into(), + schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(), + ..Default::default() + }), + ], + 0, + )?; + + // 2nd commit (version 1) - add and remove actions + write_commit_to_store( + &store, + vec![ + Action::Add(Add { + path: "file1.parquet".into(), + data_change: true, + ..Default::default() + }), + Action::Add(Add { + path: "file2.parquet".into(), + data_change: true, + ..Default::default() + }), + ], + 1, + )?; + + let table_root = Url::parse("memory:///")?; + let table = Table::new(table_root); + // Specify version 0 for checkpoint + let mut writer = table.checkpoint(&engine, Some(0))?; + let checkpoint_data = writer.get_checkpoint_info(&engine)?; + let mut data_iter = checkpoint_data.data; + + // Verify the checkpoint file path is the specified version. + assert_eq!( + checkpoint_data.path, + Url::parse("memory:///_delta_log/00000000000000000000.checkpoint.parquet")? + ); + + // The first batch should be the metadata and protocol actions. + let checkpoint_data = data_iter.next().unwrap()?; + assert_eq!(checkpoint_data.selection_vector, [true, true]); + + // No more data should exist because we only requested version 0 + assert!(data_iter.next().is_none()); + + // Verify the collected metadata + // 2 actions (metadata and protocol) + 2 add actions + assert_eq!(*writer.total_actions_counter.borrow(), 2); + // 2 add actions + assert_eq!(*writer.total_add_actions_counter.borrow(), 0); + + Ok(()) +} + +/// Tests the `checkpoint()` API with: +/// - A table that does supports v2Checkpoint +/// - No version specified (latest version is used) +#[test] +fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { + let (store, _) = new_in_memory_store(); + let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + + // 1st commit: adds `fake_path_2` & removes `fake_path_1` + write_commit_to_store( + &store, + vec![ + Action::Add(Add { + path: "fake_path_2".into(), + data_change: true, + ..Default::default() + }), + Action::Remove(Remove { + path: "fake_path_1".into(), + data_change: true, + deletion_timestamp: Some(MAX), // Ensure the remove action is not expired + ..Default::default() + }), + ], + 0, + )?; + + // 2nd commit: metadata & protocol actions + // Protocol action includes the v2Checkpoint reader/writer feature. + write_commit_to_store( + &store, + vec![ + Action::Metadata(Metadata { + id: "fake_path_1".into(), + schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(), + ..Default::default() + }), + Action::Protocol(Protocol::try_new(3, 7, vec!["v2Checkpoint"].into(), vec!["v2Checkpoint"].into())?), + ], + 1, + )?; + let table_root = Url::parse("memory:///")?; + let table = Table::new(table_root); + let mut writer = table.checkpoint(&engine, None)?; + let checkpoint_data = writer.get_checkpoint_info(&engine)?; + let mut data_iter = checkpoint_data.data; + + // Verify the checkpoint file path is the latest version by default. + assert_eq!( + checkpoint_data.path, + Url::parse("memory:///_delta_log/00000000000000000001.checkpoint.parquet")? + ); + + // The first batch should be the metadata and protocol actions. + let checkpoint_data = data_iter.next().unwrap()?; + assert_eq!(checkpoint_data.selection_vector, [true, true]); + + // The second batch should be the add action as the remove action is expired. + let checkpoint_data = data_iter.next().unwrap()?; + assert_eq!(checkpoint_data.selection_vector, [true, true]); + + // The third batch should be the CheckpointMetaData action. + let checkpoint_data = data_iter.next().unwrap()?; + assert_eq!(checkpoint_data.selection_vector, [true]); + + // No more data should exist + assert!(data_iter.next().is_none()); + + // Verify the collected metadata + // 3 actions (metadata, protocol, and checkpointMetadata) + 1 add action + 1 remove action + assert_eq!(*writer.total_actions_counter.borrow(), 5); + // 2 add actions + assert_eq!(*writer.total_add_actions_counter.borrow(), 1); + + Ok(()) +} + +/// Tests the `checkpoint()` API with: +/// - a version that does not exist in the log +#[test] +fn test_checkpoint_error_handling_invalid_version() -> DeltaResult<()> { + let (store, _) = new_in_memory_store(); + let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + + // 1st commit (version 0) - metadata and protocol actions + // Protocol action does not include the v2Checkpoint reader/writer feature. + write_commit_to_store( + &store, + vec![ + Action::Protocol(Protocol::try_new(3, 7, Vec::::new().into(), Vec::::new().into())?), + Action::Metadata(Metadata { + id: "test-table-v0".into(), + schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(), + ..Default::default() + }), + ], + 0, + )?; + let table_root = Url::parse("memory:///")?; + let table = Table::new(table_root); + let result = table.checkpoint(&engine, Some(999)); + + // Should fail with an appropriate error + // Returns error: "LogSegment end version 0 not the same as the specified end version 999" + // TODO(seb): Update the error message to be tailored to the checkpoint creation + assert!(result.is_err()); + + Ok(()) +} From 9834c6d5c09bf80ba1c881c705c62729265e6a8e Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 8 Apr 2025 10:09:01 -0700 Subject: [PATCH 106/176] fix builds --- kernel/src/checkpoint/tests.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index 52ac6d4faa..8cf4dfa742 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -1,5 +1,5 @@ use object_store::{memory::InMemory, path::Path, ObjectStore}; -use std::{i64::MAX, sync::Arc}; +use std::sync::Arc; use test_utils::delta_path_for_version; use url::Url; @@ -76,7 +76,7 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { Action::Remove(Remove { path: "fake_path_1".into(), data_change: true, - deletion_timestamp: Some(MAX), // Ensure the remove action is not expired + deletion_timestamp: Some(i64::MAX), // Ensure the remove action is not expired ..Default::default() }), ], @@ -224,7 +224,7 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { Action::Remove(Remove { path: "fake_path_1".into(), data_change: true, - deletion_timestamp: Some(MAX), // Ensure the remove action is not expired + deletion_timestamp: Some(i64::MAX), // Ensure the remove action is not expired ..Default::default() }), ], From 2aec9c38f6e8e528133df9753de9ae36d55571f6 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 8 Apr 2025 10:11:39 -0700 Subject: [PATCH 107/176] remove mod docs in this PR --- kernel/src/checkpoint/mod.rs | 56 +----------------------------------- 1 file changed, 1 insertion(+), 55 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 90540351d0..e18479696d 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -4,59 +4,5 @@ //! Checkpoints provide a compact summary of the table state, enabling faster recovery by //! avoiding full log replay. This API supports three checkpoint types: //! -//! 1. **Single-file Classic-named V1 Checkpoint** – for legacy tables that do not support -//! the `v2Checkpoints` reader/writer feature. -//! 2. **Single-file Classic-named V2 Checkpoint** – ensures backwards compatibility by -//! allowing legacy readers to recognize the checkpoint file, read the protocol action, and -//! fail gracefully. -//! 3. **Single-file UUID-named V2 Checkpoint** – the default and preferred option for small to -//! medium tables with `v2Checkpoints` reader/writer feature enabled. -//! -//! TODO!(seb): API is a WIP -//! The API follows a builder pattern using `CheckpointBuilder`, which performs tbale feature -//! detection and configuration validation. Depending on table features and builder options: -//! -//! - Without `v2Checkpoints`: produces a **Classic-named V1** checkpoint. -//! - With `v2Checkpoints`: produces a **UUID-named V2** checkpoint. -//! - With `v2Checkpoints` + `.classic_naming()`: produces a **Classic-named V2** checkpoint. -//! -//! The builder returns the `CheckpointWriter` which is responsible for: -//! - Producing the correct set of actions to be written to the checkpoint file when -//! `.get_checkpoint_info()` is called. -//! - Writing the _last_checkpoint file when `.finalize_checkpoint()` is called. -//! -//! Notes: -//! - Multi-file V2 checkpoints are not supported yet. The API is designed to be extensible for future -//! multi-file support, but the current implementation only supports single-file checkpoints. -//! - Multi-file V1 checkpoints are DEPRECATED. -//! -//! ## Example: Writing a classic-named V1/V2 checkpoint (depending on `v2Checkpoints` feature support) -//! -//! TODO(seb): unignore example -//! ```ignore -//! let path = "./tests/data/app-txn-no-checkpoint"; -//! let engine = Arc::new(SyncEngine::new()); -//! let table = Table::try_from_uri(path)?; -//! -//! // Create a checkpoint builder for the table at a specific version -//! let builder = table.checkpoint(&engine, Some(2))?; -//! -//! // Optionally configure the builder (e.g., force classic naming) -//! let writer = builder.with_classic_naming(); -//! -//! // Build the checkpoint writer -//! let mut writer = builder.build(&engine)?; -//! -//! // Retrieve checkpoint data (ensuring single consumption) -//! let checkpoint_data = writer.get_checkpoint_info()?; -//! -//! /* Write checkpoint data to file and collect metadata before finalizing */ -//! -//! // Write the _last_checkpoint file -//! writer.finalize_checkpoint(&engine, &checkpoint_metadata)?; -//! ``` -//! -//! This module, along with its submodule `checkpoint/log_replay.rs`, provides the full -//! API and implementation for generating checkpoints. See `checkpoint/log_replay.rs` for details -//! on how log replay is used to filter and deduplicate actions for checkpoint creation. +//! TODO!(seb): Include docs when implemented mod log_replay; From 2e2062f86fc23e6ceb1664bd24eefad5ef89e254 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 8 Apr 2025 10:22:20 -0700 Subject: [PATCH 108/176] update docs --- kernel/src/checkpoint/log_replay.rs | 30 ++++++++++++++--------------- kernel/src/scan/log_replay.rs | 13 ++++++------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index ae632a4474..26e06ff78c 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -85,8 +85,7 @@ pub(crate) struct CheckpointVisitor<'seen> { #[allow(unused)] impl CheckpointVisitor<'_> { // These index positions correspond to the order of columns defined in - // `selected_column_names_and_types()`, and are used to extract file key information - // for deduplication purposes + // `selected_column_names_and_types()` const ADD_PATH_INDEX: usize = 0; // Position of "add.path" in getters const ADD_DV_START_INDEX: usize = 1; // Start position of add deletion vector columns const REMOVE_PATH_INDEX: usize = 4; // Position of "remove.path" in getters @@ -125,14 +124,14 @@ impl CheckpointVisitor<'_> { /// Determines if a remove action tombstone has expired and should be excluded from the checkpoint. /// - /// A remove action includes a timestamp indicating when the deletion occurred. Physical files - /// are deleted lazily after a user-defined expiration time. Remove actions are kept to allow - /// concurrent readers to read snapshots at older versions. A remove action remains as a tombstone - /// in a checkpoint file until it expires, which happens when the deletion timestamp is less than - /// or equal to the minimum file retention timestamp. + /// A remove action includes a deletion_timestamp indicating when the deletion occurred. Physical + /// files are deleted lazily after a user-defined expiration time. Remove actions are kept to allow + /// concurrent readers to read snapshots at older versions. /// - /// Note: When remove.deletion_timestamp is not present (defaulting to 0), the remove action - /// will be excluded from the checkpoint file as it will be treated as expired. + /// Tombstone expiration rules: + /// - If deletion_timestamp <= minimum_file_retention_timestamp: Expired (exclude) + /// - If deletion_timestamp > minimum_file_retention_timestamp: Valid (include) + /// - If deletion_timestamp is missing: Defaults to 0, treated as expired (exclude) fn is_expired_tombstone<'a>(&self, i: usize, getter: &'a dyn GetData<'a>) -> DeltaResult { // Ideally this should never be zero, but we are following the same behavior as Delta // Spark and the Java Kernel. @@ -266,11 +265,12 @@ impl CheckpointVisitor<'_> { Ok(Some(())) } - /// Determines if a row in the batch should be included in the checkpoint by checking - /// if it contains any valid action type for the checkpoint. + /// Determines if a row in the batch should be included in the checkpoint. /// - /// Note: This method checks each action type in sequence, and prioritizes file actions as - /// they appear most frequently, followed by transaction, protocol, and metadata actions. + /// This method efficiently checks each action type using short-circuit evaluation + /// through the `.or()` chain. As soon as any check returns `Some(())`, the remaining + /// checks are skipped. Actions are checked in order of expected frequency (file actions first) + /// to optimize performance in typical workloads. /// /// Returns Ok(true) if the row should be included in the checkpoint. /// Returns Ok(false) if the row should be skipped. @@ -332,9 +332,7 @@ impl RowVisitor for CheckpointVisitor<'_> { ); for i in 0..row_count { - if self.selection_vector[i] { - self.selection_vector[i] = self.is_valid_action(i, getters)?; - } + self.selection_vector[i] = self.is_valid_action(i, getters)?; } Ok(()) } diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 82d6abf4e7..e8b4e2705e 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -87,13 +87,12 @@ struct AddRemoveDedupVisitor<'seen> { impl AddRemoveDedupVisitor<'_> { // These index positions correspond to the order of columns defined in - // `selected_column_names_and_types()`, and are used to extract file key information - // for deduplication purposes - const ADD_PATH_INDEX: usize = 0; - const ADD_PARTITION_VALUES_INDEX: usize = 1; - const ADD_DV_START_INDEX: usize = 2; - const REMOVE_PATH_INDEX: usize = 5; - const REMOVE_DV_START_INDEX: usize = 6; + // `selected_column_names_and_types()` + const ADD_PATH_INDEX: usize = 0; // Position of "add.path" in getters + const ADD_PARTITION_VALUES_INDEX: usize = 1; // Position of "add.partitionValues" in getters + const ADD_DV_START_INDEX: usize = 2; // Start position of add deletion vector columns + const REMOVE_PATH_INDEX: usize = 5; // Position of "remove.path" in getters + const REMOVE_DV_START_INDEX: usize = 6; // Start position of remove deletion vector columns fn new( seen: &mut HashSet, From aed3ab6cc4076590b0f827fd6f2242db9dc9b639 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 8 Apr 2025 10:38:16 -0700 Subject: [PATCH 109/176] remove file --- .../_delta_log/.00000000000000000002.json.crc | Bin 20 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 kernel/tests/data/v2-checkpoint-with-sidecars/_delta_log/.00000000000000000002.json.crc diff --git a/kernel/tests/data/v2-checkpoint-with-sidecars/_delta_log/.00000000000000000002.json.crc b/kernel/tests/data/v2-checkpoint-with-sidecars/_delta_log/.00000000000000000002.json.crc deleted file mode 100644 index f1a6ea85db288e6a905667cead26a14f9cd38b6a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20 bcmYc;N@ieSU}Es(x>#r#q1~;dcw!v@GG_(Q From fcb289d6b9dad7a5f983809c8326d9cf4935cff4 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 8 Apr 2025 11:02:29 -0700 Subject: [PATCH 110/176] docs --- kernel/src/checkpoint/log_replay.rs | 52 ++++++++++++++--------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 0be94b289b..d0a29f1248 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -1,34 +1,32 @@ -//! [`CheckpointLogReplayProcessor`] performs log replay specifically for checkpoint creation in delta tables. +//! The [`CheckpointLogReplayProcessor`] implements specialized log replay logic for creating +//! checkpoint files. It processes log files in reverse chronological order (newest to oldest) +//! and selects only the minimal set of actions needed to represent the table state at a given version. //! -//! During checkpoint creation, the processor reads batches of log actions (in reverse chronological order) -//! and performs the following steps: -//! During checkpoint creation, the processor reads batches of log actions (in reverse chronological order) -//! and performs the following steps: +//! ## Filtering Process //! -//! - Protocol and Metadata Filtering: Ensures that only the most recent protocol and metadata actions -//! are retained -//! - Transaction Deduplication: For each transaction (identified by app ID), only the latest action -//! is preserved to maintain a consistent transaction history. -//! - File Action Deduplication: Leverages the [`FileActionDeduplicator`] mechanism to ensure that -//! for each unique file (identified by its path and deletion vector unique ID), only the most -//! recent valid action is included. -//! - Tombstone Retention Management: Excludes file removal tombstones that are older than the -//! configured `minimum_file_retention_timestamp`, reducing checkpoint size without compromising -//! table consistency. -//! - Action Type Filtering: Excludes other action types such as commitInfo, and CDC actions that -//! aren't required for reconstructing table state. +//! For checkpoint creation, this processor applies several filtering and deduplication +//! steps to each batch of log actions: //! -//! The [`CheckpointVisitor`] implements the visitor pattern to efficiently apply these filtering -//! rules to each action in the batch, determining which should be included in the checkpoint file. -//! It handles deduplication of file actions, expiration of remove tombstones, and filtering of -//! non-file actions (protocol, metadata, transaction) while excluding unnecessary action types. +//! 1. **Protocol and Metadata**: Retains only the latest protocol and metadata actions. +//! 2. **Transactions**: Keeps the most recent action for each unique transaction (by app ID). +//! 3. **File Actions**: Deduplicates file actions (add/remove) by path and deletion vector ID, +//! keeping only the latest valid action. +//! 4. **Tombstones**: Excludes expired remove actions older than `minimum_file_retention_timestamp`. +//! 5. **Action Types**: Filters out irrelevant action types such as commitInfo, CDC, and sidecar actions. //! -//! As an implementation of [`LogReplayProcessor`], [`CheckpointLogReplayProcessor`] provides the -//! `process_actions_batch` method, which applies these steps to each batch of log actions and -//! produces a [`CheckpointData`] result. This result encapsulates both the original batch data -//! and a selection vector indicating which rows should be included in the checkpoint file. -//! The [`CheckpointVisitor`] is applied within the `process_actions_batch` method to determine -//! which rows to include by filtering protocol, metadata, transaction, and file actions. +//! ## Architecture +//! +//! - [`CheckpointVisitor`]: Implements [`RowVisitor`] to examine each action in a batch and +//! determine if it should be included in the checkpoint. It maintains state for deduplication +//! across multiple actions in a batch and efficiently handles all filtering rules. +//! +//! - [`CheckpointLogReplayProcessor`]: Implements the [`LogReplayProcessor`] trait and orchestrates +//! the overall process. For each batch of log actions, it: +//! 1. Creates a visitor with the current deduplication state +//! 2. Applies the visitor to filter actions in the batch +//! 3. Updates counters and state for cross-batch deduplication +//! 4. Produces a [`CheckpointData`] result which includes a selection vector indicating which +//! actions should be included in the checkpoint file use std::cell::RefCell; use std::collections::HashSet; use std::rc::Rc; From 4d2029e39d4f875b0fb2ce8a7c4490bcfa484699 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 9 Apr 2025 09:36:58 -0700 Subject: [PATCH 111/176] docs --- kernel/src/checkpoint/log_replay.rs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index d0a29f1248..1ea6437ff1 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -1,18 +1,19 @@ //! The [`CheckpointLogReplayProcessor`] implements specialized log replay logic for creating //! checkpoint files. It processes log files in reverse chronological order (newest to oldest) -//! and selects only the minimal set of actions needed to represent the table state at a given version. +//! and selects the set of actions to include in a checkpoint for a specific version. //! -//! ## Filtering Process +//! ## Actions Included for Checkpointing //! //! For checkpoint creation, this processor applies several filtering and deduplication //! steps to each batch of log actions: //! -//! 1. **Protocol and Metadata**: Retains only the latest protocol and metadata actions. -//! 2. **Transactions**: Keeps the most recent action for each unique transaction (by app ID). -//! 3. **File Actions**: Deduplicates file actions (add/remove) by path and deletion vector ID, -//! keeping only the latest valid action. -//! 4. **Tombstones**: Excludes expired remove actions older than `minimum_file_retention_timestamp`. -//! 5. **Action Types**: Filters out irrelevant action types such as commitInfo, CDC, and sidecar actions. +//! 1. **Protocol and Metadata**: Retains exactly one of each - keeping only the latest protocol +//! and metadata actions. +//! 2. **Txn Actions**: Keeps exactly one `txn` action for each unique app ID, always selecting +//! the latest one encountered. +//! 3. **File Actions**: Resolves file actions to produce the latest state of the table, keeping +//! the most recent valid add actions and unexpired remove actions (tombstones) that are newer +//! than `minimum_file_retention_timestamp`. //! //! ## Architecture //! @@ -41,7 +42,7 @@ use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, EngineData, Error}; -/// TODO!(seb): Change this to `type CheckpointData = FilteredEngineData` once available. +/// TODO!(seb): Replace `CheckpointData` with `FilteredEngineData` when available /// /// [`CheckpointData`] represents a batch of actions filtered for checkpoint creation. /// It wraps a single engine data batch and a corresponding selection vector indicating From e0d81abe1fda7d5f4623acf37350114fbcbd823e Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 9 Apr 2025 10:07:17 -0700 Subject: [PATCH 112/176] docs --- kernel/src/checkpoint/log_replay.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 1ea6437ff1..7d96ab8d86 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -66,8 +66,8 @@ impl HasSelectionVector for CheckpointData { /// trait that filters log segment actions for inclusion in a V1 spec checkpoint file. /// /// It processes each action batch via the `process_actions_batch` method, using the -/// [`CheckpointVisitor`] to map each [`EngineData`] batch into a [`CheckpointData`] -/// instance that reflect only the necessary actions for the checkpoint. +/// [`CheckpointVisitor`] to build an accompanying selection vector indicating which actions +/// should be included in the checkpoint. pub(crate) struct CheckpointLogReplayProcessor { /// Tracks file actions that have been seen during log replay to avoid duplicates. /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. From b4e28eec8a5c40d231b76c3af19226622e0bcb81 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 9 Apr 2025 10:38:25 -0700 Subject: [PATCH 113/176] review --- kernel/src/checkpoint/log_replay.rs | 6 +-- kernel/src/checkpoint/mod.rs | 60 +++++++++++++++-------------- kernel/src/lib.rs | 2 +- kernel/src/table.rs | 6 +-- 4 files changed, 38 insertions(+), 36 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 220af3d55c..7a2b918f16 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -49,11 +49,11 @@ use crate::{DeltaResult, EngineData, Error}; /// It wraps a single engine data batch and a corresponding selection vector indicating /// which rows should be written to the checkpoint file. #[allow(unused)] // TODO(seb): Make pub for roll-out -pub(crate) struct CheckpointData { +pub struct CheckpointData { /// The original engine data containing the actions - pub(crate) data: Box, + pub data: Box, /// Boolean vector indicating which rows should be included in the checkpoint - pub(crate) selection_vector: Vec, + pub selection_vector: Vec, } impl HasSelectionVector for CheckpointData { diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index becf6fa2bf..70df1f2471 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -69,6 +69,12 @@ //! This module, along with its submodule `checkpoint/log_replay.rs`, provides the full //! API and implementation for generating checkpoints. See `checkpoint/log_replay.rs` for details //! on how log replay is used to filter and deduplicate actions for checkpoint creation. +use std::{ + cell::RefCell, + rc::Rc, + sync::{Arc, LazyLock}, + time::{Duration, SystemTime, UNIX_EPOCH}, +}; use crate::{ actions::{ schemas::GetStructField, Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, @@ -81,20 +87,15 @@ use crate::{ DeltaResult, Engine, EngineData, Error, EvaluationHandlerExtension, }; use log_replay::{checkpoint_actions_iter, CheckpointData}; -use std::{ - cell::RefCell, - rc::Rc, - sync::{Arc, LazyLock}, - time::{Duration, SystemTime, UNIX_EPOCH}, -}; use url::Url; + mod log_replay; #[cfg(test)] mod tests; -/// Schema for extracting relevant actions from log files during checkpoint creation -static CHECKPOINT_READ_SCHEMA: LazyLock = LazyLock::new(|| { +/// Schema for extracting relevant actions from log files for checkpoint creation +static CHECKPOINT_ACTIONS_SCHEMA: LazyLock = LazyLock::new(|| { StructType::new([ Option::::get_struct_field(ADD_NAME), Option::::get_struct_field(REMOVE_NAME), @@ -106,43 +107,44 @@ static CHECKPOINT_READ_SCHEMA: LazyLock = LazyLock::new(|| { .into() }); -/// Returns the schema for reading Delta log actions during checkpoint creation -fn get_checkpoint_read_schema() -> &'static SchemaRef { - &CHECKPOINT_READ_SCHEMA +/// Returns the schema for reading Delta log actions for checkpoint creation +fn get_checkpoint_actions_schema() -> &'static SchemaRef { + &CHECKPOINT_ACTIONS_SCHEMA } -/// Contains the path and data for a single-file checkpoint -#[allow(unused)] // TODO(seb): Make pub for roll-out -pub(crate) struct SingleFileCheckpointData { - /// Target URL where the checkpoint file will be written +/// Represents a single-file checkpoint, including the data to write and the target path. +/// +/// TODO(seb): Rename to `CheckpointData` once `FilteredEngineData` is introduced. +pub struct SingleFileCheckpointData { + /// The URL where the checkpoint file should be written. pub path: Url, - /// Iterator over checkpoint actions to be written to the file + /// An iterator over the checkpoint data to be written to the file. pub data: Box>>, } + /// Manages the checkpoint writing process for Delta tables /// /// The [`CheckpointWriter`] orchestrates creating checkpoint data and finalizing /// the checkpoint file. It tracks statistics about included actions and /// ensures checkpoint data is consumed only once. /// -/// # Usage Flow -/// 1. Create via `Table::checkpoint()` -/// 2. Call `get_checkpoint_info()` to obtain the [`SingleFileCheckpointData`] -/// containing the path and action iterator for the checkpoint +/// # Usage +/// 1. Create via [`Table::checkpoint()`] +/// 2. Call [`CheckpointWriter::get_checkpoint_info()`] to obtain [`SingleFileCheckpointData`], +/// containing the checkpoint path and data iterator /// 3. Write the checkpoint data to storage (implementation-specific) -/// 4. Call `finalize_checkpoint()` to create the _last_checkpoint file +/// 4. Call [`CheckpointWriter::finalize_checkpoint()`] to create the _last_checkpoint file /// /// # Internal Process /// 1. Reads relevant actions from the log segment using the checkpoint read schema /// 2. Applies selection and deduplication logic with the `CheckpointLogReplayProcessor` /// 3. Tracks counts of included actions for to be written to the _last_checkpoint file /// 5. Chains the [`CheckpointMetadata`] action to the actions iterator (for V2 checkpoints) -#[allow(unused)] // TODO(seb): Make pub for roll-out -pub(crate) struct CheckpointWriter { - /// The snapshot from which the checkpoint is created - pub(crate) snapshot: Snapshot, +pub struct CheckpointWriter { + /// Reference to the snapshot of the table being checkpointed + pub(crate) snapshot: Arc, /// Note: Rc> provides shared mutability for our counters, allowing the /// returned actions iterator from `.get_checkpoint_info()` to update the counters, /// and the `finalize_checkpoint()` method to read them... @@ -156,7 +158,7 @@ pub(crate) struct CheckpointWriter { impl CheckpointWriter { /// Creates a new CheckpointWriter with the provided checkpoint data and counters - pub(crate) fn new(snapshot: Snapshot) -> Self { + pub(crate) fn new(snapshot: Arc) -> Self { Self { snapshot, total_actions_counter: Rc::new(RefCell::::new(0.into())), @@ -339,7 +341,7 @@ impl CheckpointWriter { &self, engine: &dyn Engine, ) -> DeltaResult, bool)>> + Send> { - let read_schema = get_checkpoint_read_schema(); + let read_schema = get_checkpoint_actions_schema(); self.snapshot.log_segment().read_actions( engine, read_schema.clone(), @@ -411,11 +413,11 @@ mod unit_tests { Ok(()) } - fn create_test_snapshot(engine: &dyn Engine) -> DeltaResult { + fn create_test_snapshot(engine: &dyn Engine) -> DeltaResult> { let path = std::fs::canonicalize(PathBuf::from("./tests/data/app-txn-no-checkpoint/")); let url = url::Url::from_directory_path(path.unwrap()).unwrap(); let table = Table::new(url); - table.snapshot(engine, None) + Ok(Arc::new(table.snapshot(engine, None)?)) } #[test] diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index 9ab79d558f..ff74b81d5f 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -74,7 +74,7 @@ use url::Url; use self::schema::{DataType, SchemaRef}; pub mod actions; -mod checkpoint; +pub mod checkpoint; pub mod engine_data; pub mod error; pub mod expressions; diff --git a/kernel/src/table.rs b/kernel/src/table.rs index 3e48f5f661..734f51c4d2 100644 --- a/kernel/src/table.rs +++ b/kernel/src/table.rs @@ -4,6 +4,7 @@ use std::borrow::Cow; use std::ops::Deref; use std::path::PathBuf; +use std::sync::Arc; use url::Url; @@ -107,13 +108,12 @@ impl Table { /// /// See the [`crate::checkpoint`] module documentation for more details on checkpoint types /// and the overall checkpoint process. - #[allow(unused)] // TODO(seb) Make pub for roll-out - pub(crate) fn checkpoint( + pub fn checkpoint( &self, engine: &dyn Engine, version: Option, ) -> DeltaResult { - Ok(CheckpointWriter::new(self.snapshot(engine, version)?)) + Ok(CheckpointWriter::new(Arc::new(self.snapshot(engine, version)?))) } /// Create a new write transaction for this table. From 544c42afeca42696c1e47babfe7bfeed16ef6442 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 9 Apr 2025 12:41:20 -0700 Subject: [PATCH 114/176] partial review --- kernel/src/checkpoint/mod.rs | 81 +++++++++++++--------------------- kernel/src/checkpoint/tests.rs | 6 +-- 2 files changed, 33 insertions(+), 54 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 70df1f2471..9709ef48c5 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -18,9 +18,9 @@ //! Handles the actual checkpoint data generation and writing process. It is created via the //! [`Table::checkpoint()`] method and provides the following APIs: //! - `new(snapshot: Snapshot) -> Self` - Creates a new writer for the given table snapshot -//! - `get_checkpoint_info(engine: &dyn Engine) -> DeltaResult` - +//! - `checkpoint_data(engine: &dyn Engine) -> DeltaResult` - //! Returns the checkpoint data and path information -//! - `finalize_checkpoint(engine: &dyn Engine, metadata: &dyn EngineData) -> DeltaResult<()>` - +//! - `finalize(engine: &dyn Engine, metadata: &dyn EngineData) -> DeltaResult<()>` - //! Writes the _last_checkpoint file after the checkpoint data has been written //! //! ## Checkpoint Type Selection @@ -56,25 +56,19 @@ //! let mut writer = table.checkpoint(&engine, Some(2))?; //! //! // Retrieve checkpoint data -//! let checkpoint_data = writer.get_checkpoint_info()?; +//! let checkpoint_data = writer.checkpoint_data()?; //! //! /* Write checkpoint data to file and collect metadata about the write */ //! /* The implementation of the write is storage-specific and not shown */ //! /* IMPORTANT: All data must be written before finalizing the checkpoint */ //! //! // Finalize the checkpoint by writing the _last_checkpoint file -//! writer.finalize_checkpoint(&engine, &checkpoint_metadata)?; +//! writer.finalize(&engine, &checkpoint_metadata)?; //! ``` //! //! This module, along with its submodule `checkpoint/log_replay.rs`, provides the full //! API and implementation for generating checkpoints. See `checkpoint/log_replay.rs` for details //! on how log replay is used to filter and deduplicate actions for checkpoint creation. -use std::{ - cell::RefCell, - rc::Rc, - sync::{Arc, LazyLock}, - time::{Duration, SystemTime, UNIX_EPOCH}, -}; use crate::{ actions::{ schemas::GetStructField, Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, @@ -87,9 +81,14 @@ use crate::{ DeltaResult, Engine, EngineData, Error, EvaluationHandlerExtension, }; use log_replay::{checkpoint_actions_iter, CheckpointData}; +use std::{ + cell::RefCell, + rc::Rc, + sync::{Arc, LazyLock}, + time::{Duration, SystemTime, UNIX_EPOCH}, +}; use url::Url; - mod log_replay; #[cfg(test)] mod tests; @@ -113,7 +112,7 @@ fn get_checkpoint_actions_schema() -> &'static SchemaRef { } /// Represents a single-file checkpoint, including the data to write and the target path. -/// +/// /// TODO(seb): Rename to `CheckpointData` once `FilteredEngineData` is introduced. pub struct SingleFileCheckpointData { /// The URL where the checkpoint file should be written. @@ -123,7 +122,6 @@ pub struct SingleFileCheckpointData { pub data: Box>>, } - /// Manages the checkpoint writing process for Delta tables /// /// The [`CheckpointWriter`] orchestrates creating checkpoint data and finalizing @@ -132,10 +130,10 @@ pub struct SingleFileCheckpointData { /// /// # Usage /// 1. Create via [`Table::checkpoint()`] -/// 2. Call [`CheckpointWriter::get_checkpoint_info()`] to obtain [`SingleFileCheckpointData`], +/// 2. Call [`CheckpointWriter::checkpoint_data()`] to obtain [`SingleFileCheckpointData`], /// containing the checkpoint path and data iterator /// 3. Write the checkpoint data to storage (implementation-specific) -/// 4. Call [`CheckpointWriter::finalize_checkpoint()`] to create the _last_checkpoint file +/// 4. Call [`CheckpointWriter::finalize()`] to create the _last_checkpoint file /// /// # Internal Process /// 1. Reads relevant actions from the log segment using the checkpoint read schema @@ -146,8 +144,8 @@ pub struct CheckpointWriter { /// Reference to the snapshot of the table being checkpointed pub(crate) snapshot: Arc, /// Note: Rc> provides shared mutability for our counters, allowing the - /// returned actions iterator from `.get_checkpoint_info()` to update the counters, - /// and the `finalize_checkpoint()` method to read them... + /// returned actions iterator from `.checkpoint_data()` to update the counters, + /// and the `finalize()` method to read them... /// Counter for total actions included in the checkpoint pub(crate) total_actions_counter: Rc>, /// Counter for Add actions included in the checkpoint @@ -179,26 +177,34 @@ impl CheckpointWriter { /// 5. Generates the appropriate checkpoint path /// /// The returned data should be written to persistent storage by the caller - /// before calling `finalize_checkpoint()` otherwise data loss may occur. + /// before calling `finalize()` otherwise data loss may occur. /// /// # Returns /// A [`SingleFileCheckpointData`] containing the checkpoint path and action iterator - #[allow(unused)] // TODO(seb): Make pub for roll-out - pub(crate) fn get_checkpoint_info( + pub fn checkpoint_data( &mut self, engine: &dyn Engine, ) -> DeltaResult { if self.data_consumed { return Err(Error::generic("Checkpoint data has already been consumed")); } + let is_v2_checkpoints_supported = self .snapshot .table_configuration() .is_v2_checkpoint_supported(); + let read_schema = get_checkpoint_actions_schema(); + let actions = self.snapshot.log_segment().read_actions( + engine, + read_schema.clone(), + read_schema.clone(), + None, + ); + // Create iterator over actions for checkpoint data let checkpoint_data = checkpoint_actions_iter( - self.replay_for_checkpoint_data(engine)?, + actions?, self.total_actions_counter.clone(), self.total_add_actions_counter.clone(), self.deleted_file_retention_timestamp()?, @@ -244,12 +250,8 @@ impl CheckpointWriter { /// - `metadata`: A single-row [`EngineData`] batch containing: /// - `size_in_bytes` (i64): The size of the written checkpoint file #[allow(unused)] // TODO(seb): Make pub for roll-out - fn finalize_checkpoint( - self, - _engine: &dyn Engine, - _metadata: &dyn EngineData, - ) -> DeltaResult<()> { - todo!("Implement finalize_checkpoint"); + fn finalize(self, _engine: &dyn Engine, _metadata: &dyn EngineData) -> DeltaResult<()> { + todo!("Implement finalize"); } /// Creates the checkpoint metadata action for V2 checkpoints. @@ -293,7 +295,7 @@ impl CheckpointWriter { }; // Safe to mutably borrow counter here as the iterator has not yet been returned from - // `get_checkpoint_info()`. The iterator is the only other consumer of the counter. + // `checkpoint_data()`. The iterator is the only other consumer of the counter. let mut counter_ref = self .total_actions_counter .try_borrow_mut() @@ -326,29 +328,6 @@ impl CheckpointWriter { .map_err(|e| Error::generic(format!("Failed to calculate system time: {}", e)))?, ) } - - /// Retrieves an iterator over all actions to be included in the checkpoint - /// - /// This method reads the relevant actions from the table's log segment using - /// the checkpoint schema, which filters for action types needed in checkpoints. - /// - /// The returned iterator yields tuples where: - /// - The first element is data in engine format - /// - The second element is a flag that indicates the action's source: - /// - `true` if the action came from a commit file - /// - `false` if the action came from a previous checkpoint file - fn replay_for_checkpoint_data( - &self, - engine: &dyn Engine, - ) -> DeltaResult, bool)>> + Send> { - let read_schema = get_checkpoint_actions_schema(); - self.snapshot.log_segment().read_actions( - engine, - read_schema.clone(), - read_schema.clone(), - None, - ) - } } /// Internal implementation with injectable time parameter for testing diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index 8cf4dfa742..a600442dc9 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -101,7 +101,7 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { let table_root = Url::parse("memory:///")?; let table = Table::new(table_root); let mut writer = table.checkpoint(&engine, None)?; - let checkpoint_data = writer.get_checkpoint_info(&engine)?; + let checkpoint_data = writer.checkpoint_data(&engine)?; let mut data_iter = checkpoint_data.data; // Verify the checkpoint file path is the latest version by default. @@ -179,7 +179,7 @@ fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { let table = Table::new(table_root); // Specify version 0 for checkpoint let mut writer = table.checkpoint(&engine, Some(0))?; - let checkpoint_data = writer.get_checkpoint_info(&engine)?; + let checkpoint_data = writer.checkpoint_data(&engine)?; let mut data_iter = checkpoint_data.data; // Verify the checkpoint file path is the specified version. @@ -248,7 +248,7 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { let table_root = Url::parse("memory:///")?; let table = Table::new(table_root); let mut writer = table.checkpoint(&engine, None)?; - let checkpoint_data = writer.get_checkpoint_info(&engine)?; + let checkpoint_data = writer.checkpoint_data(&engine)?; let mut data_iter = checkpoint_data.data; // Verify the checkpoint file path is the latest version by default. From e8d1239c94dd544793dd922aff6a113c486ce405 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 9 Apr 2025 14:10:04 -0700 Subject: [PATCH 115/176] arc atomic --- kernel/src/checkpoint/log_replay.rs | 43 ++++++++++++----------- kernel/src/checkpoint/mod.rs | 54 ++++++++++++++--------------- 2 files changed, 48 insertions(+), 49 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 7a2b918f16..cc045c276a 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -29,10 +29,9 @@ //! and a selection vector indicating which rows should be included in the checkpoint file. //! The [`CheckpointVisitor`] is applied within the `process_actions_batch` method to determine //! which rows to include by filtering protocol, metadata, transaction, and file actions. -use std::cell::RefCell; use std::collections::HashSet; -use std::rc::Rc; -use std::sync::LazyLock; +use std::sync::atomic::{AtomicI64, Ordering}; +use std::sync::{Arc, LazyLock}; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::log_replay::{ @@ -74,13 +73,11 @@ pub(crate) struct CheckpointLogReplayProcessor { /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. seen_file_keys: HashSet, - // Rc> provides shared mutability for our counters, allowing both the + // Arc provides shared mutability for our counters, allowing both the // iterator to update the values during processing and the caller to observe the final - // counts afterward. Note that this approach is not thread-safe and only works in - // single-threaded contexts, which means the iterator cannot be sent across thread - // boundaries (no Send trait). - total_actions: Rc>, - total_add_actions: Rc>, + // counts afterward. + total_actions: Arc, + total_add_actions: Arc, /// Indicates whether a protocol action has been seen in the log. seen_protocol: bool, @@ -131,10 +128,14 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { ); visitor.visit_rows_of(batch.as_ref())?; - // Update the total actions and add actions counters - *self.total_actions.borrow_mut() += - visitor.total_file_actions + visitor.total_non_file_actions; - *self.total_add_actions.borrow_mut() += visitor.total_add_actions; + // Update the total actions and add actions counters. Relaxed ordering is + // sufficient here as we only care about the total count and not the order of updates. + self.total_actions.fetch_add( + visitor.total_file_actions + visitor.total_non_file_actions, + Ordering::Relaxed, + ); + self.total_add_actions + .fetch_add(visitor.total_add_actions, Ordering::Relaxed); // Update protocol and metadata seen flags self.seen_protocol = visitor.seen_protocol; @@ -154,8 +155,8 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { impl CheckpointLogReplayProcessor { pub(crate) fn new( - total_actions_counter: Rc>, - total_add_actions_counter: Rc>, + total_actions_counter: Arc, + total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> Self { Self { @@ -181,8 +182,8 @@ impl CheckpointLogReplayProcessor { #[allow(unused)] // TODO: Remove once API is implemented pub(crate) fn checkpoint_actions_iter( action_iter: impl Iterator, bool)>>, - total_actions_counter: Rc>, - total_add_actions_counter: Rc>, + total_actions_counter: Arc, + total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> impl Iterator> { CheckpointLogReplayProcessor::new( @@ -769,8 +770,8 @@ mod tests { #[test] fn test_checkpoint_actions_iter_multi_batch_test() -> DeltaResult<()> { // Setup counters - let total_actions_counter = Rc::new(RefCell::new(0)); - let total_add_actions_counter = Rc::new(RefCell::new(0)); + let total_actions_counter = Arc::new(AtomicI64::new(0)); + let total_add_actions_counter = Arc::new(AtomicI64::new(0)); // Create first batch with protocol, metadata, and some files let json_strings1: StringArray = vec![ @@ -829,9 +830,9 @@ mod tests { vec![false, false, false, true, false, true] ); // 6 total actions (5 from batch1 + 2 from batch2 + 0 from batch3) - assert_eq!(*total_actions_counter.borrow(), 7); + assert_eq!(total_actions_counter.load(Ordering::Relaxed), 7); // 3 add actions (2 from batch1 + 1 from batch2) - assert_eq!(*total_add_actions_counter.borrow(), 3); + assert_eq!(total_add_actions_counter.load(Ordering::Relaxed), 3); Ok(()) } diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 9709ef48c5..efe95eeb7c 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -42,7 +42,7 @@ //! implemented in the future. The current implementation only supports classic-named V2 checkpoints. //! - Multi-file V2 checkpoints are not supported yet. The API is designed to be extensible for future //! multi-file support, but the current implementation only supports single-file checkpoints. -//! - Multi-file V1 checkpoints are DEPRECATED. +//! - Multi-file V1 checkpoints are DEPRECATED and UNSAFE. //! //! ## Example: Writing a classic-named V1/V2 checkpoint (depending on `v2Checkpoints` feature support) //! @@ -69,21 +69,20 @@ //! This module, along with its submodule `checkpoint/log_replay.rs`, provides the full //! API and implementation for generating checkpoints. See `checkpoint/log_replay.rs` for details //! on how log replay is used to filter and deduplicate actions for checkpoint creation. -use crate::{ - actions::{ - schemas::GetStructField, Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, - ADD_NAME, METADATA_NAME, PROTOCOL_NAME, REMOVE_NAME, SET_TRANSACTION_NAME, SIDECAR_NAME, - }, - expressions::Scalar, - path::ParsedLogPath, - schema::{DataType, SchemaRef, StructField, StructType}, - snapshot::Snapshot, - DeltaResult, Engine, EngineData, Error, EvaluationHandlerExtension, +use crate::actions::{ + schemas::GetStructField, Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, + METADATA_NAME, PROTOCOL_NAME, REMOVE_NAME, SET_TRANSACTION_NAME, SIDECAR_NAME, }; +use crate::expressions::Scalar; +use crate::path::ParsedLogPath; +use crate::schema::{DataType, SchemaRef, StructField, StructType}; +use crate::snapshot::Snapshot; +#[cfg(doc)] +use crate::{actions::CheckpointMetadata, table::Table}; +use crate::{DeltaResult, Engine, EngineData, Error, EvaluationHandlerExtension}; use log_replay::{checkpoint_actions_iter, CheckpointData}; +use std::sync::atomic::AtomicI64; use std::{ - cell::RefCell, - rc::Rc, sync::{Arc, LazyLock}, time::{Duration, SystemTime, UNIX_EPOCH}, }; @@ -143,13 +142,13 @@ pub struct SingleFileCheckpointData { pub struct CheckpointWriter { /// Reference to the snapshot of the table being checkpointed pub(crate) snapshot: Arc, - /// Note: Rc> provides shared mutability for our counters, allowing the + /// Note: Arc> provides shared mutability for our counters, allowing the /// returned actions iterator from `.checkpoint_data()` to update the counters, /// and the `finalize()` method to read them... /// Counter for total actions included in the checkpoint - pub(crate) total_actions_counter: Rc>, + pub(crate) total_actions_counter: Arc, /// Counter for Add actions included in the checkpoint - pub(crate) total_add_actions_counter: Rc>, + pub(crate) total_add_actions_counter: Arc, /// Flag to track if checkpoint data has been consumed pub(crate) data_consumed: bool, } @@ -159,8 +158,8 @@ impl CheckpointWriter { pub(crate) fn new(snapshot: Arc) -> Self { Self { snapshot, - total_actions_counter: Rc::new(RefCell::::new(0.into())), - total_add_actions_counter: Rc::new(RefCell::::new(0.into())), + total_actions_counter: Arc::new(AtomicI64::new(0)), + total_add_actions_counter: Arc::new(AtomicI64::new(0)), data_consumed: false, } } @@ -249,7 +248,8 @@ impl CheckpointWriter { /// - `engine`: The engine used for writing the _last_checkpoint file /// - `metadata`: A single-row [`EngineData`] batch containing: /// - `size_in_bytes` (i64): The size of the written checkpoint file - #[allow(unused)] // TODO(seb): Make pub for roll-out + #[allow(unused)] + // TODO(seb): Implement finalize & then make pub. Pub(crate) for docs fn finalize(self, _engine: &dyn Engine, _metadata: &dyn EngineData) -> DeltaResult<()> { todo!("Implement finalize"); } @@ -294,13 +294,10 @@ impl CheckpointWriter { selection_vector: vec![true], // Always include this action }; - // Safe to mutably borrow counter here as the iterator has not yet been returned from - // `checkpoint_data()`. The iterator is the only other consumer of the counter. - let mut counter_ref = self - .total_actions_counter - .try_borrow_mut() - .map_err(|e| Error::generic(format!("Failed to borrow mutably: {}", e)))?; - *counter_ref += 1; + // Ordering does not matter as there are no other threads modifying this counter + // at this time (since we have not yet returned the iterator which performs the action counting) + self.total_actions_counter + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); Ok(Some(Ok(result))) } @@ -362,6 +359,7 @@ mod unit_tests { use arrow_53::{array::RecordBatch, datatypes::Field}; use delta_kernel::arrow::array::create_array; use std::path::PathBuf; + use std::sync::atomic::Ordering; use std::time::Duration; use crate::arrow::array::{ArrayRef, StructArray}; @@ -441,7 +439,7 @@ mod unit_tests { assert_eq!(*record_batch, expected); // Verify counter was incremented - assert_eq!(*writer.total_actions_counter.borrow(), 1); + assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 1); Ok(()) } @@ -459,7 +457,7 @@ mod unit_tests { assert!(result.is_none()); // Verify counter was not incremented - assert_eq!(*writer.total_actions_counter.borrow(), 0); + assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 0); Ok(()) } From e9de5bc27c56399fed5e88c5ad27c2421ce2d2af Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 9 Apr 2025 14:29:17 -0700 Subject: [PATCH 116/176] arc --- kernel/src/checkpoint/log_replay.rs | 45 +++++++++++++++-------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 7d96ab8d86..cb6e981d92 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -28,10 +28,9 @@ //! 3. Updates counters and state for cross-batch deduplication //! 4. Produces a [`CheckpointData`] result which includes a selection vector indicating which //! actions should be included in the checkpoint file -use std::cell::RefCell; use std::collections::HashSet; -use std::rc::Rc; -use std::sync::LazyLock; +use std::sync::atomic::{AtomicI64, Ordering}; +use std::sync::{Arc, LazyLock}; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::log_replay::{ @@ -66,20 +65,18 @@ impl HasSelectionVector for CheckpointData { /// trait that filters log segment actions for inclusion in a V1 spec checkpoint file. /// /// It processes each action batch via the `process_actions_batch` method, using the -/// [`CheckpointVisitor`] to build an accompanying selection vector indicating which actions +/// [`CheckpointVisitor`] to build an accompanying selection vector indicating which actions /// should be included in the checkpoint. pub(crate) struct CheckpointLogReplayProcessor { /// Tracks file actions that have been seen during log replay to avoid duplicates. /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. seen_file_keys: HashSet, - // Rc> provides shared mutability for our counters, allowing both the + // Arc provides shared mutability for our counters, allowing both the // iterator to update the values during processing and the caller to observe the final - // counts afterward. Note that this approach is not thread-safe and only works in - // single-threaded contexts, which means the iterator cannot be sent across thread - // boundaries (no Send trait). - total_actions: Rc>, - total_add_actions: Rc>, + // counts afterward. + total_actions: Arc, + total_add_actions: Arc, /// Indicates whether a protocol action has been seen in the log. seen_protocol: bool, @@ -130,10 +127,14 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { ); visitor.visit_rows_of(batch.as_ref())?; - // Update the total actions and add actions counters - *self.total_actions.borrow_mut() += - visitor.total_file_actions + visitor.total_non_file_actions; - *self.total_add_actions.borrow_mut() += visitor.total_add_actions; + // Update the total actions and add actions counters. Relaxed ordering is + // sufficient here as we only care about the total count and not the order of updates. + self.total_actions.fetch_add( + visitor.total_file_actions + visitor.total_non_file_actions, + Ordering::Relaxed, + ); + self.total_add_actions + .fetch_add(visitor.total_add_actions, Ordering::Relaxed); // Update protocol and metadata seen flags self.seen_protocol = visitor.seen_protocol; @@ -153,8 +154,8 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { impl CheckpointLogReplayProcessor { pub(crate) fn new( - total_actions_counter: Rc>, - total_add_actions_counter: Rc>, + total_actions_counter: Arc, + total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> Self { Self { @@ -180,8 +181,8 @@ impl CheckpointLogReplayProcessor { #[allow(unused)] // TODO: Remove once API is implemented pub(crate) fn checkpoint_actions_iter( action_iter: impl Iterator, bool)>>, - total_actions_counter: Rc>, - total_add_actions_counter: Rc>, + total_actions_counter: Arc, + total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> impl Iterator> { CheckpointLogReplayProcessor::new( @@ -766,8 +767,8 @@ mod tests { #[test] fn test_checkpoint_actions_iter_multi_batch_test() -> DeltaResult<()> { // Setup counters - let total_actions_counter = Rc::new(RefCell::new(0)); - let total_add_actions_counter = Rc::new(RefCell::new(0)); + let total_actions_counter = Arc::new(AtomicI64::new(0)); + let total_add_actions_counter = Arc::new(AtomicI64::new(0)); // Create first batch with protocol, metadata, and some files let json_strings1: StringArray = vec![ @@ -826,9 +827,9 @@ mod tests { vec![false, false, false, true, false, true] ); // 6 total actions (5 from batch1 + 2 from batch2 + 0 from batch3) - assert_eq!(*total_actions_counter.borrow(), 7); + assert_eq!(total_actions_counter.load(Ordering::Relaxed), 7); // 3 add actions (2 from batch1 + 1 from batch2) - assert_eq!(*total_add_actions_counter.borrow(), 3); + assert_eq!(total_add_actions_counter.load(Ordering::Relaxed), 3); Ok(()) } From 99d31a740dfc9467753d9f89db64d5c56ac65c6e Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 10 Apr 2025 11:56:14 -0700 Subject: [PATCH 117/176] .finalize() with tests --- kernel/src/checkpoint/mod.rs | 233 ++++++++++++++++++++++++++++----- kernel/src/checkpoint/tests.rs | 110 ++++++++++++---- 2 files changed, 285 insertions(+), 58 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index efe95eeb7c..c824ac54cd 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -14,7 +14,7 @@ //! For more information on the V1/V2 specifications, see the following protocol section: //! //! -//! ### [`CheckpointWriter`] +//! ## [`CheckpointWriter`] //! Handles the actual checkpoint data generation and writing process. It is created via the //! [`Table::checkpoint()`] method and provides the following APIs: //! - `new(snapshot: Snapshot) -> Self` - Creates a new writer for the given table snapshot @@ -24,7 +24,6 @@ //! Writes the _last_checkpoint file after the checkpoint data has been written //! //! ## Checkpoint Type Selection -//! //! The checkpoint type is determined by whether the table supports the `v2Checkpoints` reader/writer feature: //! //! ```text @@ -69,19 +68,20 @@ //! This module, along with its submodule `checkpoint/log_replay.rs`, provides the full //! API and implementation for generating checkpoints. See `checkpoint/log_replay.rs` for details //! on how log replay is used to filter and deduplicate actions for checkpoint creation. +use crate::actions::CHECKPOINT_METADATA_NAME; use crate::actions::{ schemas::GetStructField, Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, METADATA_NAME, PROTOCOL_NAME, REMOVE_NAME, SET_TRANSACTION_NAME, SIDECAR_NAME, }; -use crate::expressions::Scalar; +use crate::expressions::{column_expr, Scalar}; use crate::path::ParsedLogPath; use crate::schema::{DataType, SchemaRef, StructField, StructType}; use crate::snapshot::Snapshot; #[cfg(doc)] use crate::{actions::CheckpointMetadata, table::Table}; -use crate::{DeltaResult, Engine, EngineData, Error, EvaluationHandlerExtension}; +use crate::{DeltaResult, Engine, EngineData, Error, EvaluationHandlerExtension, Expression}; use log_replay::{checkpoint_actions_iter, CheckpointData}; -use std::sync::atomic::AtomicI64; +use std::sync::atomic::{AtomicI64, Ordering}; use std::{ sync::{Arc, LazyLock}, time::{Duration, SystemTime, UNIX_EPOCH}, @@ -92,6 +92,8 @@ mod log_replay; #[cfg(test)] mod tests; +static LAST_CHECKPOINT_FILE_NAME: &str = "_last_checkpoint.json"; + /// Schema for extracting relevant actions from log files for checkpoint creation static CHECKPOINT_ACTIONS_SCHEMA: LazyLock = LazyLock::new(|| { StructType::new([ @@ -229,9 +231,9 @@ impl CheckpointWriter { }) } - /// Finalizes the checkpoint writing process by creating the _last_checkpoint file + /// Finalizes the checkpoint writing process by creating the `_last_checkpoint` file /// - /// The `LastCheckpointInfo` (`_last_checkpoint`) file is a metadata file that contains + /// The [`LastCheckpointHint`] (`_last_checkpoint`) file is a metadata file that contains /// information about the last checkpoint created for the table. It is used as a hint /// for the engine to quickly locate the last checkpoint and avoid full log replay when /// reading the table. @@ -240,18 +242,51 @@ impl CheckpointWriter { /// 0. IMPORTANT: This method must only be called AFTER successfully writing /// all checkpoint data to storage. Failure to do so may result in /// data loss. - /// 1. Extracts size information from the provided metadata - /// 2. Combines with additional metadata collected during checkpoint creation - /// 3. Writes the _last_checkpoint file to the log + /// 1. Validates the schema of the engine-provided metadata + /// 2. Enrich the metadata with the additional fields: + /// - `version`: The version of the checkpoint (snapshot version) + /// - `size`: The number of actions in the checkpoint (total_actions_counter) + /// - `parts`: The number of parts in the checkpoint (always 1) + /// - `sizeInBytes`: The size of the checkpoint file in bytes (from metadata) + /// - `numOfAddFiles`: The number of add files in the checkpoint (total_add_actions_counter) + /// - `checkpointSchema`: (not yet implemented) + /// - `checksum`: (not yet implemented) + /// 3. Write the metadata to the `_last_checkpoint` file /// /// # Parameters /// - `engine`: The engine used for writing the _last_checkpoint file /// - `metadata`: A single-row [`EngineData`] batch containing: - /// - `size_in_bytes` (i64): The size of the written checkpoint file - #[allow(unused)] - // TODO(seb): Implement finalize & then make pub. Pub(crate) for docs - fn finalize(self, _engine: &dyn Engine, _metadata: &dyn EngineData) -> DeltaResult<()> { - todo!("Implement finalize"); + /// - `sizeInBytes` (i64): The size of the written checkpoint file + pub fn finalize(self, engine: &dyn Engine, metadata: &dyn EngineData) -> DeltaResult<()> { + let version = self.snapshot.version().try_into().map_err(|e| { + Error::generic(format!( + "Failed to convert version from u64 {} to i64: {}", + self.snapshot.version(), + e + )) + })?; + + let checkpoint_metadata = create_last_checkpoint_data( + engine, + metadata, + version, + self.total_actions_counter.load(Ordering::Relaxed), + self.total_add_actions_counter.load(Ordering::Relaxed), + )?; + + let last_checkpoint_path = self + .snapshot + .log_segment() + .log_root + .join(LAST_CHECKPOINT_FILE_NAME)?; + + engine.json_handler().write_json_file( + &last_checkpoint_path, + Box::new(std::iter::once(Ok(checkpoint_metadata))), + true, // overwrite the last checkpoint file + )?; + + Ok(()) } /// Creates the checkpoint metadata action for V2 checkpoints. @@ -264,8 +299,8 @@ impl CheckpointWriter { /// # Implementation Details /// /// The function creates a single-row [`EngineData`] batch containing only the - /// version field of the `CheckpointMetadata` action. Future implementations will - /// include additional metadata fields such as tags when map support is added. + /// version field of the [`CheckpointMetadata`] action. Future implementations will + /// include the additional metadata field `tags` when map support is added. /// /// The resulting [`CheckpointData`] includes a selection vector with a single `true` /// value, indicating this action should always be included in the checkpoint. @@ -279,11 +314,11 @@ impl CheckpointWriter { return Ok(None); } let values: &[Scalar] = &[version.into()]; - // Create the nested schema structure for `CheckpointMetadata` + // Create the nested schema structure for [`CheckpointMetadata`] // Note: We cannot use `CheckpointMetadata::to_schema()` as it would include // the 'tags' field which we're not supporting yet due to the lack of map support. let schema = Arc::new(StructType::new([StructField::not_null( - "checkpointMetadata", + CHECKPOINT_METADATA_NAME, DataType::struct_type([StructField::not_null("version", DataType::LONG)]), )])); @@ -306,10 +341,10 @@ impl CheckpointWriter { /// /// This function determines the minimum timestamp before which deleted files /// will be permanently removed during VACUUM operations, based on the table's - /// deleted_file_retention_duration property. + /// `deleted_file_retention_duration` property. /// /// Returns the cutoff timestamp in milliseconds since epoch, matching - /// the remove action's deletion_timestamp format for comparison. + /// the remove action's `deletion_timestamp` field format for comparison. /// /// The default retention period is 7 days, matching delta-spark's behavior. fn deleted_file_retention_timestamp(&self) -> DeltaResult { @@ -327,7 +362,16 @@ impl CheckpointWriter { } } -/// Internal implementation with injectable time parameter for testing +/// Calculates the timestamp threshold for deleted file retention based on the provided duration. +/// This is factored out to allow testing with an injectable time and duration parameter. +/// +/// # Parameters +/// - `retention_duration`: The duration to retain deleted files. The table property +/// `deleted_file_retention_duration` is passed here. If `None`, defaults to 7 days. +/// - `now_duration`: The current time as a [`Duration`]. This allows for testing with +/// a specific time instead of using `SystemTime::now()`. +/// +/// # Returns: The timestamp in milliseconds since epoch fn deleted_file_retention_timestamp_with_time( retention_duration: Option, now_duration: Duration, @@ -351,11 +395,73 @@ fn deleted_file_retention_timestamp_with_time( Ok(now_ms - retention_ms) } +/// Creates the data as [`EngineData`] to be written to the `_last_checkpoint` file. +/// +/// This method validates the schema of the engine-provided metadata which should +/// contain a single row with the single column `sizeInBytes` (i64). It then transforms the +/// metadata to include the additional fields that are part of the `_last_checkpoint` file schema. +/// The `checkpointSchema` and `checksum` fields are also part of the `_last_checkpoint`` file +/// schema but are not yet implemented. They will be added in future versions. +fn create_last_checkpoint_data( + engine: &dyn Engine, + metadata: &dyn EngineData, + version: i64, + total_actions_counter: i64, + total_add_actions_counter: i64, +) -> DeltaResult> { + // Validate metadata has exactly one row + if metadata.len() != 1 { + return Err(Error::Generic(format!( + "Engine checkpoint metadata should have exactly one row, found {}", + metadata.len() + ))); + } + + // The current checkpoint API only supports single-file checkpoints. + let parts: i64 = 1; // Coerce the type to `i64`` to match the expected schema. + let last_checkpoint_exprs = [ + Expression::literal(version), + Expression::literal(total_actions_counter), + Expression::literal(parts), + column_expr!("sizeInBytes"), + Expression::literal(total_add_actions_counter), + // TODO(seb): Write the `checkpoint_schema` field + // TODO(seb): Write the `checksum` field + ]; + let last_checkpoint_expr = Expression::struct_from(last_checkpoint_exprs); + + // Note: We cannot use `LastCheckpointInfo::to_schema()` as it would include + // the 'checkpoint_schema' field, which is only known at runtime. + let last_checkpoint_schema = Arc::new(StructType::new([ + StructField::not_null("version", DataType::LONG), + StructField::not_null("size", DataType::LONG), + StructField::nullable("parts", DataType::LONG), + StructField::nullable("sizeInBytes", DataType::LONG), + StructField::nullable("numOfAddFiles", DataType::LONG), + ])); + + // The schema of the metadata passed to `.finalize()` should be a single-row, single-column batch + let engine_metadata_schema = Arc::new(StructType::new([StructField::not_null( + "version", + DataType::LONG, + )])); + + let last_checkpoint_metadata_evaluator = engine.evaluation_handler().new_expression_evaluator( + engine_metadata_schema.into(), + last_checkpoint_expr, + last_checkpoint_schema.into(), + ); + + last_checkpoint_metadata_evaluator.evaluate(metadata) +} + #[cfg(test)] mod unit_tests { use super::*; + use crate::arrow::datatypes::{DataType as ArrowDataType, Schema as ArrowSchema}; use crate::engine::{arrow_data::ArrowEngineData, sync::SyncEngine}; use crate::Table; + use arrow_53::array::Int64Array; use arrow_53::{array::RecordBatch, datatypes::Field}; use delta_kernel::arrow::array::create_array; use std::path::PathBuf; @@ -412,12 +518,8 @@ mod unit_tests { assert_eq!(checkpoint_data.selection_vector, vec![true]); // Verify the underlying EngineData contains the expected CheckpointMetadata action - let record_batch = checkpoint_data - .data - .any_ref() - .downcast_ref::() - .unwrap() - .record_batch(); + let arrow_engine_data = ArrowEngineData::try_from_engine_data(checkpoint_data.data)?; + let record_batch = arrow_engine_data.record_batch(); // Build the expected RecordBatch // Note: The schema is a struct with a single field "checkpointMetadata" of type struct @@ -437,8 +539,6 @@ mod unit_tests { .unwrap(); assert_eq!(*record_batch, expected); - - // Verify counter was incremented assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 1); Ok(()) @@ -455,10 +555,79 @@ mod unit_tests { // No checkpoint metadata action should be created for V1 checkpoints assert!(result.is_none()); - - // Verify counter was not incremented assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 0); Ok(()) } + + #[test] + fn test_create_last_checkpoint_metadata() -> DeltaResult<()> { + // Setup test data + let size_in_bytes: i64 = 1024 * 1024; // 1MB + let version = 10; + let total_actions_counter = 100; + let total_add_actions_counter = 75; + let engine = SyncEngine::new(); + + // Create engine metadata with `size_in_bytes` + let schema = ArrowSchema::new(vec![Field::new("sizeInBytes", ArrowDataType::Int64, false)]); + let size_array = Int64Array::from(vec![size_in_bytes]); + let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(size_array)])?; + let metadata = ArrowEngineData::new(record_batch); + + // Create last checkpoint metadata + let last_checkpoint_batch = create_last_checkpoint_data( + &engine, + &metadata, + version, + total_actions_counter, + total_add_actions_counter, + )?; + + // Verify the underlying EngineData contains the expected LastCheckpointInfo schema and data + let arrow_engine_data = ArrowEngineData::try_from_engine_data(last_checkpoint_batch)?; + let record_batch = arrow_engine_data.record_batch(); + + // Build the expected RecordBatch + let expected_schema = Arc::new(Schema::new(vec![ + Field::new("version", DataType::Int64, false), + Field::new("size", DataType::Int64, false), + Field::new("parts", DataType::Int64, true), + Field::new("sizeInBytes", DataType::Int64, true), + Field::new("numOfAddFiles", DataType::Int64, true), + ])); + let expected = RecordBatch::try_new( + expected_schema, + vec![ + create_array!(Int64, [version]), + create_array!(Int64, [total_actions_counter]), + create_array!(Int64, [1]), + create_array!(Int64, [size_in_bytes]), + create_array!(Int64, [total_add_actions_counter]), + ], + ) + .unwrap(); + + assert_eq!(*record_batch, expected); + Ok(()) + } + + #[test] + fn test_create_last_checkpoint_metadata_with_invalid_batch() -> DeltaResult<()> { + let engine = SyncEngine::new(); + + // Create engine metadata with the wrong schema + let schema = ArrowSchema::new(vec![Field::new("wrongField", ArrowDataType::Int64, false)]); + let size_array = Int64Array::from(vec![0]); + let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(size_array)]) + .expect("Failed to create record batch"); + let metadata = Box::new(ArrowEngineData::new(record_batch)); + + // This should fail because the schema does not match the expected schema + let res = create_last_checkpoint_data(&engine, &*metadata, 0, 0, 0); + + // Verify that an error is returned + assert!(res.is_err()); + Ok(()) + } } diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index a600442dc9..130be9c4a0 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -1,14 +1,21 @@ -use object_store::{memory::InMemory, path::Path, ObjectStore}; -use std::sync::Arc; -use test_utils::delta_path_for_version; -use url::Url; - use crate::{ actions::{Add, Metadata, Protocol, Remove}, - engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}, + engine::{ + arrow_data::ArrowEngineData, + default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}, + }, utils::test_utils::Action, - DeltaResult, Table, + DeltaResult, EngineData, Table, +}; +use arrow_53::{ + array::{Int64Array, RecordBatch}, + datatypes::{DataType, Field, Schema}, }; +use object_store::{memory::InMemory, path::Path, ObjectStore}; +use serde_json::{from_slice, json, Value}; +use std::sync::Arc; +use test_utils::delta_path_for_version; +use url::Url; /// TODO(seb): Merge copies and move to `test_utils` /// Create an in-memory store and return the store and the URL for the store's _delta_log directory. @@ -45,6 +52,45 @@ fn write_commit_to_store( Ok(()) } +/// Creates a metadata batch with size information for checkpoint +fn create_checkpoint_metadata_batch(size_in_bytes: i64) -> DeltaResult { + let schema = Schema::new(vec![Field::new("sizeInBytes", DataType::Int64, false)]); + let size_array = Int64Array::from(vec![size_in_bytes]); + let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(size_array)])?; + Ok(ArrowEngineData::new(record_batch)) +} + +/// Reads the `_last_checkpoint.json` file from storage +fn read_last_checkpoint_file(store: &Arc) -> DeltaResult { + let path = Path::from("_delta_log/_last_checkpoint.json"); + let rt = tokio::runtime::Runtime::new().expect("create tokio runtime"); + let byte_data = rt.block_on(async { + let data = store.get(&path).await?; + data.bytes().await + })?; + Ok(from_slice(&byte_data)?) +} + +/// Helper to verify the contents of the `_last_checkpoint` file +fn assert_last_checkpoint_contents( + store: &Arc, + expected_version: u64, + expected_size: u64, + expected_num_add_files: u64, + expected_size_in_bytes: i64, +) -> DeltaResult<()> { + let last_checkpoint_data = read_last_checkpoint_file(store)?; + let expected_data = json!({ + "version": expected_version, + "size": expected_size, + "parts": 1, + "sizeInBytes": expected_size_in_bytes, + "numOfAddFiles": expected_num_add_files, + }); + assert_eq!(last_checkpoint_data, expected_data); + Ok(()) +} + /// Tests the `checkpoint()` API with: /// - A table that does not support v2Checkpoint /// - No version specified (latest version is used) @@ -110,9 +156,6 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { Url::parse("memory:///_delta_log/00000000000000000002.checkpoint.parquet")? ); - // Recall that the batches of actions are returned in reverse order, with the - // most recent actions first. - // The first batch should be the metadata and protocol actions. let checkpoint_data = data_iter.next().unwrap()?; assert_eq!(checkpoint_data.selection_vector, [true, true]); @@ -125,11 +168,16 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { // contain any true values, as the add action is removed in a following commit. assert!(data_iter.next().is_none()); - // Verify the collected metadata - // 2 actions (metadata, protocol) + 1 add action + 1 remove action (last action is reconciled) - assert_eq!(*writer.total_actions_counter.borrow(), 4); - // 1 add action - assert_eq!(*writer.total_add_actions_counter.borrow(), 1); + // Finalize and verify checkpoint metadata + let size_in_bytes = 10; + writer.finalize(&engine, &create_checkpoint_metadata_batch(size_in_bytes)?)?; + assert_last_checkpoint_contents( + &store, + 2, // version: latest/last version in the log + 4, // size: 1 metadata + 1 protocol + 1 add action + 1 remove action + 1, // numOfAddFiles: from the 2nd commit (fake_path_2) + size_in_bytes, // sizeInBytes: passed to finalize (10) + )?; Ok(()) } @@ -195,11 +243,16 @@ fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { // No more data should exist because we only requested version 0 assert!(data_iter.next().is_none()); - // Verify the collected metadata - // 2 actions (metadata and protocol) + 2 add actions - assert_eq!(*writer.total_actions_counter.borrow(), 2); - // 2 add actions - assert_eq!(*writer.total_add_actions_counter.borrow(), 0); + // Finalize and verify + let size_in_bytes = 10; + writer.finalize(&engine, &create_checkpoint_metadata_batch(size_in_bytes)?)?; + assert_last_checkpoint_contents( + &store, + 0, // version: specified version (0) + 2, // size: 1 protocol + 1 metadata from version 0 + 0, // numOfAddFiles: no add files in version 0 + size_in_bytes, // sizeInBytes: passed to finalize (10) + )?; Ok(()) } @@ -272,11 +325,16 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { // No more data should exist assert!(data_iter.next().is_none()); - // Verify the collected metadata - // 3 actions (metadata, protocol, and checkpointMetadata) + 1 add action + 1 remove action - assert_eq!(*writer.total_actions_counter.borrow(), 5); - // 2 add actions - assert_eq!(*writer.total_add_actions_counter.borrow(), 1); + // Finalize and verify + let size_in_bytes = 10; + writer.finalize(&engine, &create_checkpoint_metadata_batch(size_in_bytes)?)?; + assert_last_checkpoint_contents( + &store, + 1, // version: latest version (1) with v2Checkpoint support + 5, // size: 1 metadata + 1 protocol + 1 add + 1 remove + 1 checkpointMetadata + 1, // numOfAddFiles: 1 add file from version 0 + size_in_bytes, // sizeInBytes: passed to finalize (10) + )?; Ok(()) } @@ -308,7 +366,7 @@ fn test_checkpoint_error_handling_invalid_version() -> DeltaResult<()> { // Should fail with an appropriate error // Returns error: "LogSegment end version 0 not the same as the specified end version 999" - // TODO(seb): Update the error message to be tailored to the checkpoint creation + // TODO(seb): Returned error should be tailored to checkpoint creation assert!(result.is_err()); Ok(()) From ab0a37372ffc97b05d242c09151167bff105d364 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 10 Apr 2025 14:25:36 -0700 Subject: [PATCH 118/176] docs --- kernel/src/checkpoint/log_replay.rs | 81 +++++++++++------------------ kernel/src/engine_data.rs | 8 +++ 2 files changed, 38 insertions(+), 51 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index a5cb4b785a..58c280ba71 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -26,40 +26,17 @@ //! 1. Creates a visitor with the current deduplication state //! 2. Applies the visitor to filter actions in the batch //! 3. Updates counters and state for cross-batch deduplication -//! 4. Produces a [`CheckpointData`] result which includes a selection vector indicating which +//! 4. Produces a [`FilteredEngineData`] result which includes a selection vector indicating which //! actions should be included in the checkpoint file -use std::collections::HashSet; -use std::sync::atomic::{AtomicI64, Ordering}; -use std::sync::{Arc, LazyLock}; - -use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; -use crate::log_replay::{ - FileActionDeduplicator, FileActionKey, HasSelectionVector, LogReplayProcessor, -}; +use crate::engine_data::{FilteredEngineData, GetData, RowVisitor, TypedGetData as _}; +use crate::log_replay::{FileActionDeduplicator, FileActionKey, LogReplayProcessor}; use crate::scan::data_skipping::DataSkippingFilter; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, EngineData, Error}; - -/// TODO!(seb): Replace `CheckpointData` with `FilteredEngineData` when available -/// -/// [`CheckpointData`] represents a batch of actions filtered for checkpoint creation. -/// It wraps a single engine data batch and a corresponding selection vector indicating -/// which rows should be written to the checkpoint file. -pub(crate) struct CheckpointData { - /// The original engine data containing the actions - #[allow(dead_code)] // TODO: Remove once checkpoint_v1 API is implemented - data: Box, - /// Boolean vector indicating which rows should be included in the checkpoint - selection_vector: Vec, -} - -impl HasSelectionVector for CheckpointData { - /// Returns true if any row in the selection vector is marked as selected - fn has_selected_rows(&self) -> bool { - self.selection_vector.contains(&true) - } -} +use std::collections::HashSet; +use std::sync::atomic::{AtomicI64, Ordering}; +use std::sync::{Arc, LazyLock}; /// The [`CheckpointLogReplayProcessor`] is an implementation of the [`LogReplayProcessor`] /// trait that filters log segment actions for inclusion in a V1 spec checkpoint file. @@ -71,13 +48,13 @@ pub(crate) struct CheckpointLogReplayProcessor { /// Tracks file actions that have been seen during log replay to avoid duplicates. /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. seen_file_keys: HashSet, - // Arc provides shared mutability for our counters, allowing both the // iterator to update the values during processing and the caller to observe the final - // counts afterward. + // counts afterward. The counters are i64 to match the `_last_checkpoint` file schema. + // Tracks the total number of actions included in the checkpoint file. total_actions: Arc, + // Tracks the total number of add actions included in the checkpoint file. total_add_actions: Arc, - /// Indicates whether a protocol action has been seen in the log. seen_protocol: bool, /// Indicates whether a metadata action has been seen in the log. @@ -89,7 +66,7 @@ pub(crate) struct CheckpointLogReplayProcessor { } impl LogReplayProcessor for CheckpointLogReplayProcessor { - type Output = CheckpointData; + type Output = FilteredEngineData; /// This function is applied to each batch of actions read from the log during /// log replay in reverse chronological order (from most recent to least recent), @@ -127,8 +104,9 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { ); visitor.visit_rows_of(batch.as_ref())?; - // Update the total actions and add actions counters. Relaxed ordering is - // sufficient here as we only care about the total count and not the order of updates. + // Update the total actions and add actions counters. Relaxed ordering is sufficient + // here as we only care about the total count when writing the _last_checkpoint file. + // (the ordering is not important for correctness) self.total_actions.fetch_add( visitor.file_actions_count + visitor.non_file_actions_count, Ordering::Relaxed, @@ -140,7 +118,7 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { self.seen_protocol = visitor.seen_protocol; self.seen_metadata = visitor.seen_metadata; - Ok(CheckpointData { + Ok(FilteredEngineData { data: batch, selection_vector: visitor.selection_vector, }) @@ -154,14 +132,14 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { impl CheckpointLogReplayProcessor { pub(crate) fn new( - total_actions_counter: Arc, - total_add_actions_counter: Arc, + total_actions: Arc, + total_add_actions: Arc, minimum_file_retention_timestamp: i64, ) -> Self { Self { seen_file_keys: Default::default(), - total_actions: total_actions_counter, - total_add_actions: total_add_actions_counter, + total_actions: total_actions, + total_add_actions: total_add_actions, seen_protocol: false, seen_metadata: false, seen_txns: Default::default(), @@ -181,13 +159,13 @@ impl CheckpointLogReplayProcessor { #[allow(unused)] // TODO: Remove once API is implemented pub(crate) fn checkpoint_actions_iter( action_iter: impl Iterator, bool)>>, - total_actions_counter: Arc, - total_add_actions_counter: Arc, + total_actions: Arc, + total_add_actions: Arc, minimum_file_retention_timestamp: i64, -) -> impl Iterator> { +) -> impl Iterator> { CheckpointLogReplayProcessor::new( - total_actions_counter, - total_add_actions_counter, + total_actions, + total_add_actions, minimum_file_retention_timestamp, ) .process_actions_iter(action_iter) @@ -740,6 +718,7 @@ mod tests { Ok(()) } + /// Tests the [`CheckpointLogReplayProcessor`] by applying the processor across /// multiple batches of actions. This test ensures that the processor correctly saves state /// in order to deduplicate actions across batches. More granular tests for the @@ -747,8 +726,8 @@ mod tests { #[test] fn test_checkpoint_actions_iter_multi_batch_test() -> DeltaResult<()> { // Setup counters - let total_actions_counter = Arc::new(AtomicI64::new(0)); - let total_add_actions_counter = Arc::new(AtomicI64::new(0)); + let total_actions = Arc::new(AtomicI64::new(0)); + let total_add_actions = Arc::new(AtomicI64::new(0)); // Create first batch with protocol, metadata, and some files let json_strings1: StringArray = vec![ @@ -785,8 +764,8 @@ mod tests { let results: Vec<_> = checkpoint_actions_iter( input_batches.into_iter(), - total_actions_counter.clone(), - total_add_actions_counter.clone(), + total_actions.clone(), + total_add_actions.clone(), 0, ) .try_collect()?; @@ -807,9 +786,9 @@ mod tests { vec![false, false, false, true, false, true] ); // 6 total actions (5 from batch1 + 2 from batch2 + 0 from batch3) - assert_eq!(total_actions_counter.load(Ordering::Relaxed), 7); + assert_eq!(total_actions.load(Ordering::Relaxed), 7); // 3 add actions (2 from batch1 + 1 from batch2) - assert_eq!(total_add_actions_counter.load(Ordering::Relaxed), 3); + assert_eq!(total_add_actions.load(Ordering::Relaxed), 3); Ok(()) } diff --git a/kernel/src/engine_data.rs b/kernel/src/engine_data.rs index 54cce0e260..44ada91e78 100644 --- a/kernel/src/engine_data.rs +++ b/kernel/src/engine_data.rs @@ -1,5 +1,6 @@ //! Traits that engines need to implement in order to pass data between themselves and kernel. +use crate::log_replay::HasSelectionVector; use crate::schema::{ColumnName, DataType}; use crate::{AsAny, DeltaResult, Error}; @@ -20,6 +21,13 @@ pub struct FilteredEngineData { pub selection_vector: Vec, } +impl HasSelectionVector for FilteredEngineData { + /// Returns true if any row in the selection vector is marked as selected + fn has_selected_rows(&self) -> bool { + self.selection_vector.contains(&true) + } +} + /// a trait that an engine exposes to give access to a list pub trait EngineList { /// Return the length of the list at the specified row_index in the raw data From 9a9697a8105ff5c44ffca882ab9e8f1f326dd88a Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 10 Apr 2025 16:01:15 -0700 Subject: [PATCH 119/176] test coverage --- kernel/src/checkpoint/log_replay.rs | 226 ++++++++++++++++++---------- 1 file changed, 147 insertions(+), 79 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 58c280ba71..2da323f0d5 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -54,7 +54,7 @@ pub(crate) struct CheckpointLogReplayProcessor { // Tracks the total number of actions included in the checkpoint file. total_actions: Arc, // Tracks the total number of add actions included in the checkpoint file. - total_add_actions: Arc, + add_actions_count: Arc, /// Indicates whether a protocol action has been seen in the log. seen_protocol: bool, /// Indicates whether a metadata action has been seen in the log. @@ -111,7 +111,7 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { visitor.file_actions_count + visitor.non_file_actions_count, Ordering::Relaxed, ); - self.total_add_actions + self.add_actions_count .fetch_add(visitor.add_actions_count, Ordering::Relaxed); // Update protocol and metadata seen flags @@ -133,13 +133,13 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { impl CheckpointLogReplayProcessor { pub(crate) fn new( total_actions: Arc, - total_add_actions: Arc, + add_actions_count: Arc, minimum_file_retention_timestamp: i64, ) -> Self { Self { seen_file_keys: Default::default(), - total_actions: total_actions, - total_add_actions: total_add_actions, + total_actions, + add_actions_count, seen_protocol: false, seen_metadata: false, seen_txns: Default::default(), @@ -160,12 +160,12 @@ impl CheckpointLogReplayProcessor { pub(crate) fn checkpoint_actions_iter( action_iter: impl Iterator, bool)>>, total_actions: Arc, - total_add_actions: Arc, + add_actions_count: Arc, minimum_file_retention_timestamp: i64, ) -> impl Iterator> { CheckpointLogReplayProcessor::new( total_actions, - total_add_actions, + add_actions_count, minimum_file_retention_timestamp, ) .process_actions_iter(action_iter) @@ -495,6 +495,33 @@ mod tests { use itertools::Itertools; use std::collections::HashSet; + /// Helper function to create test batches from JSON strings + fn create_batch(json_strings: Vec<&str>) -> DeltaResult<(Box, bool)> { + Ok((parse_json_batch(StringArray::from(json_strings)), true)) + } + + /// Helper function which applies the `checkpoint_actions_iter` function to a set of + /// input batches and returns the results. + fn run_checkpoint_test( + input_batches: Vec<(Box, bool)>, + ) -> DeltaResult<(Vec, i64, i64)> { + let total_actions = Arc::new(AtomicI64::new(0)); + let add_actions_count = Arc::new(AtomicI64::new(0)); + let results: Vec<_> = checkpoint_actions_iter( + input_batches.into_iter().map(Ok), + total_actions.clone(), + add_actions_count.clone(), + 0, + ) + .try_collect()?; + + Ok(( + results, + total_actions.load(Ordering::Relaxed), + add_actions_count.load(Ordering::Relaxed), + )) + } + #[test] fn test_checkpoint_visitor() -> DeltaResult<()> { let data = action_batch(); @@ -610,14 +637,14 @@ mod tests { } #[test] - fn test_checkpoint_visitor_conflicts_with_deletion_vectors() -> DeltaResult<()> { + fn test_checkpoint_visitor_file_actions_with_deletion_vectors() -> DeltaResult<()> { let json_strings: StringArray = vec![ // Add action for file1 with deletion vector - r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"two","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - // Remove action for file1 with a different deletion vector - r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - // Add action for file1 with the same deletion vector as the remove action above (excluded) - r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"ONE","pathOrInlineDv":"dv1","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Remove action for file1 with a different deletion vector + r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"deletionVector":{"storageType":"TWO","pathOrInlineDv":"dv2","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Remove action for file1 with another different deletion vector + r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"deletionVector":{"storageType":"THREE","pathOrInlineDv":"dv3","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, ] .into(); let batch = parse_json_batch(json_strings); @@ -636,9 +663,9 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; - let expected = vec![true, true, false]; + let expected = vec![true, true, true]; assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.file_actions_count, 2); + assert_eq!(visitor.file_actions_count, 3); assert_eq!(visitor.add_actions_count, 1); assert_eq!(visitor.non_file_actions_count, 0); @@ -719,76 +746,117 @@ mod tests { Ok(()) } - /// Tests the [`CheckpointLogReplayProcessor`] by applying the processor across - /// multiple batches of actions. This test ensures that the processor correctly saves state - /// in order to deduplicate actions across batches. More granular tests for the - /// [`CheckpointVisitor`] are in the above `test_checkpoint_visitor` tests. + /// This test ensures that the processor correctly deduplicates and filters + /// non-file actions (metadata, protocol, txn) across multiple batches. #[test] - fn test_checkpoint_actions_iter_multi_batch_test() -> DeltaResult<()> { - // Setup counters - let total_actions = Arc::new(AtomicI64::new(0)); - let total_add_actions = Arc::new(AtomicI64::new(0)); - - // Create first batch with protocol, metadata, and some files - let json_strings1: StringArray = vec![ - r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, - r#"{"metaData":{"id":"test2","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, - r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, - ].into(); - // Create second batch with some duplicates and new files - let json_strings2: StringArray = vec![ - // Protocol, metadata, txn should be skipped as duplicates - r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, - r#"{"metaData":{"id":"test1","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - // New file - r#"{"add":{"path":"file3","partitionValues":{},"size":800,"modificationTime":102,"dataChange":true}}"#, - // Duplicate file should be skipped - r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, // Transaction - // Unique transaction (appId) should be included - r#"{"txn":{"appId":"app2","version":1,"lastUpdated":123456789}}"# - ].into(); - // Create third batch with all duplicate actions. - // The *entire* batch should be skippped as there are no selected actions to write from this batch. - let json_strings3: StringArray = vec![ - r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, - r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, - ].into(); + fn test_checkpoint_actions_iter_non_file_actions() -> DeltaResult<()> { + // Batch 1: protocol, metadata, and txn actions + let batch1 = vec![ + r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, + r#"{"metaData":{"id":"test1","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + ]; + + // Batch 2: duplicate actions, and a new txn action + let batch2 = vec![ + // Duplicates that should be skipped + r#"{"protocol":{"minReaderVersion":2,"minWriterVersion":3}}"#, + r#"{"metaData":{"id":"test2","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + // Unique transaction (appId) should be included + r#"{"txn":{"appId":"app2","version":1,"lastUpdated":123456789}}"#, + ]; + + // Batch 3: a duplicate action (entire batch should be skipped) + let batch3 = vec![r#"{"protocol":{"minReaderVersion":2,"minWriterVersion":3}}"#]; + let input_batches = vec![ - Ok((parse_json_batch(json_strings1), true)), // true = commit batch - Ok((parse_json_batch(json_strings2), true)), - Ok((parse_json_batch(json_strings3), true)), + create_batch(batch1)?, + create_batch(batch2)?, + create_batch(batch3)?, ]; + let (results, total_actions, add_actions) = run_checkpoint_test(input_batches)?; - let results: Vec<_> = checkpoint_actions_iter( - input_batches.into_iter(), - total_actions.clone(), - total_add_actions.clone(), - 0, - ) - .try_collect()?; + // Verify results + assert_eq!(results.len(), 2, "Expected two batches in results"); + assert_eq!(results[0].selection_vector, vec![true, true, true],); + assert_eq!(results[1].selection_vector, vec![false, false, false, true],); + assert_eq!(total_actions, 4); + assert_eq!(add_actions, 0); - // Expect two batches in results (third batch should be filtered out) - assert_eq!(results.len(), 2); + Ok(()) + } - // First batch should have all rows selected - let checkpoint_data = &results[0]; - assert_eq!( - checkpoint_data.selection_vector, - vec![true, true, true, true, true] - ); - // Second batch should have only new file and unique transaction selected - let checkpoint_data = &results[1]; - assert_eq!( - checkpoint_data.selection_vector, - vec![false, false, false, true, false, true] - ); - // 6 total actions (5 from batch1 + 2 from batch2 + 0 from batch3) - assert_eq!(total_actions.load(Ordering::Relaxed), 7); - // 3 add actions (2 from batch1 + 1 from batch2) - assert_eq!(total_add_actions.load(Ordering::Relaxed), 3); + /// This test ensures that the processor correctly deduplicates and filters + /// file actions (add, remove) across multiple batches. + #[test] + fn test_checkpoint_actions_iter_file_actions() -> DeltaResult<()> { + // Batch 1: add action (file1) - new, should be included + let batch1 = vec![ + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + ]; + + // Batch 2: remove actions - mixed inclusion + let batch2 = vec![ + // Already seen file, should be excluded + r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + // New file, should be included + r#"{"remove":{"path":"file2","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + ]; + + // Batch 3: add action (file2) - already seen, should be excluded + let batch3 = vec![ + r#"{"add":{"path":"file2","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + ]; + + let input_batches = vec![ + create_batch(batch1)?, + create_batch(batch2)?, + create_batch(batch3)?, + ]; + let (results, total_actions, add_actions) = run_checkpoint_test(input_batches)?; + + // Verify results + assert_eq!(results.len(), 2); // The third batch should be filtered out since there are no selected actions + assert_eq!(results[0].selection_vector, vec![true]); + assert_eq!(results[1].selection_vector, vec![false, true]); + assert_eq!(total_actions, 2); + assert_eq!(add_actions, 1); + + Ok(()) + } + + /// This test ensures that the processor correctly deduplicates and filters + /// file actions (add, remove) with deletion vectors across multiple batches. + #[test] + fn test_checkpoint_actions_iter_file_actions_with_deletion_vectors() -> DeltaResult<()> { + // Batch 1: add actions with deletion vectors + let batch1 = vec![ + // (file1, DV_ONE) New, should be included + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"ONE","pathOrInlineDv":"dv1","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // (file1, DV_TWO) New, should be included + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"TWO","pathOrInlineDv":"dv2","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + ]; + + // Batch 2: mixed actions with duplicate and new entries + let batch2 = vec![ + // (file1, DV_ONE): Already seen, should be excluded + r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"deletionVector":{"storageType":"ONE","pathOrInlineDv":"dv1","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // (file1, DV_TWO): Already seen, should be excluded + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"TWO","pathOrInlineDv":"dv2","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // New file, should be included + r#"{"remove":{"path":"file2","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + ]; + + let input_batches = vec![create_batch(batch1)?, create_batch(batch2)?]; + let (results, total_actions, add_actions) = run_checkpoint_test(input_batches)?; + + // Verify results + assert_eq!(results.len(), 2); + assert_eq!(results[0].selection_vector, vec![true, true]); + assert_eq!(results[1].selection_vector, vec![false, false, true]); + assert_eq!(total_actions, 3); + assert_eq!(add_actions, 2); Ok(()) } From c7630a3c2e497fcafa2a1d163d632879866dbdf8 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 10 Apr 2025 16:19:03 -0700 Subject: [PATCH 120/176] doc --- kernel/src/checkpoint/log_replay.rs | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 2da323f0d5..eeaa278bd7 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -68,18 +68,13 @@ pub(crate) struct CheckpointLogReplayProcessor { impl LogReplayProcessor for CheckpointLogReplayProcessor { type Output = FilteredEngineData; - /// This function is applied to each batch of actions read from the log during - /// log replay in reverse chronological order (from most recent to least recent), - /// and performs the necessary filtering and deduplication to produce the minimal - /// set of actions to be written to the checkpoint file. + /// Processes a batch of actions read from the log during reverse chronological replay + /// and returns a filtered batch ([`FilteredEngineData`]) to be included in the checkpoint. /// - /// # Filtering Rules - /// - /// 1. Only the most recent protocol and metadata actions are included - /// 2. For each app ID, only the most recent transaction action is included - /// 3. Add and remove actions are deduplicated based on path and unique ID - /// 4. Remove tombstones older than `minimum_file_retention_timestamp` are excluded - /// 5. Sidecar, commitInfo, and CDC actions are excluded + /// This method delegates the filtering logic to the [`CheckpointVisitor`], which implements + /// the deduplication rules described in the module documentation. The method tracks + /// statistics about processed actions (total count, add actions count) and maintains + /// state for cross-batch deduplication. fn process_actions_batch( &mut self, batch: Box, From c58074beb5774064e257780cc9ddc4769c3f66c4 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 11 Apr 2025 11:09:46 -0700 Subject: [PATCH 121/176] fix merge --- kernel/src/checkpoint/mod.rs | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index c824ac54cd..fcfc7c447b 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -18,7 +18,7 @@ //! Handles the actual checkpoint data generation and writing process. It is created via the //! [`Table::checkpoint()`] method and provides the following APIs: //! - `new(snapshot: Snapshot) -> Self` - Creates a new writer for the given table snapshot -//! - `checkpoint_data(engine: &dyn Engine) -> DeltaResult` - +//! - `checkpoint_data(engine: &dyn Engine) -> DeltaResult` - //! Returns the checkpoint data and path information //! - `finalize(engine: &dyn Engine, metadata: &dyn EngineData) -> DeltaResult<()>` - //! Writes the _last_checkpoint file after the checkpoint data has been written @@ -73,6 +73,7 @@ use crate::actions::{ schemas::GetStructField, Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, METADATA_NAME, PROTOCOL_NAME, REMOVE_NAME, SET_TRANSACTION_NAME, SIDECAR_NAME, }; +use crate::engine_data::FilteredEngineData; use crate::expressions::{column_expr, Scalar}; use crate::path::ParsedLogPath; use crate::schema::{DataType, SchemaRef, StructField, StructType}; @@ -80,7 +81,7 @@ use crate::snapshot::Snapshot; #[cfg(doc)] use crate::{actions::CheckpointMetadata, table::Table}; use crate::{DeltaResult, Engine, EngineData, Error, EvaluationHandlerExtension, Expression}; -use log_replay::{checkpoint_actions_iter, CheckpointData}; +use log_replay::checkpoint_actions_iter; use std::sync::atomic::{AtomicI64, Ordering}; use std::{ sync::{Arc, LazyLock}, @@ -113,14 +114,12 @@ fn get_checkpoint_actions_schema() -> &'static SchemaRef { } /// Represents a single-file checkpoint, including the data to write and the target path. -/// -/// TODO(seb): Rename to `CheckpointData` once `FilteredEngineData` is introduced. -pub struct SingleFileCheckpointData { +pub struct CheckpointData { /// The URL where the checkpoint file should be written. pub path: Url, /// An iterator over the checkpoint data to be written to the file. - pub data: Box>>, + pub data: Box>>, } /// Manages the checkpoint writing process for Delta tables @@ -131,7 +130,7 @@ pub struct SingleFileCheckpointData { /// /// # Usage /// 1. Create via [`Table::checkpoint()`] -/// 2. Call [`CheckpointWriter::checkpoint_data()`] to obtain [`SingleFileCheckpointData`], +/// 2. Call [`CheckpointWriter::checkpoint_data()`] to obtain [`CheckpointData`], /// containing the checkpoint path and data iterator /// 3. Write the checkpoint data to storage (implementation-specific) /// 4. Call [`CheckpointWriter::finalize()`] to create the _last_checkpoint file @@ -181,11 +180,8 @@ impl CheckpointWriter { /// before calling `finalize()` otherwise data loss may occur. /// /// # Returns - /// A [`SingleFileCheckpointData`] containing the checkpoint path and action iterator - pub fn checkpoint_data( - &mut self, - engine: &dyn Engine, - ) -> DeltaResult { + /// A [`CheckpointData`] containing the checkpoint path and action iterator + pub fn checkpoint_data(&mut self, engine: &dyn Engine) -> DeltaResult { if self.data_consumed { return Err(Error::generic("Checkpoint data has already been consumed")); } @@ -225,7 +221,7 @@ impl CheckpointWriter { self.data_consumed = true; - Ok(SingleFileCheckpointData { + Ok(CheckpointData { path: checkpoint_path.location, data: Box::new(chained), }) @@ -302,14 +298,14 @@ impl CheckpointWriter { /// version field of the [`CheckpointMetadata`] action. Future implementations will /// include the additional metadata field `tags` when map support is added. /// - /// The resulting [`CheckpointData`] includes a selection vector with a single `true` + /// The resulting [`FilteredEngineData`] includes a selection vector with a single `true` /// value, indicating this action should always be included in the checkpoint. fn create_checkpoint_metadata_batch( &self, version: i64, engine: &dyn Engine, is_v2_checkpoint: bool, - ) -> DeltaResult>> { + ) -> DeltaResult>> { if !is_v2_checkpoint { return Ok(None); } @@ -324,7 +320,7 @@ impl CheckpointWriter { let checkpoint_metadata_batch = engine.evaluation_handler().create_one(schema, values)?; - let result = CheckpointData { + let result = FilteredEngineData { data: checkpoint_metadata_batch, selection_vector: vec![true], // Always include this action }; From 78fab5fcbe5b61840fd971b6b10fb8aa9e390899 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 11 Apr 2025 12:02:01 -0700 Subject: [PATCH 122/176] docs --- kernel/src/checkpoint/mod.rs | 190 ++++++++++++++++------------------- 1 file changed, 88 insertions(+), 102 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index fcfc7c447b..060e499729 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -6,42 +6,34 @@ //! //! 1. **Single-file Classic-named V1 Checkpoint** – for legacy tables that do not support the //! `v2Checkpoints` reader/writer feature. These checkpoints follow the V1 specification and do not -//! include a CheckpointMetadata action. +//! include a [`CheckpointMetadata`] action. //! 2. **Single-file Classic-named V2 Checkpoint** – for tables supporting the `v2Checkpoints` feature. -//! These checkpoints follow the V2 specification and include a CheckpointMetadata action, while -//! maintaining backwards compatibility by using classic naming that legacy readers can recognize. +//! These checkpoints follow the V2 specification and include a [`CheckpointMetadata`] action, while +//! maintaining backwards compatibility by using classic-naming that legacy readers can recognize. //! //! For more information on the V1/V2 specifications, see the following protocol section: //! //! -//! ## [`CheckpointWriter`] -//! Handles the actual checkpoint data generation and writing process. It is created via the -//! [`Table::checkpoint()`] method and provides the following APIs: -//! - `new(snapshot: Snapshot) -> Self` - Creates a new writer for the given table snapshot -//! - `checkpoint_data(engine: &dyn Engine) -> DeltaResult` - -//! Returns the checkpoint data and path information -//! - `finalize(engine: &dyn Engine, metadata: &dyn EngineData) -> DeltaResult<()>` - -//! Writes the _last_checkpoint file after the checkpoint data has been written -//! -//! ## Checkpoint Type Selection +//! ## Checkpoint Selection Logic //! The checkpoint type is determined by whether the table supports the `v2Checkpoints` reader/writer feature: //! -//! ```text -//! +------------------+-------------------------------+ //! | Table Feature | Resulting Checkpoint Type | -//! +==================+===============================+ +//! |------------------|-------------------------------| //! | No v2Checkpoints | Single-file Classic-named V1 | -//! +------------------+-------------------------------+ //! | v2Checkpoints | Single-file Classic-named V2 | -//! +------------------+-------------------------------+ -//! ``` //! -//! Notes: -//! - Single-file UUID-named V2 checkpoints (using `n.checkpoint.u.{json/parquet}` naming) are to be -//! implemented in the future. The current implementation only supports classic-named V2 checkpoints. -//! - Multi-file V2 checkpoints are not supported yet. The API is designed to be extensible for future -//! multi-file support, but the current implementation only supports single-file checkpoints. -//! - Multi-file V1 checkpoints are DEPRECATED and UNSAFE. +//! ## Architecture +//! +//! - [`CheckpointWriter`] - Core component that manages checkpoint creation workflow +//! - [`CheckpointData`] - Contains the data to write and destination path information +//! - [`crate::log_replay`] submodule - Handles action filtering and deduplication +//! +//! ## [`CheckpointWriter`] +//! Handles the actual checkpoint data generation and writing process. It is created via the +//! [`crate::table::Table::checkpoint`] method and provides the following APIs: +//! - [`CheckpointWriter::new`] - Creates a new writer for the given table snapshot +//! - [`CheckpointWriter::checkpoint_data`] - Returns the checkpoint data and path information +//! - [`CheckpointWriter::finalize`] - Writes the `_last_checkpoint` file //! //! ## Example: Writing a classic-named V1/V2 checkpoint (depending on `v2Checkpoints` feature support) //! @@ -57,17 +49,24 @@ //! // Retrieve checkpoint data //! let checkpoint_data = writer.checkpoint_data()?; //! -//! /* Write checkpoint data to file and collect metadata about the write */ -//! /* The implementation of the write is storage-specific and not shown */ +//! // Write checkpoint data to storage (implementation-specific) +//! let metadata = your_storage_implementation.write_checkpoint( +//! &checkpoint_data.path, +//! checkpoint_data.data +//! )?; +//! //! /* IMPORTANT: All data must be written before finalizing the checkpoint */ //! //! // Finalize the checkpoint by writing the _last_checkpoint file -//! writer.finalize(&engine, &checkpoint_metadata)?; +//! writer.finalize(&engine, &metadata)?; //! ``` +//! ## Future extensions +//! - Single-file UUID-named V2 checkpoints (using `n.checkpoint.u.{json/parquet}` naming) are to be +//! implemented in the future. The current implementation only supports classic-named V2 checkpoints. +//! - Multi-file V2 checkpoints are not supported yet. The API is designed to be extensible for future +//! multi-file support, but the current implementation only supports single-file checkpoints. //! -//! This module, along with its submodule `checkpoint/log_replay.rs`, provides the full -//! API and implementation for generating checkpoints. See `checkpoint/log_replay.rs` for details -//! on how log replay is used to filter and deduplicate actions for checkpoint creation. +//! Note: Multi-file V1 checkpoints are DEPRECATED and UNSAFE. use crate::actions::CHECKPOINT_METADATA_NAME; use crate::actions::{ schemas::GetStructField, Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, @@ -77,6 +76,8 @@ use crate::engine_data::FilteredEngineData; use crate::expressions::{column_expr, Scalar}; use crate::path::ParsedLogPath; use crate::schema::{DataType, SchemaRef, StructField, StructType}; +#[cfg(doc)] +use crate::snapshot::LastCheckpointHint; use crate::snapshot::Snapshot; #[cfg(doc)] use crate::{actions::CheckpointMetadata, table::Table}; @@ -88,11 +89,13 @@ use std::{ time::{Duration, SystemTime, UNIX_EPOCH}, }; use url::Url; - mod log_replay; #[cfg(test)] mod tests; +/// Name of the _last_checkpoint file that provides metadata about the last checkpoint +/// created for the table. This file is used as a hint for the engine to quickly locate +/// the last checkpoint and avoid full log replay when reading the table. static LAST_CHECKPOINT_FILE_NAME: &str = "_last_checkpoint.json"; /// Schema for extracting relevant actions from log files for checkpoint creation @@ -122,36 +125,20 @@ pub struct CheckpointData { pub data: Box>>, } -/// Manages the checkpoint writing process for Delta tables +/// Manages the checkpoint writing process for tables /// -/// The [`CheckpointWriter`] orchestrates creating checkpoint data and finalizing -/// the checkpoint file. It tracks statistics about included actions and -/// ensures checkpoint data is consumed only once. -/// -/// # Usage -/// 1. Create via [`Table::checkpoint()`] -/// 2. Call [`CheckpointWriter::checkpoint_data()`] to obtain [`CheckpointData`], -/// containing the checkpoint path and data iterator -/// 3. Write the checkpoint data to storage (implementation-specific) -/// 4. Call [`CheckpointWriter::finalize()`] to create the _last_checkpoint file -/// -/// # Internal Process -/// 1. Reads relevant actions from the log segment using the checkpoint read schema -/// 2. Applies selection and deduplication logic with the `CheckpointLogReplayProcessor` -/// 3. Tracks counts of included actions for to be written to the _last_checkpoint file -/// 5. Chains the [`CheckpointMetadata`] action to the actions iterator (for V2 checkpoints) +/// The [`CheckpointWriter`] orchestrates creating checkpoint data, and finalizing the +/// checkpoint by writing the `_last_checkpoint` file. pub struct CheckpointWriter { /// Reference to the snapshot of the table being checkpointed pub(crate) snapshot: Arc, - /// Note: Arc> provides shared mutability for our counters, allowing the + /// Note: Arc provides shared mutability for our counters, allowing the /// returned actions iterator from `.checkpoint_data()` to update the counters, - /// and the `finalize()` method to read them... + /// and the [`CheckpointWriter`] to read them during `.finalize()` /// Counter for total actions included in the checkpoint pub(crate) total_actions_counter: Arc, /// Counter for Add actions included in the checkpoint - pub(crate) total_add_actions_counter: Arc, - /// Flag to track if checkpoint data has been consumed - pub(crate) data_consumed: bool, + pub(crate) add_actions_counter: Arc, } impl CheckpointWriter { @@ -160,32 +147,26 @@ impl CheckpointWriter { Self { snapshot, total_actions_counter: Arc::new(AtomicI64::new(0)), - total_add_actions_counter: Arc::new(AtomicI64::new(0)), - data_consumed: false, + add_actions_counter: Arc::new(AtomicI64::new(0)), } } /// Retrieves the checkpoint data and path information /// /// This method is the core of the checkpoint generation process. It: - /// - /// 1. Ensures checkpoint data is consumed only once via `data_consumed` flag + /// 1. Determines whether to write a V1 or V2 checkpoint based on the table's + /// `v2Checkpoints` feature support /// 2. Reads actions from the log segment using the checkpoint read schema /// 3. Filters and deduplicates actions for the checkpoint /// 4. Chains the checkpoint metadata action if writing a V2 spec checkpoint /// (i.e., if `v2Checkpoints` feature is supported by table) /// 5. Generates the appropriate checkpoint path /// - /// The returned data should be written to persistent storage by the caller - /// before calling `finalize()` otherwise data loss may occur. + /// # Important: The returned data should be written to persistent storage by the + /// caller before calling `finalize()` otherwise data loss may occur. /// - /// # Returns - /// A [`CheckpointData`] containing the checkpoint path and action iterator + /// # Returns: [`CheckpointData`] containing the checkpoint path and data to write pub fn checkpoint_data(&mut self, engine: &dyn Engine) -> DeltaResult { - if self.data_consumed { - return Err(Error::generic("Checkpoint data has already been consumed")); - } - let is_v2_checkpoints_supported = self .snapshot .table_configuration() @@ -203,7 +184,7 @@ impl CheckpointWriter { let checkpoint_data = checkpoint_actions_iter( actions?, self.total_actions_counter.clone(), - self.total_add_actions_counter.clone(), + self.add_actions_counter.clone(), self.deleted_file_retention_timestamp()?, ); @@ -219,8 +200,6 @@ impl CheckpointWriter { self.snapshot.version(), )?; - self.data_consumed = true; - Ok(CheckpointData { path: checkpoint_path.location, data: Box::new(chained), @@ -234,25 +213,16 @@ impl CheckpointWriter { /// for the engine to quickly locate the last checkpoint and avoid full log replay when /// reading the table. /// - /// # Workflow - /// 0. IMPORTANT: This method must only be called AFTER successfully writing - /// all checkpoint data to storage. Failure to do so may result in - /// data loss. - /// 1. Validates the schema of the engine-provided metadata - /// 2. Enrich the metadata with the additional fields: - /// - `version`: The version of the checkpoint (snapshot version) - /// - `size`: The number of actions in the checkpoint (total_actions_counter) - /// - `parts`: The number of parts in the checkpoint (always 1) - /// - `sizeInBytes`: The size of the checkpoint file in bytes (from metadata) - /// - `numOfAddFiles`: The number of add files in the checkpoint (total_add_actions_counter) - /// - `checkpointSchema`: (not yet implemented) - /// - `checksum`: (not yet implemented) - /// 3. Write the metadata to the `_last_checkpoint` file + /// # Important + /// This method must only be called AFTER successfully writing all checkpoint data to storage. + /// Failure to do so may result in data loss. /// /// # Parameters - /// - `engine`: The engine used for writing the _last_checkpoint file - /// - `metadata`: A single-row [`EngineData`] batch containing: + /// - `engine`: The engine used for writing the `_last_checkpoint` file + /// - `metadata`: A single-row, single-column [`EngineData`] batch containing: /// - `sizeInBytes` (i64): The size of the written checkpoint file + /// + /// # Returns: [`Ok()`] if the `_last_checkpoint` file was written successfully pub fn finalize(self, engine: &dyn Engine, metadata: &dyn EngineData) -> DeltaResult<()> { let version = self.snapshot.version().try_into().map_err(|e| { Error::generic(format!( @@ -267,7 +237,7 @@ impl CheckpointWriter { metadata, version, self.total_actions_counter.load(Ordering::Relaxed), - self.total_add_actions_counter.load(Ordering::Relaxed), + self.add_actions_counter.load(Ordering::Relaxed), )?; let last_checkpoint_path = self @@ -287,10 +257,10 @@ impl CheckpointWriter { /// Creates the checkpoint metadata action for V2 checkpoints. /// - /// For V2 checkpoints, this function generates a special [`CheckpointMetadata`] action + /// For V2 checkpoints, this function generates the [`CheckpointMetadata`] action /// that must be included in the V2 spec checkpoint file. This action contains metadata /// about the checkpoint, particularly its version. For V1 checkpoints, this function - /// returns `None`, as the V1 schema does not include this action type. + /// returns `None`, as the V1 checkpoint schema does not include this action type. /// /// # Implementation Details /// @@ -298,8 +268,10 @@ impl CheckpointWriter { /// version field of the [`CheckpointMetadata`] action. Future implementations will /// include the additional metadata field `tags` when map support is added. /// - /// The resulting [`FilteredEngineData`] includes a selection vector with a single `true` - /// value, indicating this action should always be included in the checkpoint. + /// # Returns: + /// A [`FilteredEngineData`] batch including the single-row [`EngineData`] batch along with + /// an accompanying selection vector with a single `true` value, indicating the action in + /// batch should be included in the checkpoint. fn create_checkpoint_metadata_batch( &self, version: i64, @@ -310,9 +282,10 @@ impl CheckpointWriter { return Ok(None); } let values: &[Scalar] = &[version.into()]; - // Create the nested schema structure for [`CheckpointMetadata`] + // Note: We cannot use `CheckpointMetadata::to_schema()` as it would include // the 'tags' field which we're not supporting yet due to the lack of map support. + // Schema of the checkpoint metadata action let schema = Arc::new(StructType::new([StructField::not_null( CHECKPOINT_METADATA_NAME, DataType::struct_type([StructField::not_null("version", DataType::LONG)]), @@ -391,19 +364,31 @@ fn deleted_file_retention_timestamp_with_time( Ok(now_ms - retention_ms) } -/// Creates the data as [`EngineData`] to be written to the `_last_checkpoint` file. +/// Creates the data for the `_last_checkpoint` file containing checkpoint metadata +/// +/// # Parameters +/// - `engine`: Engine for data processing +/// - `metadata`: Single-row data containing `sizeInBytes` (i64) +/// - `version`: Table version number +/// - `total_actions_counter`: Total actions count +/// - `total_add_actions_counter`: Add actions count +/// +/// # Returns +/// A new [`EngineData`] batch with the `_last_checkpoint` fields: +/// - `version` (i64, required): Table version number +/// - `size` (i64, required): Total actions count +/// - `parts` (i64, optional): Always 1 for single-file checkpoints +/// - `sizeInBytes` (i64, optional): Size of checkpoint file in bytes +/// - `numOfAddFiles` (i64, optional): Number of Add actions /// -/// This method validates the schema of the engine-provided metadata which should -/// contain a single row with the single column `sizeInBytes` (i64). It then transforms the -/// metadata to include the additional fields that are part of the `_last_checkpoint` file schema. -/// The `checkpointSchema` and `checksum` fields are also part of the `_last_checkpoint`` file -/// schema but are not yet implemented. They will be added in future versions. +/// Note: The fields `checkpointSchema` and `checksum` are not yet included in this +/// implementation. They are marked as TODOs for future development. fn create_last_checkpoint_data( engine: &dyn Engine, metadata: &dyn EngineData, version: i64, total_actions_counter: i64, - total_add_actions_counter: i64, + add_actions_counter: i64, ) -> DeltaResult> { // Validate metadata has exactly one row if metadata.len() != 1 { @@ -420,7 +405,7 @@ fn create_last_checkpoint_data( Expression::literal(total_actions_counter), Expression::literal(parts), column_expr!("sizeInBytes"), - Expression::literal(total_add_actions_counter), + Expression::literal(add_actions_counter), // TODO(seb): Write the `checkpoint_schema` field // TODO(seb): Write the `checksum` field ]; @@ -428,6 +413,7 @@ fn create_last_checkpoint_data( // Note: We cannot use `LastCheckpointInfo::to_schema()` as it would include // the 'checkpoint_schema' field, which is only known at runtime. + // Schema of the `_last_checkpoint` file let last_checkpoint_schema = Arc::new(StructType::new([ StructField::not_null("version", DataType::LONG), StructField::not_null("size", DataType::LONG), @@ -562,7 +548,7 @@ mod unit_tests { let size_in_bytes: i64 = 1024 * 1024; // 1MB let version = 10; let total_actions_counter = 100; - let total_add_actions_counter = 75; + let add_actions_counter = 75; let engine = SyncEngine::new(); // Create engine metadata with `size_in_bytes` @@ -577,7 +563,7 @@ mod unit_tests { &metadata, version, total_actions_counter, - total_add_actions_counter, + add_actions_counter, )?; // Verify the underlying EngineData contains the expected LastCheckpointInfo schema and data @@ -599,7 +585,7 @@ mod unit_tests { create_array!(Int64, [total_actions_counter]), create_array!(Int64, [1]), create_array!(Int64, [size_in_bytes]), - create_array!(Int64, [total_add_actions_counter]), + create_array!(Int64, [add_actions_counter]), ], ) .unwrap(); From 64c720df07223ea953e6809e3ef536d9e4a80e41 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 11 Apr 2025 12:15:18 -0700 Subject: [PATCH 123/176] build & doc fixes --- kernel/src/checkpoint/mod.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 060e499729..90c515f1a2 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -31,7 +31,6 @@ //! ## [`CheckpointWriter`] //! Handles the actual checkpoint data generation and writing process. It is created via the //! [`crate::table::Table::checkpoint`] method and provides the following APIs: -//! - [`CheckpointWriter::new`] - Creates a new writer for the given table snapshot //! - [`CheckpointWriter::checkpoint_data`] - Returns the checkpoint data and path information //! - [`CheckpointWriter::finalize`] - Writes the `_last_checkpoint` file //! @@ -132,7 +131,7 @@ pub struct CheckpointData { pub struct CheckpointWriter { /// Reference to the snapshot of the table being checkpointed pub(crate) snapshot: Arc, - /// Note: Arc provides shared mutability for our counters, allowing the + /// Note: `Arc` provides shared mutability for our counters, allowing the /// returned actions iterator from `.checkpoint_data()` to update the counters, /// and the [`CheckpointWriter`] to read them during `.finalize()` /// Counter for total actions included in the checkpoint @@ -222,7 +221,7 @@ impl CheckpointWriter { /// - `metadata`: A single-row, single-column [`EngineData`] batch containing: /// - `sizeInBytes` (i64): The size of the written checkpoint file /// - /// # Returns: [`Ok()`] if the `_last_checkpoint` file was written successfully + /// # Returns: [`variant@Ok`] if the `_last_checkpoint` file was written successfully pub fn finalize(self, engine: &dyn Engine, metadata: &dyn EngineData) -> DeltaResult<()> { let version = self.snapshot.version().try_into().map_err(|e| { Error::generic(format!( @@ -429,7 +428,7 @@ fn create_last_checkpoint_data( )])); let last_checkpoint_metadata_evaluator = engine.evaluation_handler().new_expression_evaluator( - engine_metadata_schema.into(), + engine_metadata_schema, last_checkpoint_expr, last_checkpoint_schema.into(), ); From 7c90c33b6669c348d0de87087da3e1da5580b18e Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 11 Apr 2025 12:21:50 -0700 Subject: [PATCH 124/176] fmt --- kernel/src/table.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/src/table.rs b/kernel/src/table.rs index 734f51c4d2..20a82c193f 100644 --- a/kernel/src/table.rs +++ b/kernel/src/table.rs @@ -113,7 +113,9 @@ impl Table { engine: &dyn Engine, version: Option, ) -> DeltaResult { - Ok(CheckpointWriter::new(Arc::new(self.snapshot(engine, version)?))) + Ok(CheckpointWriter::new(Arc::new( + self.snapshot(engine, version)?, + ))) } /// Create a new write transaction for this table. From 48a015396f58e934c862c0164292f050a977fcba Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 11 Apr 2025 16:19:07 -0700 Subject: [PATCH 125/176] review --- kernel/src/checkpoint/log_replay.rs | 75 +++++++++----------------- kernel/src/log_replay.rs | 84 ++++++++++++++++++----------- kernel/src/scan/log_replay.rs | 2 +- 3 files changed, 81 insertions(+), 80 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index eeaa278bd7..7949254c3e 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -28,6 +28,8 @@ //! 3. Updates counters and state for cross-batch deduplication //! 4. Produces a [`FilteredEngineData`] result which includes a selection vector indicating which //! actions should be included in the checkpoint file +#[cfg(doc)] +use crate::actions::CheckpointMetadata; use crate::engine_data::{FilteredEngineData, GetData, RowVisitor, TypedGetData as _}; use crate::log_replay::{FileActionDeduplicator, FileActionKey, LogReplayProcessor}; use crate::scan::data_skipping::DataSkippingFilter; @@ -39,11 +41,14 @@ use std::sync::atomic::{AtomicI64, Ordering}; use std::sync::{Arc, LazyLock}; /// The [`CheckpointLogReplayProcessor`] is an implementation of the [`LogReplayProcessor`] -/// trait that filters log segment actions for inclusion in a V1 spec checkpoint file. +/// trait that filters log segment actions for inclusion in a V1 spec checkpoint file. This +/// processor is also leveraged when creating a single-file V2 checkpoint as the V2 spec +/// is a superset of the V1 spec, with the addition of a [`CheckpointMetadata`] action. /// /// It processes each action batch via the `process_actions_batch` method, using the /// [`CheckpointVisitor`] to build an accompanying selection vector indicating which actions /// should be included in the checkpoint. +#[allow(unused)] // TODO(seb): Remove once checkpoint api is implemented pub(crate) struct CheckpointLogReplayProcessor { /// Tracks file actions that have been seen during log replay to avoid duplicates. /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. @@ -52,7 +57,7 @@ pub(crate) struct CheckpointLogReplayProcessor { // iterator to update the values during processing and the caller to observe the final // counts afterward. The counters are i64 to match the `_last_checkpoint` file schema. // Tracks the total number of actions included in the checkpoint file. - total_actions: Arc, + actions_count: Arc, // Tracks the total number of add actions included in the checkpoint file. add_actions_count: Arc, /// Indicates whether a protocol action has been seen in the log. @@ -81,11 +86,6 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { is_log_batch: bool, ) -> DeltaResult { let selection_vector = vec![true; batch.len()]; - assert_eq!( - selection_vector.len(), - batch.len(), - "Initial selection vector length does not match actions length" - ); // Create the checkpoint visitor to process actions and update selection vector let mut visitor = CheckpointVisitor::new( @@ -102,7 +102,7 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { // Update the total actions and add actions counters. Relaxed ordering is sufficient // here as we only care about the total count when writing the _last_checkpoint file. // (the ordering is not important for correctness) - self.total_actions.fetch_add( + self.actions_count.fetch_add( visitor.file_actions_count + visitor.non_file_actions_count, Ordering::Relaxed, ); @@ -119,21 +119,22 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { }) } - /// Data skipping is not applicable for checkpoint log replay. + /// We never do data skipping for checkpoint log replay (entire table state is always reproduced) fn data_skipping_filter(&self) -> Option<&DataSkippingFilter> { None } } impl CheckpointLogReplayProcessor { + #[allow(unused)] // TODO(seb): Remove once checkpoint api is implemented pub(crate) fn new( - total_actions: Arc, + actions_count: Arc, add_actions_count: Arc, minimum_file_retention_timestamp: i64, ) -> Self { Self { seen_file_keys: Default::default(), - total_actions, + actions_count, add_actions_count, seen_protocol: false, seen_metadata: false, @@ -143,29 +144,6 @@ impl CheckpointLogReplayProcessor { } } -/// Given an iterator of (engine_data, bool) tuples, returns an iterator of -/// `(engine_data, selection_vec)`. Each row that is selected in the returned `engine_data` _must_ -/// be written to the V1 checkpoint file in order to capture the table version's complete state. -/// Non-selected rows _must_ be ignored. The boolean flag tied to each actions batch indicates -/// whether the batch is a commit batch (true) or a checkpoint batch (false). -/// -/// Note: The 'action_iter' parameter is an iterator of (engine_data, bool) tuples that _must_ be -/// sorted by the order of the actions in the log from most recent to least recent. -#[allow(unused)] // TODO: Remove once API is implemented -pub(crate) fn checkpoint_actions_iter( - action_iter: impl Iterator, bool)>>, - total_actions: Arc, - add_actions_count: Arc, - minimum_file_retention_timestamp: i64, -) -> impl Iterator> { - CheckpointLogReplayProcessor::new( - total_actions, - add_actions_count, - minimum_file_retention_timestamp, - ) - .process_actions_iter(action_iter) -} - /// A visitor that filters actions for inclusion in a V1 spec checkpoint file. /// /// This visitor processes actions in newest-to-oldest order (as they appear in log @@ -487,7 +465,6 @@ mod tests { use super::*; use crate::arrow::array::StringArray; use crate::utils::test_utils::{action_batch, parse_json_batch}; - use itertools::Itertools; use std::collections::HashSet; /// Helper function to create test batches from JSON strings @@ -495,24 +472,24 @@ mod tests { Ok((parse_json_batch(StringArray::from(json_strings)), true)) } - /// Helper function which applies the `checkpoint_actions_iter` function to a set of + /// Helper function which applies the [`CheckpointLogReplayProcessor`] to a set of /// input batches and returns the results. fn run_checkpoint_test( input_batches: Vec<(Box, bool)>, ) -> DeltaResult<(Vec, i64, i64)> { - let total_actions = Arc::new(AtomicI64::new(0)); + let actions_count = Arc::new(AtomicI64::new(0)); let add_actions_count = Arc::new(AtomicI64::new(0)); - let results: Vec<_> = checkpoint_actions_iter( - input_batches.into_iter().map(Ok), - total_actions.clone(), + let results: Vec<_> = CheckpointLogReplayProcessor::new( + actions_count.clone(), add_actions_count.clone(), - 0, + 0, // minimum_file_retention_timestamp ) - .try_collect()?; + .process_actions_iter(input_batches.into_iter().map(Ok)) + .collect::>>()?; Ok(( results, - total_actions.load(Ordering::Relaxed), + actions_count.load(Ordering::Relaxed), add_actions_count.load(Ordering::Relaxed), )) } @@ -770,13 +747,13 @@ mod tests { create_batch(batch2)?, create_batch(batch3)?, ]; - let (results, total_actions, add_actions) = run_checkpoint_test(input_batches)?; + let (results, actions_count, add_actions) = run_checkpoint_test(input_batches)?; // Verify results assert_eq!(results.len(), 2, "Expected two batches in results"); assert_eq!(results[0].selection_vector, vec![true, true, true],); assert_eq!(results[1].selection_vector, vec![false, false, false, true],); - assert_eq!(total_actions, 4); + assert_eq!(actions_count, 4); assert_eq!(add_actions, 0); Ok(()) @@ -809,13 +786,13 @@ mod tests { create_batch(batch2)?, create_batch(batch3)?, ]; - let (results, total_actions, add_actions) = run_checkpoint_test(input_batches)?; + let (results, actions_count, add_actions) = run_checkpoint_test(input_batches)?; // Verify results assert_eq!(results.len(), 2); // The third batch should be filtered out since there are no selected actions assert_eq!(results[0].selection_vector, vec![true]); assert_eq!(results[1].selection_vector, vec![false, true]); - assert_eq!(total_actions, 2); + assert_eq!(actions_count, 2); assert_eq!(add_actions, 1); Ok(()) @@ -844,13 +821,13 @@ mod tests { ]; let input_batches = vec![create_batch(batch1)?, create_batch(batch2)?]; - let (results, total_actions, add_actions) = run_checkpoint_test(input_batches)?; + let (results, actions_count, add_actions) = run_checkpoint_test(input_batches)?; // Verify results assert_eq!(results.len(), 2); assert_eq!(results[0].selection_vector, vec![true, true]); assert_eq!(results[1].selection_vector, vec![false, false, true]); - assert_eq!(total_actions, 3); + assert_eq!(actions_count, 3); assert_eq!(add_actions, 2); Ok(()) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 007cb6854b..70eb592a5e 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -14,12 +14,15 @@ //! deduplication with `FileActionDeduplicator` which tracks unique files across log batches //! to minimize memory usage for tables with extensive history. -use std::collections::HashSet; - use crate::actions::deletion_vector::DeletionVectorDescriptor; +#[cfg(doc)] +use crate::checkpoint::CheckpointLogReplayProcessor; use crate::engine_data::{GetData, TypedGetData}; use crate::scan::data_skipping::DataSkippingFilter; +#[cfg(doc)] +use crate::scan::{log_replay::ScanLogReplayProcessor, ScanMetadata}; use crate::{DeltaResult, EngineData}; +use std::collections::HashSet; use tracing::debug; @@ -148,17 +151,15 @@ impl<'seen> FileActionDeduplicator<'seen> { /// This method examines the data at the given index using the provided getters /// to identify whether a file action exists and what type it is. /// - /// # Arguments - /// - /// * `i` - Index position in the data structure to examine - /// * `getters` - Collection of data getter implementations used to access the data - /// * `skip_removes` - Whether to skip remove actions when extracting file actions + /// # Parameters + /// - `i`: Index position in the data structure to examine + /// - `getters`: Collection of data getter implementations used to access the data + /// - `skip_removes`: Whether to skip remove actions when extracting file actions /// /// # Returns - /// - /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation - /// * `Ok(None)` - When no file action is found - /// * `Err(...)` - On any error during extraction + /// - `Ok(Some((key, is_add)))`: When a file action is found, returns the key and whether it's an add operation + /// - `Ok(None)`: When no file action is found + /// - `Err(...)`: On any error during extraction pub(crate) fn extract_file_action<'a>( &self, i: usize, @@ -207,38 +208,52 @@ impl<'seen> FileActionDeduplicator<'seen> { /// - **Data skipping** filters are applied to the initial selection vector to reduce the number of rows /// processed by the processor, (if a filter is provided). /// -/// Implementations: -/// - `ScanLogReplayProcessor`: Used for table scans, this processor filters and selects deduplicated +/// # Implementations +/// +/// - [`ScanLogReplayProcessor`]: Used for table scans, this processor filters and selects deduplicated /// `Add` actions from log batches to reconstruct the view of the table at a specific point in time. /// Note that scans do not expose `Remove` actions. Data skipping may be applied when a predicate is /// provided. /// -/// - `CheckpointLogReplayProcessor` (WIP): Will be responsible for processing log batches to construct -/// V1 spec checkpoint files. Unlike scans, checkpoint processing includes additional actions, such as -/// `Remove`, `Metadata`, and `Protocol`, required to fully reconstruct table state. -/// Data skipping is not applied during checkpoint processing. +/// - [`CheckpointLogReplayProcessor`]: Used for writing checkpoints, this processor filters and selects +/// actions from log batches for inclusion in V1 spec checkpoint files. Unlike scans, checkpoint +/// processing includes additional actions, such as `Remove`, `Metadata`, and `Protocol`, required to +/// fully reconstruct table state. Data skipping is not applied during checkpoint processing. /// -/// The `Output` type represents the material result of log replay, and it must implement the -/// `HasSelectionVector` trait to allow filtering of irrelevant rows: +/// # Action Iterator Input /// -/// - For **scans**, the output type is `ScanMetadata`, which contains the file actions (`Add` +/// The [`LogReplayProcessor::process_actions_iter`] method is the entry point for log replay processing. +/// It takes as input an iterator of (actions batch, is_commit_batch flag) tuples and returns an iterator of +/// processor-specific output types with selection vectors. The is_commit_batch bool flag in each tuple +/// indicates whether the batch came from a commit log (`true`) or checkpoint (`false`). Action batches +/// **must** be sorted by the order of the actions in the log from most recent to oldest. +/// +/// Each row that is selected in the returned output **must** be included in the processor's result +/// (e.g., in scan results or checkpoint files), while non-selected rows **must** be ignored. +/// +/// # Output Types +/// +/// The [`LogReplayProcessor::Output`] type represents the material result of log replay, and it must +/// implement the [`HasSelectionVector`] trait to allow filtering of irrelevant rows: +/// +/// - For **scans**, the output type is [`ScanMetadata`], which contains the file actions (`Add` /// actions) that need to be applied to build the table's view, accompanied by a /// **selection vector** that identifies which rows should be included. A transform vector may /// also be included to handle schema changes, such as renaming columns or modifying data types. /// -/// - For **checkpoints**, the output includes the actions necessary to write to the checkpoint file (`Add`, -/// `Remove`, `Metadata`, `Protocol` actions), filtered by the **selection vector** to determine which -/// rows are included in the final checkpoint. +/// - For **checkpoints**, the output type is [`FilteredEngineData`], which includes the actions +/// necessary to write to the checkpoint file (`Add`, `Remove`, `Metadata`, `Protocol` actions), +/// filtered by the **selection vector** to determine which rows are included in the final checkpoint. /// /// TODO: Refactor the Change Data Feed (CDF) processor to use this trait. pub(crate) trait LogReplayProcessor: Sized { /// The type of results produced by this processor must implement the - /// `HasSelectionVector` trait to allow filtering out batches with no selected rows. + /// [`HasSelectionVector`] trait to allow filtering out batches with no selected rows. type Output: HasSelectionVector; /// Processes a batch of actions and returns the filtered results. /// - /// # Arguments + /// # Parameters /// - `actions_batch` - A boxed [`EngineData`] instance representing a batch of actions. /// - `is_log_batch` - `true` if the batch originates from a commit log, `false` if from a checkpoint. /// @@ -253,10 +268,19 @@ pub(crate) trait LogReplayProcessor: Sized { /// Applies the processor to an actions iterator and filters out empty results. /// - /// # Arguments - /// * `action_iter` - Iterator of action batches and their source flags + /// This method: + /// 1. Applies `process_actions_batch` to each action batch + /// 2. Maintains processor state across all batches + /// 3. Automatically filters out batches with no selected rows + /// + /// # Parameters + /// - `action_iter`: Iterator of (batch, is_commit_batch) tuples, where each batch contains actions + /// and the boolean flag indicates whether the batch came from a commit log (`true`) or checkpoint + /// (`false`). Actions must be provided in reverse chronological order. /// - /// Returns an iterator that yields the Output type of the processor. + /// # Returns + /// An iterator that yields the output type of the processor, containing only non-empty results + /// (batches where at least one row was selected). fn process_actions_iter( mut self, action_iter: impl Iterator, bool)>>, @@ -281,8 +305,8 @@ pub(crate) trait LogReplayProcessor: Sized { /// The selection vector is further updated based on the processor's logic in the /// `process_actions_batch` method. /// - /// # Arguments - /// - `batch` - A reference to the batch of actions to be processed. + /// # Parameters + /// - `batch`: A reference to the batch of actions to be processed. /// /// # Returns /// A `DeltaResult>`, where each boolean indicates if the corresponding row should be included. diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 41763706ce..ce83e01e70 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -38,7 +38,7 @@ use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; /// produces a [`ScanMetadata`] result. This result includes the transformed batch, a selection /// vector indicating which rows are valid, and any row-level transformation expressions that need /// to be applied to the selected rows. -struct ScanLogReplayProcessor { +pub(crate) struct ScanLogReplayProcessor { partition_filter: Option, data_skipping_filter: Option, add_transform: Arc, From 4a1a1dd3c043a33a9bbb09374e3796f271b0ca70 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 11 Apr 2025 16:24:53 -0700 Subject: [PATCH 126/176] schema spec --- kernel/src/checkpoint/log_replay.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 7949254c3e..117bd40f78 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -42,8 +42,8 @@ use std::sync::{Arc, LazyLock}; /// The [`CheckpointLogReplayProcessor`] is an implementation of the [`LogReplayProcessor`] /// trait that filters log segment actions for inclusion in a V1 spec checkpoint file. This -/// processor is also leveraged when creating a single-file V2 checkpoint as the V2 spec -/// is a superset of the V1 spec, with the addition of a [`CheckpointMetadata`] action. +/// processor is leveraged when creating a single-file V2 checkpoint as the V2 spec schema is +/// a superset of the V1 spec schema, with the addition of a [`CheckpointMetadata`] action. /// /// It processes each action batch via the `process_actions_batch` method, using the /// [`CheckpointVisitor`] to build an accompanying selection vector indicating which actions From 4d48a8a01490cfbd181cbcd4b4e19a38c2189fc8 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 11 Apr 2025 16:26:12 -0700 Subject: [PATCH 127/176] pub crate --- kernel/src/log_replay.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 70eb592a5e..aaf26639c8 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -16,7 +16,7 @@ use crate::actions::deletion_vector::DeletionVectorDescriptor; #[cfg(doc)] -use crate::checkpoint::CheckpointLogReplayProcessor; +use crate::checkpoint::log_replay::CheckpointLogReplayProcessor; use crate::engine_data::{GetData, TypedGetData}; use crate::scan::data_skipping::DataSkippingFilter; #[cfg(doc)] From 411b2c4b66c4a2733addb506390d4ab4de7cd895 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 11 Apr 2025 16:28:54 -0700 Subject: [PATCH 128/176] forgot to include this file --- kernel/src/checkpoint/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index e18479696d..490e2cf993 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -5,4 +5,4 @@ //! avoiding full log replay. This API supports three checkpoint types: //! //! TODO!(seb): Include docs when implemented -mod log_replay; +pub(crate) mod log_replay; From 48d529d9dbd82537f167d3f2b9b847b68f42672c Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 14 Apr 2025 12:16:55 -0700 Subject: [PATCH 129/176] review --- kernel/src/checkpoint/log_replay.rs | 8 +++++--- kernel/src/log_replay.rs | 11 +++++------ kernel/src/utils.rs | 1 + 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 117bd40f78..e80d8ce56d 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -28,14 +28,15 @@ //! 3. Updates counters and state for cross-batch deduplication //! 4. Produces a [`FilteredEngineData`] result which includes a selection vector indicating which //! actions should be included in the checkpoint file -#[cfg(doc)] -use crate::actions::CheckpointMetadata; +//! +//! [`CheckpointMetadata`]: crate::actions::CheckpointMetadata use crate::engine_data::{FilteredEngineData, GetData, RowVisitor, TypedGetData as _}; use crate::log_replay::{FileActionDeduplicator, FileActionKey, LogReplayProcessor}; use crate::scan::data_skipping::DataSkippingFilter; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, EngineData, Error}; + use std::collections::HashSet; use std::sync::atomic::{AtomicI64, Ordering}; use std::sync::{Arc, LazyLock}; @@ -502,7 +503,7 @@ mod tests { let mut visitor = CheckpointVisitor::new( &mut seen_file_keys, true, - vec![true; 8], + vec![true; 9], 0, // minimum_file_retention_timestamp (no expired tombstones) false, false, @@ -520,6 +521,7 @@ mod tests { false, // Row 5 is a cdc action (excluded) false, // Row 6 is a sidecar action (excluded) true, // Row 7 is a txn action (included) + false, // Row 8 is a checkpointMetadata action (excluded) ]; assert_eq!(visitor.file_actions_count, 2); diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index aaf26639c8..e6f8fdfb8e 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -13,19 +13,18 @@ //! This module provides structures for efficient batch processing, focusing on file action //! deduplication with `FileActionDeduplicator` which tracks unique files across log batches //! to minimize memory usage for tables with extensive history. - +//! +//! [`CheckpointLogReplayProcessor`]: crate::checkpoint::log_replay::CheckpointLogReplayProcessor +//! [`ScanLogReplayProcessor`]: crate::scan::log_replay::ScanLogReplayProcessor +//! [`ScanMetadata`]: crate::scan::ScanMetadata use crate::actions::deletion_vector::DeletionVectorDescriptor; -#[cfg(doc)] -use crate::checkpoint::log_replay::CheckpointLogReplayProcessor; use crate::engine_data::{GetData, TypedGetData}; use crate::scan::data_skipping::DataSkippingFilter; -#[cfg(doc)] -use crate::scan::{log_replay::ScanLogReplayProcessor, ScanMetadata}; use crate::{DeltaResult, EngineData}; + use std::collections::HashSet; use tracing::debug; - /// The subset of file action fields that uniquely identifies it in the log, used for deduplication /// of adds and removes during log replay. #[derive(Debug, Hash, Eq, PartialEq)] diff --git a/kernel/src/utils.rs b/kernel/src/utils.rs index f8813d82e0..6eea84f8b3 100644 --- a/kernel/src/utils.rs +++ b/kernel/src/utils.rs @@ -126,6 +126,7 @@ pub(crate) mod test_utils { r#"{"cdc":{"path":"_change_data/age=21/cdc-00000-93f7fceb-281a-446a-b221-07b88132d203.c000.snappy.parquet","partitionValues":{"age":"21"},"size":1033,"dataChange":false}}"#, r#"{"sidecar":{"path":"016ae953-37a9-438e-8683-9a9a4a79a395.parquet","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{"tag_foo":"tag_bar"}}}"#, r#"{"txn":{"appId":"myApp","version": 3}}"#, + r#"{"checkpointMetadata":{"version":2, "tags":{"tag_foo":"tag_bar"}}}"#, ] .into(); parse_json_batch(json_strings) From 1bd96580914d9bc898efa1b1f574c3ad26fccfd9 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 14 Apr 2025 14:53:34 -0700 Subject: [PATCH 130/176] vis --- kernel/src/snapshot.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index a30273a49f..cfd5a98aac 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -306,7 +306,7 @@ impl Snapshot { #[serde(rename_all = "camelCase")] #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] #[cfg_attr(not(feature = "developer-visibility"), visibility::make(pub(crate)))] -struct LastCheckpointHint { +pub(crate) struct LastCheckpointHint { /// The version of the table when the last checkpoint was made. #[allow(unreachable_pub)] // used by acceptance tests (TODO make an fn accessor?) pub version: Version, From 2439ef3ef93a7902298bc35c7543bffe204dabde Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 15 Apr 2025 10:01:29 -0700 Subject: [PATCH 131/176] compiling doc --- kernel/src/checkpoint/mod.rs | 55 +++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 2816b96a88..b4107cde96 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -36,29 +36,39 @@ //! //! ## Example: Writing a classic-named V1/V2 checkpoint (depending on `v2Checkpoints` feature support) //! -//! TODO(seb): unignore example -//! ```ignore -//! let path = "./tests/data/app-txn-no-checkpoint"; -//! let engine = Arc::new(SyncEngine::new()); -//! let table = Table::try_from_uri(path)?; -//! -//! // Create a checkpoint writer for the table at a specific version -//! let mut writer = table.checkpoint(&engine, Some(2))?; -//! -//! // Retrieve checkpoint data -//! let checkpoint_data = writer.checkpoint_data()?; -//! -//! // Write checkpoint data to storage (implementation-specific) -//! let metadata = your_storage_implementation.write_checkpoint( -//! &checkpoint_data.path, -//! checkpoint_data.data -//! )?; +//! ``` +//! use std::sync::Arc; +//! use object_store::local::LocalFileSystem; +//! use delta_kernel::{ +//! checkpoint::CheckpointData, +//! engine::arrow_data::ArrowEngineData, +//! engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}, +//! table::Table, +//! DeltaResult, Error, +//! }; +//! use delta_kernel::arrow::array::{Int64Array, RecordBatch}; +//! use delta_kernel::arrow::datatypes::{DataType, Field, Schema}; //! -//! /* IMPORTANT: All data must be written before finalizing the checkpoint */ +//! fn mock_write_to_object_store(data: CheckpointData) -> DeltaResult { +//! let size: i64 = data.data.map(|r| r.map(|_| 1)).collect::, _>>()?.into_iter().sum(); +//! let batch = RecordBatch::try_new( +//! Arc::new(Schema::new(vec![Field::new("sizeInBytes", DataType::Int64, false)])), +//! vec![Arc::new(Int64Array::from(vec![size]))], +//! )?; +//! Ok(ArrowEngineData::new(batch)) +//! } //! -//! // Finalize the checkpoint by writing the _last_checkpoint file +//! let engine = DefaultEngine::new( +//! Arc::new(LocalFileSystem::new()), +//! Arc::new(TokioBackgroundExecutor::new()) +//! ); +//! let table = Table::try_from_uri("./tests/data/app-txn-no-checkpoint")?; +//! let mut writer = table.checkpoint(&engine, Some(1))?; +//! let metadata = mock_write_to_object_store(writer.checkpoint_data(&engine)?)?; //! writer.finalize(&engine, &metadata)?; +//! # Ok::<_, Error>(()) //! ``` +//! //! ## Future extensions //! - TODO(#836): Single-file UUID-named V2 checkpoints (using `n.checkpoint.u.{json/parquet}` naming) are to be //! implemented in the future. The current implementation only supports classic-named V2 checkpoints. @@ -259,10 +269,9 @@ impl CheckpointWriter { /// Finalizes the checkpoint writing process by creating the `_last_checkpoint` file /// - /// The [`LastCheckpointHint`] (`_last_checkpoint`) file is a metadata file that contains - /// information about the last checkpoint created for the table. It is used as a hint - /// for the engine to quickly locate the last checkpoint and avoid full log replay when - /// reading the table. + /// The `_last_checkpoint` file is a metadata file that contains information about the + /// last checkpoint created for the table. It is used as a hint for the engine to quickly + /// locate the last checkpoint and avoid full log replay when reading the table. /// /// # Important /// This method must only be called AFTER successfully writing all checkpoint data to storage. From 16953922f3513d9a67d9157865722501ec819208 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 15 Apr 2025 10:03:17 -0700 Subject: [PATCH 132/176] docs --- kernel/src/checkpoint/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index b4107cde96..90a45127bb 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -34,7 +34,7 @@ //! - [`CheckpointWriter::checkpoint_data`] - Returns the checkpoint data and path information //! - [`CheckpointWriter::finalize`] - Writes the `_last_checkpoint` file //! -//! ## Example: Writing a classic-named V1/V2 checkpoint (depending on `v2Checkpoints` feature support) +//! ## Example: Writing a classic-named V1 checkpoint (no `v2Checkpoints` feature on test table) //! //! ``` //! use std::sync::Arc; From cf3faf800532c1c25d6be9bf648b7224ba19cfbe Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 15 Apr 2025 10:18:59 -0700 Subject: [PATCH 133/176] CheckpointWriter error ' --- kernel/src/checkpoint/mod.rs | 19 +++++++++---------- kernel/src/error.rs | 7 +++++++ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 90a45127bb..c29dad4406 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -242,7 +242,7 @@ impl CheckpointWriter { .process_actions_iter(actions); let version = self.snapshot.version().try_into().map_err(|e| { - Error::generic(format!( + Error::checkpoint_writer(format!( "Failed to convert checkpoint version from u64 {} to i64: {}", self.snapshot.version(), e @@ -285,8 +285,8 @@ impl CheckpointWriter { /// # Returns: [`variant@Ok`] if the `_last_checkpoint` file was written successfully pub fn finalize(self, engine: &dyn Engine, metadata: &dyn EngineData) -> DeltaResult<()> { let version = self.snapshot.version().try_into().map_err(|e| { - Error::generic(format!( - "Failed to convert version from u64 {} to i64: {}", + Error::checkpoint_writer(format!( + "Failed to convert checkpoint version from u64 {} to i64: {}", self.snapshot.version(), e )) @@ -407,12 +407,11 @@ fn deleted_file_retention_timestamp_with_time( let now_ms: i64 = now_duration .as_millis() .try_into() - .map_err(|_| Error::generic("Current timestamp exceeds i64 millisecond range"))?; + .map_err(|_| Error::checkpoint_writer("Current timestamp exceeds i64 millisecond range"))?; - let retention_ms: i64 = retention_duration - .as_millis() - .try_into() - .map_err(|_| Error::generic("Retention duration exceeds i64 millisecond range"))?; + let retention_ms: i64 = retention_duration.as_millis().try_into().map_err(|_| { + Error::checkpoint_writer("Retention duration exceeds i64 millisecond range") + })?; // Simple subtraction - will produce negative values if retention > now Ok(now_ms - retention_ms) @@ -446,8 +445,8 @@ fn create_last_checkpoint_data( ) -> DeltaResult> { // Validate metadata has exactly one row if metadata.len() != 1 { - return Err(Error::Generic(format!( - "Engine checkpoint metadata should have exactly one row, found {}", + return Err(Error::checkpoint_writer(format!( + "Engine-collected checkpoint metadata should have exactly one row, found {}", metadata.len() ))); } diff --git a/kernel/src/error.rs b/kernel/src/error.rs index 80857b856b..a261e13a40 100644 --- a/kernel/src/error.rs +++ b/kernel/src/error.rs @@ -34,6 +34,9 @@ pub enum Error { #[error(transparent)] Arrow(ArrowError), + #[error("Error writing checkpoint: {0}")] + CheckpointWriter(String), + /// User tried to convert engine data to the wrong type #[error("Invalid engine data type. Could not convert to {0}")] EngineDataType(String), @@ -208,6 +211,10 @@ pub enum Error { // Convenience constructors for Error types that take a String argument impl Error { + pub fn checkpoint_writer(msg: impl ToString) -> Self { + Self::CheckpointWriter(msg.to_string()) + } + pub fn generic_err(source: impl Into>) -> Self { Self::GenericError { source: source.into(), From aa01189b50b08ab104389df905ce3ffd858c67fa Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 15 Apr 2025 10:40:19 -0700 Subject: [PATCH 134/176] relaxed ordering --- kernel/src/checkpoint/log_replay.rs | 7 +++++-- kernel/src/checkpoint/mod.rs | 13 +++++++------ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 5e27c23bcb..e80d8ce56d 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -100,12 +100,15 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { ); visitor.visit_rows_of(batch.as_ref())?; + // Update the total actions and add actions counters. Relaxed ordering is sufficient + // here as we only care about the total count when writing the _last_checkpoint file. + // (the ordering is not important for correctness) self.actions_count.fetch_add( visitor.file_actions_count + visitor.non_file_actions_count, - Ordering::SeqCst, + Ordering::Relaxed, ); self.add_actions_count - .fetch_add(visitor.add_actions_count, Ordering::SeqCst); + .fetch_add(visitor.add_actions_count, Ordering::Relaxed); // Update protocol and metadata seen flags self.seen_protocol = visitor.seen_protocol; diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index c29dad4406..34874a1288 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -292,12 +292,14 @@ impl CheckpointWriter { )) })?; + // Ordering does not matter as there are no other threads modifying this counter + // at this time (since the checkpoint data iterator has been consumed) let checkpoint_metadata = create_last_checkpoint_data( engine, metadata, version, - self.total_actions_counter.load(Ordering::SeqCst), - self.add_actions_counter.load(Ordering::SeqCst), + self.total_actions_counter.load(Ordering::Relaxed), + self.add_actions_counter.load(Ordering::Relaxed), )?; let last_checkpoint_path = self @@ -354,8 +356,7 @@ impl CheckpointWriter { // Ordering does not matter as there are no other threads modifying this counter // at this time (since we have not yet returned the iterator which performs the action counting) - self.total_actions_counter - .fetch_add(1, std::sync::atomic::Ordering::SeqCst); + self.total_actions_counter.fetch_add(1, Ordering::Relaxed); Ok(Some(Ok(result))) } @@ -555,7 +556,7 @@ mod unit_tests { .unwrap(); assert_eq!(*record_batch, expected); - assert_eq!(writer.total_actions_counter.load(Ordering::SeqCst), 1); + assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 1); Ok(()) } @@ -571,7 +572,7 @@ mod unit_tests { // No checkpoint metadata action should be created for V1 checkpoints assert!(result.is_none()); - assert_eq!(writer.total_actions_counter.load(Ordering::SeqCst), 0); + assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 0); Ok(()) } From a9f9614096c0941598b6ac54dbc3ad15f9d823c8 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 15 Apr 2025 11:06:07 -0700 Subject: [PATCH 135/176] docs & last_checkpoint --- kernel/src/checkpoint/mod.rs | 20 +++++++++++++------- kernel/src/snapshot.rs | 6 +++++- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 34874a1288..90441c34e2 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -50,7 +50,9 @@ //! use delta_kernel::arrow::datatypes::{DataType, Field, Schema}; //! //! fn mock_write_to_object_store(data: CheckpointData) -> DeltaResult { -//! let size: i64 = data.data.map(|r| r.map(|_| 1)).collect::, _>>()?.into_iter().sum(); +//! /* This should be replaced with actual object store write logic */ +//! /* For demonstration, we manually create an EngineData batch with a dummy size */ +//! let size = data.data.try_fold(0i64, |acc, r| r.map(|_| acc + 1))?; //! let batch = RecordBatch::try_new( //! Arc::new(Schema::new(vec![Field::new("sizeInBytes", DataType::Int64, false)])), //! vec![Arc::new(Int64Array::from(vec![size]))], @@ -63,9 +65,18 @@ //! Arc::new(TokioBackgroundExecutor::new()) //! ); //! let table = Table::try_from_uri("./tests/data/app-txn-no-checkpoint")?; +//! +//! // Create a checkpoint writer for the table at a specific version //! let mut writer = table.checkpoint(&engine, Some(1))?; +//! +//! // Write the checkpoint data to the object store and get the metadata //! let metadata = mock_write_to_object_store(writer.checkpoint_data(&engine)?)?; +//! +//! /* IMPORTANT: All data must be written before finalizing the checkpoint */ +//! +//! // Finalize the checkpoint. This call will write the _last_checkpoint file //! writer.finalize(&engine, &metadata)?; +//! //! # Ok::<_, Error>(()) //! ``` //! @@ -89,7 +100,7 @@ use crate::expressions::{column_expr, Scalar}; use crate::log_replay::LogReplayProcessor; use crate::path::ParsedLogPath; use crate::schema::{DataType, SchemaRef, StructField, StructType}; -use crate::snapshot::Snapshot; +use crate::snapshot::{Snapshot, LAST_CHECKPOINT_FILE_NAME}; use crate::{DeltaResult, Engine, EngineData, Error, EvaluationHandlerExtension, Expression}; use log_replay::CheckpointLogReplayProcessor; use std::sync::atomic::{AtomicI64, Ordering}; @@ -102,11 +113,6 @@ mod log_replay; #[cfg(test)] mod tests; -/// Name of the `_last_checkpoint`` file that provides metadata about the last checkpoint -/// created for the table. This file is used as a hint for the engine to quickly locate -/// the last checkpoint and avoid full log replay when reading the table. -static LAST_CHECKPOINT_FILE_NAME: &str = "_last_checkpoint.json"; - /// Schema of the `_last_checkpoint` file /// We cannot use `LastCheckpointInfo::to_schema()` as it would include the 'checkpoint_schema' /// field, which is only known at runtime. diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index cfd5a98aac..78b419a7f5 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -15,7 +15,11 @@ use crate::table_features::ColumnMappingMode; use crate::table_properties::TableProperties; use crate::{DeltaResult, Engine, Error, StorageHandler, Version}; -const LAST_CHECKPOINT_FILE_NAME: &str = "_last_checkpoint"; +/// Name of the _last_checkpoint file that provides metadata about the last checkpoint +/// created for the table. This file is used as a hint for the engine to quickly locate +/// the last checkpoint and avoid full log replay when reading the table. +pub(crate) const LAST_CHECKPOINT_FILE_NAME: &str = "_last_checkpoint"; + // TODO expose methods for accessing the files of a table (with file pruning). /// In-memory representation of a specific snapshot of a Delta table. While a `DeltaTable` exists /// throughout time, `Snapshot`s represent a view of a table at a specific point in time; they From 14b7db74f921b3ad6405962ec185ea4675d292cf Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 15 Apr 2025 11:15:13 -0700 Subject: [PATCH 136/176] schemas --- kernel/src/checkpoint/mod.rs | 32 +++++--------------------------- 1 file changed, 5 insertions(+), 27 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 90441c34e2..4fca0baaa6 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -26,7 +26,6 @@ //! //! - [`CheckpointWriter`] - Core component that manages checkpoint creation workflow //! - [`CheckpointData`] - Contains the data to write and destination path information -//! - [`crate::log_replay`] submodule - Handles action filtering and deduplication //! //! ## [`CheckpointWriter`] //! Handles the actual checkpoint data generation and writing process. It is created via the @@ -155,26 +154,6 @@ static CHECKPOINT_METADATA_ACTION_SCHEMA: LazyLock = LazyLock::new(|| .into() }); -/// Returns the schema for writing the `_last_checkpoint` file -fn get_last_checkpoint_schema() -> &'static SchemaRef { - &LAST_CHECKPOINT_SCHEMA -} - -/// Returns the schema for reading Delta log actions for checkpoint creation -fn get_checkpoint_actions_schema() -> &'static SchemaRef { - &CHECKPOINT_ACTIONS_SCHEMA -} - -/// Returns the schema of the metadata passed to the [`CheckpointWriter::finalize()`] method by the engine -fn get_engine_checkpoint_metadata_schema() -> &'static SchemaRef { - &ENGINE_CHECKPOINT_METADATA_SCHEMA -} - -/// Returns the schema of the [`CheckpointMetadata`] action that is included in V2 checkpoints -fn get_checkpoint_metadata_action_schema() -> &'static SchemaRef { - &CHECKPOINT_METADATA_ACTION_SCHEMA -} - /// Represents a single-file checkpoint, including the data to write and the target path. pub struct CheckpointData { /// The URL where the checkpoint file should be written. @@ -231,11 +210,10 @@ impl CheckpointWriter { .table_configuration() .is_v2_checkpoint_supported(); - let read_schema = get_checkpoint_actions_schema(); let actions = self.snapshot.log_segment().read_actions( engine, - read_schema.clone(), - read_schema.clone(), + CHECKPOINT_ACTIONS_SCHEMA.clone(), + CHECKPOINT_ACTIONS_SCHEMA.clone(), None, )?; @@ -353,7 +331,7 @@ impl CheckpointWriter { let checkpoint_metadata_batch = engine .evaluation_handler() - .create_one(get_checkpoint_metadata_action_schema().clone(), values)?; + .create_one(CHECKPOINT_METADATA_ACTION_SCHEMA.clone(), values)?; let result = FilteredEngineData { data: checkpoint_metadata_batch, @@ -470,9 +448,9 @@ fn create_last_checkpoint_data( let last_checkpoint_expr = Expression::struct_from(last_checkpoint_exprs); let last_checkpoint_metadata_evaluator = engine.evaluation_handler().new_expression_evaluator( - get_engine_checkpoint_metadata_schema().clone(), + ENGINE_CHECKPOINT_METADATA_SCHEMA.clone(), last_checkpoint_expr, - get_last_checkpoint_schema().clone().into(), + LAST_CHECKPOINT_SCHEMA.clone().into(), ); last_checkpoint_metadata_evaluator.evaluate(metadata) From 8f985cadd5cb751295ca2b300b50e4e061dd0a40 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 15 Apr 2025 11:57:51 -0700 Subject: [PATCH 137/176] test --- ffi/src/error.rs | 2 + kernel/src/checkpoint/tests.rs | 156 ++++++++++++++++----------------- 2 files changed, 79 insertions(+), 79 deletions(-) diff --git a/ffi/src/error.rs b/ffi/src/error.rs index fd5fb87e2b..c054c6e7a6 100644 --- a/ffi/src/error.rs +++ b/ffi/src/error.rs @@ -53,6 +53,7 @@ pub enum KernelError { ChangeDataFeedIncompatibleSchema, InvalidCheckpoint, LiteralExpressionTransformError, + CheckpointWriterError, } impl From for KernelError { @@ -61,6 +62,7 @@ impl From for KernelError { // NOTE: By definition, no kernel Error maps to FFIError #[cfg(any(feature = "default-engine", feature = "sync-engine"))] Error::Arrow(_) => KernelError::ArrowError, + Error::CheckpointWriter(_) => KernelError::CheckpointWriterError, Error::EngineDataType(_) => KernelError::EngineDataTypeError, Error::Extract(..) => KernelError::ExtractError, Error::Generic(_) => KernelError::GenericError, diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index 130be9c4a0..3c0625ac8d 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -60,9 +60,9 @@ fn create_checkpoint_metadata_batch(size_in_bytes: i64) -> DeltaResult) -> DeltaResult { - let path = Path::from("_delta_log/_last_checkpoint.json"); + let path = Path::from("_delta_log/_last_checkpoint"); let rt = tokio::runtime::Runtime::new().expect("create tokio runtime"); let byte_data = rt.block_on(async { let data = store.get(&path).await?; @@ -70,6 +70,62 @@ fn read_last_checkpoint_file(store: &Arc) -> DeltaResult { })?; Ok(from_slice(&byte_data)?) } +/// Create a Protocol action without v2Checkpoint feature support +fn create_basic_protocol_action() -> Action { + Action::Protocol( + Protocol::try_new( + 3, + 7, + Vec::::new().into(), + Vec::::new().into(), + ) + .unwrap(), + ) +} + +/// Create a Protocol action with v2Checkpoint feature support +fn create_v2_checkpoint_protocol_action() -> Action { + Action::Protocol( + Protocol::try_new( + 3, + 7, + vec!["v2Checkpoint"].into(), + vec!["v2Checkpoint"].into(), + ) + .unwrap(), + ) +} + +/// Create a Metadata action +fn create_metadata_action() -> Action { + Action::Metadata(Metadata { + id: "test-table".into(), + schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(), + ..Default::default() + }) +} + +/// Create an Add action with the specified path +fn create_add_action(path: &str) -> Action { + Action::Add(Add { + path: path.into(), + data_change: true, + ..Default::default() + }) +} + +/// Create a Remove action with the specified path +/// +/// The remove action has deletion_timestamp set to i64::MAX to ensure the +/// remove action is not considered expired during testing. +fn create_remove_action(path: &str) -> Action { + Action::Remove(Remove { + path: path.into(), + data_change: true, + deletion_timestamp: Some(i64::MAX), // Ensure the remove action is not expired + ..Default::default() + }) +} /// Helper to verify the contents of the `_last_checkpoint` file fn assert_last_checkpoint_contents( @@ -100,31 +156,14 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); // 1st commit: adds `fake_path_1` - write_commit_to_store( - &store, - vec![Action::Add(Add { - path: "fake_path_1".into(), - data_change: true, - ..Default::default() - })], - 0, - )?; + write_commit_to_store(&store, vec![create_add_action("fake_path_1")], 0)?; // 2nd commit: adds `fake_path_2` & removes `fake_path_1` write_commit_to_store( &store, vec![ - Action::Add(Add { - path: "fake_path_2".into(), - data_change: true, - ..Default::default() - }), - Action::Remove(Remove { - path: "fake_path_1".into(), - data_change: true, - deletion_timestamp: Some(i64::MAX), // Ensure the remove action is not expired - ..Default::default() - }), + create_add_action("fake_path_2"), + create_remove_action("fake_path_1"), ], 1, )?; @@ -133,14 +172,7 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { // Protocol action does not include the v2Checkpoint reader/writer feature. write_commit_to_store( &store, - vec![ - Action::Metadata(Metadata { - id: "fake_path_1".into(), - schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(), - ..Default::default() - }), - Action::Protocol(Protocol::try_new(3, 7, Vec::::new().into(), Vec::::new().into())?), - ], + vec![create_metadata_action(), create_basic_protocol_action()], 2, )?; @@ -194,31 +226,16 @@ fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { // Protocol action does not include the v2Checkpoint reader/writer feature. write_commit_to_store( &store, - vec![ - Action::Protocol(Protocol::try_new(3, 7, Vec::::new().into(), Vec::::new().into())?), - Action::Metadata(Metadata { - id: "test-table-v0".into(), - schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(), - ..Default::default() - }), - ], + vec![create_basic_protocol_action(), create_metadata_action()], 0, )?; - // 2nd commit (version 1) - add and remove actions + // 2nd commit (version 1) - add actions write_commit_to_store( &store, vec![ - Action::Add(Add { - path: "file1.parquet".into(), - data_change: true, - ..Default::default() - }), - Action::Add(Add { - path: "file2.parquet".into(), - data_change: true, - ..Default::default() - }), + create_add_action("file1.parquet"), + create_add_action("file2.parquet"), ], 1, )?; @@ -269,17 +286,8 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { write_commit_to_store( &store, vec![ - Action::Add(Add { - path: "fake_path_2".into(), - data_change: true, - ..Default::default() - }), - Action::Remove(Remove { - path: "fake_path_1".into(), - data_change: true, - deletion_timestamp: Some(i64::MAX), // Ensure the remove action is not expired - ..Default::default() - }), + create_add_action("fake_path_2"), + create_remove_action("fake_path_1"), ], 0, )?; @@ -287,17 +295,14 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { // 2nd commit: metadata & protocol actions // Protocol action includes the v2Checkpoint reader/writer feature. write_commit_to_store( - &store, - vec![ - Action::Metadata(Metadata { - id: "fake_path_1".into(), - schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(), - ..Default::default() - }), - Action::Protocol(Protocol::try_new(3, 7, vec!["v2Checkpoint"].into(), vec!["v2Checkpoint"].into())?), - ], - 1, - )?; + &store, + vec![ + create_metadata_action(), + create_v2_checkpoint_protocol_action(), + ], + 1, + )?; + let table_root = Url::parse("memory:///")?; let table = Table::new(table_root); let mut writer = table.checkpoint(&engine, None)?; @@ -350,14 +355,7 @@ fn test_checkpoint_error_handling_invalid_version() -> DeltaResult<()> { // Protocol action does not include the v2Checkpoint reader/writer feature. write_commit_to_store( &store, - vec![ - Action::Protocol(Protocol::try_new(3, 7, Vec::::new().into(), Vec::::new().into())?), - Action::Metadata(Metadata { - id: "test-table-v0".into(), - schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(), - ..Default::default() - }), - ], + vec![create_basic_protocol_action(), create_metadata_action()], 0, )?; let table_root = Url::parse("memory:///")?; From 9a76b22fd65f0cfd0e2464c13158f4c77ce8e38c Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 15 Apr 2025 12:04:13 -0700 Subject: [PATCH 138/176] fix flag --- kernel/src/table_configuration.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/table_configuration.rs b/kernel/src/table_configuration.rs index 7a13beef1c..ef78afc7f4 100644 --- a/kernel/src/table_configuration.rs +++ b/kernel/src/table_configuration.rs @@ -268,7 +268,7 @@ impl TableConfiguration { /// both the protocol's readerFeatures and writerFeatures. /// /// See: - #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] + #[cfg_attr(feature = "internal-api", visibility::make(pub))] #[allow(unused)] // needed to compile w/o default features pub(crate) fn is_v2_checkpoint_supported(&self) -> bool { let read_supported = self From ffb02db26a3127b80ee81c619e56a88d106f2342 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 15 Apr 2025 13:06:07 -0700 Subject: [PATCH 139/176] write --- kernel/src/checkpoint/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 4fca0baaa6..91b2b5a414 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -48,7 +48,7 @@ //! use delta_kernel::arrow::array::{Int64Array, RecordBatch}; //! use delta_kernel::arrow::datatypes::{DataType, Field, Schema}; //! -//! fn mock_write_to_object_store(data: CheckpointData) -> DeltaResult { +//! fn mock_write_to_object_store(mut data: CheckpointData) -> DeltaResult { //! /* This should be replaced with actual object store write logic */ //! /* For demonstration, we manually create an EngineData batch with a dummy size */ //! let size = data.data.try_fold(0i64, |acc, r| r.map(|_| acc + 1))?; From 31da7d4ee448fecf32427002a68b62157ec2f8fc Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 15 Apr 2025 15:11:10 -0700 Subject: [PATCH 140/176] extract .finalize api to separate PR --- kernel/src/checkpoint/mod.rs | 195 ++------------------------------- kernel/src/checkpoint/tests.rs | 154 ++++++++++++++------------ 2 files changed, 93 insertions(+), 256 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 91b2b5a414..3f3daa9d16 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -31,7 +31,7 @@ //! Handles the actual checkpoint data generation and writing process. It is created via the //! [`crate::table::Table::checkpoint`] method and provides the following APIs: //! - [`CheckpointWriter::checkpoint_data`] - Returns the checkpoint data and path information -//! - [`CheckpointWriter::finalize`] - Writes the `_last_checkpoint` file +//! - TODO(#850): [`CheckpointWriter::finalize`] - Writes the `_last_checkpoint` file //! //! ## Example: Writing a classic-named V1 checkpoint (no `v2Checkpoints` feature on test table) //! @@ -73,8 +73,9 @@ //! //! /* IMPORTANT: All data must be written before finalizing the checkpoint */ //! -//! // Finalize the checkpoint. This call will write the _last_checkpoint file -//! writer.finalize(&engine, &metadata)?; +//! // TODO(#850): Implement the finalize method +//! // Finalize the checkpoint. This call will write the _last_checkpoint file +//! // writer.finalize(&engine, &metadata)?; //! //! # Ok::<_, Error>(()) //! ``` @@ -95,12 +96,12 @@ use crate::actions::{ METADATA_NAME, PROTOCOL_NAME, REMOVE_NAME, SET_TRANSACTION_NAME, SIDECAR_NAME, }; use crate::engine_data::FilteredEngineData; -use crate::expressions::{column_expr, Scalar}; +use crate::expressions::Scalar; use crate::log_replay::LogReplayProcessor; use crate::path::ParsedLogPath; use crate::schema::{DataType, SchemaRef, StructField, StructType}; -use crate::snapshot::{Snapshot, LAST_CHECKPOINT_FILE_NAME}; -use crate::{DeltaResult, Engine, EngineData, Error, EvaluationHandlerExtension, Expression}; +use crate::snapshot::Snapshot; +use crate::{DeltaResult, Engine, EngineData, Error, EvaluationHandlerExtension}; use log_replay::CheckpointLogReplayProcessor; use std::sync::atomic::{AtomicI64, Ordering}; use std::{ @@ -112,24 +113,6 @@ mod log_replay; #[cfg(test)] mod tests; -/// Schema of the `_last_checkpoint` file -/// We cannot use `LastCheckpointInfo::to_schema()` as it would include the 'checkpoint_schema' -/// field, which is only known at runtime. -static LAST_CHECKPOINT_SCHEMA: LazyLock = LazyLock::new(|| { - StructType::new([ - StructField::not_null("version", DataType::LONG), - StructField::not_null("size", DataType::LONG), - StructField::nullable("parts", DataType::LONG), - StructField::nullable("sizeInBytes", DataType::LONG), - StructField::nullable("numOfAddFiles", DataType::LONG), - ]) - .into() -}); - -/// Schema of metadata passed to the [`CheckpointWriter::finalize()`] method by the engine -static ENGINE_CHECKPOINT_METADATA_SCHEMA: LazyLock = - LazyLock::new(|| StructType::new([StructField::not_null("version", DataType::LONG)]).into()); - /// Schema for extracting relevant actions from log files for checkpoint creation static CHECKPOINT_ACTIONS_SCHEMA: LazyLock = LazyLock::new(|| { StructType::new([ @@ -251,6 +234,8 @@ impl CheckpointWriter { }) } + /// TODO(#850): Implement the finalize method + /// /// Finalizes the checkpoint writing process by creating the `_last_checkpoint` file /// /// The `_last_checkpoint` file is a metadata file that contains information about the @@ -267,38 +252,9 @@ impl CheckpointWriter { /// - `sizeInBytes` (i64): The size of the written checkpoint file /// /// # Returns: [`variant@Ok`] if the `_last_checkpoint` file was written successfully - pub fn finalize(self, engine: &dyn Engine, metadata: &dyn EngineData) -> DeltaResult<()> { - let version = self.snapshot.version().try_into().map_err(|e| { - Error::checkpoint_writer(format!( - "Failed to convert checkpoint version from u64 {} to i64: {}", - self.snapshot.version(), - e - )) - })?; - - // Ordering does not matter as there are no other threads modifying this counter - // at this time (since the checkpoint data iterator has been consumed) - let checkpoint_metadata = create_last_checkpoint_data( - engine, - metadata, - version, - self.total_actions_counter.load(Ordering::Relaxed), - self.add_actions_counter.load(Ordering::Relaxed), - )?; - - let last_checkpoint_path = self - .snapshot - .log_segment() - .log_root - .join(LAST_CHECKPOINT_FILE_NAME)?; - - engine.json_handler().write_json_file( - &last_checkpoint_path, - Box::new(std::iter::once(Ok(checkpoint_metadata))), - true, // overwrite the last checkpoint file - )?; - - Ok(()) + #[allow(unused)] + fn finalize(self, _engine: &dyn Engine, _metadata: &dyn EngineData) -> DeltaResult<()> { + todo!("Implement the finalize method which will write the _last_checkpoint file") } /// Creates the checkpoint metadata action for V2 checkpoints. @@ -402,67 +358,11 @@ fn deleted_file_retention_timestamp_with_time( Ok(now_ms - retention_ms) } -/// Creates the data for the `_last_checkpoint` file containing checkpoint metadata -/// -/// # Parameters -/// - `engine`: Engine for data processing -/// - `metadata`: Single-row data containing `sizeInBytes` (i64) -/// - `version`: Table version number -/// - `total_actions_counter`: Total actions count -/// - `total_add_actions_counter`: Add actions count -/// -/// # Returns -/// A new [`EngineData`] batch with the `_last_checkpoint` fields: -/// - `version` (i64, required): Table version number -/// - `size` (i64, required): Total actions count -/// - `parts` (i64, optional): Always 1 for single-file checkpoints -/// - `sizeInBytes` (i64, optional): Size of checkpoint file in bytes -/// - `numOfAddFiles` (i64, optional): Number of Add actions -/// -/// TODO(#838) Add `checksum` field to the `_last_checkpoint` file -/// TODO(#839) Add `checkpoint_schema` field to the `_last_checkpoint` file -fn create_last_checkpoint_data( - engine: &dyn Engine, - metadata: &dyn EngineData, - version: i64, - total_actions_counter: i64, - add_actions_counter: i64, -) -> DeltaResult> { - // Validate metadata has exactly one row - if metadata.len() != 1 { - return Err(Error::checkpoint_writer(format!( - "Engine-collected checkpoint metadata should have exactly one row, found {}", - metadata.len() - ))); - } - - let last_checkpoint_exprs = [ - Expression::literal(version), - Expression::literal(total_actions_counter), - Expression::literal(1i64), // Single-file checkpoint - column_expr!("sizeInBytes"), - Expression::literal(add_actions_counter), - // TODO(#838): Include the checksum here - // TODO(#839): Include the schema here - ]; - let last_checkpoint_expr = Expression::struct_from(last_checkpoint_exprs); - - let last_checkpoint_metadata_evaluator = engine.evaluation_handler().new_expression_evaluator( - ENGINE_CHECKPOINT_METADATA_SCHEMA.clone(), - last_checkpoint_expr, - LAST_CHECKPOINT_SCHEMA.clone().into(), - ); - - last_checkpoint_metadata_evaluator.evaluate(metadata) -} - #[cfg(test)] mod unit_tests { use super::*; - use crate::arrow::datatypes::{DataType as ArrowDataType, Schema as ArrowSchema}; use crate::engine::{arrow_data::ArrowEngineData, sync::SyncEngine}; use crate::Table; - use arrow_53::array::Int64Array; use arrow_53::{array::RecordBatch, datatypes::Field}; use delta_kernel::arrow::array::create_array; use std::path::PathBuf; @@ -560,75 +460,4 @@ mod unit_tests { Ok(()) } - - #[test] - fn test_create_last_checkpoint_metadata() -> DeltaResult<()> { - // Setup test data - let size_in_bytes: i64 = 1024 * 1024; // 1MB - let version = 10; - let total_actions_counter = 100; - let add_actions_counter = 75; - let engine = SyncEngine::new(); - - // Create engine metadata with `size_in_bytes` - let schema = ArrowSchema::new(vec![Field::new("sizeInBytes", ArrowDataType::Int64, false)]); - let size_array = Int64Array::from(vec![size_in_bytes]); - let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(size_array)])?; - let metadata = ArrowEngineData::new(record_batch); - - // Create last checkpoint metadata - let last_checkpoint_batch = create_last_checkpoint_data( - &engine, - &metadata, - version, - total_actions_counter, - add_actions_counter, - )?; - - // Verify the underlying EngineData contains the expected LastCheckpointInfo schema and data - let arrow_engine_data = ArrowEngineData::try_from_engine_data(last_checkpoint_batch)?; - let record_batch = arrow_engine_data.record_batch(); - - // Build the expected RecordBatch - let expected_schema = Arc::new(Schema::new(vec![ - Field::new("version", DataType::Int64, false), - Field::new("size", DataType::Int64, false), - Field::new("parts", DataType::Int64, true), - Field::new("sizeInBytes", DataType::Int64, true), - Field::new("numOfAddFiles", DataType::Int64, true), - ])); - let expected = RecordBatch::try_new( - expected_schema, - vec![ - create_array!(Int64, [version]), - create_array!(Int64, [total_actions_counter]), - create_array!(Int64, [1]), - create_array!(Int64, [size_in_bytes]), - create_array!(Int64, [add_actions_counter]), - ], - ) - .unwrap(); - - assert_eq!(*record_batch, expected); - Ok(()) - } - - #[test] - fn test_create_last_checkpoint_metadata_with_invalid_batch() -> DeltaResult<()> { - let engine = SyncEngine::new(); - - // Create engine metadata with the wrong schema - let schema = ArrowSchema::new(vec![Field::new("wrongField", ArrowDataType::Int64, false)]); - let size_array = Int64Array::from(vec![0]); - let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(size_array)]) - .expect("Failed to create record batch"); - let metadata = Box::new(ArrowEngineData::new(record_batch)); - - // This should fail because the schema does not match the expected schema - let res = create_last_checkpoint_data(&engine, &*metadata, 0, 0, 0); - - // Verify that an error is returned - assert!(res.is_err()); - Ok(()) - } } diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index 3c0625ac8d..1b173d1cec 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -1,19 +1,11 @@ use crate::{ actions::{Add, Metadata, Protocol, Remove}, - engine::{ - arrow_data::ArrowEngineData, - default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}, - }, + engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}, utils::test_utils::Action, - DeltaResult, EngineData, Table, -}; -use arrow_53::{ - array::{Int64Array, RecordBatch}, - datatypes::{DataType, Field, Schema}, + DeltaResult, Table, }; use object_store::{memory::InMemory, path::Path, ObjectStore}; -use serde_json::{from_slice, json, Value}; -use std::sync::Arc; +use std::sync::{atomic::Ordering, Arc}; use test_utils::delta_path_for_version; use url::Url; @@ -52,24 +44,27 @@ fn write_commit_to_store( Ok(()) } +// TODO(#850): Uncomment when `finalize` is implemented /// Creates a metadata batch with size information for checkpoint -fn create_checkpoint_metadata_batch(size_in_bytes: i64) -> DeltaResult { - let schema = Schema::new(vec![Field::new("sizeInBytes", DataType::Int64, false)]); - let size_array = Int64Array::from(vec![size_in_bytes]); - let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(size_array)])?; - Ok(ArrowEngineData::new(record_batch)) -} +// fn create_checkpoint_metadata_batch(size_in_bytes: i64) -> DeltaResult { +// let schema = Schema::new(vec![Field::new("sizeInBytes", DataType::Int64, false)]); +// let size_array = Int64Array::from(vec![size_in_bytes]); +// let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(size_array)])?; +// Ok(ArrowEngineData::new(record_batch)) +// } + +// TODO(#850): Uncomment when `finalize` is implemented +// /// Reads the `_last_checkpoint` file from storage +// fn read_last_checkpoint_file(store: &Arc) -> DeltaResult { +// let path = Path::from("_delta_log/_last_checkpoint"); +// let rt = tokio::runtime::Runtime::new().expect("create tokio runtime"); +// let byte_data = rt.block_on(async { +// let data = store.get(&path).await?; +// data.bytes().await +// })?; +// Ok(from_slice(&byte_data)?) +// } -/// Reads the `_last_checkpoint` file from storage -fn read_last_checkpoint_file(store: &Arc) -> DeltaResult { - let path = Path::from("_delta_log/_last_checkpoint"); - let rt = tokio::runtime::Runtime::new().expect("create tokio runtime"); - let byte_data = rt.block_on(async { - let data = store.get(&path).await?; - data.bytes().await - })?; - Ok(from_slice(&byte_data)?) -} /// Create a Protocol action without v2Checkpoint feature support fn create_basic_protocol_action() -> Action { Action::Protocol( @@ -127,25 +122,26 @@ fn create_remove_action(path: &str) -> Action { }) } -/// Helper to verify the contents of the `_last_checkpoint` file -fn assert_last_checkpoint_contents( - store: &Arc, - expected_version: u64, - expected_size: u64, - expected_num_add_files: u64, - expected_size_in_bytes: i64, -) -> DeltaResult<()> { - let last_checkpoint_data = read_last_checkpoint_file(store)?; - let expected_data = json!({ - "version": expected_version, - "size": expected_size, - "parts": 1, - "sizeInBytes": expected_size_in_bytes, - "numOfAddFiles": expected_num_add_files, - }); - assert_eq!(last_checkpoint_data, expected_data); - Ok(()) -} +// TODO(#850): Uncomment when `finalize` is implemented +// /// Helper to verify the contents of the `_last_checkpoint` file +// fn assert_last_checkpoint_contents( +// store: &Arc, +// expected_version: u64, +// expected_size: u64, +// expected_num_add_files: u64, +// expected_size_in_bytes: i64, +// ) -> DeltaResult<()> { +// let last_checkpoint_data = read_last_checkpoint_file(store)?; +// let expected_data = json!({ +// "version": expected_version, +// "size": expected_size, +// "parts": 1, +// "sizeInBytes": expected_size_in_bytes, +// "numOfAddFiles": expected_num_add_files, +// }); +// assert_eq!(last_checkpoint_data, expected_data); +// Ok(()) +// } /// Tests the `checkpoint()` API with: /// - A table that does not support v2Checkpoint @@ -200,16 +196,20 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { // contain any true values, as the add action is removed in a following commit. assert!(data_iter.next().is_none()); + assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 4); + assert_eq!(writer.add_actions_counter.load(Ordering::Relaxed), 1); + + // TODO(#850): Uncomment when `finalize` is implemented // Finalize and verify checkpoint metadata - let size_in_bytes = 10; - writer.finalize(&engine, &create_checkpoint_metadata_batch(size_in_bytes)?)?; - assert_last_checkpoint_contents( - &store, - 2, // version: latest/last version in the log - 4, // size: 1 metadata + 1 protocol + 1 add action + 1 remove action - 1, // numOfAddFiles: from the 2nd commit (fake_path_2) - size_in_bytes, // sizeInBytes: passed to finalize (10) - )?; + // let size_in_bytes = 10; + // writer.finalize(&engine, &create_checkpoint_metadata_batch(size_in_bytes)?)?; + // assert_last_checkpoint_contents( + // &store, + // 2, // version: latest/last version in the log + // 4, // size: 1 metadata + 1 protocol + 1 add action + 1 remove action + // 1, // numOfAddFiles: from the 2nd commit (fake_path_2) + // size_in_bytes, // sizeInBytes: passed to finalize (10) + // )?; Ok(()) } @@ -260,16 +260,20 @@ fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { // No more data should exist because we only requested version 0 assert!(data_iter.next().is_none()); + assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 2); + assert_eq!(writer.add_actions_counter.load(Ordering::Relaxed), 0); + + // TODO(#850): Uncomment when `finalize` is implemented // Finalize and verify - let size_in_bytes = 10; - writer.finalize(&engine, &create_checkpoint_metadata_batch(size_in_bytes)?)?; - assert_last_checkpoint_contents( - &store, - 0, // version: specified version (0) - 2, // size: 1 protocol + 1 metadata from version 0 - 0, // numOfAddFiles: no add files in version 0 - size_in_bytes, // sizeInBytes: passed to finalize (10) - )?; + // let size_in_bytes = 10; + // writer.finalize(&engine, &create_checkpoint_metadata_batch(size_in_bytes)?)?; + // assert_last_checkpoint_contents( + // &store, + // 0, // version: specified version (0) + // 2, // size: 1 protocol + 1 metadata from version 0 + // 0, // numOfAddFiles: no add files in version 0 + // size_in_bytes, // sizeInBytes: passed to finalize (10) + // )?; Ok(()) } @@ -330,16 +334,20 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { // No more data should exist assert!(data_iter.next().is_none()); + assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 5); + assert_eq!(writer.add_actions_counter.load(Ordering::Relaxed), 1); + + // TODO(#850): Uncomment when `finalize` is implemented // Finalize and verify - let size_in_bytes = 10; - writer.finalize(&engine, &create_checkpoint_metadata_batch(size_in_bytes)?)?; - assert_last_checkpoint_contents( - &store, - 1, // version: latest version (1) with v2Checkpoint support - 5, // size: 1 metadata + 1 protocol + 1 add + 1 remove + 1 checkpointMetadata - 1, // numOfAddFiles: 1 add file from version 0 - size_in_bytes, // sizeInBytes: passed to finalize (10) - )?; + // let size_in_bytes = 10; + // writer.finalize(&engine, &create_checkpoint_metadata_batch(size_in_bytes)?)?; + // assert_last_checkpoint_contents( + // &store, + // 1, // version: latest version (1) with v2Checkpoint support + // 5, // size: 1 metadata + 1 protocol + 1 add + 1 remove + 1 checkpointMetadata + // 1, // numOfAddFiles: 1 add file from version 0 + // size_in_bytes, // sizeInBytes: passed to finalize (10) + // )?; Ok(()) } From 362112102a0f456ad56f3e8467e9c2267d56faa5 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 15 Apr 2025 15:23:27 -0700 Subject: [PATCH 141/176] remove comments --- kernel/src/checkpoint/tests.rs | 81 ++-------------------------------- 1 file changed, 3 insertions(+), 78 deletions(-) diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index 1b173d1cec..40d7f915fb 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -44,27 +44,6 @@ fn write_commit_to_store( Ok(()) } -// TODO(#850): Uncomment when `finalize` is implemented -/// Creates a metadata batch with size information for checkpoint -// fn create_checkpoint_metadata_batch(size_in_bytes: i64) -> DeltaResult { -// let schema = Schema::new(vec![Field::new("sizeInBytes", DataType::Int64, false)]); -// let size_array = Int64Array::from(vec![size_in_bytes]); -// let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(size_array)])?; -// Ok(ArrowEngineData::new(record_batch)) -// } - -// TODO(#850): Uncomment when `finalize` is implemented -// /// Reads the `_last_checkpoint` file from storage -// fn read_last_checkpoint_file(store: &Arc) -> DeltaResult { -// let path = Path::from("_delta_log/_last_checkpoint"); -// let rt = tokio::runtime::Runtime::new().expect("create tokio runtime"); -// let byte_data = rt.block_on(async { -// let data = store.get(&path).await?; -// data.bytes().await -// })?; -// Ok(from_slice(&byte_data)?) -// } - /// Create a Protocol action without v2Checkpoint feature support fn create_basic_protocol_action() -> Action { Action::Protocol( @@ -122,27 +101,6 @@ fn create_remove_action(path: &str) -> Action { }) } -// TODO(#850): Uncomment when `finalize` is implemented -// /// Helper to verify the contents of the `_last_checkpoint` file -// fn assert_last_checkpoint_contents( -// store: &Arc, -// expected_version: u64, -// expected_size: u64, -// expected_num_add_files: u64, -// expected_size_in_bytes: i64, -// ) -> DeltaResult<()> { -// let last_checkpoint_data = read_last_checkpoint_file(store)?; -// let expected_data = json!({ -// "version": expected_version, -// "size": expected_size, -// "parts": 1, -// "sizeInBytes": expected_size_in_bytes, -// "numOfAddFiles": expected_num_add_files, -// }); -// assert_eq!(last_checkpoint_data, expected_data); -// Ok(()) -// } - /// Tests the `checkpoint()` API with: /// - A table that does not support v2Checkpoint /// - No version specified (latest version is used) @@ -199,18 +157,7 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 4); assert_eq!(writer.add_actions_counter.load(Ordering::Relaxed), 1); - // TODO(#850): Uncomment when `finalize` is implemented - // Finalize and verify checkpoint metadata - // let size_in_bytes = 10; - // writer.finalize(&engine, &create_checkpoint_metadata_batch(size_in_bytes)?)?; - // assert_last_checkpoint_contents( - // &store, - // 2, // version: latest/last version in the log - // 4, // size: 1 metadata + 1 protocol + 1 add action + 1 remove action - // 1, // numOfAddFiles: from the 2nd commit (fake_path_2) - // size_in_bytes, // sizeInBytes: passed to finalize (10) - // )?; - + // TODO(#850): Finalize and verify _last_checkpoint Ok(()) } @@ -263,18 +210,7 @@ fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 2); assert_eq!(writer.add_actions_counter.load(Ordering::Relaxed), 0); - // TODO(#850): Uncomment when `finalize` is implemented - // Finalize and verify - // let size_in_bytes = 10; - // writer.finalize(&engine, &create_checkpoint_metadata_batch(size_in_bytes)?)?; - // assert_last_checkpoint_contents( - // &store, - // 0, // version: specified version (0) - // 2, // size: 1 protocol + 1 metadata from version 0 - // 0, // numOfAddFiles: no add files in version 0 - // size_in_bytes, // sizeInBytes: passed to finalize (10) - // )?; - + // TODO(#850): Finalize and verify _last_checkpoint Ok(()) } @@ -337,18 +273,7 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 5); assert_eq!(writer.add_actions_counter.load(Ordering::Relaxed), 1); - // TODO(#850): Uncomment when `finalize` is implemented - // Finalize and verify - // let size_in_bytes = 10; - // writer.finalize(&engine, &create_checkpoint_metadata_batch(size_in_bytes)?)?; - // assert_last_checkpoint_contents( - // &store, - // 1, // version: latest version (1) with v2Checkpoint support - // 5, // size: 1 metadata + 1 protocol + 1 add + 1 remove + 1 checkpointMetadata - // 1, // numOfAddFiles: 1 add file from version 0 - // size_in_bytes, // sizeInBytes: passed to finalize (10) - // )?; - + // TODO(#850): Finalize and verify _last_checkpoint Ok(()) } From 600cee6cc491f07bacfea4c006f1c9c1cacffc13 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 15 Apr 2025 15:32:56 -0700 Subject: [PATCH 142/176] doc --- kernel/src/checkpoint/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 3f3daa9d16..4eef270e52 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -31,7 +31,7 @@ //! Handles the actual checkpoint data generation and writing process. It is created via the //! [`crate::table::Table::checkpoint`] method and provides the following APIs: //! - [`CheckpointWriter::checkpoint_data`] - Returns the checkpoint data and path information -//! - TODO(#850): [`CheckpointWriter::finalize`] - Writes the `_last_checkpoint` file +//! - TODO(#850): `CheckpointWriter::finalize` - Writes the `_last_checkpoint` file //! //! ## Example: Writing a classic-named V1 checkpoint (no `v2Checkpoints` feature on test table) //! From 2371ed0d892f08c134b702dffede92ec3f2a1646 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 16 Apr 2025 11:19:25 -0700 Subject: [PATCH 143/176] include issue --- kernel/src/checkpoint/tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index 40d7f915fb..b1549733a2 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -297,7 +297,7 @@ fn test_checkpoint_error_handling_invalid_version() -> DeltaResult<()> { // Should fail with an appropriate error // Returns error: "LogSegment end version 0 not the same as the specified end version 999" - // TODO(seb): Returned error should be tailored to checkpoint creation + // TODO(#854): Returned error should be tailored to checkpoint creation assert!(result.is_err()); Ok(()) From db386534f9e5d8f7c281ad337a537d4c13a93e18 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 16 Apr 2025 11:29:31 -0700 Subject: [PATCH 144/176] merge fix --- kernel/src/table_configuration.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/table_configuration.rs b/kernel/src/table_configuration.rs index dc101c359b..fcb4393a63 100644 --- a/kernel/src/table_configuration.rs +++ b/kernel/src/table_configuration.rs @@ -269,7 +269,7 @@ impl TableConfiguration { /// both the protocol's readerFeatures and writerFeatures. /// /// See: - #[cfg_attr(feature = "internal-api", visibility::make(pub))] + #[internal_api] #[allow(unused)] // needed to compile w/o default features pub(crate) fn is_v2_checkpoint_supported(&self) -> bool { let read_supported = self From d191912e33443d55b299aee36e5ceafcc4a9162a Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 16 Apr 2025 13:51:21 -0700 Subject: [PATCH 145/176] reviews --- ffi/src/error.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ffi/src/error.rs b/ffi/src/error.rs index c054c6e7a6..46e83b67b9 100644 --- a/ffi/src/error.rs +++ b/ffi/src/error.rs @@ -53,7 +53,7 @@ pub enum KernelError { ChangeDataFeedIncompatibleSchema, InvalidCheckpoint, LiteralExpressionTransformError, - CheckpointWriterError, + CheckpointWriteError, } impl From for KernelError { @@ -62,7 +62,7 @@ impl From for KernelError { // NOTE: By definition, no kernel Error maps to FFIError #[cfg(any(feature = "default-engine", feature = "sync-engine"))] Error::Arrow(_) => KernelError::ArrowError, - Error::CheckpointWriter(_) => KernelError::CheckpointWriterError, + Error::CheckpointWrite(_) => KernelError::CheckpointWriteError, Error::EngineDataType(_) => KernelError::EngineDataTypeError, Error::Extract(..) => KernelError::ExtractError, Error::Generic(_) => KernelError::GenericError, From 2ece8a39007af5f0dfe8a86fa52906d324ac41f3 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 16 Apr 2025 13:56:25 -0700 Subject: [PATCH 146/176] err handling --- kernel/src/checkpoint/log_replay.rs | 6 +- kernel/src/checkpoint/mod.rs | 90 ++++++++++++++++++++--------- kernel/src/error.rs | 6 +- 3 files changed, 70 insertions(+), 32 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index e80d8ce56d..1a46469ff5 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -100,9 +100,9 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { ); visitor.visit_rows_of(batch.as_ref())?; - // Update the total actions and add actions counters. Relaxed ordering is sufficient - // here as we only care about the total count when writing the _last_checkpoint file. - // (the ordering is not important for correctness) + // Safe to use Relaxed here: + // "Incrementing a counter can be safely done by multiple threads using a relaxed fetch_add + // if you're not using the counter to synchronize any other accesses." – Rust Atomics and Locks self.actions_count.fetch_add( visitor.file_actions_count + visitor.non_file_actions_count, Ordering::Relaxed, diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 4eef270e52..17787a06f7 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -1,8 +1,13 @@ //! # Delta Kernel Checkpoint API //! //! This module implements the API for writing checkpoints in delta tables. -//! Checkpoints provide a compact summary of the table state, enabling faster recovery by -//! avoiding full log replay. This API supports two checkpoint types: +//! Checkpoints allow readers to short-cut the cost of reading the entire log history by providing +//! the complete replay of all actions, up to and including the checkpointed version, with invalid +//! actions removed. Invalid actions are those that have been canceled out by subsequent ones (for +//! example removing a file that has been added), using the rules for reconciliation. +//! +//! ## Checkpoint Types +//! This API supports two checkpoint types: //! //! 1. **Single-file Classic-named V1 Checkpoint** – for legacy tables that do not support the //! `v2Checkpoints` reader/writer feature. These checkpoints follow the V1 specification and do not @@ -24,20 +29,21 @@ //! //! ## Architecture //! -//! - [`CheckpointWriter`] - Core component that manages checkpoint creation workflow +//! - [`CheckpointWriter`] - Core component that manages the checkpoint creation workflow //! - [`CheckpointData`] - Contains the data to write and destination path information //! -//! ## [`CheckpointWriter`] -//! Handles the actual checkpoint data generation and writing process. It is created via the -//! [`crate::table::Table::checkpoint`] method and provides the following APIs: -//! - [`CheckpointWriter::checkpoint_data`] - Returns the checkpoint data and path information -//! - TODO(#850): `CheckpointWriter::finalize` - Writes the `_last_checkpoint` file +//! ## Usage Workflow +//! +//! 1. Create a [`CheckpointWriter`] using [`crate::table::Table::checkpoint`] +//! 2. Get checkpoint data and path with [`CheckpointWriter::checkpoint_data`] +//! 3. Write all data to the returned location +//! 4. TODO(#850) Finalize the checkpoint with `CheckpointWriter::finalize` + //! //! ## Example: Writing a classic-named V1 checkpoint (no `v2Checkpoints` feature on test table) //! //! ``` //! use std::sync::Arc; -//! use object_store::local::LocalFileSystem; //! use delta_kernel::{ //! checkpoint::CheckpointData, //! engine::arrow_data::ArrowEngineData, @@ -47,6 +53,7 @@ //! }; //! use delta_kernel::arrow::array::{Int64Array, RecordBatch}; //! use delta_kernel::arrow::datatypes::{DataType, Field, Schema}; +//! use object_store::local::LocalFileSystem; //! //! fn mock_write_to_object_store(mut data: CheckpointData) -> DeltaResult { //! /* This should be replaced with actual object store write logic */ @@ -90,6 +97,10 @@ //! //! [`CheckpointMetadata`]: crate::actions::CheckpointMetadata //! [`LastCheckpointHint`]: crate::snapshot::LastCheckpointHint +use std::sync::atomic::{AtomicI64, Ordering}; +use std::sync::{Arc, LazyLock}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + use crate::actions::CHECKPOINT_METADATA_NAME; use crate::actions::{ schemas::GetStructField, Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, @@ -103,12 +114,9 @@ use crate::schema::{DataType, SchemaRef, StructField, StructType}; use crate::snapshot::Snapshot; use crate::{DeltaResult, Engine, EngineData, Error, EvaluationHandlerExtension}; use log_replay::CheckpointLogReplayProcessor; -use std::sync::atomic::{AtomicI64, Ordering}; -use std::{ - sync::{Arc, LazyLock}, - time::{Duration, SystemTime, UNIX_EPOCH}, -}; + use url::Url; + mod log_replay; #[cfg(test)] mod tests; @@ -137,7 +145,20 @@ static CHECKPOINT_METADATA_ACTION_SCHEMA: LazyLock = LazyLock::new(|| .into() }); -/// Represents a single-file checkpoint, including the data to write and the target path. +/// Represents the data needed to create a checkpoint file. +/// +/// Obtained from [`CheckpointWriter::checkpoint_data`], this struct provides both the +/// location where the checkpoint file should be written and an iterator over the data +/// that should be included in the checkpoint. +/// +/// # Fields +/// - `path`: The URL where the checkpoint file should be written. +/// - `data`: A boxed iterator that yields checkpoint actions as chunks of [`EngineData`]. +/// +/// # Usagea +/// 1. Write every action yielded by `data` to persistent storage at the URL specified by `path`. +/// 2. Ensure that all data is fully persisted before calling `CheckpointWriter::finalize`. +/// This is crucial to avoid data loss or corruption. pub struct CheckpointData { /// The URL where the checkpoint file should be written. pub path: Url, @@ -146,10 +167,25 @@ pub struct CheckpointData { pub data: Box>>, } -/// Manages the checkpoint writing process for tables +/// Orchestrates the process of creating and finalizing a checkpoint. /// -/// The [`CheckpointWriter`] orchestrates creating checkpoint data, and finalizing the -/// checkpoint by writing the `_last_checkpoint` file. +/// The [`CheckpointWriter`] is the entry point for generating checkpoint data for a Delta table. +/// It automatically selects the appropriate checkpoint format (V1/V2) based on the table's +/// feature support. +/// +/// # Usage Workflow +/// 1. Create a [`CheckpointWriter`] via [`crate::table::Table::checkpoint`]. +/// 2. Call [`CheckpointWriter::checkpoint_data`] to obtain a [`CheckpointData`] instance. +/// 3. Write out all actions from the [`CheckpointData::data`] iterator to the destination +/// specified by [`CheckpointData::path`]. +/// 4. After successfully writing all data, finalize the checkpoint by calling +/// [`CheckpointWriter::finalize`] to write the `_last_checkpoint` file. +/// +/// # Important Notes +/// - The checkpoint data must be fully written to persistent storage before calling `finalize()` +/// in step 3. Failing to do so may result in data loss or corruption. +/// - This API automatically selects the appropriate checkpoint format (V1/V2) based on the table's +/// `v2Checkpoints` feature support. pub struct CheckpointWriter { /// Reference to the snapshot of the table being checkpointed pub(crate) snapshot: Arc, @@ -163,7 +199,7 @@ pub struct CheckpointWriter { } impl CheckpointWriter { - /// Creates a new CheckpointWriter with the provided checkpoint data and counters + /// Creates a new CheckpointWriter from a snapshot pub(crate) fn new(snapshot: Arc) -> Self { Self { snapshot, @@ -209,7 +245,7 @@ impl CheckpointWriter { .process_actions_iter(actions); let version = self.snapshot.version().try_into().map_err(|e| { - Error::checkpoint_writer(format!( + Error::CheckpointWrite(format!( "Failed to convert checkpoint version from u64 {} to i64: {}", self.snapshot.version(), e @@ -294,8 +330,9 @@ impl CheckpointWriter { selection_vector: vec![true], // Include the action in the checkpoint }; - // Ordering does not matter as there are no other threads modifying this counter - // at this time (since we have not yet returned the iterator which performs the action counting) + // Safe to use Relaxed here: + // "Incrementing a counter can be safely done by multiple threads using a relaxed fetch_add + // if you're not using the counter to synchronize any other accesses." – Rust Atomics and Locks self.total_actions_counter.fetch_add(1, Ordering::Relaxed); Ok(Some(Ok(result))) @@ -348,11 +385,12 @@ fn deleted_file_retention_timestamp_with_time( let now_ms: i64 = now_duration .as_millis() .try_into() - .map_err(|_| Error::checkpoint_writer("Current timestamp exceeds i64 millisecond range"))?; + .map_err(|_| Error::checkpoint_write("Current timestamp exceeds i64 millisecond range"))?; - let retention_ms: i64 = retention_duration.as_millis().try_into().map_err(|_| { - Error::checkpoint_writer("Retention duration exceeds i64 millisecond range") - })?; + let retention_ms: i64 = retention_duration + .as_millis() + .try_into() + .map_err(|_| Error::checkpoint_write("Retention duration exceeds i64 millisecond range"))?; // Simple subtraction - will produce negative values if retention > now Ok(now_ms - retention_ms) diff --git a/kernel/src/error.rs b/kernel/src/error.rs index a261e13a40..14b125ae98 100644 --- a/kernel/src/error.rs +++ b/kernel/src/error.rs @@ -35,7 +35,7 @@ pub enum Error { Arrow(ArrowError), #[error("Error writing checkpoint: {0}")] - CheckpointWriter(String), + CheckpointWrite(String), /// User tried to convert engine data to the wrong type #[error("Invalid engine data type. Could not convert to {0}")] @@ -211,8 +211,8 @@ pub enum Error { // Convenience constructors for Error types that take a String argument impl Error { - pub fn checkpoint_writer(msg: impl ToString) -> Self { - Self::CheckpointWriter(msg.to_string()) + pub fn checkpoint_write(msg: impl ToString) -> Self { + Self::CheckpointWrite(msg.to_string()) } pub fn generic_err(source: impl Into>) -> Self { From 3d2532dea959ce1bd0a8ec06ce645dd6ebfbeea4 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 16 Apr 2025 13:59:19 -0700 Subject: [PATCH 147/176] const --- kernel/src/checkpoint/mod.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 17787a06f7..f2ab15cc3a 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -121,6 +121,14 @@ mod log_replay; #[cfg(test)] mod tests; +const SECONDS_PER_MINUTE: u64 = 60; +const MINUTES_PER_HOUR: u64 = 60; +const HOURS_PER_DAY: u64 = 24; +const DAYS: u64 = 7; +/// The default retention period for deleted files in seconds. +/// This is set to 7 days, which is the default in delta-spark. +const DEFAULT_RETENTION_SECS: u64 = SECONDS_PER_MINUTE * MINUTES_PER_HOUR * HOURS_PER_DAY * DAYS; + /// Schema for extracting relevant actions from log files for checkpoint creation static CHECKPOINT_ACTIONS_SCHEMA: LazyLock = LazyLock::new(|| { StructType::new([ @@ -379,7 +387,7 @@ fn deleted_file_retention_timestamp_with_time( ) -> DeltaResult { // Use provided retention duration or default (7 days) let retention_duration = - retention_duration.unwrap_or_else(|| Duration::from_secs(60 * 60 * 24 * 7)); + retention_duration.unwrap_or_else(|| Duration::from_secs(DEFAULT_RETENTION_SECS)); // Convert to milliseconds for remove action deletion_timestamp comparison let now_ms: i64 = now_duration From 2d5890a9fb5022ed360fef02aced29cefd40d9b7 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 16 Apr 2025 14:06:01 -0700 Subject: [PATCH 148/176] docs --- kernel/src/checkpoint/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index f2ab15cc3a..89863257e2 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -187,7 +187,7 @@ pub struct CheckpointData { /// 3. Write out all actions from the [`CheckpointData::data`] iterator to the destination /// specified by [`CheckpointData::path`]. /// 4. After successfully writing all data, finalize the checkpoint by calling -/// [`CheckpointWriter::finalize`] to write the `_last_checkpoint` file. +/// `CheckpointWriter::finalize`] to write the `_last_checkpoint` file. /// /// # Important Notes /// - The checkpoint data must be fully written to persistent storage before calling `finalize()` @@ -227,10 +227,10 @@ impl CheckpointWriter { /// (i.e., if `v2Checkpoints` feature is supported by table) /// 5. Generates the appropriate checkpoint path /// + /// # Returns: [`CheckpointData`] containing the checkpoint path and data to write + /// /// # Important: The returned data should be written to persistent storage by the /// caller before calling `finalize()` otherwise data loss may occur. - /// - /// # Returns: [`CheckpointData`] containing the checkpoint path and data to write pub fn checkpoint_data(&mut self, engine: &dyn Engine) -> DeltaResult { let is_v2_checkpoints_supported = self .snapshot From a69f3f07f795f7fb62b08148a3558d7ea2bdaeb5 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 16 Apr 2025 14:23:42 -0700 Subject: [PATCH 149/176] issue track --- kernel/src/checkpoint/tests.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index b1549733a2..9874b91ca8 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -9,7 +9,7 @@ use std::sync::{atomic::Ordering, Arc}; use test_utils::delta_path_for_version; use url::Url; -/// TODO(seb): Merge copies and move to `test_utils` +/// TODO(#855): Merge copies and move to `test_utils` /// Create an in-memory store and return the store and the URL for the store's _delta_log directory. fn new_in_memory_store() -> (Arc, Url) { ( @@ -21,7 +21,7 @@ fn new_in_memory_store() -> (Arc, Url) { ) } -/// TODO(seb): Merge copies and move to `test_utils` +/// TODO(#855): Merge copies and move to `test_utils` /// Writes all actions to a _delta_log json commit file in the store. /// This function formats the provided filename into the _delta_log directory. fn write_commit_to_store( From e84a55b739b18846dba8b9b2be77d016f4e53a27 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 17 Apr 2025 11:19:40 -0700 Subject: [PATCH 150/176] docs --- kernel/src/checkpoint/mod.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 89863257e2..2555c81bca 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -163,7 +163,7 @@ static CHECKPOINT_METADATA_ACTION_SCHEMA: LazyLock = LazyLock::new(|| /// - `path`: The URL where the checkpoint file should be written. /// - `data`: A boxed iterator that yields checkpoint actions as chunks of [`EngineData`]. /// -/// # Usagea +/// # Usage /// 1. Write every action yielded by `data` to persistent storage at the URL specified by `path`. /// 2. Ensure that all data is fully persisted before calling `CheckpointWriter::finalize`. /// This is crucial to avoid data loss or corruption. @@ -178,8 +178,8 @@ pub struct CheckpointData { /// Orchestrates the process of creating and finalizing a checkpoint. /// /// The [`CheckpointWriter`] is the entry point for generating checkpoint data for a Delta table. -/// It automatically selects the appropriate checkpoint format (V1/V2) based on the table's -/// feature support. +/// It automatically selects the appropriate checkpoint format (V1/V2) based on whether the table +/// supports the `v2Checkpoints` reader/writer feature. /// /// # Usage Workflow /// 1. Create a [`CheckpointWriter`] via [`crate::table::Table::checkpoint`]. @@ -280,14 +280,14 @@ impl CheckpointWriter { /// TODO(#850): Implement the finalize method /// - /// Finalizes the checkpoint writing process by creating the `_last_checkpoint` file + /// Finalizes the checkpoint writing. This function writes the `_last_checkpoint` file /// /// The `_last_checkpoint` file is a metadata file that contains information about the /// last checkpoint created for the table. It is used as a hint for the engine to quickly /// locate the last checkpoint and avoid full log replay when reading the table. /// /// # Important - /// This method must only be called AFTER successfully writing all checkpoint data to storage. + /// This method must only be called **after** successfully writing all checkpoint data to storage. /// Failure to do so may result in data loss. /// /// # Parameters From 457a95c740a0d98f571d8f0c105a065549067f6b Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 17 Apr 2025 16:36:15 -0700 Subject: [PATCH 151/176] review --- kernel/src/checkpoint/mod.rs | 264 +++++++++--------------------- kernel/src/checkpoint/tests.rs | 131 +++++++++++++-- kernel/src/error.rs | 2 +- kernel/src/table.rs | 11 +- kernel/src/table_configuration.rs | 4 +- 5 files changed, 203 insertions(+), 209 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 2555c81bca..af17359a97 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -1,10 +1,6 @@ -//! # Delta Kernel Checkpoint API +//! This module implements the API for writing single-file checkpoints in delta tables. //! -//! This module implements the API for writing checkpoints in delta tables. -//! Checkpoints allow readers to short-cut the cost of reading the entire log history by providing -//! the complete replay of all actions, up to and including the checkpointed version, with invalid -//! actions removed. Invalid actions are those that have been canceled out by subsequent ones (for -//! example removing a file that has been added), using the rules for reconciliation. +//! The entry-point for this API is [`Table::checkpoint`]. //! //! ## Checkpoint Types //! This API supports two checkpoint types: @@ -32,30 +28,28 @@ //! - [`CheckpointWriter`] - Core component that manages the checkpoint creation workflow //! - [`CheckpointData`] - Contains the data to write and destination path information //! -//! ## Usage Workflow +//! ## Usage //! -//! 1. Create a [`CheckpointWriter`] using [`crate::table::Table::checkpoint`] +//! The following steps outline the process of creating a checkpoint: +//! +//! 1. Create a [`CheckpointWriter`] using [`Table::checkpoint`] //! 2. Get checkpoint data and path with [`CheckpointWriter::checkpoint_data`] //! 3. Write all data to the returned location -//! 4. TODO(#850) Finalize the checkpoint with `CheckpointWriter::finalize` - -//! -//! ## Example: Writing a classic-named V1 checkpoint (no `v2Checkpoints` feature on test table) +//! 4. Finalize the checkpoint with `CheckpointWriter::finalize` //! //! ``` -//! use std::sync::Arc; -//! use delta_kernel::{ -//! checkpoint::CheckpointData, -//! engine::arrow_data::ArrowEngineData, -//! engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}, -//! table::Table, -//! DeltaResult, Error, -//! }; -//! use delta_kernel::arrow::array::{Int64Array, RecordBatch}; -//! use delta_kernel::arrow::datatypes::{DataType, Field, Schema}; -//! use object_store::local::LocalFileSystem; -//! -//! fn mock_write_to_object_store(mut data: CheckpointData) -> DeltaResult { +//! # use std::sync::Arc; +//! # use delta_kernel::checkpoint::CheckpointData; +//! # use delta_kernel::engine::arrow_data::ArrowEngineData; +//! # use delta_kernel::engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}; +//! # use delta_kernel::table::Table; +//! # use delta_kernel::DeltaResult; +//! # use delta_kernel::Error; +//! # use delta_kernel::arrow::array::{Int64Array, RecordBatch}; +//! # use delta_kernel::arrow::datatypes::{DataType, Field, Schema}; +//! # use object_store::local::LocalFileSystem; +//! // Example function which writes checkpoint data to storage +//! fn write_files(mut data: CheckpointData) -> DeltaResult { //! /* This should be replaced with actual object store write logic */ //! /* For demonstration, we manually create an EngineData batch with a dummy size */ //! let size = data.data.try_fold(0i64, |acc, r| r.map(|_| acc + 1))?; @@ -66,37 +60,42 @@ //! Ok(ArrowEngineData::new(batch)) //! } //! +//! // Create an engine instance //! let engine = DefaultEngine::new( //! Arc::new(LocalFileSystem::new()), //! Arc::new(TokioBackgroundExecutor::new()) //! ); +//! +//! // Create a table instance for the table you want to checkpoint //! let table = Table::try_from_uri("./tests/data/app-txn-no-checkpoint")?; //! -//! // Create a checkpoint writer for the table at a specific version +//! // Use table.checkpoint() to create a checkpoint writer +//! // (optionally specify a version to checkpoint) //! let mut writer = table.checkpoint(&engine, Some(1))?; //! //! // Write the checkpoint data to the object store and get the metadata -//! let metadata = mock_write_to_object_store(writer.checkpoint_data(&engine)?)?; +//! let metadata = write_files(writer.checkpoint_data(&engine)?)?; //! //! /* IMPORTANT: All data must be written before finalizing the checkpoint */ //! //! // TODO(#850): Implement the finalize method -//! // Finalize the checkpoint. This call will write the _last_checkpoint file +//! // Finalize the checkpoint //! // writer.finalize(&engine, &metadata)?; //! //! # Ok::<_, Error>(()) //! ``` //! -//! ## Future extensions -//! - TODO(#836): Single-file UUID-named V2 checkpoints (using `n.checkpoint.u.{json/parquet}` naming) are to be -//! implemented in the future. The current implementation only supports classic-named V2 checkpoints. -//! - TODO(#837): Multi-file V2 checkpoints are not supported yet. The API is designed to be extensible for future -//! multi-file support, but the current implementation only supports single-file checkpoints. -//! -//! Note: Multi-file V1 checkpoints are DEPRECATED and UNSAFE. +//! ## Warning +//! Multi-part (V1) checkpoints are DEPRECATED and UNSAFE. //! //! [`CheckpointMetadata`]: crate::actions::CheckpointMetadata //! [`LastCheckpointHint`]: crate::snapshot::LastCheckpointHint +//! [`Table::checkpoint`]: [`crate::table::Table::checkpoint`] +// Future extensions +// - TODO(#836): Single-file UUID-named V2 checkpoints (using `n.checkpoint.u.{json/parquet}` naming) are to be +// implemented in the future. The current implementation only supports classic-named V2 checkpoints. +// - TODO(#837): Multi-file V2 checkpoints are not supported yet. The API is designed to be extensible for future +// multi-file support, but the current implementation only supports single-file checkpoints. use std::sync::atomic::{AtomicI64, Ordering}; use std::sync::{Arc, LazyLock}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; @@ -153,20 +152,15 @@ static CHECKPOINT_METADATA_ACTION_SCHEMA: LazyLock = LazyLock::new(|| .into() }); -/// Represents the data needed to create a checkpoint file. +/// Represents the data needed to create a single-file checkpoint. /// /// Obtained from [`CheckpointWriter::checkpoint_data`], this struct provides both the /// location where the checkpoint file should be written and an iterator over the data /// that should be included in the checkpoint. /// -/// # Fields -/// - `path`: The URL where the checkpoint file should be written. -/// - `data`: A boxed iterator that yields checkpoint actions as chunks of [`EngineData`]. -/// -/// # Usage -/// 1. Write every action yielded by `data` to persistent storage at the URL specified by `path`. -/// 2. Ensure that all data is fully persisted before calling `CheckpointWriter::finalize`. -/// This is crucial to avoid data loss or corruption. +/// # Warning +/// All data must be fully written to persistent storage before calling +/// `CheckpointWriter::finalize`. Failing to do so may result in data loss or corruption. pub struct CheckpointData { /// The URL where the checkpoint file should be written. pub path: Url, @@ -175,35 +169,30 @@ pub struct CheckpointData { pub data: Box>>, } -/// Orchestrates the process of creating and finalizing a checkpoint. +/// Orchestrates the process of creating a checkpoint for a table. /// /// The [`CheckpointWriter`] is the entry point for generating checkpoint data for a Delta table. /// It automatically selects the appropriate checkpoint format (V1/V2) based on whether the table /// supports the `v2Checkpoints` reader/writer feature. /// -/// # Usage Workflow -/// 1. Create a [`CheckpointWriter`] via [`crate::table::Table::checkpoint`]. -/// 2. Call [`CheckpointWriter::checkpoint_data`] to obtain a [`CheckpointData`] instance. -/// 3. Write out all actions from the [`CheckpointData::data`] iterator to the destination -/// specified by [`CheckpointData::path`]. -/// 4. After successfully writing all data, finalize the checkpoint by calling -/// `CheckpointWriter::finalize`] to write the `_last_checkpoint` file. +/// # Warning +/// The checkpoint data must be fully written to storage before calling `CheckpointWriter::finalize()`. +/// Failing to do so may result in data loss or corruption. /// -/// # Important Notes -/// - The checkpoint data must be fully written to persistent storage before calling `finalize()` -/// in step 3. Failing to do so may result in data loss or corruption. -/// - This API automatically selects the appropriate checkpoint format (V1/V2) based on the table's -/// `v2Checkpoints` feature support. +/// # See Also +/// See the [module-level documentation](self) for the complete checkpoint workflow +/// +/// [`Table::checkpoint`]: [`crate::table::Table::checkpoint`] pub struct CheckpointWriter { - /// Reference to the snapshot of the table being checkpointed + /// Reference to the snapshot (i.e. version) of the table being checkpointed pub(crate) snapshot: Arc, /// Note: `Arc` provides shared mutability for our counters, allowing the /// returned actions iterator from `.checkpoint_data()` to update the counters, /// and the [`CheckpointWriter`] to read them during `.finalize()` /// Counter for total actions included in the checkpoint - pub(crate) total_actions_counter: Arc, + pub(crate) actions_count: Arc, /// Counter for Add actions included in the checkpoint - pub(crate) add_actions_counter: Arc, + pub(crate) add_actions_count: Arc, } impl CheckpointWriter { @@ -211,31 +200,34 @@ impl CheckpointWriter { pub(crate) fn new(snapshot: Arc) -> Self { Self { snapshot, - total_actions_counter: Arc::new(AtomicI64::new(0)), - add_actions_counter: Arc::new(AtomicI64::new(0)), + actions_count: Arc::new(AtomicI64::new(0)), + add_actions_count: Arc::new(AtomicI64::new(0)), } } - /// Retrieves the checkpoint data and path information + /// Retrieves the checkpoint data and path information. /// - /// This method is the core of the checkpoint generation process. It: - /// 1. Determines whether to write a V1 or V2 checkpoint based on the table's - /// `v2Checkpoints` feature support - /// 2. Reads actions from the log segment using the checkpoint read schema - /// 3. Filters and deduplicates actions for the checkpoint - /// 4. Chains the checkpoint metadata action if writing a V2 spec checkpoint - /// (i.e., if `v2Checkpoints` feature is supported by table) - /// 5. Generates the appropriate checkpoint path + /// This method generates the filtered actions for the checkpoint and determines + /// the appropriate destination path. /// - /// # Returns: [`CheckpointData`] containing the checkpoint path and data to write + /// # Returns + /// [`CheckpointData`] containing the checkpoint path and data to write. /// - /// # Important: The returned data should be written to persistent storage by the - /// caller before calling `finalize()` otherwise data loss may occur. + /// # Warning + /// All data must be written to persistent storage before calling `CheckpointWriter::finalize()`. + // This method is the core of the checkpoint generation process. It: + // 1. Determines whether to write a V1 or V2 checkpoint based on the table's + // `v2Checkpoints` feature support + // 2. Reads actions from the log segment using the checkpoint read schema + // 3. Filters and deduplicates actions for the checkpoint + // 4. Chains the checkpoint metadata action if writing a V2 spec checkpoint + // (i.e., if `v2Checkpoints` feature is supported by table) + // 5. Generates the appropriate checkpoint path pub fn checkpoint_data(&mut self, engine: &dyn Engine) -> DeltaResult { let is_v2_checkpoints_supported = self .snapshot .table_configuration() - .is_v2_checkpoint_supported(); + .is_v2_checkpoint_write_supported(); let actions = self.snapshot.log_segment().read_actions( engine, @@ -246,8 +238,8 @@ impl CheckpointWriter { // Create iterator over actions for checkpoint data let checkpoint_data = CheckpointLogReplayProcessor::new( - self.total_actions_counter.clone(), - self.add_actions_counter.clone(), + self.actions_count.clone(), + self.add_actions_count.clone(), self.deleted_file_retention_timestamp()?, ) .process_actions_iter(actions); @@ -280,22 +272,22 @@ impl CheckpointWriter { /// TODO(#850): Implement the finalize method /// - /// Finalizes the checkpoint writing. This function writes the `_last_checkpoint` file + /// Finalize the checkpoint writing process. /// - /// The `_last_checkpoint` file is a metadata file that contains information about the - /// last checkpoint created for the table. It is used as a hint for the engine to quickly - /// locate the last checkpoint and avoid full log replay when reading the table. + /// Internally, this method writes a last checkpoint hint which contains metadata about the + /// written checkpoint. /// /// # Important /// This method must only be called **after** successfully writing all checkpoint data to storage. /// Failure to do so may result in data loss. /// /// # Parameters - /// - `engine`: The engine used for writing the `_last_checkpoint` file + /// - `engine`: Implementation of [`Engine`] apis. /// - `metadata`: A single-row, single-column [`EngineData`] batch containing: /// - `sizeInBytes` (i64): The size of the written checkpoint file /// - /// # Returns: [`variant@Ok`] if the `_last_checkpoint` file was written successfully + /// # Returns: [`variant@Ok`] if the checkpoint was successfully finalized + #[allow(unused)] fn finalize(self, _engine: &dyn Engine, _metadata: &dyn EngineData) -> DeltaResult<()> { todo!("Implement the finalize method which will write the _last_checkpoint file") @@ -341,7 +333,7 @@ impl CheckpointWriter { // Safe to use Relaxed here: // "Incrementing a counter can be safely done by multiple threads using a relaxed fetch_add // if you're not using the counter to synchronize any other accesses." – Rust Atomics and Locks - self.total_actions_counter.fetch_add(1, Ordering::Relaxed); + self.actions_count.fetch_add(1, Ordering::Relaxed); Ok(Some(Ok(result))) } @@ -403,107 +395,3 @@ fn deleted_file_retention_timestamp_with_time( // Simple subtraction - will produce negative values if retention > now Ok(now_ms - retention_ms) } - -#[cfg(test)] -mod unit_tests { - use super::*; - use crate::engine::{arrow_data::ArrowEngineData, sync::SyncEngine}; - use crate::Table; - use arrow_53::{array::RecordBatch, datatypes::Field}; - use delta_kernel::arrow::array::create_array; - use std::path::PathBuf; - use std::sync::atomic::Ordering; - use std::time::Duration; - - use crate::arrow::array::{ArrayRef, StructArray}; - use crate::arrow::datatypes::{DataType, Schema}; - - #[test] - fn test_deleted_file_retention_timestamp() -> DeltaResult<()> { - let now = Duration::from_secs(1000).as_millis() as i64; - - // Test cases - let test_cases = [ - // Default case (7 days) - (None, now - (7 * 24 * 60 * 60 * 1000)), - // Zero retention - (Some(Duration::from_secs(0)), now), - // Custom retention (2000 seconds) - // This results in a negative timestamp which is valid - as it just means that - // the retention window extends to before UNIX epoch. - (Some(Duration::from_secs(2000)), now - (2000 * 1000)), - ]; - - for (retention, expected) in test_cases { - let result = - deleted_file_retention_timestamp_with_time(retention, Duration::from_secs(1000))?; - assert_eq!(result, expected); - } - - Ok(()) - } - - fn create_test_snapshot(engine: &dyn Engine) -> DeltaResult> { - let path = std::fs::canonicalize(PathBuf::from("./tests/data/app-txn-no-checkpoint/")); - let url = url::Url::from_directory_path(path.unwrap()).unwrap(); - let table = Table::new(url); - Ok(Arc::new(table.snapshot(engine, None)?)) - } - - #[test] - fn test_create_checkpoint_metadata_batch_when_v2_checkpoints_is_supported() -> DeltaResult<()> { - let engine = SyncEngine::new(); - let version = 10; - let writer = CheckpointWriter::new(create_test_snapshot(&engine)?); - - // Test with is_v2_checkpoint = true - let result = writer.create_checkpoint_metadata_batch(version, &engine, true)?; - assert!(result.is_some()); - let checkpoint_data = result.unwrap()?; - - // Check selection vector has one true value - assert_eq!(checkpoint_data.selection_vector, vec![true]); - - // Verify the underlying EngineData contains the expected CheckpointMetadata action - let arrow_engine_data = ArrowEngineData::try_from_engine_data(checkpoint_data.data)?; - let record_batch = arrow_engine_data.record_batch(); - - // Build the expected RecordBatch - // Note: The schema is a struct with a single field "checkpointMetadata" of type struct - // containing a single field "version" of type long - let expected_schema = Arc::new(Schema::new(vec![Field::new( - "checkpointMetadata", - DataType::Struct(vec![Field::new("version", DataType::Int64, false)].into()), - false, - )])); - let expected = RecordBatch::try_new( - expected_schema, - vec![Arc::new(StructArray::from(vec![( - Arc::new(Field::new("version", DataType::Int64, false)), - create_array!(Int64, [version]) as ArrayRef, - )]))], - ) - .unwrap(); - - assert_eq!(*record_batch, expected); - assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 1); - - Ok(()) - } - - #[test] - fn test_create_checkpoint_metadata_batch_when_v2_checkpoints_not_supported() -> DeltaResult<()> - { - let engine = SyncEngine::new(); - let writer = CheckpointWriter::new(create_test_snapshot(&engine)?); - - // Test with is_v2_checkpoint = false - let result = writer.create_checkpoint_metadata_batch(10, &engine, false)?; - - // No checkpoint metadata action should be created for V1 checkpoints - assert!(result.is_none()); - assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 0); - - Ok(()) - } -} diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index 9874b91ca8..d8413767a2 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -1,14 +1,121 @@ -use crate::{ - actions::{Add, Metadata, Protocol, Remove}, - engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}, - utils::test_utils::Action, - DeltaResult, Table, +use std::{ + path::PathBuf, + sync::{atomic::Ordering, Arc}, + time::Duration, }; + +use crate::actions::{Add, Metadata, Protocol, Remove}; +use crate::arrow::array::{ArrayRef, StructArray}; +use crate::arrow::datatypes::{DataType, Schema}; +use crate::checkpoint::{deleted_file_retention_timestamp_with_time, CheckpointWriter}; +use crate::engine::arrow_data::ArrowEngineData; +use crate::engine::{ + default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}, + sync::SyncEngine, +}; +use crate::snapshot::Snapshot; +use crate::utils::test_utils::Action; +use crate::DeltaResult; +use crate::Engine; +use crate::Table; + +use arrow_53::{ + array::{create_array, RecordBatch}, + datatypes::Field, +}; + use object_store::{memory::InMemory, path::Path, ObjectStore}; -use std::sync::{atomic::Ordering, Arc}; use test_utils::delta_path_for_version; use url::Url; +#[test] +fn test_deleted_file_retention_timestamp() -> DeltaResult<()> { + let now = Duration::from_secs(1000).as_millis() as i64; + + // Test cases + let test_cases = [ + // Default case (7 days) + (None, now - (7 * 24 * 60 * 60 * 1000)), + // Zero retention + (Some(Duration::from_secs(0)), now), + // Custom retention (2000 seconds) + // This results in a negative timestamp which is valid - as it just means that + // the retention window extends to before UNIX epoch. + (Some(Duration::from_secs(2000)), now - (2000 * 1000)), + ]; + + for (retention, expected) in test_cases { + let result = + deleted_file_retention_timestamp_with_time(retention, Duration::from_secs(1000))?; + assert_eq!(result, expected); + } + + Ok(()) +} + +fn create_test_snapshot(engine: &dyn Engine) -> DeltaResult> { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/app-txn-no-checkpoint/")); + let url = url::Url::from_directory_path(path.unwrap()).unwrap(); + let table = Table::new(url); + Ok(Arc::new(table.snapshot(engine, None)?)) +} + +#[test] +fn test_create_checkpoint_metadata_batch_when_v2_checkpoints_is_supported() -> DeltaResult<()> { + let engine = SyncEngine::new(); + let version = 10; + let writer = CheckpointWriter::new(create_test_snapshot(&engine)?); + + // Test with is_v2_checkpoint = true + let result = writer.create_checkpoint_metadata_batch(version, &engine, true)?; + assert!(result.is_some()); + let checkpoint_data = result.unwrap()?; + + // Check selection vector has one true value + assert_eq!(checkpoint_data.selection_vector, vec![true]); + + // Verify the underlying EngineData contains the expected CheckpointMetadata action + let arrow_engine_data = ArrowEngineData::try_from_engine_data(checkpoint_data.data)?; + let record_batch = arrow_engine_data.record_batch(); + + // Build the expected RecordBatch + // Note: The schema is a struct with a single field "checkpointMetadata" of type struct + // containing a single field "version" of type long + let expected_schema = Arc::new(Schema::new(vec![Field::new( + "checkpointMetadata", + DataType::Struct(vec![Field::new("version", DataType::Int64, false)].into()), + false, + )])); + let expected = RecordBatch::try_new( + expected_schema, + vec![Arc::new(StructArray::from(vec![( + Arc::new(Field::new("version", DataType::Int64, false)), + create_array!(Int64, [version]) as ArrayRef, + )]))], + ) + .unwrap(); + + assert_eq!(*record_batch, expected); + assert_eq!(writer.actions_count.load(Ordering::Relaxed), 1); + + Ok(()) +} + +#[test] +fn test_create_checkpoint_metadata_batch_when_v2_checkpoints_not_supported() -> DeltaResult<()> { + let engine = SyncEngine::new(); + let writer = CheckpointWriter::new(create_test_snapshot(&engine)?); + + // Test with is_v2_checkpoint = false + let result = writer.create_checkpoint_metadata_batch(10, &engine, false)?; + + // No checkpoint metadata action should be created for V1 checkpoints + assert!(result.is_none()); + assert_eq!(writer.actions_count.load(Ordering::Relaxed), 0); + + Ok(()) +} + /// TODO(#855): Merge copies and move to `test_utils` /// Create an in-memory store and return the store and the URL for the store's _delta_log directory. fn new_in_memory_store() -> (Arc, Url) { @@ -154,8 +261,8 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { // contain any true values, as the add action is removed in a following commit. assert!(data_iter.next().is_none()); - assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 4); - assert_eq!(writer.add_actions_counter.load(Ordering::Relaxed), 1); + assert_eq!(writer.actions_count.load(Ordering::Relaxed), 4); + assert_eq!(writer.add_actions_count.load(Ordering::Relaxed), 1); // TODO(#850): Finalize and verify _last_checkpoint Ok(()) @@ -207,8 +314,8 @@ fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { // No more data should exist because we only requested version 0 assert!(data_iter.next().is_none()); - assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 2); - assert_eq!(writer.add_actions_counter.load(Ordering::Relaxed), 0); + assert_eq!(writer.actions_count.load(Ordering::Relaxed), 2); + assert_eq!(writer.add_actions_count.load(Ordering::Relaxed), 0); // TODO(#850): Finalize and verify _last_checkpoint Ok(()) @@ -270,8 +377,8 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { // No more data should exist assert!(data_iter.next().is_none()); - assert_eq!(writer.total_actions_counter.load(Ordering::Relaxed), 5); - assert_eq!(writer.add_actions_counter.load(Ordering::Relaxed), 1); + assert_eq!(writer.actions_count.load(Ordering::Relaxed), 5); + assert_eq!(writer.add_actions_count.load(Ordering::Relaxed), 1); // TODO(#850): Finalize and verify _last_checkpoint Ok(()) diff --git a/kernel/src/error.rs b/kernel/src/error.rs index 14b125ae98..876862725f 100644 --- a/kernel/src/error.rs +++ b/kernel/src/error.rs @@ -211,7 +211,7 @@ pub enum Error { // Convenience constructors for Error types that take a String argument impl Error { - pub fn checkpoint_write(msg: impl ToString) -> Self { + pub(crate) fn checkpoint_write(msg: impl ToString) -> Self { Self::CheckpointWrite(msg.to_string()) } diff --git a/kernel/src/table.rs b/kernel/src/table.rs index 20a82c193f..33f078bb1b 100644 --- a/kernel/src/table.rs +++ b/kernel/src/table.rs @@ -100,14 +100,15 @@ impl Table { ) } - /// Creates a [`CheckpointWriter`] for generating table checkpoints at the specified version. - /// - /// The checkpoint type is automatically determined based on the table's feature support: - /// - Tables supporting `v2Checkpoints` feature -> Creates a Classic-named V2 checkpoint - /// - Tables not supporting `v2Checkpoints` feature -> Creates a Classic-named V1 checkpoint + /// Creates a [`CheckpointWriter`] for generating checkpoints at the specified table version. /// /// See the [`crate::checkpoint`] module documentation for more details on checkpoint types /// and the overall checkpoint process. + /// + /// # Parameters + /// - `engine`: Implementation of [`Engine`] apis. + /// - `version`: The version of the table to checkpoint. If [`None`], the latest version of the + /// table will be checkpointed. pub fn checkpoint( &self, engine: &dyn Engine, diff --git a/kernel/src/table_configuration.rs b/kernel/src/table_configuration.rs index fcb4393a63..e88da5eaab 100644 --- a/kernel/src/table_configuration.rs +++ b/kernel/src/table_configuration.rs @@ -269,9 +269,7 @@ impl TableConfiguration { /// both the protocol's readerFeatures and writerFeatures. /// /// See: - #[internal_api] - #[allow(unused)] // needed to compile w/o default features - pub(crate) fn is_v2_checkpoint_supported(&self) -> bool { + pub(crate) fn is_v2_checkpoint_write_supported(&self) -> bool { let read_supported = self .protocol() .has_reader_feature(&ReaderFeature::V2Checkpoint) From 4ccf3801d55dc237bbee3601ed26391d477a1b94 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 17 Apr 2025 16:51:01 -0700 Subject: [PATCH 152/176] docs --- kernel/src/checkpoint/mod.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index af17359a97..cd02c48a69 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -1,4 +1,4 @@ -//! This module implements the API for writing single-file checkpoints in delta tables. +//! This module implements the API for writing single-file checkpoints. //! //! The entry-point for this API is [`Table::checkpoint`]. //! @@ -90,7 +90,7 @@ //! //! [`CheckpointMetadata`]: crate::actions::CheckpointMetadata //! [`LastCheckpointHint`]: crate::snapshot::LastCheckpointHint -//! [`Table::checkpoint`]: [`crate::table::Table::checkpoint`] +//! [`Table::checkpoint`]: crate::table::Table::checkpoint // Future extensions // - TODO(#836): Single-file UUID-named V2 checkpoints (using `n.checkpoint.u.{json/parquet}` naming) are to be // implemented in the future. The current implementation only supports classic-named V2 checkpoints. @@ -287,7 +287,6 @@ impl CheckpointWriter { /// - `sizeInBytes` (i64): The size of the written checkpoint file /// /// # Returns: [`variant@Ok`] if the checkpoint was successfully finalized - #[allow(unused)] fn finalize(self, _engine: &dyn Engine, _metadata: &dyn EngineData) -> DeltaResult<()> { todo!("Implement the finalize method which will write the _last_checkpoint file") From ca6dbe3a6b1d948f7c2b3db276ca3e6e802e8d52 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 18 Apr 2025 10:43:17 -0700 Subject: [PATCH 153/176] checkpont metadata --- kernel/src/checkpoint/mod.rs | 24 +++++++++--------------- kernel/src/checkpoint/tests.rs | 21 ++------------------- 2 files changed, 11 insertions(+), 34 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index cd02c48a69..1c9267eb15 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -253,11 +253,10 @@ impl CheckpointWriter { })?; // Chain the checkpoint metadata action if using V2 checkpoints - let chained = checkpoint_data.chain(self.create_checkpoint_metadata_batch( - version, - engine, - is_v2_checkpoints_supported, - )?); + let chained = checkpoint_data.chain( + is_v2_checkpoints_supported + .then(|| Ok(self.create_checkpoint_metadata_batch(version, engine)?)), + ); let checkpoint_path = ParsedLogPath::new_classic_parquet_checkpoint( self.snapshot.table_root(), @@ -294,10 +293,9 @@ impl CheckpointWriter { /// Creates the checkpoint metadata action for V2 checkpoints. /// - /// For V2 checkpoints, this function generates the [`CheckpointMetadata`] action - /// that must be included in the V2 spec checkpoint file. This action contains metadata - /// about the checkpoint, particularly its version. For V1 checkpoints, this function - /// returns `None`, as the V1 checkpoint schema does not include this action type. + /// This function generates the [`CheckpointMetadata`] action that must be included in the + /// V2 spec checkpoint file. This action contains metadata about the checkpoint, particularly + /// its version. /// /// # Implementation Details /// @@ -313,11 +311,7 @@ impl CheckpointWriter { &self, version: i64, engine: &dyn Engine, - is_v2_checkpoint: bool, - ) -> DeltaResult>> { - if !is_v2_checkpoint { - return Ok(None); - } + ) -> DeltaResult { let values: &[Scalar] = &[version.into()]; let checkpoint_metadata_batch = engine @@ -334,7 +328,7 @@ impl CheckpointWriter { // if you're not using the counter to synchronize any other accesses." – Rust Atomics and Locks self.actions_count.fetch_add(1, Ordering::Relaxed); - Ok(Some(Ok(result))) + Ok(result) } /// Calculates the cutoff timestamp for deleted file cleanup. diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index d8413767a2..900478e72e 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -61,15 +61,13 @@ fn create_test_snapshot(engine: &dyn Engine) -> DeltaResult> { } #[test] -fn test_create_checkpoint_metadata_batch_when_v2_checkpoints_is_supported() -> DeltaResult<()> { +fn test_create_checkpoint_metadata_batch() -> DeltaResult<()> { let engine = SyncEngine::new(); let version = 10; let writer = CheckpointWriter::new(create_test_snapshot(&engine)?); // Test with is_v2_checkpoint = true - let result = writer.create_checkpoint_metadata_batch(version, &engine, true)?; - assert!(result.is_some()); - let checkpoint_data = result.unwrap()?; + let checkpoint_data = writer.create_checkpoint_metadata_batch(version, &engine)?; // Check selection vector has one true value assert_eq!(checkpoint_data.selection_vector, vec![true]); @@ -101,21 +99,6 @@ fn test_create_checkpoint_metadata_batch_when_v2_checkpoints_is_supported() -> D Ok(()) } -#[test] -fn test_create_checkpoint_metadata_batch_when_v2_checkpoints_not_supported() -> DeltaResult<()> { - let engine = SyncEngine::new(); - let writer = CheckpointWriter::new(create_test_snapshot(&engine)?); - - // Test with is_v2_checkpoint = false - let result = writer.create_checkpoint_metadata_batch(10, &engine, false)?; - - // No checkpoint metadata action should be created for V1 checkpoints - assert!(result.is_none()); - assert_eq!(writer.actions_count.load(Ordering::Relaxed), 0); - - Ok(()) -} - /// TODO(#855): Merge copies and move to `test_utils` /// Create an in-memory store and return the store and the URL for the store's _delta_log directory. fn new_in_memory_store() -> (Arc, Url) { From ecaa9b07ee7bbdbf09ceb32615a2804aeb065875 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 18 Apr 2025 14:25:23 -0700 Subject: [PATCH 154/176] channel instead of arc for action counts --- kernel/src/checkpoint/log_replay.rs | 103 ++++++++--------- kernel/src/checkpoint/mod.rs | 164 +++++++++++++++++++++------- kernel/src/checkpoint/tests.rs | 53 +++++---- 3 files changed, 198 insertions(+), 122 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 1a46469ff5..0595c4b558 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -25,42 +25,31 @@ //! the overall process. For each batch of log actions, it: //! 1. Creates a visitor with the current deduplication state //! 2. Applies the visitor to filter actions in the batch -//! 3. Updates counters and state for cross-batch deduplication -//! 4. Produces a [`FilteredEngineData`] result which includes a selection vector indicating which -//! actions should be included in the checkpoint file +//! 3. Tracks state for deduplication across batches +//! 4. Produces a [`CheckpointBatch`] result which includes both the filtered data and counts of +//! actions selected for the checkpoint file //! //! [`CheckpointMetadata`]: crate::actions::CheckpointMetadata use crate::engine_data::{FilteredEngineData, GetData, RowVisitor, TypedGetData as _}; -use crate::log_replay::{FileActionDeduplicator, FileActionKey, LogReplayProcessor}; +use crate::log_replay::{ + FileActionDeduplicator, FileActionKey, HasSelectionVector, LogReplayProcessor, +}; use crate::scan::data_skipping::DataSkippingFilter; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, EngineData, Error}; use std::collections::HashSet; -use std::sync::atomic::{AtomicI64, Ordering}; -use std::sync::{Arc, LazyLock}; +use std::sync::LazyLock; /// The [`CheckpointLogReplayProcessor`] is an implementation of the [`LogReplayProcessor`] /// trait that filters log segment actions for inclusion in a V1 spec checkpoint file. This /// processor is leveraged when creating a single-file V2 checkpoint as the V2 spec schema is /// a superset of the V1 spec schema, with the addition of a [`CheckpointMetadata`] action. -/// -/// It processes each action batch via the `process_actions_batch` method, using the -/// [`CheckpointVisitor`] to build an accompanying selection vector indicating which actions -/// should be included in the checkpoint. -#[allow(unused)] // TODO(seb): Remove once checkpoint api is implemented pub(crate) struct CheckpointLogReplayProcessor { /// Tracks file actions that have been seen during log replay to avoid duplicates. /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. seen_file_keys: HashSet, - // Arc provides shared mutability for our counters, allowing both the - // iterator to update the values during processing and the caller to observe the final - // counts afterward. The counters are i64 to match the `_last_checkpoint` file schema. - // Tracks the total number of actions included in the checkpoint file. - actions_count: Arc, - // Tracks the total number of add actions included in the checkpoint file. - add_actions_count: Arc, /// Indicates whether a protocol action has been seen in the log. seen_protocol: bool, /// Indicates whether a metadata action has been seen in the log. @@ -71,11 +60,31 @@ pub(crate) struct CheckpointLogReplayProcessor { minimum_file_retention_timestamp: i64, } +/// This struct is the output of the [`CheckpointLogReplayProcessor`]. +/// +/// It contains the filtered batch of actions to be included in the checkpoint, +/// along with statistics about the number of actions filtered for inclusion. +pub(crate) struct CheckpointBatch { + /// The filtered batch of actions to be included in the checkpoint. + pub(crate) filtered_data: FilteredEngineData, + /// The number of actions in the batch filtered for inclusion in the checkpoint. + pub(crate) actions_count: i64, + /// The number of add actions in the batch filtered for inclusion in the checkpoint. + pub(crate) add_actions_count: i64, +} + +impl HasSelectionVector for CheckpointBatch { + fn has_selected_rows(&self) -> bool { + self.filtered_data.has_selected_rows() + } +} + impl LogReplayProcessor for CheckpointLogReplayProcessor { - type Output = FilteredEngineData; + type Output = CheckpointBatch; /// Processes a batch of actions read from the log during reverse chronological replay - /// and returns a filtered batch ([`FilteredEngineData`]) to be included in the checkpoint. + /// and returns a [`CheckpointBatch`], which contains the filtered actions to be + /// included in the checkpoint file, along with statistics about the included actions. /// /// This method delegates the filtering logic to the [`CheckpointVisitor`], which implements /// the deduplication rules described in the module documentation. The method tracks @@ -100,23 +109,19 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { ); visitor.visit_rows_of(batch.as_ref())?; - // Safe to use Relaxed here: - // "Incrementing a counter can be safely done by multiple threads using a relaxed fetch_add - // if you're not using the counter to synchronize any other accesses." – Rust Atomics and Locks - self.actions_count.fetch_add( - visitor.file_actions_count + visitor.non_file_actions_count, - Ordering::Relaxed, - ); - self.add_actions_count - .fetch_add(visitor.add_actions_count, Ordering::Relaxed); - // Update protocol and metadata seen flags self.seen_protocol = visitor.seen_protocol; self.seen_metadata = visitor.seen_metadata; - Ok(FilteredEngineData { + let filtered_data = FilteredEngineData { data: batch, selection_vector: visitor.selection_vector, + }; + + Ok(CheckpointBatch { + filtered_data, + actions_count: visitor.non_file_actions_count + visitor.file_actions_count, + add_actions_count: visitor.add_actions_count, }) } @@ -127,16 +132,9 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { } impl CheckpointLogReplayProcessor { - #[allow(unused)] // TODO(seb): Remove once checkpoint api is implemented - pub(crate) fn new( - actions_count: Arc, - add_actions_count: Arc, - minimum_file_retention_timestamp: i64, - ) -> Self { + pub(crate) fn new(minimum_file_retention_timestamp: i64) -> Self { Self { seen_file_keys: Default::default(), - actions_count, - add_actions_count, seen_protocol: false, seen_metadata: false, seen_txns: Default::default(), @@ -478,23 +476,18 @@ mod tests { fn run_checkpoint_test( input_batches: Vec<(Box, bool)>, ) -> DeltaResult<(Vec, i64, i64)> { - let actions_count = Arc::new(AtomicI64::new(0)); - let add_actions_count = Arc::new(AtomicI64::new(0)); - let results: Vec<_> = CheckpointLogReplayProcessor::new( - actions_count.clone(), - add_actions_count.clone(), - 0, // minimum_file_retention_timestamp - ) - .process_actions_iter(input_batches.into_iter().map(Ok)) - .collect::>>()?; - - Ok(( - results, - actions_count.load(Ordering::Relaxed), - add_actions_count.load(Ordering::Relaxed), - )) + let processed_batches = CheckpointLogReplayProcessor::new(0) + .process_actions_iter(input_batches.into_iter().map(Ok)) + .collect::>>()?; + let total_count: i64 = processed_batches.iter().map(|b| b.actions_count).sum(); + let add_count: i64 = processed_batches.iter().map(|b| b.add_actions_count).sum(); + let filtered_data = processed_batches + .into_iter() + .map(|b| b.filtered_data) + .collect(); + + Ok((filtered_data, total_count, add_count)) } - #[test] fn test_checkpoint_visitor() -> DeltaResult<()> { let data = action_batch(); diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 1c9267eb15..c81b506678 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -26,7 +26,8 @@ //! ## Architecture //! //! - [`CheckpointWriter`] - Core component that manages the checkpoint creation workflow -//! - [`CheckpointData`] - Contains the data to write and destination path information +//! - [`CheckpointData`] - Wraps the [`CheckpointDataIterator`] and destination path information +//! - [`CheckpointDataIterator`] - Iterator over the checkpoint data to be written //! //! ## Usage //! @@ -96,7 +97,7 @@ // implemented in the future. The current implementation only supports classic-named V2 checkpoints. // - TODO(#837): Multi-file V2 checkpoints are not supported yet. The API is designed to be extensible for future // multi-file support, but the current implementation only supports single-file checkpoints. -use std::sync::atomic::{AtomicI64, Ordering}; +use std::sync::mpsc::{channel, Receiver, Sender}; use std::sync::{Arc, LazyLock}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; @@ -112,7 +113,7 @@ use crate::path::ParsedLogPath; use crate::schema::{DataType, SchemaRef, StructField, StructType}; use crate::snapshot::Snapshot; use crate::{DeltaResult, Engine, EngineData, Error, EvaluationHandlerExtension}; -use log_replay::CheckpointLogReplayProcessor; +use log_replay::{CheckpointBatch, CheckpointLogReplayProcessor}; use url::Url; @@ -152,6 +153,80 @@ static CHECKPOINT_METADATA_ACTION_SCHEMA: LazyLock = LazyLock::new(|| .into() }); +/// This struct is used to send the total action counts from the [`CheckpointDataIterator`] +/// to the [`CheckpointWriter`] when the iterator is dropped over a [`channel`]. The counts are +/// used to populate the `_last_checkpoint` file on [`CheckpointWriter::finalize`]. +#[allow(unused)] // TODO(#850): Read when implementing finalize +struct CheckpointCounts { + /// Total number of actions included in the checkpoint + actions_count: i64, + /// Number of add actions included in the checkpoint + add_actions_count: i64, +} + +/// An iterator over the checkpoint data to be written to the file. +/// +/// This iterator yields filtered checkpoint data batches ([`FilteredEngineData`]) and +/// tracks action statistics required for finalizing the checkpoint. It must be fully consumed +/// before calling [`CheckpointWriter::finalize`], or finalization will fail. Furthermore, +/// the yielded data must be written to the specified path before finalization, or it will +/// may result in data loss and corruption. +/// +/// On drop, sends accumulated action counts to the writer for metadata recording. +pub struct CheckpointDataIterator { + /// The inner iterator that yields the checkpoint data with counts + inner: Box>>, + /// Channel sender for action counts + counts_tx: Sender, + /// Running total of actions included in the checkpoint + actions_count: i64, + /// Running total of add actions included in the checkpoint + add_actions_count: i64, +} + +impl Iterator for CheckpointDataIterator { + type Item = DeltaResult; + + /// Advances the iterator and returns the next value. + /// + /// This implementation transforms the [`CheckpointBatch`] items from the inner iterator into + /// [`FilteredEngineData`] items for the engine to write, while accumulating action counts for + /// each batch. When the iterator is dropped, it sends the accumulated counts to the [`CheckpointWriter`] + /// through a [`channel`] to be later used in [`CheckpointWriter::finalize`]. + fn next(&mut self) -> Option { + // Get the next batch from the inner iterator + self.inner.next().map(|result| { + result.map(|batch| { + // Accumulate counts + self.actions_count += batch.actions_count; + self.add_actions_count += batch.add_actions_count; + + // Return just the filtered data + batch.filtered_data + }) + }) + } +} + +impl Drop for CheckpointDataIterator { + /// Sends accumulated action counts when the iterator is dropped. + /// + /// This method is called automatically when the iterator goes out of scope, + /// which happens either when it is fully consumed or when it is explicitly dropped. + /// The accumulated counts are sent to the [`CheckpointWriter`] through a channel, + /// where they are used during finalization to write the `_last_checkpoint` file. + fn drop(&mut self) { + // Send accumulated counts when the iterator is dropped + let counts = CheckpointCounts { + actions_count: self.actions_count, + add_actions_count: self.add_actions_count, + }; + + // Ignore send errors - they will be handled on the receiving end + let _ = self.counts_tx.send(counts); + } +} + /// Represents the data needed to create a single-file checkpoint. /// /// Obtained from [`CheckpointWriter::checkpoint_data`], this struct provides both the @@ -159,14 +234,14 @@ static CHECKPOINT_METADATA_ACTION_SCHEMA: LazyLock = LazyLock::new(|| /// that should be included in the checkpoint. /// /// # Warning -/// All data must be fully written to persistent storage before calling +/// The [`CheckpointDataIterator`] must be fully consumed to ensure proper collection of statistics for +/// the checkpoint. Additionally, all yielded data must be written to the specified path before calling /// `CheckpointWriter::finalize`. Failing to do so may result in data loss or corruption. pub struct CheckpointData { /// The URL where the checkpoint file should be written. pub path: Url, - /// An iterator over the checkpoint data to be written to the file. - pub data: Box>>, + pub data: CheckpointDataIterator, } /// Orchestrates the process of creating a checkpoint for a table. @@ -186,22 +261,20 @@ pub struct CheckpointData { pub struct CheckpointWriter { /// Reference to the snapshot (i.e. version) of the table being checkpointed pub(crate) snapshot: Arc, - /// Note: `Arc` provides shared mutability for our counters, allowing the - /// returned actions iterator from `.checkpoint_data()` to update the counters, - /// and the [`CheckpointWriter`] to read them during `.finalize()` - /// Counter for total actions included in the checkpoint - pub(crate) actions_count: Arc, - /// Counter for Add actions included in the checkpoint - pub(crate) add_actions_count: Arc, + /// Channel receiver for action counts from the [`CheckpointDataIterator`] + counts_rx: Receiver, + /// Channel sender for action counts for the [`CheckpointDataIterator`] + counts_tx: Sender, } impl CheckpointWriter { /// Creates a new CheckpointWriter from a snapshot pub(crate) fn new(snapshot: Arc) -> Self { + let (counts_tx, counts_rx) = channel(); Self { snapshot, - actions_count: Arc::new(AtomicI64::new(0)), - add_actions_count: Arc::new(AtomicI64::new(0)), + counts_rx, + counts_tx, } } @@ -237,12 +310,9 @@ impl CheckpointWriter { )?; // Create iterator over actions for checkpoint data - let checkpoint_data = CheckpointLogReplayProcessor::new( - self.actions_count.clone(), - self.add_actions_count.clone(), - self.deleted_file_retention_timestamp()?, - ) - .process_actions_iter(actions); + let checkpoint_data = + CheckpointLogReplayProcessor::new(self.deleted_file_retention_timestamp()?) + .process_actions_iter(actions); let version = self.snapshot.version().try_into().map_err(|e| { Error::CheckpointWrite(format!( @@ -263,22 +333,27 @@ impl CheckpointWriter { self.snapshot.version(), )?; + // Wrap the data iterator to send counts to the CheckpointWriter when dropped + let wrapped_iterator = CheckpointDataIterator { + inner: Box::new(chained), + counts_tx: self.counts_tx.clone(), + actions_count: 0, + add_actions_count: 0, + }; + Ok(CheckpointData { path: checkpoint_path.location, - data: Box::new(chained), + data: wrapped_iterator, }) } /// TODO(#850): Implement the finalize method /// - /// Finalize the checkpoint writing process. - /// - /// Internally, this method writes a last checkpoint hint which contains metadata about the - /// written checkpoint. + /// Finalizes checkpoint creation after verifying all data is persisted. /// - /// # Important - /// This method must only be called **after** successfully writing all checkpoint data to storage. - /// Failure to do so may result in data loss. + /// This method **must** be called only after: + /// 1. The checkpoint data iterator has been fully consumed + /// 2. All data has been successfully written to object storage /// /// # Parameters /// - `engine`: Implementation of [`Engine`] apis. @@ -288,7 +363,19 @@ impl CheckpointWriter { /// # Returns: [`variant@Ok`] if the checkpoint was successfully finalized #[allow(unused)] fn finalize(self, _engine: &dyn Engine, _metadata: &dyn EngineData) -> DeltaResult<()> { - todo!("Implement the finalize method which will write the _last_checkpoint file") + // The method validates iterator consumption, but can not garuntee data persistence. + match self.counts_rx.try_recv() { + Ok(counts) => { + // Write the _last_checkpoint file with the action counts + todo!("Implement the finalize method which will write the _last_checkpoint file") + } + Err(_) => { + // The iterator wasn't fully consumed, which means not all data was written + return Err(Error::checkpoint_write( + "Checkpoint data iterator was not fully consumed before finalization", + )); + } + } } /// Creates the checkpoint metadata action for V2 checkpoints. @@ -304,31 +391,30 @@ impl CheckpointWriter { /// include the additional metadata field `tags` when map support is added. /// /// # Returns: - /// A [`FilteredEngineData`] batch including the single-row [`EngineData`] batch along with + /// A [`CheckpointBatch`] batch including the single-row [`EngineData`] batch along with /// an accompanying selection vector with a single `true` value, indicating the action in /// batch should be included in the checkpoint. fn create_checkpoint_metadata_batch( &self, version: i64, engine: &dyn Engine, - ) -> DeltaResult { + ) -> DeltaResult { let values: &[Scalar] = &[version.into()]; let checkpoint_metadata_batch = engine .evaluation_handler() .create_one(CHECKPOINT_METADATA_ACTION_SCHEMA.clone(), values)?; - let result = FilteredEngineData { + let filtered_data = FilteredEngineData { data: checkpoint_metadata_batch, selection_vector: vec![true], // Include the action in the checkpoint }; - // Safe to use Relaxed here: - // "Incrementing a counter can be safely done by multiple threads using a relaxed fetch_add - // if you're not using the counter to synchronize any other accesses." – Rust Atomics and Locks - self.actions_count.fetch_add(1, Ordering::Relaxed); - - Ok(result) + Ok(CheckpointBatch { + filtered_data, + actions_count: 1, + add_actions_count: 0, + }) } /// Calculates the cutoff timestamp for deleted file cleanup. diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index 900478e72e..fa04c470e4 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -1,8 +1,4 @@ -use std::{ - path::PathBuf, - sync::{atomic::Ordering, Arc}, - time::Duration, -}; +use std::{path::PathBuf, sync::Arc, time::Duration}; use crate::actions::{Add, Metadata, Protocol, Remove}; use crate::arrow::array::{ArrayRef, StructArray}; @@ -66,14 +62,14 @@ fn test_create_checkpoint_metadata_batch() -> DeltaResult<()> { let version = 10; let writer = CheckpointWriter::new(create_test_snapshot(&engine)?); - // Test with is_v2_checkpoint = true - let checkpoint_data = writer.create_checkpoint_metadata_batch(version, &engine)?; + let checkpoint_batch = writer.create_checkpoint_metadata_batch(version, &engine)?; // Check selection vector has one true value - assert_eq!(checkpoint_data.selection_vector, vec![true]); + assert_eq!(checkpoint_batch.filtered_data.selection_vector, vec![true]); // Verify the underlying EngineData contains the expected CheckpointMetadata action - let arrow_engine_data = ArrowEngineData::try_from_engine_data(checkpoint_data.data)?; + let arrow_engine_data = + ArrowEngineData::try_from_engine_data(checkpoint_batch.filtered_data.data)?; let record_batch = arrow_engine_data.record_batch(); // Build the expected RecordBatch @@ -94,7 +90,8 @@ fn test_create_checkpoint_metadata_batch() -> DeltaResult<()> { .unwrap(); assert_eq!(*record_batch, expected); - assert_eq!(writer.actions_count.load(Ordering::Relaxed), 1); + assert_eq!(checkpoint_batch.actions_count, 1); + assert_eq!(checkpoint_batch.add_actions_count, 0); Ok(()) } @@ -233,19 +230,19 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { ); // The first batch should be the metadata and protocol actions. - let checkpoint_data = data_iter.next().unwrap()?; - assert_eq!(checkpoint_data.selection_vector, [true, true]); + let batch = data_iter.next().unwrap()?; + assert_eq!(batch.selection_vector, [true, true]); // The second batch should only include the add action as the remove action is expired. - let checkpoint_data = data_iter.next().unwrap()?; - assert_eq!(checkpoint_data.selection_vector, [true, true]); + let batch = data_iter.next().unwrap()?; + assert_eq!(batch.selection_vector, [true, true]); // The third batch should not be included as the selection vector does not // contain any true values, as the add action is removed in a following commit. assert!(data_iter.next().is_none()); - assert_eq!(writer.actions_count.load(Ordering::Relaxed), 4); - assert_eq!(writer.add_actions_count.load(Ordering::Relaxed), 1); + assert_eq!(data_iter.actions_count, 4); + assert_eq!(data_iter.add_actions_count, 1); // TODO(#850): Finalize and verify _last_checkpoint Ok(()) @@ -291,14 +288,14 @@ fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { ); // The first batch should be the metadata and protocol actions. - let checkpoint_data = data_iter.next().unwrap()?; - assert_eq!(checkpoint_data.selection_vector, [true, true]); + let batch = data_iter.next().unwrap()?; + assert_eq!(batch.selection_vector, [true, true]); // No more data should exist because we only requested version 0 assert!(data_iter.next().is_none()); - assert_eq!(writer.actions_count.load(Ordering::Relaxed), 2); - assert_eq!(writer.add_actions_count.load(Ordering::Relaxed), 0); + assert_eq!(data_iter.actions_count, 2); + assert_eq!(data_iter.add_actions_count, 0); // TODO(#850): Finalize and verify _last_checkpoint Ok(()) @@ -346,22 +343,22 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { ); // The first batch should be the metadata and protocol actions. - let checkpoint_data = data_iter.next().unwrap()?; - assert_eq!(checkpoint_data.selection_vector, [true, true]); + let batch = data_iter.next().unwrap()?; + assert_eq!(batch.selection_vector, [true, true]); // The second batch should be the add action as the remove action is expired. - let checkpoint_data = data_iter.next().unwrap()?; - assert_eq!(checkpoint_data.selection_vector, [true, true]); + let batch = data_iter.next().unwrap()?; + assert_eq!(batch.selection_vector, [true, true]); // The third batch should be the CheckpointMetaData action. - let checkpoint_data = data_iter.next().unwrap()?; - assert_eq!(checkpoint_data.selection_vector, [true]); + let batch = data_iter.next().unwrap()?; + assert_eq!(batch.selection_vector, [true]); // No more data should exist assert!(data_iter.next().is_none()); - assert_eq!(writer.actions_count.load(Ordering::Relaxed), 5); - assert_eq!(writer.add_actions_count.load(Ordering::Relaxed), 1); + assert_eq!(data_iter.actions_count, 5); + assert_eq!(data_iter.add_actions_count, 1); // TODO(#850): Finalize and verify _last_checkpoint Ok(()) From f3679e0df146a18882c23a0353207e70378b3e8c Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 18 Apr 2025 14:39:20 -0700 Subject: [PATCH 155/176] docs & fix --- kernel/src/checkpoint/mod.rs | 10 +-- kernel/tests/write_checkpoints.rs | 106 ++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 5 deletions(-) create mode 100644 kernel/tests/write_checkpoints.rs diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index c81b506678..38e44d673b 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -168,7 +168,7 @@ struct CheckpointCounts { /// /// This iterator yields filtered checkpoint data batches ([`FilteredEngineData`]) and /// tracks action statistics required for finalizing the checkpoint. It must be fully consumed -/// before calling [`CheckpointWriter::finalize`], or finalization will fail. Furthermore, +/// before calling `CheckpointWriter::finalize`, or finalization will fail. Furthermore, /// the yielded data must be written to the specified path before finalization, or it will /// may result in data loss and corruption. /// @@ -189,7 +189,7 @@ impl Iterator for CheckpointDataIterator { /// Advances the iterator and returns the next value. /// - /// This implementation transforms the [`CheckpointBatch`] items from the inner iterator into + /// This implementation transforms the `CheckpointBatch` items from the inner iterator into /// [`FilteredEngineData`] items for the engine to write, while accumulating action counts for /// each batch. When the iterator is dropped, it sends the accumulated counts to the [`CheckpointWriter`] /// through a [`channel`] to be later used in [`CheckpointWriter::finalize`]. @@ -325,7 +325,7 @@ impl CheckpointWriter { // Chain the checkpoint metadata action if using V2 checkpoints let chained = checkpoint_data.chain( is_v2_checkpoints_supported - .then(|| Ok(self.create_checkpoint_metadata_batch(version, engine)?)), + .then(|| self.create_checkpoint_metadata_batch(version, engine)), ); let checkpoint_path = ParsedLogPath::new_classic_parquet_checkpoint( @@ -371,9 +371,9 @@ impl CheckpointWriter { } Err(_) => { // The iterator wasn't fully consumed, which means not all data was written - return Err(Error::checkpoint_write( + Err(Error::checkpoint_write( "Checkpoint data iterator was not fully consumed before finalization", - )); + )) } } } diff --git a/kernel/tests/write_checkpoints.rs b/kernel/tests/write_checkpoints.rs new file mode 100644 index 0000000000..8b85bfeea2 --- /dev/null +++ b/kernel/tests/write_checkpoints.rs @@ -0,0 +1,106 @@ +// use std::path::PathBuf; +// use std::sync::Arc; + +// use delta_kernel::arrow::array::RecordBatch; +// use delta_kernel::arrow::datatypes::Schema; +// use delta_kernel::engine::arrow_data::ArrowEngineData; +// use delta_kernel::engine::sync::SyncEngine; +// use delta_kernel::{arrow, DeltaResult, Engine, Table}; + +// mod common; +// use common::{load_test_data, read_scan}; +// use itertools::Itertools; +// use parquet_53::arrow::ArrowWriter; + +// fn read_table(test_name: &str) -> DeltaResult> { +// let path = std::fs::canonicalize(PathBuf::from(format!("./tests/data/{}/", test_name)))?; +// let table = Table::try_from_uri(path.to_string_lossy().as_ref())?; +// let engine = Arc::new(SyncEngine::new()); +// let snapshot = table.snapshot(engine.as_ref(), None)?; +// let scan = snapshot.into_scan_builder().build()?; +// let batches = read_scan(&scan, engine)?; + +// Ok(batches) +// } + +// fn checkpoint_table(table_name: &str, checkpoint_version: u64) -> DeltaResult<()> { +// let path = std::fs::canonicalize(PathBuf::from(format!("./tests/data/{}/", table_name)))?; +// let table = Table::try_from_uri(path.to_string_lossy().as_ref())?; +// let engine = SyncEngine::new(); + +// let mut writer = table.checkpoint(&engine, Some(checkpoint_version))?; + +// // Get the checkpoint data +// let checkpoint_data = writer.checkpoint_data(&engine)?; + +// // Convert URL to filesystem path for local implementation +// let fs_path = std::path::Path::new(checkpoint_data.path.as_str()) +// .strip_prefix("file:/") +// .unwrap(); + +// // Create parent directories +// if let Some(parent) = fs_path.parent() { +// std::fs::create_dir_all(parent)?; +// } + +// use std::fs::File; + +// // Create a Parquet writer with appropriate schema +// // Note: In a real implementation, you would use the schema from the data +// let schema = Schema::new(vec![]); +// let file = File::create(fs_path)?; +// let mut parquet_writer = ArrowWriter::try_new(file, schema.into(), None)?; + +// // Write the data to the Parquet file +// let mut size_in_bytes: i64 = 0; +// for data_result in checkpoint_data.data { +// let filtered_data = data_result?; + +// // Only write data that matches the selection vector (true values) +// if filtered_data +// .selection_vector +// .iter() +// .any(|&selected| selected) +// { +// // Get the actual engine data +// let data = filtered_data.data; + +// // Convert the engine data to a record batch and write it +// if let Some(arrow_batch) = data +// .into_any() +// .downcast::() +// .unwrap() +// .into() +// { +// // Write the batch to the Parquet file +// parquet_writer.write(arrow_batch.record_batch())?; +// } +// } +// } + +// // Close the writer and flush to disk +// parquet_writer.close()?; + +// Ok(()) +// } + +// fn test_table(table_name: &str, latest_version: u64) -> DeltaResult<()> { +// let mut expected = read_table(table_name)?; +// sort_lines!(expected); + +// for version in 0..latest_version { +// // Checkpoint at each version +// checkpoint_table(table_name, version)?; + +// let result = read_table(table_name)?; +// assert_batches_sorted_eq!(expected, &result); +// } + +// Ok(()) +// } + +// #[test] +// fn v2_checkpoints_json_with_sidecars() -> DeltaResult<()> { +// test_table("app-txn-no-checkpoint", 1); +// Ok(()) +// } From a6c3bf41d7da0e7ea427df4a129fba2fd944fb24 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 18 Apr 2025 17:57:16 -0700 Subject: [PATCH 156/176] review --- kernel/src/checkpoint/log_replay.rs | 9 +++-- kernel/src/checkpoint/mod.rs | 63 ++++++++++++++++------------- kernel/src/checkpoint/tests.rs | 20 +++++++++ kernel/src/snapshot.rs | 2 +- kernel/src/table_configuration.rs | 6 +-- 5 files changed, 63 insertions(+), 37 deletions(-) diff --git a/kernel/src/checkpoint/log_replay.rs b/kernel/src/checkpoint/log_replay.rs index 0595c4b558..19d20c82fc 100644 --- a/kernel/src/checkpoint/log_replay.rs +++ b/kernel/src/checkpoint/log_replay.rs @@ -461,10 +461,13 @@ impl RowVisitor for CheckpointVisitor<'_> { #[cfg(test)] mod tests { + use std::collections::HashSet; + use super::*; use crate::arrow::array::StringArray; use crate::utils::test_utils::{action_batch, parse_json_batch}; - use std::collections::HashSet; + + use itertools::Itertools; /// Helper function to create test batches from JSON strings fn create_batch(json_strings: Vec<&str>) -> DeltaResult<(Box, bool)> { @@ -476,9 +479,9 @@ mod tests { fn run_checkpoint_test( input_batches: Vec<(Box, bool)>, ) -> DeltaResult<(Vec, i64, i64)> { - let processed_batches = CheckpointLogReplayProcessor::new(0) + let processed_batches: Vec<_> = CheckpointLogReplayProcessor::new(0) .process_actions_iter(input_batches.into_iter().map(Ok)) - .collect::>>()?; + .try_collect()?; let total_count: i64 = processed_batches.iter().map(|b| b.actions_count).sum(); let add_count: i64 = processed_batches.iter().map(|b| b.add_actions_count).sum(); let filtered_data = processed_batches diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 38e44d673b..6d60d40d16 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -2,27 +2,17 @@ //! //! The entry-point for this API is [`Table::checkpoint`]. //! -//! ## Checkpoint Types -//! This API supports two checkpoint types: +//! ## Checkpoint Types and Selection Logic +//! This API supports two checkpoint types, selected based on table features: //! -//! 1. **Single-file Classic-named V1 Checkpoint** – for legacy tables that do not support the -//! `v2Checkpoints` reader/writer feature. These checkpoints follow the V1 specification and do not -//! include a [`CheckpointMetadata`] action. -//! 2. **Single-file Classic-named V2 Checkpoint** – for tables supporting the `v2Checkpoints` feature. -//! These checkpoints follow the V2 specification and include a [`CheckpointMetadata`] action, while -//! maintaining backwards compatibility by using classic-naming that legacy readers can recognize. +//! | Table Feature | Resulting Checkpoint Type | Description | +//! |------------------|-------------------------------|-----------------------------------------------------------------------------| +//! | No v2Checkpoints | Single-file Classic-named V1 | Follows V1 specification without [`CheckpointMetadata`] action | +//! | v2Checkpoints | Single-file Classic-named V2 | Follows V2 specification with [`CheckpointMetadata`] action while maintaining backward compatibility via classic naming | //! //! For more information on the V1/V2 specifications, see the following protocol section: //! //! -//! ## Checkpoint Selection Logic -//! The checkpoint type is determined by whether the table supports the `v2Checkpoints` reader/writer feature: -//! -//! | Table Feature | Resulting Checkpoint Type | -//! |------------------|-------------------------------| -//! | No v2Checkpoints | Single-file Classic-named V1 | -//! | v2Checkpoints | Single-file Classic-named V2 | -//! //! ## Architecture //! //! - [`CheckpointWriter`] - Core component that manages the checkpoint creation workflow @@ -50,7 +40,7 @@ //! # use delta_kernel::arrow::datatypes::{DataType, Field, Schema}; //! # use object_store::local::LocalFileSystem; //! // Example function which writes checkpoint data to storage -//! fn write_files(mut data: CheckpointData) -> DeltaResult { +//! fn write_checkpoint_file(mut data: CheckpointData) -> DeltaResult { //! /* This should be replaced with actual object store write logic */ //! /* For demonstration, we manually create an EngineData batch with a dummy size */ //! let size = data.data.try_fold(0i64, |acc, r| r.map(|_| acc + 1))?; @@ -75,7 +65,7 @@ //! let mut writer = table.checkpoint(&engine, Some(1))?; //! //! // Write the checkpoint data to the object store and get the metadata -//! let metadata = write_files(writer.checkpoint_data(&engine)?)?; +//! let metadata = write_checkpoint_file(writer.checkpoint_data(&engine)?)?; //! //! /* IMPORTANT: All data must be written before finalizing the checkpoint */ //! @@ -167,10 +157,15 @@ struct CheckpointCounts { /// An iterator over the checkpoint data to be written to the file. /// /// This iterator yields filtered checkpoint data batches ([`FilteredEngineData`]) and -/// tracks action statistics required for finalizing the checkpoint. It must be fully consumed -/// before calling `CheckpointWriter::finalize`, or finalization will fail. Furthermore, -/// the yielded data must be written to the specified path before finalization, or it will -/// may result in data loss and corruption. +/// tracks action statistics required for finalizing the checkpoint. +/// +/// # Warning +/// This iterator MUST be fully consumed before it is dropped. If the iterator is dropped +/// before being fully consumed, it will panic with the message: +/// "CheckpointDataIterator was dropped before being fully consumed". +/// +/// Additionally, all yielded data must be written to the specified path before calling +/// `CheckpointWriter::finalize`, or it may result in data loss and corruption. /// /// On drop, sends accumulated action counts to the writer for metadata recording. pub struct CheckpointDataIterator { @@ -182,6 +177,8 @@ pub struct CheckpointDataIterator { actions_count: i64, /// Running total of add actions included in the checkpoint add_actions_count: i64, + /// Flag indicating whether the iterator has been fully consumed + fully_consumed: bool, } impl Iterator for CheckpointDataIterator { @@ -194,14 +191,17 @@ impl Iterator for CheckpointDataIterator { /// each batch. When the iterator is dropped, it sends the accumulated counts to the [`CheckpointWriter`] /// through a [`channel`] to be later used in [`CheckpointWriter::finalize`]. fn next(&mut self) -> Option { - // Get the next batch from the inner iterator - self.inner.next().map(|result| { + let next_item = self.inner.next(); + + // Check if the iterator is fully consumed + if next_item.is_none() { + self.fully_consumed = true; + } + + next_item.map(|result| { result.map(|batch| { - // Accumulate counts self.actions_count += batch.actions_count; self.add_actions_count += batch.add_actions_count; - - // Return just the filtered data batch.filtered_data }) }) @@ -216,7 +216,11 @@ impl Drop for CheckpointDataIterator { /// The accumulated counts are sent to the [`CheckpointWriter`] through a channel, /// where they are used during finalization to write the `_last_checkpoint` file. fn drop(&mut self) { - // Send accumulated counts when the iterator is dropped + assert!( + self.fully_consumed, + "CheckpointDataIterator was dropped before being fully consumed" + ); + let counts = CheckpointCounts { actions_count: self.actions_count, add_actions_count: self.add_actions_count, @@ -339,6 +343,7 @@ impl CheckpointWriter { counts_tx: self.counts_tx.clone(), actions_count: 0, add_actions_count: 0, + fully_consumed: false, }; Ok(CheckpointData { @@ -363,7 +368,7 @@ impl CheckpointWriter { /// # Returns: [`variant@Ok`] if the checkpoint was successfully finalized #[allow(unused)] fn finalize(self, _engine: &dyn Engine, _metadata: &dyn EngineData) -> DeltaResult<()> { - // The method validates iterator consumption, but can not garuntee data persistence. + // The method validates iterator consumption, but can not guaruntee data persistence. match self.counts_rx.try_recv() { Ok(counts) => { // Write the _last_checkpoint file with the action counts diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index fa04c470e4..aa9637e517 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -389,3 +389,23 @@ fn test_checkpoint_error_handling_invalid_version() -> DeltaResult<()> { Ok(()) } + +#[test] +#[should_panic(expected = "CheckpointDataIterator was dropped before being fully consumed")] +fn test_checkpoint_data_iterator_panics_when_dropped_and_not_consumed() { + let (store, _) = new_in_memory_store(); + let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + write_commit_to_store( + &store, + vec![create_basic_protocol_action(), create_metadata_action()], + 0, + ) + .unwrap(); + let table_root = Url::parse("memory:///").unwrap(); + let table = Table::new(table_root); + let mut writer = table.checkpoint(&engine, None).unwrap(); + let _checkpoint_data_iterator = writer.checkpoint_data(&engine).unwrap().data; + + // We do not call `next()` at all. The iterator will panic when it goes out of scope + // at the end of this test as it is not fully consumed. +} diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index c95bbd6592..d872be16a7 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -19,7 +19,7 @@ use url::Url; /// Name of the _last_checkpoint file that provides metadata about the last checkpoint /// created for the table. This file is used as a hint for the engine to quickly locate -/// the last checkpoint and avoid full log replay when reading the table. +/// the latest checkpoint without a full directory listing. pub(crate) const LAST_CHECKPOINT_FILE_NAME: &str = "_last_checkpoint"; // TODO expose methods for accessing the files of a table (with file pruning). diff --git a/kernel/src/table_configuration.rs b/kernel/src/table_configuration.rs index e88da5eaab..524439af66 100644 --- a/kernel/src/table_configuration.rs +++ b/kernel/src/table_configuration.rs @@ -272,12 +272,10 @@ impl TableConfiguration { pub(crate) fn is_v2_checkpoint_write_supported(&self) -> bool { let read_supported = self .protocol() - .has_reader_feature(&ReaderFeature::V2Checkpoint) - && self.protocol.min_reader_version() == 3; + .has_reader_feature(&ReaderFeature::V2Checkpoint); let write_supported = self .protocol() - .has_writer_feature(&WriterFeature::V2Checkpoint) - && self.protocol.min_writer_version() == 7; + .has_writer_feature(&WriterFeature::V2Checkpoint); read_supported && write_supported } } From 6d9c5cb0cdfaae55a670221f54ab3dc1de37c0f5 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 21 Apr 2025 10:10:39 -0700 Subject: [PATCH 157/176] remove test file & spelling --- kernel/src/checkpoint/mod.rs | 2 +- kernel/tests/write_checkpoints.rs | 106 ------------------------------ 2 files changed, 1 insertion(+), 107 deletions(-) delete mode 100644 kernel/tests/write_checkpoints.rs diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 6d60d40d16..3db9ce3e32 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -368,7 +368,7 @@ impl CheckpointWriter { /// # Returns: [`variant@Ok`] if the checkpoint was successfully finalized #[allow(unused)] fn finalize(self, _engine: &dyn Engine, _metadata: &dyn EngineData) -> DeltaResult<()> { - // The method validates iterator consumption, but can not guaruntee data persistence. + // The method validates iterator consumption, but can not guarantee data persistence. match self.counts_rx.try_recv() { Ok(counts) => { // Write the _last_checkpoint file with the action counts diff --git a/kernel/tests/write_checkpoints.rs b/kernel/tests/write_checkpoints.rs deleted file mode 100644 index 8b85bfeea2..0000000000 --- a/kernel/tests/write_checkpoints.rs +++ /dev/null @@ -1,106 +0,0 @@ -// use std::path::PathBuf; -// use std::sync::Arc; - -// use delta_kernel::arrow::array::RecordBatch; -// use delta_kernel::arrow::datatypes::Schema; -// use delta_kernel::engine::arrow_data::ArrowEngineData; -// use delta_kernel::engine::sync::SyncEngine; -// use delta_kernel::{arrow, DeltaResult, Engine, Table}; - -// mod common; -// use common::{load_test_data, read_scan}; -// use itertools::Itertools; -// use parquet_53::arrow::ArrowWriter; - -// fn read_table(test_name: &str) -> DeltaResult> { -// let path = std::fs::canonicalize(PathBuf::from(format!("./tests/data/{}/", test_name)))?; -// let table = Table::try_from_uri(path.to_string_lossy().as_ref())?; -// let engine = Arc::new(SyncEngine::new()); -// let snapshot = table.snapshot(engine.as_ref(), None)?; -// let scan = snapshot.into_scan_builder().build()?; -// let batches = read_scan(&scan, engine)?; - -// Ok(batches) -// } - -// fn checkpoint_table(table_name: &str, checkpoint_version: u64) -> DeltaResult<()> { -// let path = std::fs::canonicalize(PathBuf::from(format!("./tests/data/{}/", table_name)))?; -// let table = Table::try_from_uri(path.to_string_lossy().as_ref())?; -// let engine = SyncEngine::new(); - -// let mut writer = table.checkpoint(&engine, Some(checkpoint_version))?; - -// // Get the checkpoint data -// let checkpoint_data = writer.checkpoint_data(&engine)?; - -// // Convert URL to filesystem path for local implementation -// let fs_path = std::path::Path::new(checkpoint_data.path.as_str()) -// .strip_prefix("file:/") -// .unwrap(); - -// // Create parent directories -// if let Some(parent) = fs_path.parent() { -// std::fs::create_dir_all(parent)?; -// } - -// use std::fs::File; - -// // Create a Parquet writer with appropriate schema -// // Note: In a real implementation, you would use the schema from the data -// let schema = Schema::new(vec![]); -// let file = File::create(fs_path)?; -// let mut parquet_writer = ArrowWriter::try_new(file, schema.into(), None)?; - -// // Write the data to the Parquet file -// let mut size_in_bytes: i64 = 0; -// for data_result in checkpoint_data.data { -// let filtered_data = data_result?; - -// // Only write data that matches the selection vector (true values) -// if filtered_data -// .selection_vector -// .iter() -// .any(|&selected| selected) -// { -// // Get the actual engine data -// let data = filtered_data.data; - -// // Convert the engine data to a record batch and write it -// if let Some(arrow_batch) = data -// .into_any() -// .downcast::() -// .unwrap() -// .into() -// { -// // Write the batch to the Parquet file -// parquet_writer.write(arrow_batch.record_batch())?; -// } -// } -// } - -// // Close the writer and flush to disk -// parquet_writer.close()?; - -// Ok(()) -// } - -// fn test_table(table_name: &str, latest_version: u64) -> DeltaResult<()> { -// let mut expected = read_table(table_name)?; -// sort_lines!(expected); - -// for version in 0..latest_version { -// // Checkpoint at each version -// checkpoint_table(table_name, version)?; - -// let result = read_table(table_name)?; -// assert_batches_sorted_eq!(expected, &result); -// } - -// Ok(()) -// } - -// #[test] -// fn v2_checkpoints_json_with_sidecars() -> DeltaResult<()> { -// test_table("app-txn-no-checkpoint", 1); -// Ok(()) -// } From affe0b5b22a0481e05c7deb0e5c32a9709d9cdc1 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 21 Apr 2025 11:30:42 -0700 Subject: [PATCH 158/176] reviews --- kernel/src/checkpoint/mod.rs | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 3db9ce3e32..2be0471d2b 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -114,33 +114,30 @@ mod tests; const SECONDS_PER_MINUTE: u64 = 60; const MINUTES_PER_HOUR: u64 = 60; const HOURS_PER_DAY: u64 = 24; -const DAYS: u64 = 7; /// The default retention period for deleted files in seconds. /// This is set to 7 days, which is the default in delta-spark. -const DEFAULT_RETENTION_SECS: u64 = SECONDS_PER_MINUTE * MINUTES_PER_HOUR * HOURS_PER_DAY * DAYS; +const DEFAULT_RETENTION_SECS: u64 = 7 * HOURS_PER_DAY * MINUTES_PER_HOUR * SECONDS_PER_MINUTE; /// Schema for extracting relevant actions from log files for checkpoint creation static CHECKPOINT_ACTIONS_SCHEMA: LazyLock = LazyLock::new(|| { - StructType::new([ + Arc::new(StructType::new([ Option::::get_struct_field(ADD_NAME), Option::::get_struct_field(REMOVE_NAME), Option::::get_struct_field(METADATA_NAME), Option::::get_struct_field(PROTOCOL_NAME), Option::::get_struct_field(SET_TRANSACTION_NAME), Option::::get_struct_field(SIDECAR_NAME), - ]) - .into() + ])) }); // Schema of the [`CheckpointMetadata`] action that is included in V2 checkpoints // We cannot use `CheckpointMetadata::to_schema()` as it would include the 'tags' field which // we're not supporting yet due to the lack of map support. static CHECKPOINT_METADATA_ACTION_SCHEMA: LazyLock = LazyLock::new(|| { - StructType::new([StructField::not_null( + Arc::new(StructType::new([StructField::not_null( CHECKPOINT_METADATA_NAME, DataType::struct_type([StructField::not_null("version", DataType::LONG)]), - )]) - .into() + )])) }); /// This struct is used to send the total action counts from the [`CheckpointDataIterator`] @@ -404,11 +401,10 @@ impl CheckpointWriter { version: i64, engine: &dyn Engine, ) -> DeltaResult { - let values: &[Scalar] = &[version.into()]; - - let checkpoint_metadata_batch = engine - .evaluation_handler() - .create_one(CHECKPOINT_METADATA_ACTION_SCHEMA.clone(), values)?; + let checkpoint_metadata_batch = engine.evaluation_handler().create_one( + CHECKPOINT_METADATA_ACTION_SCHEMA.clone(), + &[Scalar::from(version)], + )?; let filtered_data = FilteredEngineData { data: checkpoint_metadata_batch, From 37e7beaffb89462ef2575a899504ff56218cf23a Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 21 Apr 2025 14:02:52 -0700 Subject: [PATCH 159/176] review - remove channels, iterator as param, FileMeta instead of EnginData --- kernel/src/checkpoint/mod.rs | 138 ++++++++------------------------- kernel/src/checkpoint/tests.rs | 20 ----- 2 files changed, 32 insertions(+), 126 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 2be0471d2b..7c344eb62b 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -28,50 +28,39 @@ //! 3. Write all data to the returned location //! 4. Finalize the checkpoint with `CheckpointWriter::finalize` //! -//! ``` +//! ```no_run //! # use std::sync::Arc; //! # use delta_kernel::checkpoint::CheckpointData; -//! # use delta_kernel::engine::arrow_data::ArrowEngineData; -//! # use delta_kernel::engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}; +//! # use delta_kernel::checkpoint::CheckpointWriter; //! # use delta_kernel::table::Table; //! # use delta_kernel::DeltaResult; //! # use delta_kernel::Error; -//! # use delta_kernel::arrow::array::{Int64Array, RecordBatch}; -//! # use delta_kernel::arrow::datatypes::{DataType, Field, Schema}; -//! # use object_store::local::LocalFileSystem; -//! // Example function which writes checkpoint data to storage -//! fn write_checkpoint_file(mut data: CheckpointData) -> DeltaResult { -//! /* This should be replaced with actual object store write logic */ -//! /* For demonstration, we manually create an EngineData batch with a dummy size */ -//! let size = data.data.try_fold(0i64, |acc, r| r.map(|_| acc + 1))?; -//! let batch = RecordBatch::try_new( -//! Arc::new(Schema::new(vec![Field::new("sizeInBytes", DataType::Int64, false)])), -//! vec![Arc::new(Int64Array::from(vec![size]))], -//! )?; -//! Ok(ArrowEngineData::new(batch)) +//! # use delta_kernel::FileMeta; +//! # use url::Url; +//! fn write_checkpoint_file(checkpoint_data: &CheckpointData) -> DeltaResult { +//! todo!() /* engine-specific logic to write checkpoint_data.data to checkpoint_data.path */ //! } //! //! // Create an engine instance -//! let engine = DefaultEngine::new( -//! Arc::new(LocalFileSystem::new()), -//! Arc::new(TokioBackgroundExecutor::new()) -//! ); +//! let engine = todo!(); //! //! // Create a table instance for the table you want to checkpoint //! let table = Table::try_from_uri("./tests/data/app-txn-no-checkpoint")?; //! //! // Use table.checkpoint() to create a checkpoint writer //! // (optionally specify a version to checkpoint) -//! let mut writer = table.checkpoint(&engine, Some(1))?; +//! let mut writer: CheckpointWriter = table.checkpoint(&engine, Some(1))?; +//! +//! // Get the checkpoint data and path +//! let checkpoint_data = writer.checkpoint_data(&engine)? //! -//! // Write the checkpoint data to the object store and get the metadata -//! let metadata = write_checkpoint_file(writer.checkpoint_data(&engine)?)?; +//! // Write the checkpoint data to the object store and collect metadata +//! let metadata: FileMeta = write_checkpoint_file(&checkpoint_data)?; //! //! /* IMPORTANT: All data must be written before finalizing the checkpoint */ //! //! // TODO(#850): Implement the finalize method -//! // Finalize the checkpoint -//! // writer.finalize(&engine, &metadata)?; +//! // writer.finalize(&engine, &metadata, checkpoint_data.data)?; //! //! # Ok::<_, Error>(()) //! ``` @@ -87,7 +76,6 @@ // implemented in the future. The current implementation only supports classic-named V2 checkpoints. // - TODO(#837): Multi-file V2 checkpoints are not supported yet. The API is designed to be extensible for future // multi-file support, but the current implementation only supports single-file checkpoints. -use std::sync::mpsc::{channel, Receiver, Sender}; use std::sync::{Arc, LazyLock}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; @@ -102,7 +90,7 @@ use crate::log_replay::LogReplayProcessor; use crate::path::ParsedLogPath; use crate::schema::{DataType, SchemaRef, StructField, StructType}; use crate::snapshot::Snapshot; -use crate::{DeltaResult, Engine, EngineData, Error, EvaluationHandlerExtension}; +use crate::{DeltaResult, Engine, Error, EvaluationHandlerExtension, FileMeta}; use log_replay::{CheckpointBatch, CheckpointLogReplayProcessor}; use url::Url; @@ -140,42 +128,21 @@ static CHECKPOINT_METADATA_ACTION_SCHEMA: LazyLock = LazyLock::new(|| )])) }); -/// This struct is used to send the total action counts from the [`CheckpointDataIterator`] -/// to the [`CheckpointWriter`] when the iterator is dropped over a [`channel`]. The counts are -/// used to populate the `_last_checkpoint` file on [`CheckpointWriter::finalize`]. -#[allow(unused)] // TODO(#850): Read when implementing finalize -struct CheckpointCounts { - /// Total number of actions included in the checkpoint - actions_count: i64, - /// Number of add actions included in the checkpoint - add_actions_count: i64, -} - /// An iterator over the checkpoint data to be written to the file. /// /// This iterator yields filtered checkpoint data batches ([`FilteredEngineData`]) and /// tracks action statistics required for finalizing the checkpoint. /// /// # Warning -/// This iterator MUST be fully consumed before it is dropped. If the iterator is dropped -/// before being fully consumed, it will panic with the message: -/// "CheckpointDataIterator was dropped before being fully consumed". -/// /// Additionally, all yielded data must be written to the specified path before calling /// `CheckpointWriter::finalize`, or it may result in data loss and corruption. -/// -/// On drop, sends accumulated action counts to the writer for metadata recording. pub struct CheckpointDataIterator { /// The inner iterator that yields the checkpoint data with counts inner: Box>>, - /// Channel sender for action counts - counts_tx: Sender, /// Running total of actions included in the checkpoint actions_count: i64, /// Running total of add actions included in the checkpoint add_actions_count: i64, - /// Flag indicating whether the iterator has been fully consumed - fully_consumed: bool, } impl Iterator for CheckpointDataIterator { @@ -185,16 +152,11 @@ impl Iterator for CheckpointDataIterator { /// /// This implementation transforms the `CheckpointBatch` items from the inner iterator into /// [`FilteredEngineData`] items for the engine to write, while accumulating action counts for - /// each batch. When the iterator is dropped, it sends the accumulated counts to the [`CheckpointWriter`] - /// through a [`channel`] to be later used in [`CheckpointWriter::finalize`]. + /// each batch. The [`CheckpointDataIterator`] is passed back to the kernel on call to + /// `CheckpointWriter::finalize` for counts to be read and written to the `_last_checkpoint` file fn next(&mut self) -> Option { let next_item = self.inner.next(); - // Check if the iterator is fully consumed - if next_item.is_none() { - self.fully_consumed = true; - } - next_item.map(|result| { result.map(|batch| { self.actions_count += batch.actions_count; @@ -205,29 +167,6 @@ impl Iterator for CheckpointDataIterator { } } -impl Drop for CheckpointDataIterator { - /// Sends accumulated action counts when the iterator is dropped. - /// - /// This method is called automatically when the iterator goes out of scope, - /// which happens either when it is fully consumed or when it is explicitly dropped. - /// The accumulated counts are sent to the [`CheckpointWriter`] through a channel, - /// where they are used during finalization to write the `_last_checkpoint` file. - fn drop(&mut self) { - assert!( - self.fully_consumed, - "CheckpointDataIterator was dropped before being fully consumed" - ); - - let counts = CheckpointCounts { - actions_count: self.actions_count, - add_actions_count: self.add_actions_count, - }; - - // Ignore send errors - they will be handled on the receiving end - let _ = self.counts_tx.send(counts); - } -} - /// Represents the data needed to create a single-file checkpoint. /// /// Obtained from [`CheckpointWriter::checkpoint_data`], this struct provides both the @@ -262,21 +201,12 @@ pub struct CheckpointData { pub struct CheckpointWriter { /// Reference to the snapshot (i.e. version) of the table being checkpointed pub(crate) snapshot: Arc, - /// Channel receiver for action counts from the [`CheckpointDataIterator`] - counts_rx: Receiver, - /// Channel sender for action counts for the [`CheckpointDataIterator`] - counts_tx: Sender, } impl CheckpointWriter { /// Creates a new CheckpointWriter from a snapshot pub(crate) fn new(snapshot: Arc) -> Self { - let (counts_tx, counts_rx) = channel(); - Self { - snapshot, - counts_rx, - counts_tx, - } + Self { snapshot } } /// Retrieves the checkpoint data and path information. @@ -337,10 +267,8 @@ impl CheckpointWriter { // Wrap the data iterator to send counts to the CheckpointWriter when dropped let wrapped_iterator = CheckpointDataIterator { inner: Box::new(chained), - counts_tx: self.counts_tx.clone(), actions_count: 0, add_actions_count: 0, - fully_consumed: false, }; Ok(CheckpointData { @@ -359,25 +287,23 @@ impl CheckpointWriter { /// /// # Parameters /// - `engine`: Implementation of [`Engine`] apis. - /// - `metadata`: A single-row, single-column [`EngineData`] batch containing: - /// - `sizeInBytes` (i64): The size of the written checkpoint file + /// - `metadata`: The metadata of the written checkpoint file + /// - `checkpoint_data_iter`: The exhausted checkpoint data iterator (must be fully consumed) + /// /// # Returns: [`variant@Ok`] if the checkpoint was successfully finalized #[allow(unused)] - fn finalize(self, _engine: &dyn Engine, _metadata: &dyn EngineData) -> DeltaResult<()> { - // The method validates iterator consumption, but can not guarantee data persistence. - match self.counts_rx.try_recv() { - Ok(counts) => { - // Write the _last_checkpoint file with the action counts - todo!("Implement the finalize method which will write the _last_checkpoint file") - } - Err(_) => { - // The iterator wasn't fully consumed, which means not all data was written - Err(Error::checkpoint_write( - "Checkpoint data iterator was not fully consumed before finalization", - )) - } - } + fn finalize( + self, + _engine: &dyn Engine, + _metadata: &FileMeta, + _checkpoint_data_iter: CheckpointDataIterator, + ) -> DeltaResult<()> { + // Verify the iterator is exhausted (optional) + // Implementation will use checkpoint_data.actions_count and checkpoint_data.add_actions_count + + // TODO(#850): Implement the actual finalization logic + todo!("Implement finalize method for checkpoint writer") } /// Creates the checkpoint metadata action for V2 checkpoints. diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index aa9637e517..fa04c470e4 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -389,23 +389,3 @@ fn test_checkpoint_error_handling_invalid_version() -> DeltaResult<()> { Ok(()) } - -#[test] -#[should_panic(expected = "CheckpointDataIterator was dropped before being fully consumed")] -fn test_checkpoint_data_iterator_panics_when_dropped_and_not_consumed() { - let (store, _) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); - write_commit_to_store( - &store, - vec![create_basic_protocol_action(), create_metadata_action()], - 0, - ) - .unwrap(); - let table_root = Url::parse("memory:///").unwrap(); - let table = Table::new(table_root); - let mut writer = table.checkpoint(&engine, None).unwrap(); - let _checkpoint_data_iterator = writer.checkpoint_data(&engine).unwrap().data; - - // We do not call `next()` at all. The iterator will panic when it goes out of scope - // at the end of this test as it is not fully consumed. -} From ff92454ac88c39b61bf0ffe2a5ea98256c00717f Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 21 Apr 2025 14:45:58 -0700 Subject: [PATCH 160/176] docs --- kernel/src/checkpoint/mod.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 7c344eb62b..1f6a88faa9 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -25,7 +25,7 @@ //! //! 1. Create a [`CheckpointWriter`] using [`Table::checkpoint`] //! 2. Get checkpoint data and path with [`CheckpointWriter::checkpoint_data`] -//! 3. Write all data to the returned location +//! 3. Write the [`CheckpointDataIterator`] returned in [`CheckpointData`] to [`CheckpointData::path`] //! 4. Finalize the checkpoint with `CheckpointWriter::finalize` //! //! ```no_run @@ -42,17 +42,16 @@ //! } //! //! // Create an engine instance -//! let engine = todo!(); +//! let engine: Arc = Arc::new(todo!("create your engine here")); //! //! // Create a table instance for the table you want to checkpoint //! let table = Table::try_from_uri("./tests/data/app-txn-no-checkpoint")?; //! -//! // Use table.checkpoint() to create a checkpoint writer -//! // (optionally specify a version to checkpoint) +//! // Create a checkpoint writer for a version of the table (`None` for latest) //! let mut writer: CheckpointWriter = table.checkpoint(&engine, Some(1))?; //! //! // Get the checkpoint data and path -//! let checkpoint_data = writer.checkpoint_data(&engine)? +//! let checkpoint_data = writer.checkpoint_data(&engine)?; //! //! // Write the checkpoint data to the object store and collect metadata //! let metadata: FileMeta = write_checkpoint_file(&checkpoint_data)?; From 83827e6a458aa0d1baa544b170250dbc5a859ea0 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 21 Apr 2025 14:52:14 -0700 Subject: [PATCH 161/176] docs --- kernel/src/checkpoint/mod.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 1f6a88faa9..cd6cca12d8 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -24,9 +24,10 @@ //! The following steps outline the process of creating a checkpoint: //! //! 1. Create a [`CheckpointWriter`] using [`Table::checkpoint`] -//! 2. Get checkpoint data and path with [`CheckpointWriter::checkpoint_data`] -//! 3. Write the [`CheckpointDataIterator`] returned in [`CheckpointData`] to [`CheckpointData::path`] -//! 4. Finalize the checkpoint with `CheckpointWriter::finalize` +//! 2. Get [`CheckpointData`] from [`CheckpointWriter::checkpoint_data`] +//! 3. Write the [`CheckpointData::data`] to [`CheckpointData::path`] +//! 4. Collect metadata ([`FileMeta`]) from the write operation +//! 5. Pass the metadata and [`CheckpointDataIterator`] to [`CheckpointWriter::finalize`] to finalize the checkpoint //! //! ```no_run //! # use std::sync::Arc; From 87b5f1effeaf519c07326ae8917a3dcdeef16b62 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 21 Apr 2025 14:54:42 -0700 Subject: [PATCH 162/176] fix docs --- kernel/src/checkpoint/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index cd6cca12d8..90271a6118 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -27,7 +27,7 @@ //! 2. Get [`CheckpointData`] from [`CheckpointWriter::checkpoint_data`] //! 3. Write the [`CheckpointData::data`] to [`CheckpointData::path`] //! 4. Collect metadata ([`FileMeta`]) from the write operation -//! 5. Pass the metadata and [`CheckpointDataIterator`] to [`CheckpointWriter::finalize`] to finalize the checkpoint +//! 5. Pass the metadata and [`CheckpointDataIterator`] to `CheckpointWriter::finalize` to finalize the checkpoint //! //! ```no_run //! # use std::sync::Arc; From b2b07ac5766b00e120c0732668075120b5018a5d Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 21 Apr 2025 23:46:50 -0700 Subject: [PATCH 163/176] snapshot::checkpoint --- kernel/src/checkpoint/mod.rs | 14 +++++++++----- kernel/src/checkpoint/tests.rs | 35 ++++++---------------------------- kernel/src/snapshot.rs | 9 +++++++++ kernel/src/table.rs | 21 -------------------- 4 files changed, 24 insertions(+), 55 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 90271a6118..3a64b05004 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -1,6 +1,6 @@ //! This module implements the API for writing single-file checkpoints. //! -//! The entry-point for this API is [`Table::checkpoint`]. +//! The entry-point for this API is [`Snapshot::checkpoint`]. //! //! ## Checkpoint Types and Selection Logic //! This API supports two checkpoint types, selected based on table features: @@ -23,7 +23,7 @@ //! //! The following steps outline the process of creating a checkpoint: //! -//! 1. Create a [`CheckpointWriter`] using [`Table::checkpoint`] +//! 1. Create a [`CheckpointWriter`] using [`Snapshot::checkpoint`] //! 2. Get [`CheckpointData`] from [`CheckpointWriter::checkpoint_data`] //! 3. Write the [`CheckpointData::data`] to [`CheckpointData::path`] //! 4. Collect metadata ([`FileMeta`]) from the write operation @@ -33,10 +33,12 @@ //! # use std::sync::Arc; //! # use delta_kernel::checkpoint::CheckpointData; //! # use delta_kernel::checkpoint::CheckpointWriter; +//! # use delta_kernel::Engine; //! # use delta_kernel::table::Table; //! # use delta_kernel::DeltaResult; //! # use delta_kernel::Error; //! # use delta_kernel::FileMeta; +//! # use delta_kernel::snapshot::Snapshot; //! # use url::Url; //! fn write_checkpoint_file(checkpoint_data: &CheckpointData) -> DeltaResult { //! todo!() /* engine-specific logic to write checkpoint_data.data to checkpoint_data.path */ @@ -48,8 +50,11 @@ //! // Create a table instance for the table you want to checkpoint //! let table = Table::try_from_uri("./tests/data/app-txn-no-checkpoint")?; //! -//! // Create a checkpoint writer for a version of the table (`None` for latest) -//! let mut writer: CheckpointWriter = table.checkpoint(&engine, Some(1))?; +//! // Create a snapshot of a specific version of the table (e.g., version 1) +//! let snapshot: Arc = table.snapshot(&engine, Some(1))?; +//! +//! // Create a checkpoint writer from the snapshot +//! let mut writer: CheckpointWriter = snapshot.checkpoint()?; //! //! // Get the checkpoint data and path //! let checkpoint_data = writer.checkpoint_data(&engine)?; @@ -70,7 +75,6 @@ //! //! [`CheckpointMetadata`]: crate::actions::CheckpointMetadata //! [`LastCheckpointHint`]: crate::snapshot::LastCheckpointHint -//! [`Table::checkpoint`]: crate::table::Table::checkpoint // Future extensions // - TODO(#836): Single-file UUID-named V2 checkpoints (using `n.checkpoint.u.{json/parquet}` naming) are to be // implemented in the future. The current implementation only supports classic-named V2 checkpoints. diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index fa04c470e4..636dffd70d 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -219,7 +219,8 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { let table_root = Url::parse("memory:///")?; let table = Table::new(table_root); - let mut writer = table.checkpoint(&engine, None)?; + let snapshot = table.snapshot(&engine, None)?; + let mut writer = snapshot.checkpoint()?; let checkpoint_data = writer.checkpoint_data(&engine)?; let mut data_iter = checkpoint_data.data; @@ -277,7 +278,8 @@ fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { let table_root = Url::parse("memory:///")?; let table = Table::new(table_root); // Specify version 0 for checkpoint - let mut writer = table.checkpoint(&engine, Some(0))?; + let snapshot = table.snapshot(&engine, Some(0))?; + let mut writer = snapshot.checkpoint()?; let checkpoint_data = writer.checkpoint_data(&engine)?; let mut data_iter = checkpoint_data.data; @@ -332,7 +334,8 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { let table_root = Url::parse("memory:///")?; let table = Table::new(table_root); - let mut writer = table.checkpoint(&engine, None)?; + let snapshot = table.snapshot(&engine, None)?; + let mut writer = snapshot.checkpoint()?; let checkpoint_data = writer.checkpoint_data(&engine)?; let mut data_iter = checkpoint_data.data; @@ -363,29 +366,3 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { // TODO(#850): Finalize and verify _last_checkpoint Ok(()) } - -/// Tests the `checkpoint()` API with: -/// - a version that does not exist in the log -#[test] -fn test_checkpoint_error_handling_invalid_version() -> DeltaResult<()> { - let (store, _) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); - - // 1st commit (version 0) - metadata and protocol actions - // Protocol action does not include the v2Checkpoint reader/writer feature. - write_commit_to_store( - &store, - vec![create_basic_protocol_action(), create_metadata_action()], - 0, - )?; - let table_root = Url::parse("memory:///")?; - let table = Table::new(table_root); - let result = table.checkpoint(&engine, Some(999)); - - // Should fail with an appropriate error - // Returns error: "LogSegment end version 0 not the same as the specified end version 999" - // TODO(#854): Returned error should be tailored to checkpoint creation - assert!(result.is_err()); - - Ok(()) -} diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index d872be16a7..f52f0ac131 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use crate::actions::{Metadata, Protocol}; +use crate::checkpoint::CheckpointWriter; use crate::log_segment::{self, LogSegment}; use crate::scan::ScanBuilder; use crate::schema::{Schema, SchemaRef}; @@ -247,6 +248,14 @@ impl Snapshot { }) } + /// Creates a [`CheckpointWriter`] for generating a checkpoint for this snapshot. + /// + /// See the [`crate::checkpoint`] module documentation for more details on checkpoint types + /// and the overall checkpoint process. + pub fn checkpoint(self) -> DeltaResult { + Ok(CheckpointWriter::new(Arc::new(self))) + } + /// Log segment this snapshot uses #[internal_api] pub(crate) fn log_segment(&self) -> &LogSegment { diff --git a/kernel/src/table.rs b/kernel/src/table.rs index 33f078bb1b..97e1596d77 100644 --- a/kernel/src/table.rs +++ b/kernel/src/table.rs @@ -4,11 +4,9 @@ use std::borrow::Cow; use std::ops::Deref; use std::path::PathBuf; -use std::sync::Arc; use url::Url; -use crate::checkpoint::CheckpointWriter; use crate::snapshot::Snapshot; use crate::table_changes::TableChanges; use crate::transaction::Transaction; @@ -100,25 +98,6 @@ impl Table { ) } - /// Creates a [`CheckpointWriter`] for generating checkpoints at the specified table version. - /// - /// See the [`crate::checkpoint`] module documentation for more details on checkpoint types - /// and the overall checkpoint process. - /// - /// # Parameters - /// - `engine`: Implementation of [`Engine`] apis. - /// - `version`: The version of the table to checkpoint. If [`None`], the latest version of the - /// table will be checkpointed. - pub fn checkpoint( - &self, - engine: &dyn Engine, - version: Option, - ) -> DeltaResult { - Ok(CheckpointWriter::new(Arc::new( - self.snapshot(engine, version)?, - ))) - } - /// Create a new write transaction for this table. pub fn new_transaction(&self, engine: &dyn Engine) -> DeltaResult { Transaction::try_new(self.snapshot(engine, None)?) From ad8380e8cb435e3db6674fc37cbaddeb81c77007 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 21 Apr 2025 23:50:13 -0700 Subject: [PATCH 164/176] docs --- kernel/src/checkpoint/mod.rs | 1 - kernel/src/snapshot.rs | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 3a64b05004..5fc494e762 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -293,7 +293,6 @@ impl CheckpointWriter { /// - `engine`: Implementation of [`Engine`] apis. /// - `metadata`: The metadata of the written checkpoint file /// - `checkpoint_data_iter`: The exhausted checkpoint data iterator (must be fully consumed) - /// /// # Returns: [`variant@Ok`] if the checkpoint was successfully finalized #[allow(unused)] diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index f52f0ac131..c50b919a7d 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -248,7 +248,7 @@ impl Snapshot { }) } - /// Creates a [`CheckpointWriter`] for generating a checkpoint for this snapshot. + /// Creates a [`CheckpointWriter`] for generating a checkpoint from this snapshot. /// /// See the [`crate::checkpoint`] module documentation for more details on checkpoint types /// and the overall checkpoint process. From dbff20661b8e822d974c7a6a4a269bcce53eb038 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 22 Apr 2025 09:34:49 -0700 Subject: [PATCH 165/176] doc test --- kernel/src/checkpoint/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 5fc494e762..207318d5c5 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -45,19 +45,19 @@ //! } //! //! // Create an engine instance -//! let engine: Arc = Arc::new(todo!("create your engine here")); +//! let engine: &dyn Engine = todo!("create your engine here"); //! //! // Create a table instance for the table you want to checkpoint //! let table = Table::try_from_uri("./tests/data/app-txn-no-checkpoint")?; //! //! // Create a snapshot of a specific version of the table (e.g., version 1) -//! let snapshot: Arc = table.snapshot(&engine, Some(1))?; +//! let snapshot: Snapshot = table.snapshot(engine, Some(1))?; //! //! // Create a checkpoint writer from the snapshot //! let mut writer: CheckpointWriter = snapshot.checkpoint()?; //! //! // Get the checkpoint data and path -//! let checkpoint_data = writer.checkpoint_data(&engine)?; +//! let checkpoint_data = writer.checkpoint_data(engine)?; //! //! // Write the checkpoint data to the object store and collect metadata //! let metadata: FileMeta = write_checkpoint_file(&checkpoint_data)?; From 7e403f153d02489c7f7b7e305734abc5d9fc9cca Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 22 Apr 2025 11:03:17 -0700 Subject: [PATCH 166/176] doc --- kernel/src/checkpoint/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 207318d5c5..5eab34f861 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -45,7 +45,7 @@ //! } //! //! // Create an engine instance -//! let engine: &dyn Engine = todo!("create your engine here"); +//! let engine: &dyn Engine = todo!(); //! //! // Create a table instance for the table you want to checkpoint //! let table = Table::try_from_uri("./tests/data/app-txn-no-checkpoint")?; From 85cb57c2caa004cac1deb6834ef582ac00087dae Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 22 Apr 2025 13:51:33 -0700 Subject: [PATCH 167/176] split checkpoint path --- kernel/src/checkpoint/mod.rs | 106 ++++++++++++++------------------- kernel/src/checkpoint/tests.rs | 21 +++---- 2 files changed, 54 insertions(+), 73 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 5eab34f861..71f28dc084 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -16,7 +16,6 @@ //! ## Architecture //! //! - [`CheckpointWriter`] - Core component that manages the checkpoint creation workflow -//! - [`CheckpointData`] - Wraps the [`CheckpointDataIterator`] and destination path information //! - [`CheckpointDataIterator`] - Iterator over the checkpoint data to be written //! //! ## Usage @@ -24,14 +23,15 @@ //! The following steps outline the process of creating a checkpoint: //! //! 1. Create a [`CheckpointWriter`] using [`Snapshot::checkpoint`] -//! 2. Get [`CheckpointData`] from [`CheckpointWriter::checkpoint_data`] -//! 3. Write the [`CheckpointData::data`] to [`CheckpointData::path`] +//! 2. Get the checkpoint path from [`CheckpointWriter::checkpoint_path`] +//! 2. Get the checkpoint data from [`CheckpointWriter::checkpoint_data`] +//! 3. Write the data to the path in object storage (engine-specific) //! 4. Collect metadata ([`FileMeta`]) from the write operation -//! 5. Pass the metadata and [`CheckpointDataIterator`] to `CheckpointWriter::finalize` to finalize the checkpoint +//! 5. Pass the metadata and consumed data iterator to `CheckpointWriter::finalize` //! //! ```no_run //! # use std::sync::Arc; -//! # use delta_kernel::checkpoint::CheckpointData; +//! # use delta_kernel::checkpoint::CheckpointDataIterator; //! # use delta_kernel::checkpoint::CheckpointWriter; //! # use delta_kernel::Engine; //! # use delta_kernel::table::Table; @@ -40,12 +40,11 @@ //! # use delta_kernel::FileMeta; //! # use delta_kernel::snapshot::Snapshot; //! # use url::Url; -//! fn write_checkpoint_file(checkpoint_data: &CheckpointData) -> DeltaResult { -//! todo!() /* engine-specific logic to write checkpoint_data.data to checkpoint_data.path */ +//! fn write_checkpoint_file(path: Url, data: &CheckpointDataIterator) -> DeltaResult { +//! todo!() /* engine-specific logic to write data to object storage*/ //! } //! -//! // Create an engine instance -//! let engine: &dyn Engine = todo!(); +//! let engine: &dyn Engine = todo!(); /* create engine instance */ //! //! // Create a table instance for the table you want to checkpoint //! let table = Table::try_from_uri("./tests/data/app-txn-no-checkpoint")?; @@ -56,16 +55,17 @@ //! // Create a checkpoint writer from the snapshot //! let mut writer: CheckpointWriter = snapshot.checkpoint()?; //! -//! // Get the checkpoint data and path +//! // Get the checkpoint path and data +//! let checkpoint_path = writer.checkpoint_path()?; //! let checkpoint_data = writer.checkpoint_data(engine)?; //! //! // Write the checkpoint data to the object store and collect metadata -//! let metadata: FileMeta = write_checkpoint_file(&checkpoint_data)?; +//! let metadata: FileMeta = write_checkpoint_file(checkpoint_path, &checkpoint_data)?; //! //! /* IMPORTANT: All data must be written before finalizing the checkpoint */ //! //! // TODO(#850): Implement the finalize method -//! // writer.finalize(&engine, &metadata, checkpoint_data.data)?; +//! // writer.finalize(&engine, &metadata, checkpoint_data)?; //! //! # Ok::<_, Error>(()) //! ``` @@ -138,11 +138,12 @@ static CHECKPOINT_METADATA_ACTION_SCHEMA: LazyLock = LazyLock::new(|| /// tracks action statistics required for finalizing the checkpoint. /// /// # Warning -/// Additionally, all yielded data must be written to the specified path before calling -/// `CheckpointWriter::finalize`, or it may result in data loss and corruption. +/// The [`CheckpointDataIterator`] must be fully consumed to ensure proper collection of statistics for +/// the checkpoint. Additionally, all yielded data must be written to the specified path before calling +/// `CheckpointWriter::finalize`. Failing to do so may result in data loss or corruption. pub struct CheckpointDataIterator { - /// The inner iterator that yields the checkpoint data with counts - inner: Box>>, + /// The nested iterator that yields checkpoint batches with action counts + checkpoint_batch_iterator: Box>>, /// Running total of actions included in the checkpoint actions_count: i64, /// Running total of add actions included in the checkpoint @@ -154,12 +155,12 @@ impl Iterator for CheckpointDataIterator { /// Advances the iterator and returns the next value. /// - /// This implementation transforms the `CheckpointBatch` items from the inner iterator into - /// [`FilteredEngineData`] items for the engine to write, while accumulating action counts for + /// This implementation transforms the `CheckpointBatch` items from the nested iterator into + /// [`FilteredEngineData`] items for the engine to write, while accumulating action counts from /// each batch. The [`CheckpointDataIterator`] is passed back to the kernel on call to /// `CheckpointWriter::finalize` for counts to be read and written to the `_last_checkpoint` file fn next(&mut self) -> Option { - let next_item = self.inner.next(); + let next_item = self.checkpoint_batch_iterator.next(); next_item.map(|result| { result.map(|batch| { @@ -171,23 +172,6 @@ impl Iterator for CheckpointDataIterator { } } -/// Represents the data needed to create a single-file checkpoint. -/// -/// Obtained from [`CheckpointWriter::checkpoint_data`], this struct provides both the -/// location where the checkpoint file should be written and an iterator over the data -/// that should be included in the checkpoint. -/// -/// # Warning -/// The [`CheckpointDataIterator`] must be fully consumed to ensure proper collection of statistics for -/// the checkpoint. Additionally, all yielded data must be written to the specified path before calling -/// `CheckpointWriter::finalize`. Failing to do so may result in data loss or corruption. -pub struct CheckpointData { - /// The URL where the checkpoint file should be written. - pub path: Url, - /// An iterator over the checkpoint data to be written to the file. - pub data: CheckpointDataIterator, -} - /// Orchestrates the process of creating a checkpoint for a table. /// /// The [`CheckpointWriter`] is the entry point for generating checkpoint data for a Delta table. @@ -200,8 +184,6 @@ pub struct CheckpointData { /// /// # See Also /// See the [module-level documentation](self) for the complete checkpoint workflow -/// -/// [`Table::checkpoint`]: [`crate::table::Table::checkpoint`] pub struct CheckpointWriter { /// Reference to the snapshot (i.e. version) of the table being checkpointed pub(crate) snapshot: Arc, @@ -213,16 +195,28 @@ impl CheckpointWriter { Self { snapshot } } - /// Retrieves the checkpoint data and path information. + /// Returns the URL where the checkpoint file should be written. /// - /// This method generates the filtered actions for the checkpoint and determines - /// the appropriate destination path. + /// This method generates the checkpoint path based on the table's root and the current version. + /// The generated path follows the classic naming convention for checkpoints: + /// - `n.checkpoint.parquet`, where `n` is the current version of the table. + pub fn checkpoint_path(&self) -> DeltaResult { + ParsedLogPath::new_classic_parquet_checkpoint( + self.snapshot.table_root(), + self.snapshot.version(), + ) + .map(|parsed| parsed.location) + } + + /// Returns the checkpoint data to be written to the checkpoint file. + /// + /// This method reads the actions from the log segment and processes them + /// to create the checkpoint data. /// - /// # Returns - /// [`CheckpointData`] containing the checkpoint path and data to write. + /// # Parameters + /// - `engine`: Implementation of [`Engine`] APIs. /// - /// # Warning - /// All data must be written to persistent storage before calling `CheckpointWriter::finalize()`. + /// # Returns: [`CheckpointDataIterator`] containing the checkpoint data // This method is the core of the checkpoint generation process. It: // 1. Determines whether to write a V1 or V2 checkpoint based on the table's // `v2Checkpoints` feature support @@ -231,7 +225,7 @@ impl CheckpointWriter { // 4. Chains the checkpoint metadata action if writing a V2 spec checkpoint // (i.e., if `v2Checkpoints` feature is supported by table) // 5. Generates the appropriate checkpoint path - pub fn checkpoint_data(&mut self, engine: &dyn Engine) -> DeltaResult { + pub fn checkpoint_data(&mut self, engine: &dyn Engine) -> DeltaResult { let is_v2_checkpoints_supported = self .snapshot .table_configuration() @@ -263,21 +257,11 @@ impl CheckpointWriter { .then(|| self.create_checkpoint_metadata_batch(version, engine)), ); - let checkpoint_path = ParsedLogPath::new_classic_parquet_checkpoint( - self.snapshot.table_root(), - self.snapshot.version(), - )?; - - // Wrap the data iterator to send counts to the CheckpointWriter when dropped - let wrapped_iterator = CheckpointDataIterator { - inner: Box::new(chained), + // Wrap the iterator in a CheckpointDataIterator to track action counts + Ok(CheckpointDataIterator { + checkpoint_batch_iterator: Box::new(chained), actions_count: 0, add_actions_count: 0, - }; - - Ok(CheckpointData { - path: checkpoint_path.location, - data: wrapped_iterator, }) } @@ -292,7 +276,7 @@ impl CheckpointWriter { /// # Parameters /// - `engine`: Implementation of [`Engine`] apis. /// - `metadata`: The metadata of the written checkpoint file - /// - `checkpoint_data_iter`: The exhausted checkpoint data iterator (must be fully consumed) + /// - `checkpoint_data`: The exhausted checkpoint data iterator (must be fully consumed) /// /// # Returns: [`variant@Ok`] if the checkpoint was successfully finalized #[allow(unused)] @@ -300,7 +284,7 @@ impl CheckpointWriter { self, _engine: &dyn Engine, _metadata: &FileMeta, - _checkpoint_data_iter: CheckpointDataIterator, + _checkpoint_data: CheckpointDataIterator, ) -> DeltaResult<()> { // Verify the iterator is exhausted (optional) // Implementation will use checkpoint_data.actions_count and checkpoint_data.add_actions_count diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index 636dffd70d..587c68fba2 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -221,25 +221,24 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { let table = Table::new(table_root); let snapshot = table.snapshot(&engine, None)?; let mut writer = snapshot.checkpoint()?; - let checkpoint_data = writer.checkpoint_data(&engine)?; - let mut data_iter = checkpoint_data.data; // Verify the checkpoint file path is the latest version by default. assert_eq!( - checkpoint_data.path, + writer.checkpoint_path()?, Url::parse("memory:///_delta_log/00000000000000000002.checkpoint.parquet")? ); + let mut data_iter = writer.checkpoint_data(&engine)?; // The first batch should be the metadata and protocol actions. let batch = data_iter.next().unwrap()?; assert_eq!(batch.selection_vector, [true, true]); - // The second batch should only include the add action as the remove action is expired. + // The second batch should include both the add action and the remove action let batch = data_iter.next().unwrap()?; assert_eq!(batch.selection_vector, [true, true]); // The third batch should not be included as the selection vector does not - // contain any true values, as the add action is removed in a following commit. + // contain any true values, as the file added is removed in a following commit. assert!(data_iter.next().is_none()); assert_eq!(data_iter.actions_count, 4); @@ -280,15 +279,14 @@ fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { // Specify version 0 for checkpoint let snapshot = table.snapshot(&engine, Some(0))?; let mut writer = snapshot.checkpoint()?; - let checkpoint_data = writer.checkpoint_data(&engine)?; - let mut data_iter = checkpoint_data.data; // Verify the checkpoint file path is the specified version. assert_eq!( - checkpoint_data.path, + writer.checkpoint_path()?, Url::parse("memory:///_delta_log/00000000000000000000.checkpoint.parquet")? ); + let mut data_iter = writer.checkpoint_data(&engine)?; // The first batch should be the metadata and protocol actions. let batch = data_iter.next().unwrap()?; assert_eq!(batch.selection_vector, [true, true]); @@ -336,20 +334,19 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { let table = Table::new(table_root); let snapshot = table.snapshot(&engine, None)?; let mut writer = snapshot.checkpoint()?; - let checkpoint_data = writer.checkpoint_data(&engine)?; - let mut data_iter = checkpoint_data.data; // Verify the checkpoint file path is the latest version by default. assert_eq!( - checkpoint_data.path, + writer.checkpoint_path()?, Url::parse("memory:///_delta_log/00000000000000000001.checkpoint.parquet")? ); + let mut data_iter = writer.checkpoint_data(&engine)?; // The first batch should be the metadata and protocol actions. let batch = data_iter.next().unwrap()?; assert_eq!(batch.selection_vector, [true, true]); - // The second batch should be the add action as the remove action is expired. + // The second batch should include both the add action and the remove action let batch = data_iter.next().unwrap()?; assert_eq!(batch.selection_vector, [true, true]); From 32e6714e3ccddb3d1f1b5394e2d49c3c4bb01630 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 22 Apr 2025 14:18:22 -0700 Subject: [PATCH 168/176] reviews --- kernel/src/checkpoint/mod.rs | 7 +------ kernel/src/checkpoint/tests.rs | 10 ++++++---- kernel/src/snapshot.rs | 4 ++-- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 71f28dc084..2e4a33b40a 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -190,16 +190,11 @@ pub struct CheckpointWriter { } impl CheckpointWriter { - /// Creates a new CheckpointWriter from a snapshot - pub(crate) fn new(snapshot: Arc) -> Self { - Self { snapshot } - } - /// Returns the URL where the checkpoint file should be written. /// /// This method generates the checkpoint path based on the table's root and the current version. /// The generated path follows the classic naming convention for checkpoints: - /// - `n.checkpoint.parquet`, where `n` is the current version of the table. + /// - /`n.checkpoint.parquet`, where `n` is the current version of the table. pub fn checkpoint_path(&self) -> DeltaResult { ParsedLogPath::new_classic_parquet_checkpoint( self.snapshot.table_root(), diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index 587c68fba2..7a73b11571 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -60,7 +60,9 @@ fn create_test_snapshot(engine: &dyn Engine) -> DeltaResult> { fn test_create_checkpoint_metadata_batch() -> DeltaResult<()> { let engine = SyncEngine::new(); let version = 10; - let writer = CheckpointWriter::new(create_test_snapshot(&engine)?); + let writer = CheckpointWriter { + snapshot: create_test_snapshot(&engine)?, + }; let checkpoint_batch = writer.create_checkpoint_metadata_batch(version, &engine)?; @@ -220,7 +222,7 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { let table_root = Url::parse("memory:///")?; let table = Table::new(table_root); let snapshot = table.snapshot(&engine, None)?; - let mut writer = snapshot.checkpoint()?; + let mut writer = Arc::new(snapshot).checkpoint()?; // Verify the checkpoint file path is the latest version by default. assert_eq!( @@ -278,7 +280,7 @@ fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { let table = Table::new(table_root); // Specify version 0 for checkpoint let snapshot = table.snapshot(&engine, Some(0))?; - let mut writer = snapshot.checkpoint()?; + let mut writer = Arc::new(snapshot).checkpoint()?; // Verify the checkpoint file path is the specified version. assert_eq!( @@ -333,7 +335,7 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { let table_root = Url::parse("memory:///")?; let table = Table::new(table_root); let snapshot = table.snapshot(&engine, None)?; - let mut writer = snapshot.checkpoint()?; + let mut writer = Arc::new(snapshot).checkpoint()?; // Verify the checkpoint file path is the latest version by default. assert_eq!( diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index c50b919a7d..38adb1e589 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -252,8 +252,8 @@ impl Snapshot { /// /// See the [`crate::checkpoint`] module documentation for more details on checkpoint types /// and the overall checkpoint process. - pub fn checkpoint(self) -> DeltaResult { - Ok(CheckpointWriter::new(Arc::new(self))) + pub fn checkpoint(self: Arc) -> DeltaResult { + Ok(CheckpointWriter { snapshot: self }) } /// Log segment this snapshot uses From de3252d2a78df341f741083d3db756805bf2f747 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 22 Apr 2025 14:52:56 -0700 Subject: [PATCH 169/176] review --- kernel/src/checkpoint/mod.rs | 36 +++++++++-------------- kernel/src/checkpoint/tests.rs | 52 ++++++++++++++++------------------ 2 files changed, 38 insertions(+), 50 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 2e4a33b40a..ee70528bcd 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -27,7 +27,7 @@ //! 2. Get the checkpoint data from [`CheckpointWriter::checkpoint_data`] //! 3. Write the data to the path in object storage (engine-specific) //! 4. Collect metadata ([`FileMeta`]) from the write operation -//! 5. Pass the metadata and consumed data iterator to `CheckpointWriter::finalize` +//! 5. Pass the metadata and exhausted data iterator to `CheckpointWriter::finalize` //! //! ```no_run //! # use std::sync::Arc; @@ -64,7 +64,6 @@ //! //! /* IMPORTANT: All data must be written before finalizing the checkpoint */ //! -//! // TODO(#850): Implement the finalize method //! // writer.finalize(&engine, &metadata, checkpoint_data)?; //! //! # Ok::<_, Error>(()) @@ -160,15 +159,11 @@ impl Iterator for CheckpointDataIterator { /// each batch. The [`CheckpointDataIterator`] is passed back to the kernel on call to /// `CheckpointWriter::finalize` for counts to be read and written to the `_last_checkpoint` file fn next(&mut self) -> Option { - let next_item = self.checkpoint_batch_iterator.next(); - - next_item.map(|result| { - result.map(|batch| { - self.actions_count += batch.actions_count; - self.add_actions_count += batch.add_actions_count; - batch.filtered_data - }) - }) + Some(self.checkpoint_batch_iterator.next()?.map(|batch| { + self.actions_count += batch.actions_count; + self.add_actions_count += batch.add_actions_count; + batch.filtered_data + })) } } @@ -233,11 +228,6 @@ impl CheckpointWriter { None, )?; - // Create iterator over actions for checkpoint data - let checkpoint_data = - CheckpointLogReplayProcessor::new(self.deleted_file_retention_timestamp()?) - .process_actions_iter(actions); - let version = self.snapshot.version().try_into().map_err(|e| { Error::CheckpointWrite(format!( "Failed to convert checkpoint version from u64 {} to i64: {}", @@ -246,15 +236,17 @@ impl CheckpointWriter { )) })?; - // Chain the checkpoint metadata action if using V2 checkpoints - let chained = checkpoint_data.chain( - is_v2_checkpoints_supported - .then(|| self.create_checkpoint_metadata_batch(version, engine)), - ); + // Create iterator over actions for checkpoint data + let checkpoint_data = + CheckpointLogReplayProcessor::new(self.deleted_file_retention_timestamp()?) + .process_actions_iter(actions); + + let checkpoint_metadata = is_v2_checkpoints_supported + .then(|| self.create_checkpoint_metadata_batch(version, engine)); // Wrap the iterator in a CheckpointDataIterator to track action counts Ok(CheckpointDataIterator { - checkpoint_batch_iterator: Box::new(chained), + checkpoint_batch_iterator: Box::new(checkpoint_data.chain(checkpoint_metadata)), actions_count: 0, add_actions_count: 0, }) diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index 7a73b11571..eff8861bb0 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -1,5 +1,6 @@ use std::{path::PathBuf, sync::Arc, time::Duration}; +use super::DEFAULT_RETENTION_SECS; use crate::actions::{Add, Metadata, Protocol, Remove}; use crate::arrow::array::{ArrayRef, StructArray}; use crate::arrow::datatypes::{DataType, Schema}; @@ -26,29 +27,36 @@ use url::Url; #[test] fn test_deleted_file_retention_timestamp() -> DeltaResult<()> { - let now = Duration::from_secs(1000).as_millis() as i64; + const MILLIS_PER_SECOND: i64 = 1_000; - // Test cases + let reference_time_secs = 10_000; + let reference_time = Duration::from_secs(reference_time_secs); + let reference_time_millis = reference_time.as_millis() as i64; + + // Retention scenarios: + // ( retention duration , expected_timestamp ) let test_cases = [ - // Default case (7 days) - (None, now - (7 * 24 * 60 * 60 * 1000)), + // None = Default retention (7 days) + ( + None, + reference_time_millis - (DEFAULT_RETENTION_SECS as i64 * MILLIS_PER_SECOND), + ), // Zero retention - (Some(Duration::from_secs(0)), now), - // Custom retention (2000 seconds) - // This results in a negative timestamp which is valid - as it just means that - // the retention window extends to before UNIX epoch. - (Some(Duration::from_secs(2000)), now - (2000 * 1000)), + (Some(Duration::from_secs(0)), reference_time_millis), + // Custom retention (e.g., 2000 seconds) + ( + Some(Duration::from_secs(2_000)), + reference_time_millis - (2_000 * MILLIS_PER_SECOND), + ), ]; - for (retention, expected) in test_cases { - let result = - deleted_file_retention_timestamp_with_time(retention, Duration::from_secs(1000))?; - assert_eq!(result, expected); + for (retention, expected_timestamp) in test_cases { + let result = deleted_file_retention_timestamp_with_time(retention, reference_time)?; + assert_eq!(result, expected_timestamp); } Ok(()) } - fn create_test_snapshot(engine: &dyn Engine) -> DeltaResult> { let path = std::fs::canonicalize(PathBuf::from("./tests/data/app-txn-no-checkpoint/")); let url = url::Url::from_directory_path(path.unwrap()).unwrap(); @@ -136,26 +144,14 @@ fn write_commit_to_store( /// Create a Protocol action without v2Checkpoint feature support fn create_basic_protocol_action() -> Action { Action::Protocol( - Protocol::try_new( - 3, - 7, - Vec::::new().into(), - Vec::::new().into(), - ) - .unwrap(), + Protocol::try_new(3, 7, Some(Vec::::new()), Some(Vec::::new())).unwrap(), ) } /// Create a Protocol action with v2Checkpoint feature support fn create_v2_checkpoint_protocol_action() -> Action { Action::Protocol( - Protocol::try_new( - 3, - 7, - vec!["v2Checkpoint"].into(), - vec!["v2Checkpoint"].into(), - ) - .unwrap(), + Protocol::try_new(3, 7, Some(vec!["v2Checkpoint"]), Some(vec!["v2Checkpoint"])).unwrap(), ) } From fe110910328417ec5e40f8a0d9ba8e04ca9799b5 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 22 Apr 2025 15:11:31 -0700 Subject: [PATCH 170/176] review --- kernel/src/checkpoint/mod.rs | 19 +++++++----- kernel/src/checkpoint/tests.rs | 53 +++++++++++++++++----------------- kernel/src/table.rs | 21 ++++++++++++++ 3 files changed, 58 insertions(+), 35 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index ee70528bcd..da7d3362da 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -1,6 +1,8 @@ //! This module implements the API for writing single-file checkpoints. //! -//! The entry-point for this API is [`Snapshot::checkpoint`]. +//! The entry-points for this API are: +//! 1. [`Snapshot::checkpoint`] +//! 2. [`Table::checkpoint`] //! //! ## Checkpoint Types and Selection Logic //! This API supports two checkpoint types, selected based on table features: @@ -22,7 +24,7 @@ //! //! The following steps outline the process of creating a checkpoint: //! -//! 1. Create a [`CheckpointWriter`] using [`Snapshot::checkpoint`] +//! 1. Create a [`CheckpointWriter`] using [`Snapshot::checkpoint`] or [`Table::checkpoint`] //! 2. Get the checkpoint path from [`CheckpointWriter::checkpoint_path`] //! 2. Get the checkpoint data from [`CheckpointWriter::checkpoint_data`] //! 3. Write the data to the path in object storage (engine-specific) @@ -49,11 +51,9 @@ //! // Create a table instance for the table you want to checkpoint //! let table = Table::try_from_uri("./tests/data/app-txn-no-checkpoint")?; //! -//! // Create a snapshot of a specific version of the table (e.g., version 1) -//! let snapshot: Snapshot = table.snapshot(engine, Some(1))?; -//! -//! // Create a checkpoint writer from the snapshot -//! let mut writer: CheckpointWriter = snapshot.checkpoint()?; +//! // Create a checkpoint writer from a version of the table (e.g., version 1) +//! // Alternatively, if you have a snapshot, you can use `Snapshot::checkpoint()` +//! let mut writer = table.checkpoint(engine, Some(1))?; //! //! // Get the checkpoint path and data //! let checkpoint_path = writer.checkpoint_path()?; @@ -74,6 +74,7 @@ //! //! [`CheckpointMetadata`]: crate::actions::CheckpointMetadata //! [`LastCheckpointHint`]: crate::snapshot::LastCheckpointHint +//! [`Table`]: crate::table::Table // Future extensions // - TODO(#836): Single-file UUID-named V2 checkpoints (using `n.checkpoint.u.{json/parquet}` naming) are to be // implemented in the future. The current implementation only supports classic-named V2 checkpoints. @@ -277,7 +278,9 @@ impl CheckpointWriter { // Implementation will use checkpoint_data.actions_count and checkpoint_data.add_actions_count // TODO(#850): Implement the actual finalization logic - todo!("Implement finalize method for checkpoint writer") + return Err(Error::checkpoint_write( + "Checkpoint finalization is not yet implemented", + )); } /// Creates the checkpoint metadata action for V2 checkpoints. diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index eff8861bb0..c0500db758 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -1,19 +1,14 @@ -use std::{path::PathBuf, sync::Arc, time::Duration}; +use std::{sync::Arc, time::Duration}; use super::DEFAULT_RETENTION_SECS; use crate::actions::{Add, Metadata, Protocol, Remove}; use crate::arrow::array::{ArrayRef, StructArray}; use crate::arrow::datatypes::{DataType, Schema}; -use crate::checkpoint::{deleted_file_retention_timestamp_with_time, CheckpointWriter}; +use crate::checkpoint::deleted_file_retention_timestamp_with_time; use crate::engine::arrow_data::ArrowEngineData; -use crate::engine::{ - default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}, - sync::SyncEngine, -}; -use crate::snapshot::Snapshot; +use crate::engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}; use crate::utils::test_utils::Action; use crate::DeltaResult; -use crate::Engine; use crate::Table; use arrow_53::{ @@ -57,22 +52,29 @@ fn test_deleted_file_retention_timestamp() -> DeltaResult<()> { Ok(()) } -fn create_test_snapshot(engine: &dyn Engine) -> DeltaResult> { - let path = std::fs::canonicalize(PathBuf::from("./tests/data/app-txn-no-checkpoint/")); - let url = url::Url::from_directory_path(path.unwrap()).unwrap(); - let table = Table::new(url); - Ok(Arc::new(table.snapshot(engine, None)?)) -} #[test] fn test_create_checkpoint_metadata_batch() -> DeltaResult<()> { - let engine = SyncEngine::new(); - let version = 10; - let writer = CheckpointWriter { - snapshot: create_test_snapshot(&engine)?, - }; + let (store, _) = new_in_memory_store(); + let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); - let checkpoint_batch = writer.create_checkpoint_metadata_batch(version, &engine)?; + // 1st commit (version 0) - metadata and protocol actions + // Protocol action does not include the v2Checkpoint reader/writer feature. + write_commit_to_store( + &store, + vec![ + create_v2_checkpoint_protocol_action(), + create_metadata_action(), + ], + 0, + )?; + + let table_root = Url::parse("memory:///")?; + let table = Table::new(table_root); + let snapshot = table.snapshot(&engine, None)?; + let writer = Arc::new(snapshot).checkpoint()?; + + let checkpoint_batch = writer.create_checkpoint_metadata_batch(0, &engine)?; // Check selection vector has one true value assert_eq!(checkpoint_batch.filtered_data.selection_vector, vec![true]); @@ -94,7 +96,7 @@ fn test_create_checkpoint_metadata_batch() -> DeltaResult<()> { expected_schema, vec![Arc::new(StructArray::from(vec![( Arc::new(Field::new("version", DataType::Int64, false)), - create_array!(Int64, [version]) as ArrayRef, + create_array!(Int64, [0]) as ArrayRef, )]))], ) .unwrap(); @@ -217,8 +219,7 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { let table_root = Url::parse("memory:///")?; let table = Table::new(table_root); - let snapshot = table.snapshot(&engine, None)?; - let mut writer = Arc::new(snapshot).checkpoint()?; + let mut writer = table.checkpoint(&engine, None)?; // Verify the checkpoint file path is the latest version by default. assert_eq!( @@ -275,8 +276,7 @@ fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { let table_root = Url::parse("memory:///")?; let table = Table::new(table_root); // Specify version 0 for checkpoint - let snapshot = table.snapshot(&engine, Some(0))?; - let mut writer = Arc::new(snapshot).checkpoint()?; + let mut writer = table.checkpoint(&engine, Some(0))?; // Verify the checkpoint file path is the specified version. assert_eq!( @@ -330,8 +330,7 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { let table_root = Url::parse("memory:///")?; let table = Table::new(table_root); - let snapshot = table.snapshot(&engine, None)?; - let mut writer = Arc::new(snapshot).checkpoint()?; + let mut writer = table.checkpoint(&engine, None)?; // Verify the checkpoint file path is the latest version by default. assert_eq!( diff --git a/kernel/src/table.rs b/kernel/src/table.rs index 97e1596d77..df5b739878 100644 --- a/kernel/src/table.rs +++ b/kernel/src/table.rs @@ -4,9 +4,11 @@ use std::borrow::Cow; use std::ops::Deref; use std::path::PathBuf; +use std::sync::Arc; use url::Url; +use crate::checkpoint::CheckpointWriter; use crate::snapshot::Snapshot; use crate::table_changes::TableChanges; use crate::transaction::Transaction; @@ -98,6 +100,25 @@ impl Table { ) } + /// Creates a [`CheckpointWriter`] for generating checkpoints at the specified table version. + /// + /// See the [`crate::checkpoint`] module documentation for more details on checkpoint types + /// and the overall checkpoint process. + /// + /// # Parameters + /// - `engine`: Implementation of [`Engine`] apis. + /// - `version`: The version of the table to checkpoint. If [`None`], the latest version of the + /// table will be checkpointed. + pub fn checkpoint( + &self, + engine: &dyn Engine, + version: Option, + ) -> DeltaResult { + Ok(CheckpointWriter { + snapshot: Arc::new(self.snapshot(engine, version)?), + }) + } + /// Create a new write transaction for this table. pub fn new_transaction(&self, engine: &dyn Engine) -> DeltaResult { Transaction::try_new(self.snapshot(engine, None)?) From ae86b11d4e6ae8e6842e0ec5eefc30831c8f41ac Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 22 Apr 2025 15:27:11 -0700 Subject: [PATCH 171/176] docs --- kernel/src/checkpoint/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index da7d3362da..7df107aa06 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -74,7 +74,7 @@ //! //! [`CheckpointMetadata`]: crate::actions::CheckpointMetadata //! [`LastCheckpointHint`]: crate::snapshot::LastCheckpointHint -//! [`Table`]: crate::table::Table +//! [`Table::checkpoint`]: crate::table::Table::checkpoint // Future extensions // - TODO(#836): Single-file UUID-named V2 checkpoints (using `n.checkpoint.u.{json/parquet}` naming) are to be // implemented in the future. The current implementation only supports classic-named V2 checkpoints. @@ -278,9 +278,9 @@ impl CheckpointWriter { // Implementation will use checkpoint_data.actions_count and checkpoint_data.add_actions_count // TODO(#850): Implement the actual finalization logic - return Err(Error::checkpoint_write( + Err(Error::checkpoint_write( "Checkpoint finalization is not yet implemented", - )); + )) } /// Creates the checkpoint metadata action for V2 checkpoints. From 094f460fbe0871beea20433139b344ff454aa40d Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 22 Apr 2025 15:33:38 -0700 Subject: [PATCH 172/176] docs --- kernel/src/checkpoint/mod.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 7df107aa06..3c8567e2f1 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -76,8 +76,6 @@ //! [`LastCheckpointHint`]: crate::snapshot::LastCheckpointHint //! [`Table::checkpoint`]: crate::table::Table::checkpoint // Future extensions -// - TODO(#836): Single-file UUID-named V2 checkpoints (using `n.checkpoint.u.{json/parquet}` naming) are to be -// implemented in the future. The current implementation only supports classic-named V2 checkpoints. // - TODO(#837): Multi-file V2 checkpoints are not supported yet. The API is designed to be extensible for future // multi-file support, but the current implementation only supports single-file checkpoints. use std::sync::{Arc, LazyLock}; @@ -188,9 +186,14 @@ pub struct CheckpointWriter { impl CheckpointWriter { /// Returns the URL where the checkpoint file should be written. /// - /// This method generates the checkpoint path based on the table's root and the current version. - /// The generated path follows the classic naming convention for checkpoints: - /// - /`n.checkpoint.parquet`, where `n` is the current version of the table. + /// This method generates the checkpoint path based on the table's root and the version + /// of the underlying snapshot being checkpointed. The resulting path follows the classic + /// Delta checkpoint naming convention: + /// + /// `/.checkpoint.parquet` + /// + /// For example, if the table root is `s3://bucket/path` and the version is `42`, + /// the checkpoint path will be:`s3://bucket/path/42.checkpoint.parquet` pub fn checkpoint_path(&self) -> DeltaResult { ParsedLogPath::new_classic_parquet_checkpoint( self.snapshot.table_root(), From 893568a484ee10f8f2d39380155bc315acb5ba62 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 22 Apr 2025 16:02:17 -0700 Subject: [PATCH 173/176] note about not supporting uuid named checkpoints --- kernel/src/checkpoint/mod.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 3c8567e2f1..72beef01e3 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -72,10 +72,17 @@ //! ## Warning //! Multi-part (V1) checkpoints are DEPRECATED and UNSAFE. //! +//! ## Note +//! We currently do not plan to support UUID-named V2 checkpoints, since S3's put-if-absent +//! semantics remove the need for UUIDs to ensure uniqueness. Supporting only classic-named +//! checkpoints avoids added complexity, such as coordinating naming decisions between kernel and +//! engine, and handling coexistence with legacy V1 checkpoints. If a compelling use case arises +//! in the future, we can revisit this decision. +//! //! [`CheckpointMetadata`]: crate::actions::CheckpointMetadata //! [`LastCheckpointHint`]: crate::snapshot::LastCheckpointHint //! [`Table::checkpoint`]: crate::table::Table::checkpoint -// Future extensions +// Future extensions: // - TODO(#837): Multi-file V2 checkpoints are not supported yet. The API is designed to be extensible for future // multi-file support, but the current implementation only supports single-file checkpoints. use std::sync::{Arc, LazyLock}; From d0b6e9b980f830c8db3dd79e1a2700762dc7609c Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 22 Apr 2025 16:20:27 -0700 Subject: [PATCH 174/176] impl Into --- kernel/src/table.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/src/table.rs b/kernel/src/table.rs index df5b739878..8da7490736 100644 --- a/kernel/src/table.rs +++ b/kernel/src/table.rs @@ -112,11 +112,10 @@ impl Table { pub fn checkpoint( &self, engine: &dyn Engine, - version: Option, + version: impl Into>, ) -> DeltaResult { - Ok(CheckpointWriter { - snapshot: Arc::new(self.snapshot(engine, version)?), - }) + let snapshot = Arc::new(self.snapshot(engine, version.into())?); + Ok(CheckpointWriter { snapshot }) } /// Create a new write transaction for this table. From 9e225bd4ee30a733061405f18f5589dda0ae81c9 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 23 Apr 2025 15:22:29 -0700 Subject: [PATCH 175/176] reviews --- kernel/src/checkpoint/mod.rs | 51 ++++++++++++++++++---------------- kernel/src/checkpoint/tests.rs | 10 +++---- 2 files changed, 32 insertions(+), 29 deletions(-) diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 72beef01e3..e9040550aa 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -129,9 +129,9 @@ static CHECKPOINT_ACTIONS_SCHEMA: LazyLock = LazyLock::new(|| { // Schema of the [`CheckpointMetadata`] action that is included in V2 checkpoints // We cannot use `CheckpointMetadata::to_schema()` as it would include the 'tags' field which -// we're not supporting yet due to the lack of map support. +// we're not supporting yet due to the lack of map support TODO(#880). static CHECKPOINT_METADATA_ACTION_SCHEMA: LazyLock = LazyLock::new(|| { - Arc::new(StructType::new([StructField::not_null( + Arc::new(StructType::new([StructField::nullable( CHECKPOINT_METADATA_NAME, DataType::struct_type([StructField::not_null("version", DataType::LONG)]), )])) @@ -195,12 +195,12 @@ impl CheckpointWriter { /// /// This method generates the checkpoint path based on the table's root and the version /// of the underlying snapshot being checkpointed. The resulting path follows the classic - /// Delta checkpoint naming convention: + /// Delta checkpoint naming convention (where the version is zero-padded to 20 digits): /// /// `/.checkpoint.parquet` /// - /// For example, if the table root is `s3://bucket/path` and the version is `42`, - /// the checkpoint path will be:`s3://bucket/path/42.checkpoint.parquet` + /// For example, if the table root is `s3://bucket/path` and the version is `10`, + /// the checkpoint path will be: `s3://bucket/path/00000000000000000010.checkpoint.parquet` pub fn checkpoint_path(&self) -> DeltaResult { ParsedLogPath::new_classic_parquet_checkpoint( self.snapshot.table_root(), @@ -226,7 +226,7 @@ impl CheckpointWriter { // 4. Chains the checkpoint metadata action if writing a V2 spec checkpoint // (i.e., if `v2Checkpoints` feature is supported by table) // 5. Generates the appropriate checkpoint path - pub fn checkpoint_data(&mut self, engine: &dyn Engine) -> DeltaResult { + pub fn checkpoint_data(&self, engine: &dyn Engine) -> DeltaResult { let is_v2_checkpoints_supported = self .snapshot .table_configuration() @@ -239,21 +239,13 @@ impl CheckpointWriter { None, )?; - let version = self.snapshot.version().try_into().map_err(|e| { - Error::CheckpointWrite(format!( - "Failed to convert checkpoint version from u64 {} to i64: {}", - self.snapshot.version(), - e - )) - })?; - // Create iterator over actions for checkpoint data let checkpoint_data = CheckpointLogReplayProcessor::new(self.deleted_file_retention_timestamp()?) .process_actions_iter(actions); - let checkpoint_metadata = is_v2_checkpoints_supported - .then(|| self.create_checkpoint_metadata_batch(version, engine)); + let checkpoint_metadata = + is_v2_checkpoints_supported.then(|| self.create_checkpoint_metadata_batch(engine)); // Wrap the iterator in a CheckpointDataIterator to track action counts Ok(CheckpointDataIterator { @@ -311,9 +303,16 @@ impl CheckpointWriter { /// batch should be included in the checkpoint. fn create_checkpoint_metadata_batch( &self, - version: i64, engine: &dyn Engine, ) -> DeltaResult { + let version: i64 = self.snapshot.version().try_into().map_err(|e| { + Error::CheckpointWrite(format!( + "Failed to convert checkpoint version from u64 {} to i64: {}", + self.snapshot.version(), + e + )) + })?; + let checkpoint_metadata_batch = engine.evaluation_handler().create_one( CHECKPOINT_METADATA_ACTION_SCHEMA.clone(), &[Scalar::from(version)], @@ -331,16 +330,20 @@ impl CheckpointWriter { }) } - /// Calculates the cutoff timestamp for deleted file cleanup. - /// /// This function determines the minimum timestamp before which deleted files - /// will be permanently removed during VACUUM operations, based on the table's - /// `deleted_file_retention_duration` property. + /// are eligible for permanent removal during VACUUM operations. It is used + /// during checkpointing to decide whether to include `remove` actions. /// - /// Returns the cutoff timestamp in milliseconds since epoch, matching - /// the remove action's `deletion_timestamp` field format for comparison. + /// If a deleted file's timestamp is older than this threshold (based on the + /// table's `deleted_file_retention_duration`), the corresponding `remove` action + /// is included in the checkpoint, allowing VACUUM operations to later identify + /// and clean up those files. + /// + /// # Returns: + /// The cutoff timestamp in milliseconds since epoch, matching the remove action's + /// `deletion_timestamp` field format for comparison. /// - /// The default retention period is 7 days, matching delta-spark's behavior. + /// # Note: The default retention period is 7 days, matching delta-spark's behavior. fn deleted_file_retention_timestamp(&self) -> DeltaResult { let retention_duration = self .snapshot diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index c0500db758..ea36af59eb 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -74,7 +74,7 @@ fn test_create_checkpoint_metadata_batch() -> DeltaResult<()> { let snapshot = table.snapshot(&engine, None)?; let writer = Arc::new(snapshot).checkpoint()?; - let checkpoint_batch = writer.create_checkpoint_metadata_batch(0, &engine)?; + let checkpoint_batch = writer.create_checkpoint_metadata_batch(&engine)?; // Check selection vector has one true value assert_eq!(checkpoint_batch.filtered_data.selection_vector, vec![true]); @@ -90,7 +90,7 @@ fn test_create_checkpoint_metadata_batch() -> DeltaResult<()> { let expected_schema = Arc::new(Schema::new(vec![Field::new( "checkpointMetadata", DataType::Struct(vec![Field::new("version", DataType::Int64, false)].into()), - false, + true, )])); let expected = RecordBatch::try_new( expected_schema, @@ -219,7 +219,7 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { let table_root = Url::parse("memory:///")?; let table = Table::new(table_root); - let mut writer = table.checkpoint(&engine, None)?; + let writer = table.checkpoint(&engine, None)?; // Verify the checkpoint file path is the latest version by default. assert_eq!( @@ -276,7 +276,7 @@ fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { let table_root = Url::parse("memory:///")?; let table = Table::new(table_root); // Specify version 0 for checkpoint - let mut writer = table.checkpoint(&engine, Some(0))?; + let writer = table.checkpoint(&engine, Some(0))?; // Verify the checkpoint file path is the specified version. assert_eq!( @@ -330,7 +330,7 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { let table_root = Url::parse("memory:///")?; let table = Table::new(table_root); - let mut writer = table.checkpoint(&engine, None)?; + let writer = table.checkpoint(&engine, None)?; // Verify the checkpoint file path is the latest version by default. assert_eq!( From be6ba6815379cc4dd92954230be4bbcf77d1dc6a Mon Sep 17 00:00:00 2001 From: "shjtia@uwaterloo.ca" Date: Mon, 28 Apr 2025 11:00:11 -0700 Subject: [PATCH 176/176] fix merge --- kernel/src/checkpoint/tests.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index ea36af59eb..e14adca399 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -7,16 +7,16 @@ use crate::arrow::datatypes::{DataType, Schema}; use crate::checkpoint::deleted_file_retention_timestamp_with_time; use crate::engine::arrow_data::ArrowEngineData; use crate::engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}; +use crate::object_store::{memory::InMemory, path::Path, ObjectStore}; use crate::utils::test_utils::Action; use crate::DeltaResult; use crate::Table; -use arrow_53::{ +use arrow_55::{ array::{create_array, RecordBatch}, datatypes::Field, }; -use object_store::{memory::InMemory, path::Path, ObjectStore}; use test_utils::delta_path_for_version; use url::Url;