From 435302e42e2f2776435b915698af1e6cd0ca7339 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 11 Mar 2025 22:03:21 -0700 Subject: [PATCH 01/45] introduce visitors --- kernel/src/actions/visitors.rs | 524 +++++++++++++++++++++++++++++++-- kernel/src/scan/log_replay.rs | 10 +- 2 files changed, 510 insertions(+), 24 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 36a2c7faf..9eef22ed5 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -1,10 +1,12 @@ //! This module defines visitors that can be used to extract the various delta actions from //! [`crate::engine_data::EngineData`] types. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::LazyLock; +use tracing::debug; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; +use crate::scan::log_replay::FileActionKey; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, Error}; @@ -483,6 +485,270 @@ impl RowVisitor for SidecarVisitor { } } +/// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds and +/// removes to be included in a checkpoint file. Log replay visits actions newest-first, so once +/// we've seen a file action for a given (path, dvId) pair, we should ignore all subsequent (older) +/// actions for that same (path, dvId) pair. If the first action for a given (path, dvId) is a remove +/// action, we should only include it if it is not expired (i.e., its deletion timestamp is greater +/// than the minimum file retention timestamp). +struct CheckpointFileActionsVisitor<'seen> { + seen_file_keys: &'seen mut HashSet, + selection_vector: Vec, + is_log_batch: bool, + total_actions: usize, + total_add_actions: usize, + minimum_file_retention_timestamp: i64, +} + +#[allow(unused)] // TODO: Remove flag once used for checkpoint writing +impl CheckpointFileActionsVisitor<'_> { + /// Checks if log replay already processed this logical file (in which case the current action + /// should be ignored). If not already seen, register it so we can recognize future duplicates. + /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it + /// and should process it. + /// + /// TODO: This method is a duplicate of AddRemoveDedupVisior's method! + fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { + // Note: each (add.path + add.dv_unique_id()) pair has a + // unique Add + Remove pair in the log. For example: + // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json + + if self.seen_file_keys.contains(&key) { + debug!( + "Ignoring duplicate ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + true + } else { + debug!( + "Including ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + if self.is_log_batch { + // Remember file actions from this batch so we can ignore duplicates as we process + // batches from older commit and/or checkpoint files. We don't track checkpoint + // batches because they are already the oldest actions and never replace anything. + self.seen_file_keys.insert(key); + } + false + } + } + + /// A remove action includes a timestamp indicating when the deletion occurred. Physical files + /// are deleted lazily after a user-defined expiration time, allowing concurrent readers to + /// access stale snapshots. A remove action remains as a tombstone in a checkpoint file until + /// it expires, which happens when the current time exceeds the removal timestamp plus the + /// expiration threshold. + fn is_expired_tombstone<'a>(&self, i: usize, getter: &'a dyn GetData<'a>) -> DeltaResult { + // Ideally this should never be zero, but we are following the same behavior as Delta + // Spark and the Java Kernel. + let mut deletion_timestamp: i64 = 0; + if let Some(ts) = getter.get_opt(i, "remove.deletionTimestamp")? { + deletion_timestamp = ts; + } + + Ok(deletion_timestamp <= self.minimum_file_retention_timestamp) + } + + /// Returns true if the row contains a valid file action to be included in the checkpoint. + fn is_valid_file_action<'a>( + &mut self, + i: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult { + // Add will have a path at index 0 if it is valid; otherwise we may + // have a remove with a path at index 4. In either case, extract the three dv getters at + // indexes that immediately follow a valid path index. + let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? { + (path, &getters[1..4], true) + } else if let Some(path) = getters[4].get_opt(i, "remove.path")? { + (path, &getters[6..9], false) + } else { + return Ok(false); + }; + + let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? { + Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts( + storage_type, + dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?, + dv_getters[2].get_opt(i, "deletionVector.offset")?, + )), + None => None, + }; + + // Check both adds and removes (skipping already-seen) + let file_key = FileActionKey::new(path, dv_unique_id); + if self.check_and_record_seen(file_key) { + return Ok(false); + } + + // Ignore expired tombstones. + if !is_add && self.is_expired_tombstone(i, getters[5])? { + return Ok(false); + } + + if is_add { + self.total_add_actions += 1; + } + + Ok(true) + } +} + +impl RowVisitor for CheckpointFileActionsVisitor<'_> { + fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { + // The data columns visited must be in the following order: + // 1. ADD + // 2. REMOVE + static CHECKPOINT_FILE_ACTION_COLUMNS: LazyLock = + LazyLock::new(|| { + const STRING: DataType = DataType::STRING; + const INTEGER: DataType = DataType::INTEGER; + let types_and_names = vec![ + (STRING, column_name!("add.path")), + (STRING, column_name!("add.deletionVector.storageType")), + (STRING, column_name!("add.deletionVector.pathOrInlineDv")), + (INTEGER, column_name!("add.deletionVector.offset")), + (STRING, column_name!("remove.path")), + (DataType::LONG, column_name!("remove.deletionTimestamp")), + (STRING, column_name!("remove.deletionVector.storageType")), + (STRING, column_name!("remove.deletionVector.pathOrInlineDv")), + (INTEGER, column_name!("remove.deletionVector.offset")), + ]; + let (types, names) = types_and_names.into_iter().unzip(); + (names, types).into() + }); + CHECKPOINT_FILE_ACTION_COLUMNS.as_ref() + } + + fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { + require!( + getters.len() == 9, + Error::InternalError(format!( + "Wrong number of visitor getters: {}", + getters.len() + )) + ); + + for i in 0..row_count { + let should_select = self.is_valid_file_action(i, getters)?; + + if should_select { + self.selection_vector[i] = true; + self.total_actions += 1; + } + } + Ok(()) + } +} + +/// A visitor that selects non-file actions for a checkpoint file. Since log replay visits actions +/// in newest-first order, we only keep the first occurrence of: +/// - a protocol action, +/// - a metadata action, +/// - a transaction (txn) action for a given app ID. +/// +/// Any subsequent (older) actions of the same type are ignored. This visitor tracks which actions +/// have been seen and includes only the first occurrence of each in the selection vector. +#[cfg_attr(feature = "developer-visibility", visibility::make(pub))] +pub(crate) struct CheckpointNonFileActionsVisitor<'seen> { + // Non-file actions state + pub(crate) seen_protocol: bool, + pub(crate) seen_metadata: bool, + pub(crate) seen_txns: &'seen mut HashSet, + pub(crate) selection_vector: Vec, + pub(crate) total_actions: usize, +} + +#[allow(unused)] // TODO: Remove flag once used for checkpoint writing +impl CheckpointNonFileActionsVisitor<'_> { + /// Returns true if the row contains a protocol action, and we haven’t seen one yet. + fn is_valid_protocol_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol { + self.seen_protocol = true; + Ok(true) + } else { + Ok(false) + } + } + + /// Returns true if the row contains a metadata action, and we haven’t seen one yet. + fn is_valid_metadata_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata { + self.seen_metadata = true; + Ok(true) + } else { + Ok(false) + } + } + + /// Returns true if the row contains a txn action with an appId that we haven’t seen yet. + fn is_valid_txn_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + let app_id = match getter.get_str(i, "txn.appId")? { + Some(id) => id, + None => return Ok(false), + }; + + Ok(self.seen_txns.insert(app_id.to_string())) + } +} + +impl RowVisitor for CheckpointNonFileActionsVisitor<'_> { + fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { + // The data columns visited must be in the following order: + // 1. METADATA + // 2. PROTOCOL + // 3. TXN + static CHECKPOINT_NON_FILE_ACTION_COLUMNS: LazyLock = + LazyLock::new(|| { + const STRING: DataType = DataType::STRING; + const INTEGER: DataType = DataType::INTEGER; + let types_and_names = vec![ + (STRING, column_name!("metaData.id")), + (INTEGER, column_name!("protocol.minReaderVersion")), + (STRING, column_name!("txn.appId")), + ]; + let (types, names) = types_and_names.into_iter().unzip(); + (names, types).into() + }); + CHECKPOINT_NON_FILE_ACTION_COLUMNS.as_ref() + } + + fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { + require!( + getters.len() == 3, + Error::InternalError(format!( + "Wrong number of visitor getters: {}", + getters.len() + )) + ); + + for i in 0..row_count { + let should_select = self.is_valid_metadata_action(i, getters[0])? + || self.is_valid_protocol_action(i, getters[1])? + || self.is_valid_txn_action(i, getters[2])?; + + if should_select { + self.selection_vector[i] = true; + self.total_actions += 1; + } + } + Ok(()) + } +} + /// Get a DV out of some engine data. The caller is responsible for slicing the `getters` slice such /// that the first element contains the `storageType` element of the deletion vector. pub(crate) fn visit_deletion_vector_at<'a>( @@ -537,11 +803,13 @@ mod tests { let handler = SyncJsonHandler {}; let json_strings: StringArray = vec![ r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, + r#"{"remove":{"path":"part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, r#"{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Databricks-Runtime/","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}}"#, r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#, r#"{"cdc":{"path":"_change_data/age=21/cdc-00000-93f7fceb-281a-446a-b221-07b88132d203.c000.snappy.parquet","partitionValues":{"age":"21"},"size":1033,"dataChange":false}}"#, r#"{"sidecar":{"path":"016ae953-37a9-438e-8683-9a9a4a79a395.parquet","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{"tag_foo":"tag_bar"}}}"#, + r#"{"txn":{"appId":"myApp","version": 3}}"#, ] .into(); let output_schema = get_log_schema().clone(); @@ -551,6 +819,18 @@ mod tests { ArrowEngineData::try_from_engine_data(parsed).unwrap() } + fn parse_json_batch(json_strings: StringArray) -> Box { + let engine = SyncEngine::new(); + let json_handler = engine.get_json_handler(); + let output_schema = get_log_schema().clone(); + json_handler + .parse_json( + string_array_to_engine_data(json_strings.into()), + output_schema, + ) + .unwrap() + } + #[test] fn test_parse_protocol() -> DeltaResult<()> { let data = action_batch(); @@ -639,8 +919,6 @@ mod tests { #[test] fn test_parse_add_partitioned() { - let engine = SyncEngine::new(); - let json_handler = engine.get_json_handler(); let json_strings: StringArray = vec![ r#"{"commitInfo":{"timestamp":1670892998177,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[\"c1\",\"c2\"]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"3","numOutputRows":"3","numOutputBytes":"1356"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.2.0","txnId":"046a258f-45e3-4657-b0bf-abfb0f76681c"}}"#, r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, @@ -650,10 +928,7 @@ mod tests { r#"{"add":{"path":"c1=6/c2=a/part-00011-10619b10-b691-4fd0-acc4-2a9608499d7c.c000.snappy.parquet","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":4},\"maxValues\":{\"c3\":4},\"nullCount\":{\"c3\":0}}"}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let batch = json_handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); + let batch = parse_json_batch(json_strings); let mut add_visitor = AddVisitor::default(); add_visitor.visit_rows_of(batch.as_ref()).unwrap(); let add1 = Add { @@ -697,18 +972,13 @@ mod tests { #[test] fn test_parse_remove_partitioned() { - let engine = SyncEngine::new(); - let json_handler = engine.get_json_handler(); let json_strings: StringArray = vec![ r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, r#"{"metaData":{"id":"aff5cb91-8cd9-4195-aef9-446908507302","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, r#"{"remove":{"path":"c1=4/c2=c/part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let batch = json_handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); + let batch = parse_json_batch(json_strings); let mut remove_visitor = RemoveVisitor::default(); remove_visitor.visit_rows_of(batch.as_ref()).unwrap(); let expected_remove = Remove { @@ -736,8 +1006,6 @@ mod tests { #[test] fn test_parse_txn() { - let engine = SyncEngine::new(); - let json_handler = engine.get_json_handler(); let json_strings: StringArray = vec![ r#"{"commitInfo":{"timestamp":1670892998177,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[\"c1\",\"c2\"]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"3","numOutputRows":"3","numOutputBytes":"1356"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.2.0","txnId":"046a258f-45e3-4657-b0bf-abfb0f76681c"}}"#, r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, @@ -747,10 +1015,7 @@ mod tests { r#"{"txn":{"appId":"myApp2","version": 4, "lastUpdated": 1670892998177}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let batch = json_handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); + let batch = parse_json_batch(json_strings); let mut txn_visitor = SetTransactionVisitor::default(); txn_visitor.visit_rows_of(batch.as_ref()).unwrap(); let mut actual = txn_visitor.set_transactions; @@ -771,4 +1036,225 @@ mod tests { }) ); } + + #[test] + fn test_parse_checkpoint_file_action_visitor() -> DeltaResult<()> { + let data = action_batch(); + let mut visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut HashSet::new(), + selection_vector: vec![false; 8], // 8 rows in the action batch + is_log_batch: true, + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: 0, // No tombstones are expired + }; + + visitor.visit_rows_of(data.as_ref())?; + + let expected = vec![true, true, false, false, false, false, false, false]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_file_keys.len(), 2); + assert_eq!(visitor.total_actions, 2); + assert_eq!(visitor.total_add_actions, 1); + Ok(()) + } + + #[test] + fn test_checkpoint_file_action_visitor_boundary_cases_for_tombstone_expiration( + ) -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, // Missing timestamp defaults to 0 + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut HashSet::new(), + selection_vector: vec![false; 4], + is_log_batch: true, + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: 100, // Threshold set to 100 + }; + + visitor.visit_rows_of(batch.as_ref())?; + + let expected = vec![false, false, true, false]; // Only "one_above_threshold" should be kept + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_file_keys.len(), 4); // All are recorded as seen even if expired + assert_eq!(visitor.total_actions, 1); + assert_eq!(visitor.total_add_actions, 0); + Ok(()) + } + + #[test] + fn test_checkpoint_file_action_visitor_duplicate_file_actions_in_log_batch() -> DeltaResult<()> + { + let json_strings: StringArray = vec![ + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, // Duplicate path + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut HashSet::new(), + selection_vector: vec![false; 2], + is_log_batch: true, // Log batch + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: 0, + }; + + visitor.visit_rows_of(batch.as_ref())?; + + // First one should be included, second one skipped as a duplicate + let expected = vec![true, false]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_file_keys.len(), 1); + assert_eq!(visitor.total_actions, 1); + assert_eq!(visitor.total_add_actions, 1); + Ok(()) + } + + #[test] + fn test_checkpoint_file_action_visitor_duplicate_file_actions_in_checkpoint_batch( + ) -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + // Duplicate path + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut HashSet::new(), + selection_vector: vec![false; 2], + is_log_batch: false, // Checkpoint batch + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: 0, + }; + + visitor.visit_rows_of(batch.as_ref())?; + + // Both should be included since we don't track duplicates in checkpoint batches + let expected = vec![true, true]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_file_keys.len(), 0); // No tracking for checkpoint batches + assert_eq!(visitor.total_actions, 2); + assert_eq!(visitor.total_add_actions, 2); + Ok(()) + } + + #[test] + fn test_checkpoint_file_action_visitor_with_deletion_vectors() -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Same path but different DV + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"two","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Duplicate of first entry + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + ] + .into(); + let batch = parse_json_batch(json_strings); + + let mut visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut HashSet::new(), + selection_vector: vec![false; 3], + is_log_batch: true, + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: 0, + }; + + visitor.visit_rows_of(batch.as_ref())?; + + let expected = vec![true, true, false]; // Third one is a duplicate + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_file_keys.len(), 2); + assert_eq!(visitor.total_actions, 2); + assert_eq!(visitor.total_add_actions, 2); + Ok(()) + } + + #[test] + fn test_parse_checkpoint_non_file_actions_visitor() -> DeltaResult<()> { + let data = action_batch(); + let mut visitor = CheckpointNonFileActionsVisitor { + seen_protocol: false, + seen_metadata: false, + seen_txns: &mut HashSet::new(), + selection_vector: vec![false; 8], + total_actions: 0, + }; + + visitor.visit_rows_of(data.as_ref())?; + + let expected = vec![false, false, false, true, true, false, false, true]; + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_metadata, true); + assert_eq!(visitor.seen_protocol, true); + assert_eq!(visitor.seen_txns.len(), 1); + assert_eq!(visitor.total_actions, 3); + Ok(()) + } + + #[test] + fn test_checkpoint_non_file_actions_visitor_txn_already_seen() -> DeltaResult<()> { + let json_strings: StringArray = + vec![r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#].into(); + let batch = parse_json_batch(json_strings); + + // Pre-populate with app1 + let mut seen_txns = HashSet::new(); + seen_txns.insert("app1".to_string()); + + let mut visitor = CheckpointNonFileActionsVisitor { + seen_protocol: false, + seen_metadata: false, + seen_txns: &mut seen_txns, + selection_vector: vec![false; 1], + total_actions: 0, + }; + + visitor.visit_rows_of(batch.as_ref())?; + + let expected = vec![false]; // Transaction should be skipped as it's already seen + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.seen_txns.len(), 1); // Still only one transaction + assert_eq!(visitor.total_actions, 0); + Ok(()) + } + + #[test] + fn test_checkpoint_non_file_actions_visitor_protocol_and_metadata_already_seen( + ) -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#, + ] + .into(); + let batch = parse_json_batch(json_strings); + + // Set protocol and metadata as already seen + let mut visitor = CheckpointNonFileActionsVisitor { + seen_protocol: true, // Already seen + seen_metadata: true, // Already seen + seen_txns: &mut HashSet::new(), + selection_vector: vec![false; 2], + total_actions: 0, + }; + + visitor.visit_rows_of(batch.as_ref())?; + + let expected = vec![false, false]; // Both should be skipped + assert_eq!(visitor.selection_vector, expected); + assert_eq!(visitor.total_actions, 0); + Ok(()) + } } diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 0e26b610f..b0d3ea8f0 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -19,12 +19,12 @@ use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; /// The subset of file action fields that uniquely identifies it in the log, used for deduplication /// of adds and removes during log replay. #[derive(Debug, Hash, Eq, PartialEq)] -struct FileActionKey { - path: String, - dv_unique_id: Option, +pub(crate) struct FileActionKey { + pub(crate) path: String, + pub(crate) dv_unique_id: Option, } impl FileActionKey { - fn new(path: impl Into, dv_unique_id: Option) -> Self { + pub(crate) fn new(path: impl Into, dv_unique_id: Option) -> Self { let path = path.into(); Self { path, dv_unique_id } } @@ -59,7 +59,7 @@ impl AddRemoveDedupVisitor<'_> { /// should be ignored). If not already seen, register it so we can recognize future duplicates. /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it /// and should process it. - fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { + pub fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { // Note: each (add.path + add.dv_unique_id()) pair has a // unique Add + Remove pair in the log. For example: // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json From e500a107abe4c818bd1c451a70bf965124857f05 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 11 Mar 2025 22:04:35 -0700 Subject: [PATCH 02/45] remove pub --- kernel/src/scan/log_replay.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index b0d3ea8f0..dbcd056df 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -59,7 +59,7 @@ impl AddRemoveDedupVisitor<'_> { /// should be ignored). If not already seen, register it so we can recognize future duplicates. /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it /// and should process it. - pub fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { + fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { // Note: each (add.path + add.dv_unique_id()) pair has a // unique Add + Remove pair in the log. For example: // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json From 19733cd003eb7a72d962f9cf1d1556e26d2f7f77 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 11 Mar 2025 22:28:59 -0700 Subject: [PATCH 03/45] assert! instead of assert_eq with bool --- kernel/src/actions/visitors.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 9eef22ed5..3ade3d914 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -824,10 +824,7 @@ mod tests { let json_handler = engine.get_json_handler(); let output_schema = get_log_schema().clone(); json_handler - .parse_json( - string_array_to_engine_data(json_strings.into()), - output_schema, - ) + .parse_json(string_array_to_engine_data(json_strings), output_schema) .unwrap() } @@ -1197,8 +1194,8 @@ mod tests { let expected = vec![false, false, false, true, true, false, false, true]; assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_metadata, true); - assert_eq!(visitor.seen_protocol, true); + assert!(visitor.seen_metadata); + assert!(visitor.seen_protocol); assert_eq!(visitor.seen_txns.len(), 1); assert_eq!(visitor.total_actions, 3); Ok(()) From 87c9f31f97a0d7a22e07c337b6c92ee9945c19df Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 13 Mar 2025 13:22:09 -0700 Subject: [PATCH 04/45] log replay for checkpoints --- kernel/src/actions/visitors.rs | 116 +++++--------- kernel/src/checkpoints/log_replay.rs | 229 +++++++++++++++++++++++++++ kernel/src/checkpoints/mod.rs | 1 + kernel/src/lib.rs | 1 + kernel/src/path.rs | 17 ++ kernel/src/utils.rs | 25 ++- 6 files changed, 315 insertions(+), 74 deletions(-) create mode 100644 kernel/src/checkpoints/log_replay.rs create mode 100644 kernel/src/checkpoints/mod.rs diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 3ade3d914..e0e622b05 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -491,13 +491,13 @@ impl RowVisitor for SidecarVisitor { /// actions for that same (path, dvId) pair. If the first action for a given (path, dvId) is a remove /// action, we should only include it if it is not expired (i.e., its deletion timestamp is greater /// than the minimum file retention timestamp). -struct CheckpointFileActionsVisitor<'seen> { - seen_file_keys: &'seen mut HashSet, - selection_vector: Vec, - is_log_batch: bool, - total_actions: usize, - total_add_actions: usize, - minimum_file_retention_timestamp: i64, +pub(crate) struct CheckpointFileActionsVisitor<'seen> { + pub(crate) seen_file_keys: &'seen mut HashSet, + pub(crate) selection_vector: &'seen mut Vec, + pub(crate) is_log_batch: bool, + pub(crate) total_actions: usize, + pub(crate) total_add_actions: usize, + pub(crate) minimum_file_retention_timestamp: i64, } #[allow(unused)] // TODO: Remove flag once used for checkpoint writing @@ -653,10 +653,10 @@ impl RowVisitor for CheckpointFileActionsVisitor<'_> { #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] pub(crate) struct CheckpointNonFileActionsVisitor<'seen> { // Non-file actions state - pub(crate) seen_protocol: bool, - pub(crate) seen_metadata: bool, + pub(crate) seen_protocol: &'seen mut bool, + pub(crate) seen_metadata: &'seen mut bool, pub(crate) seen_txns: &'seen mut HashSet, - pub(crate) selection_vector: Vec, + pub(crate) selection_vector: &'seen mut Vec, pub(crate) total_actions: usize, } @@ -668,8 +668,8 @@ impl CheckpointNonFileActionsVisitor<'_> { i: usize, getter: &'a dyn GetData<'a>, ) -> DeltaResult { - if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol { - self.seen_protocol = true; + if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !*self.seen_protocol { + *self.seen_protocol = true; Ok(true) } else { Ok(false) @@ -682,8 +682,8 @@ impl CheckpointNonFileActionsVisitor<'_> { i: usize, getter: &'a dyn GetData<'a>, ) -> DeltaResult { - if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata { - self.seen_metadata = true; + if getter.get_str(i, "metaData.id")?.is_some() && !*self.seen_metadata { + *self.seen_metadata = true; Ok(true) } else { Ok(false) @@ -777,30 +777,13 @@ pub(crate) fn visit_deletion_vector_at<'a>( #[cfg(test)] mod tests { - use std::sync::Arc; - - use crate::arrow::array::{RecordBatch, StringArray}; - use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use crate::arrow::array::StringArray; + use crate::utils::test_utils::parse_json_batch; + use crate::EngineData; use super::*; - use crate::{ - actions::get_log_schema, - engine::arrow_data::ArrowEngineData, - engine::sync::{json::SyncJsonHandler, SyncEngine}, - Engine, EngineData, JsonHandler, - }; - - // TODO(nick): Merge all copies of this into one "test utils" thing - fn string_array_to_engine_data(string_array: StringArray) -> Box { - let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); - let schema = Arc::new(ArrowSchema::new(vec![string_field])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) - .expect("Can't convert to record batch"); - Box::new(ArrowEngineData::new(batch)) - } - fn action_batch() -> Box { - let handler = SyncJsonHandler {}; + fn action_batch() -> Box { let json_strings: StringArray = vec![ r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, r#"{"remove":{"path":"part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, @@ -812,20 +795,7 @@ mod tests { r#"{"txn":{"appId":"myApp","version": 3}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let parsed = handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); - ArrowEngineData::try_from_engine_data(parsed).unwrap() - } - - fn parse_json_batch(json_strings: StringArray) -> Box { - let engine = SyncEngine::new(); - let json_handler = engine.get_json_handler(); - let output_schema = get_log_schema().clone(); - json_handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap() + parse_json_batch(json_strings) } #[test] @@ -1039,7 +1009,7 @@ mod tests { let data = action_batch(); let mut visitor = CheckpointFileActionsVisitor { seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 8], // 8 rows in the action batch + selection_vector: &mut vec![false; 8], // 8 rows in the action batch is_log_batch: true, total_actions: 0, total_add_actions: 0, @@ -1049,7 +1019,7 @@ mod tests { visitor.visit_rows_of(data.as_ref())?; let expected = vec![true, true, false, false, false, false, false, false]; - assert_eq!(visitor.selection_vector, expected); + assert_eq!(*visitor.selection_vector, expected); assert_eq!(visitor.seen_file_keys.len(), 2); assert_eq!(visitor.total_actions, 2); assert_eq!(visitor.total_add_actions, 1); @@ -1070,7 +1040,7 @@ mod tests { let mut visitor = CheckpointFileActionsVisitor { seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 4], + selection_vector: &mut vec![false; 4], is_log_batch: true, total_actions: 0, total_add_actions: 0, @@ -1080,7 +1050,7 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![false, false, true, false]; // Only "one_above_threshold" should be kept - assert_eq!(visitor.selection_vector, expected); + assert_eq!(*visitor.selection_vector, expected); assert_eq!(visitor.seen_file_keys.len(), 4); // All are recorded as seen even if expired assert_eq!(visitor.total_actions, 1); assert_eq!(visitor.total_add_actions, 0); @@ -1099,7 +1069,7 @@ mod tests { let mut visitor = CheckpointFileActionsVisitor { seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 2], + selection_vector: &mut vec![false; 2], is_log_batch: true, // Log batch total_actions: 0, total_add_actions: 0, @@ -1110,7 +1080,7 @@ mod tests { // First one should be included, second one skipped as a duplicate let expected = vec![true, false]; - assert_eq!(visitor.selection_vector, expected); + assert_eq!(*visitor.selection_vector, expected); assert_eq!(visitor.seen_file_keys.len(), 1); assert_eq!(visitor.total_actions, 1); assert_eq!(visitor.total_add_actions, 1); @@ -1130,7 +1100,7 @@ mod tests { let mut visitor = CheckpointFileActionsVisitor { seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 2], + selection_vector: &mut vec![false; 2], is_log_batch: false, // Checkpoint batch total_actions: 0, total_add_actions: 0, @@ -1141,7 +1111,7 @@ mod tests { // Both should be included since we don't track duplicates in checkpoint batches let expected = vec![true, true]; - assert_eq!(visitor.selection_vector, expected); + assert_eq!(*visitor.selection_vector, expected); assert_eq!(visitor.seen_file_keys.len(), 0); // No tracking for checkpoint batches assert_eq!(visitor.total_actions, 2); assert_eq!(visitor.total_add_actions, 2); @@ -1162,7 +1132,7 @@ mod tests { let mut visitor = CheckpointFileActionsVisitor { seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 3], + selection_vector: &mut vec![false; 3], is_log_batch: true, total_actions: 0, total_add_actions: 0, @@ -1172,7 +1142,7 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, false]; // Third one is a duplicate - assert_eq!(visitor.selection_vector, expected); + assert_eq!(*visitor.selection_vector, expected); assert_eq!(visitor.seen_file_keys.len(), 2); assert_eq!(visitor.total_actions, 2); assert_eq!(visitor.total_add_actions, 2); @@ -1183,19 +1153,19 @@ mod tests { fn test_parse_checkpoint_non_file_actions_visitor() -> DeltaResult<()> { let data = action_batch(); let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: false, - seen_metadata: false, + seen_protocol: &mut false, + seen_metadata: &mut false, seen_txns: &mut HashSet::new(), - selection_vector: vec![false; 8], + selection_vector: &mut vec![false; 8], total_actions: 0, }; visitor.visit_rows_of(data.as_ref())?; let expected = vec![false, false, false, true, true, false, false, true]; - assert_eq!(visitor.selection_vector, expected); - assert!(visitor.seen_metadata); - assert!(visitor.seen_protocol); + assert_eq!(*visitor.selection_vector, expected); + assert!(*visitor.seen_metadata); + assert!(*visitor.seen_protocol); assert_eq!(visitor.seen_txns.len(), 1); assert_eq!(visitor.total_actions, 3); Ok(()) @@ -1212,17 +1182,17 @@ mod tests { seen_txns.insert("app1".to_string()); let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: false, - seen_metadata: false, + seen_protocol: &mut false, + seen_metadata: &mut false, seen_txns: &mut seen_txns, - selection_vector: vec![false; 1], + selection_vector: &mut vec![false; 1], total_actions: 0, }; visitor.visit_rows_of(batch.as_ref())?; let expected = vec![false]; // Transaction should be skipped as it's already seen - assert_eq!(visitor.selection_vector, expected); + assert_eq!(*visitor.selection_vector, expected); assert_eq!(visitor.seen_txns.len(), 1); // Still only one transaction assert_eq!(visitor.total_actions, 0); Ok(()) @@ -1240,17 +1210,17 @@ mod tests { // Set protocol and metadata as already seen let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: true, // Already seen - seen_metadata: true, // Already seen + seen_protocol: &mut true, // Already seen + seen_metadata: &mut true, // Already seen seen_txns: &mut HashSet::new(), - selection_vector: vec![false; 2], + selection_vector: &mut vec![false; 2], total_actions: 0, }; visitor.visit_rows_of(batch.as_ref())?; let expected = vec![false, false]; // Both should be skipped - assert_eq!(visitor.selection_vector, expected); + assert_eq!(*visitor.selection_vector, expected); assert_eq!(visitor.total_actions, 0); Ok(()) } diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs new file mode 100644 index 000000000..a632fd336 --- /dev/null +++ b/kernel/src/checkpoints/log_replay.rs @@ -0,0 +1,229 @@ +use std::collections::HashSet; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +use crate::actions::visitors::{CheckpointFileActionsVisitor, CheckpointNonFileActionsVisitor}; +use crate::engine_data::RowVisitor; +use crate::scan::log_replay::FileActionKey; +use crate::{DeltaResult, EngineData}; + +/// `LogReplayForCheckpoints` is responsible for filtering actions during log +/// replay to include only those that should be included in a V1 checkpoint. +struct LogReplayForCheckpoints { + /// Tracks file actions that have been seen during log replay to avoid duplicates. + /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. + seen_file_keys: HashSet, + + /// Counter for the total number of actions processed during log replay. + total_actions: Arc, + + /// Counter for the total number of add actions processed during log replay. + total_add_actions: Arc, + + /// Indicates whether a protocol action has been seen in the log. + seen_protocol: bool, + + /// Indicates whether a metadata action has been seen in the log. + seen_metadata: bool, + + /// Set of transaction app IDs that have been processed to avoid duplicates. + seen_txns: HashSet, + + /// Minimum timestamp for file retention, used for filtering expired tombstones. + minimum_file_retention_timestamp: i64, +} + +impl LogReplayForCheckpoints { + pub(super) fn new( + total_actions_counter: Arc, + total_add_actions_counter: Arc, + minimum_file_retention_timestamp: i64, + ) -> Self { + Self { + seen_file_keys: Default::default(), + total_actions: total_actions_counter, + total_add_actions: total_add_actions_counter, + seen_protocol: false, + seen_metadata: false, + seen_txns: Default::default(), + minimum_file_retention_timestamp, + } + } + + /// Iterates over actions and filters them for inclusion in a V1 checkpoint. + /// + /// This function processes batches of actions in reverse chronological order + /// (from most recent to least recent) and performs the necessary filtering + /// to ensure the checkpoint contains only the actions needed to reconstruct + /// the complete state of the table. + /// + /// # Filtering Rules + /// + /// The following rules apply when filtering actions: + /// + /// 1. Only the most recent protocol and metadata actions are included + /// 2. For each app ID, only the most recent transaction action is included + /// 3. File actions are deduplicated based on path and unique ID + /// 4. Tombstones older than `minimum_file_retention_timestamp` are excluded + pub(super) fn process_v1_checkpoint_batch( + &mut self, + actions: Box, + is_log_batch: bool, + ) -> DeltaResult<(Box, Vec)> { + // Initialize selection vector with all rows un-selected + let mut selection_vector = vec![false; actions.len()]; + assert_eq!( + selection_vector.len(), + actions.len(), + "Initial selection vector length does not match actions length" + ); + + // Create the non file actions visitor to process non file actions and update selection vector + let mut non_file_actions_visitor = CheckpointNonFileActionsVisitor { + seen_protocol: &mut self.seen_protocol, + seen_metadata: &mut self.seen_metadata, + seen_txns: &mut self.seen_txns, + selection_vector: &mut selection_vector, + total_actions: 0, + }; + + // Process actions and let visitor update selection vector + non_file_actions_visitor.visit_rows_of(actions.as_ref())?; + + // Update shared counters with non-file action counts from this batch + self.total_actions + .fetch_add(non_file_actions_visitor.total_actions, Ordering::Relaxed); + + // Create the file actions visitor to process file actions and update selection vector + let mut file_actions_visitor = CheckpointFileActionsVisitor { + seen_file_keys: &mut self.seen_file_keys, + is_log_batch, + selection_vector: &mut selection_vector, + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp: self.minimum_file_retention_timestamp, + }; + + // Process actions and let visitor update selection vector + file_actions_visitor.visit_rows_of(actions.as_ref())?; + + // Update shared counters with file action counts from this batch + self.total_actions + .fetch_add(file_actions_visitor.total_actions, Ordering::Relaxed); + self.total_add_actions + .fetch_add(file_actions_visitor.total_add_actions, Ordering::Relaxed); + + Ok((actions, selection_vector)) + } +} + +/// Given an iterator of (engine_data, bool) tuples, returns an iterator of +/// `(engine_data, selection_vec)`. Each row that is selected in the returned `engine_data` _must_ +/// be written to the V1 checkpoint file in order to capture the table version's complete state. +/// Non-selected rows _must_ be ignored. The boolean flag indicates whether the record batch +/// is a log or checkpoint batch. +/// +/// Note: The iterator of (engine_data, bool) tuples must be sorted by the order of the actions in +/// the log from most recent to least recent. +pub(crate) fn v1_checkpoint_actions_iter( + action_iter: impl Iterator, bool)>> + Send + 'static, + total_actions_counter: Arc, + total_add_actions_counter: Arc, + minimum_file_retention_timestamp: i64, +) -> impl Iterator, Vec)>> + Send + 'static { + let mut log_scanner = LogReplayForCheckpoints::new( + total_actions_counter, + total_add_actions_counter, + minimum_file_retention_timestamp, + ); + + action_iter + .map(move |action_res| { + let (batch, is_log_batch) = action_res?; + log_scanner.process_v1_checkpoint_batch(batch, is_log_batch) + }) + // Only yield batches that have at least one selected row + .filter(|res| res.as_ref().map_or(true, |(_, sv)| sv.contains(&true))) +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; + + use crate::arrow::array::StringArray; + use crate::checkpoints::log_replay::v1_checkpoint_actions_iter; + use crate::utils::test_utils::parse_json_batch; + use crate::DeltaResult; + + /// Tests the end-to-end processing of multiple batches with various action types + /// This tests the integration of the visitors with the main iterator function. + /// More granular testing is performed in the individual visitor tests. + #[test] + fn test_v1_checkpoint_actions_iter_multi_batch_integration() -> DeltaResult<()> { + // Setup counters + let total_actions_counter = Arc::new(AtomicUsize::new(0)); + let total_add_actions_counter = Arc::new(AtomicUsize::new(0)); + + // Create first batch with protocol, metadata, and some files + let json_strings1: StringArray = vec![ + r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, + r#"{"metaData":{"id":"test2","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, + r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, + r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, + ].into(); + + // Create second batch with some duplicates and new files + let json_strings2: StringArray = vec![ + // Protocol and metadata should be skipped as duplicates + r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, + r#"{"metaData":{"id":"test1","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, + // New files + r#"{"add":{"path":"file3","partitionValues":{},"size":800,"modificationTime":102,"dataChange":true}}"#, + // Duplicate file should be skipped + r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, // Transaction + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"# + ].into(); + + // Create third batch with all duplicate actions (should be filtered out completely) + let json_strings3: StringArray = vec![ + r#"{"add":{"path":"file1","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, + r#"{"add":{"path":"file2","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, + ].into(); + + let input_batches = vec![ + Ok((parse_json_batch(json_strings1), true)), + Ok((parse_json_batch(json_strings2), true)), + Ok((parse_json_batch(json_strings3), true)), + ]; + + // Run the iterator + let results: Vec<_> = v1_checkpoint_actions_iter( + input_batches.into_iter(), + total_actions_counter.clone(), + total_add_actions_counter.clone(), + 0, + ) + .collect::, _>>()?; + + // Expect two batches in results (third batch should be filtered)" + assert_eq!(results.len(), 2); + + // First batch should have all rows selected + let (_, selection_vector1) = &results[0]; + assert_eq!(selection_vector1, &vec![true, true, true, true]); + + // Second batch should have only new file and transaction selected + let (_, selection_vector2) = &results[1]; + assert_eq!(selection_vector2, &vec![false, false, true, false, true]); + + // Verify counters + // 6 total actions (4 from batch1 + 2 from batch2 + 0 from batch3) + assert_eq!(total_actions_counter.load(Ordering::Relaxed), 6); + + // 3 add actions (2 from batch1 + 1 from batch2) + assert_eq!(total_add_actions_counter.load(Ordering::Relaxed), 3); + + Ok(()) + } +} diff --git a/kernel/src/checkpoints/mod.rs b/kernel/src/checkpoints/mod.rs new file mode 100644 index 000000000..826ff771f --- /dev/null +++ b/kernel/src/checkpoints/mod.rs @@ -0,0 +1 @@ +pub mod log_replay; diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index 65a0a6ab5..bf2476921 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -74,6 +74,7 @@ use url::Url; use self::schema::{DataType, SchemaRef}; pub mod actions; +pub mod checkpoints; pub mod engine_data; pub mod error; pub mod expressions; diff --git a/kernel/src/path.rs b/kernel/src/path.rs index df372f08e..f9988cc8a 100644 --- a/kernel/src/path.rs +++ b/kernel/src/path.rs @@ -196,6 +196,23 @@ impl ParsedLogPath { } Ok(path) } + + /// Create a new ParsedCommitPath for a new parquet v1 checkpoint file at the specified version + pub(crate) fn new_v1_checkpoint( + table_root: &Url, + version: Version, + ) -> DeltaResult> { + let filename = format!("{:020}.checkpoint.parquet", version); + let location = table_root.join("_delta_log/")?.join(&filename)?; + let path = Self::try_from(location)? + .ok_or_else(|| Error::internal_error("attempted to create invalid checkpoint path"))?; + if !path.is_checkpoint() { + return Err(Error::internal_error( + "ParsedLogPath::new_commit created a non-checkpoint path", + )); + } + Ok(path) + } } #[cfg(test)] diff --git a/kernel/src/utils.rs b/kernel/src/utils.rs index fd2db2501..7713e042a 100644 --- a/kernel/src/utils.rs +++ b/kernel/src/utils.rs @@ -22,11 +22,15 @@ pub(crate) mod test_utils { use tempfile::TempDir; use test_utils::delta_path_for_version; + use crate::actions::get_log_schema; + use crate::arrow::array::StringArray; + use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use crate::engine::sync::SyncEngine; use crate::{ actions::{Add, Cdc, CommitInfo, Metadata, Protocol, Remove}, engine::arrow_data::ArrowEngineData, - EngineData, }; + use crate::{Engine, EngineData}; #[derive(Serialize)] pub(crate) enum Action { @@ -97,4 +101,23 @@ pub(crate) mod test_utils { pub(crate) fn assert_batch_matches(actual: Box, expected: Box) { assert_eq!(into_record_batch(actual), into_record_batch(expected)); } + + /// Converts a `StringArray` to an `EngineData` object + pub(crate) fn string_array_to_engine_data(string_array: StringArray) -> Box { + let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); + let schema = Arc::new(ArrowSchema::new(vec![string_field])); + let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) + .expect("Can't convert to record batch"); + Box::new(ArrowEngineData::new(batch)) + } + + /// Parses a batch of JSON strings into an `EngineData` object + pub(crate) fn parse_json_batch(json_strings: StringArray) -> Box { + let engine = SyncEngine::new(); + let json_handler = engine.get_json_handler(); + let output_schema = get_log_schema().clone(); + json_handler + .parse_json(string_array_to_engine_data(json_strings), output_schema) + .unwrap() + } } From db5ccd05ba8be030fd8941d4b0025fcbe1372d49 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 13 Mar 2025 15:29:15 -0700 Subject: [PATCH 05/45] rename & some clean up --- kernel/src/checkpoints/log_replay.rs | 17 ++++++++++------- kernel/src/engine/arrow_data.rs | 19 +++---------------- kernel/src/engine/default/json.rs | 9 +-------- kernel/src/scan/mod.rs | 15 +++------------ 4 files changed, 17 insertions(+), 43 deletions(-) diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs index a632fd336..4bd6c3448 100644 --- a/kernel/src/checkpoints/log_replay.rs +++ b/kernel/src/checkpoints/log_replay.rs @@ -7,9 +7,10 @@ use crate::engine_data::RowVisitor; use crate::scan::log_replay::FileActionKey; use crate::{DeltaResult, EngineData}; -/// `LogReplayForCheckpoints` is responsible for filtering actions during log +/// `V1CheckpointLogReplayScanner` is responsible for filtering actions during log /// replay to include only those that should be included in a V1 checkpoint. -struct LogReplayForCheckpoints { +#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented +struct V1CheckpointLogReplayScanner { /// Tracks file actions that have been seen during log replay to avoid duplicates. /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. seen_file_keys: HashSet, @@ -33,7 +34,8 @@ struct LogReplayForCheckpoints { minimum_file_retention_timestamp: i64, } -impl LogReplayForCheckpoints { +#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented +impl V1CheckpointLogReplayScanner { pub(super) fn new( total_actions_counter: Arc, total_add_actions_counter: Arc, @@ -65,7 +67,7 @@ impl LogReplayForCheckpoints { /// 2. For each app ID, only the most recent transaction action is included /// 3. File actions are deduplicated based on path and unique ID /// 4. Tombstones older than `minimum_file_retention_timestamp` are excluded - pub(super) fn process_v1_checkpoint_batch( + pub(super) fn filter_v1_checkpoint_actions( &mut self, actions: Box, is_log_batch: bool, @@ -125,13 +127,14 @@ impl LogReplayForCheckpoints { /// /// Note: The iterator of (engine_data, bool) tuples must be sorted by the order of the actions in /// the log from most recent to least recent. +#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented pub(crate) fn v1_checkpoint_actions_iter( action_iter: impl Iterator, bool)>> + Send + 'static, total_actions_counter: Arc, total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> impl Iterator, Vec)>> + Send + 'static { - let mut log_scanner = LogReplayForCheckpoints::new( + let mut log_scanner = V1CheckpointLogReplayScanner::new( total_actions_counter, total_add_actions_counter, minimum_file_retention_timestamp, @@ -140,7 +143,7 @@ pub(crate) fn v1_checkpoint_actions_iter( action_iter .map(move |action_res| { let (batch, is_log_batch) = action_res?; - log_scanner.process_v1_checkpoint_batch(batch, is_log_batch) + log_scanner.filter_v1_checkpoint_actions(batch, is_log_batch) }) // Only yield batches that have at least one selected row .filter(|res| res.as_ref().map_or(true, |(_, sv)| sv.contains(&true))) @@ -156,7 +159,7 @@ mod tests { use crate::utils::test_utils::parse_json_batch; use crate::DeltaResult; - /// Tests the end-to-end processing of multiple batches with various action types + /// Tests the end-to-end processing of multiple batches with various action types. /// This tests the integration of the visitors with the main iterator function. /// More granular testing is performed in the individual visitor tests. #[test] diff --git a/kernel/src/engine/arrow_data.rs b/kernel/src/engine/arrow_data.rs index 988380901..b09b27ff9 100644 --- a/kernel/src/engine/arrow_data.rs +++ b/kernel/src/engine/arrow_data.rs @@ -294,27 +294,14 @@ impl ArrowEngineData { #[cfg(test)] mod tests { - use std::sync::Arc; - - use crate::arrow::array::{RecordBatch, StringArray}; - use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; - + use crate::arrow::array::StringArray; + use crate::utils::test_utils::string_array_to_engine_data; use crate::{ actions::{get_log_schema, Metadata, Protocol}, engine::sync::SyncEngine, - DeltaResult, Engine, EngineData, + DeltaResult, Engine, }; - use super::ArrowEngineData; - - fn string_array_to_engine_data(string_array: StringArray) -> Box { - let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); - let schema = Arc::new(ArrowSchema::new(vec![string_field])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) - .expect("Can't convert to record batch"); - Box::new(ArrowEngineData::new(batch)) - } - #[test] fn test_md_extract() -> DeltaResult<()> { let engine = SyncEngine::new(); diff --git a/kernel/src/engine/default/json.rs b/kernel/src/engine/default/json.rs index 98a9b0dc7..8b401a3d4 100644 --- a/kernel/src/engine/default/json.rs +++ b/kernel/src/engine/default/json.rs @@ -257,6 +257,7 @@ mod tests { use crate::engine::default::executor::tokio::{ TokioBackgroundExecutor, TokioMultiThreadExecutor, }; + use crate::utils::test_utils::string_array_to_engine_data; use futures::future; use itertools::Itertools; use object_store::local::LocalFileSystem; @@ -471,14 +472,6 @@ mod tests { } } - fn string_array_to_engine_data(string_array: StringArray) -> Box { - let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); - let schema = Arc::new(ArrowSchema::new(vec![string_field])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) - .expect("Can't convert to record batch"); - Box::new(ArrowEngineData::new(batch)) - } - #[test] fn test_parse_json() { let store = Arc::new(LocalFileSystem::new()); diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index ccdff3d66..689a6eab3 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -663,8 +663,8 @@ pub fn selection_vector( // some utils that are used in file_stream.rs and state.rs tests #[cfg(test)] pub(crate) mod test_utils { - use crate::arrow::array::{RecordBatch, StringArray}; - use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use crate::arrow::array::StringArray; + use crate::utils::test_utils::string_array_to_engine_data; use itertools::Itertools; use std::sync::Arc; @@ -676,20 +676,11 @@ pub(crate) mod test_utils { }, scan::log_replay::scan_action_iter, schema::SchemaRef, - EngineData, JsonHandler, + JsonHandler, }; use super::{state::ScanCallback, Transform}; - // TODO(nick): Merge all copies of this into one "test utils" thing - fn string_array_to_engine_data(string_array: StringArray) -> Box { - let string_field = Arc::new(Field::new("a", DataType::Utf8, true)); - let schema = Arc::new(ArrowSchema::new(vec![string_field])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(string_array)]) - .expect("Can't convert to record batch"); - Box::new(ArrowEngineData::new(batch)) - } - // Generates a batch of sidecar actions with the given paths. // The schema is provided as null columns affect equality checks. pub(crate) fn sidecar_batch_with_given_paths( From 42c08c1f439a5d20adcba1f56df74b3e65b469ec Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 13 Mar 2025 15:43:25 -0700 Subject: [PATCH 06/45] remove new path for now --- kernel/src/path.rs | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/kernel/src/path.rs b/kernel/src/path.rs index f9988cc8a..df372f08e 100644 --- a/kernel/src/path.rs +++ b/kernel/src/path.rs @@ -196,23 +196,6 @@ impl ParsedLogPath { } Ok(path) } - - /// Create a new ParsedCommitPath for a new parquet v1 checkpoint file at the specified version - pub(crate) fn new_v1_checkpoint( - table_root: &Url, - version: Version, - ) -> DeltaResult> { - let filename = format!("{:020}.checkpoint.parquet", version); - let location = table_root.join("_delta_log/")?.join(&filename)?; - let path = Self::try_from(location)? - .ok_or_else(|| Error::internal_error("attempted to create invalid checkpoint path"))?; - if !path.is_checkpoint() { - return Err(Error::internal_error( - "ParsedLogPath::new_commit created a non-checkpoint path", - )); - } - Ok(path) - } } #[cfg(test)] From f91baebe5af22c4c01a7529fdf9967ffa04c510f Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 21 Mar 2025 19:33:35 -0700 Subject: [PATCH 07/45] merge non file action visitor tests --- kernel/src/actions/visitors.rs | 58 +++++++++++++++++----------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 3ade3d914..150beffe6 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -784,10 +784,8 @@ mod tests { use super::*; use crate::{ - actions::get_log_schema, - engine::arrow_data::ArrowEngineData, - engine::sync::{json::SyncJsonHandler, SyncEngine}, - Engine, EngineData, JsonHandler, + actions::get_log_schema, engine::arrow_data::ArrowEngineData, engine::sync::SyncEngine, + Engine, EngineData, }; // TODO(nick): Merge all copies of this into one "test utils" thing @@ -799,8 +797,7 @@ mod tests { Box::new(ArrowEngineData::new(batch)) } - fn action_batch() -> Box { - let handler = SyncJsonHandler {}; + fn action_batch() -> Box { let json_strings: StringArray = vec![ r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, r#"{"remove":{"path":"part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, @@ -812,11 +809,7 @@ mod tests { r#"{"txn":{"appId":"myApp","version": 3}}"#, ] .into(); - let output_schema = get_log_schema().clone(); - let parsed = handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); - ArrowEngineData::try_from_engine_data(parsed).unwrap() + parse_json_batch(json_strings) } fn parse_json_batch(json_strings: StringArray) -> Box { @@ -1202,26 +1195,30 @@ mod tests { } #[test] - fn test_checkpoint_non_file_actions_visitor_txn_already_seen() -> DeltaResult<()> { - let json_strings: StringArray = - vec![r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#].into(); + fn test_checkpoint_non_file_actions_visitor_already_seen_actions() -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + ].into(); let batch = parse_json_batch(json_strings); - // Pre-populate with app1 + // Pre-populate with txn app1 let mut seen_txns = HashSet::new(); seen_txns.insert("app1".to_string()); let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: false, - seen_metadata: false, + seen_protocol: true, // Already seen + seen_metadata: true, // Already seen seen_txns: &mut seen_txns, - selection_vector: vec![false; 1], + selection_vector: vec![false; 3], total_actions: 0, }; visitor.visit_rows_of(batch.as_ref())?; - let expected = vec![false]; // Transaction should be skipped as it's already seen + // All actions should be skipped as they have already been seen + let expected = vec![false; 3]; assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.seen_txns.len(), 1); // Still only one transaction assert_eq!(visitor.total_actions, 0); @@ -1229,29 +1226,32 @@ mod tests { } #[test] - fn test_checkpoint_non_file_actions_visitor_protocol_and_metadata_already_seen( - ) -> DeltaResult<()> { + fn test_checkpoint_non_file_actions_visitor_duplicate_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableChangeDataFeed":"true"},"createdTime":1677811175819}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, ] .into(); let batch = parse_json_batch(json_strings); - // Set protocol and metadata as already seen let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: true, // Already seen - seen_metadata: true, // Already seen - seen_txns: &mut HashSet::new(), - selection_vector: vec![false; 2], + seen_protocol: false, + seen_metadata: false, + seen_txns: &mut HashSet::new(), // Empty set + selection_vector: vec![false; 6], total_actions: 0, }; visitor.visit_rows_of(batch.as_ref())?; - let expected = vec![false, false]; // Both should be skipped + let expected = vec![true, false, true, false, true, false]; assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.total_actions, 0); + assert_eq!(visitor.seen_txns.len(), 1); + assert_eq!(visitor.total_actions, 3); Ok(()) } } From 9fdfba70f63371a72bc624f7228e6f92f7760ab6 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 24 Mar 2025 15:22:56 -0700 Subject: [PATCH 08/45] mvp for refactor --- kernel/src/actions/visitors.rs | 80 ++++-------- kernel/src/checkpoints/log_replay.rs | 78 ++++++----- kernel/src/lib.rs | 1 + kernel/src/log_replay.rs | 154 ++++++++++++++++++++++ kernel/src/scan/log_replay.rs | 188 ++++++++++++--------------- 5 files changed, 311 insertions(+), 190 deletions(-) create mode 100644 kernel/src/log_replay.rs diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index e0e622b05..c348c92e2 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -3,10 +3,9 @@ use std::collections::{HashMap, HashSet}; use std::sync::LazyLock; -use tracing::debug; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; -use crate::scan::log_replay::FileActionKey; +use crate::log_replay::{FileActionKey, FileActionVisitor}; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, Error}; @@ -500,40 +499,30 @@ pub(crate) struct CheckpointFileActionsVisitor<'seen> { pub(crate) minimum_file_retention_timestamp: i64, } -#[allow(unused)] // TODO: Remove flag once used for checkpoint writing -impl CheckpointFileActionsVisitor<'_> { - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it - /// and should process it. - /// - /// TODO: This method is a duplicate of AddRemoveDedupVisior's method! - fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen_file_keys.contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - true - } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - if self.is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen_file_keys.insert(key); - } - false - } +impl FileActionVisitor for CheckpointFileActionsVisitor<'_> { + fn seen_file_keys(&mut self) -> &mut HashSet { + self.seen_file_keys } + fn add_path_index(&self) -> usize { + 0 + } + + fn remove_path_index(&self) -> Option { + Some(4) + } + + fn add_dv_start_index(&self) -> usize { + 1 + } + + fn remove_dv_start_index(&self) -> Option { + Some(6) + } +} + +#[allow(unused)] // TODO: Remove flag once used for checkpoint writing +impl CheckpointFileActionsVisitor<'_> { /// A remove action includes a timestamp indicating when the deletion occurred. Physical files /// are deleted lazily after a user-defined expiration time, allowing concurrent readers to /// access stale snapshots. A remove action remains as a tombstone in a checkpoint file until @@ -556,29 +545,14 @@ impl CheckpointFileActionsVisitor<'_> { i: usize, getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { - // Add will have a path at index 0 if it is valid; otherwise we may - // have a remove with a path at index 4. In either case, extract the three dv getters at - // indexes that immediately follow a valid path index. - let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? { - (path, &getters[1..4], true) - } else if let Some(path) = getters[4].get_opt(i, "remove.path")? { - (path, &getters[6..9], false) - } else { + // Retrieve the file action key and whether it is an add action + let Some((file_key, is_add)) = self.extract_file_action(i, getters)? else { + // Not a file action return Ok(false); }; - let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?, - dv_getters[2].get_opt(i, "deletionVector.offset")?, - )), - None => None, - }; - // Check both adds and removes (skipping already-seen) - let file_key = FileActionKey::new(path, dv_unique_id); - if self.check_and_record_seen(file_key) { + if self.check_and_record_seen(file_key, self.is_log_batch) { return Ok(false); } diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs index 4bd6c3448..98600a821 100644 --- a/kernel/src/checkpoints/log_replay.rs +++ b/kernel/src/checkpoints/log_replay.rs @@ -4,13 +4,13 @@ use std::sync::Arc; use crate::actions::visitors::{CheckpointFileActionsVisitor, CheckpointNonFileActionsVisitor}; use crate::engine_data::RowVisitor; -use crate::scan::log_replay::FileActionKey; +use crate::log_replay::{FileActionKey, LogReplayProcessor}; use crate::{DeltaResult, EngineData}; -/// `V1CheckpointLogReplayScanner` is responsible for filtering actions during log +/// `CheckpointLogReplayProcessor` is responsible for filtering actions during log /// replay to include only those that should be included in a V1 checkpoint. #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented -struct V1CheckpointLogReplayScanner { +struct CheckpointLogReplayProcessor { /// Tracks file actions that have been seen during log replay to avoid duplicates. /// Contains (data file path, dv_unique_id) pairs as `FileActionKey` instances. seen_file_keys: HashSet, @@ -34,26 +34,10 @@ struct V1CheckpointLogReplayScanner { minimum_file_retention_timestamp: i64, } -#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented -impl V1CheckpointLogReplayScanner { - pub(super) fn new( - total_actions_counter: Arc, - total_add_actions_counter: Arc, - minimum_file_retention_timestamp: i64, - ) -> Self { - Self { - seen_file_keys: Default::default(), - total_actions: total_actions_counter, - total_add_actions: total_add_actions_counter, - seen_protocol: false, - seen_metadata: false, - seen_txns: Default::default(), - minimum_file_retention_timestamp, - } - } +impl LogReplayProcessor for CheckpointLogReplayProcessor { + // Define the processing result type as a tuple of the data and selection vector + type ProcessingResult = (Box, Vec); - /// Iterates over actions and filters them for inclusion in a V1 checkpoint. - /// /// This function processes batches of actions in reverse chronological order /// (from most recent to least recent) and performs the necessary filtering /// to ensure the checkpoint contains only the actions needed to reconstruct @@ -67,16 +51,16 @@ impl V1CheckpointLogReplayScanner { /// 2. For each app ID, only the most recent transaction action is included /// 3. File actions are deduplicated based on path and unique ID /// 4. Tombstones older than `minimum_file_retention_timestamp` are excluded - pub(super) fn filter_v1_checkpoint_actions( + fn process_batch( &mut self, - actions: Box, + batch: Box, is_log_batch: bool, - ) -> DeltaResult<(Box, Vec)> { + ) -> DeltaResult { // Initialize selection vector with all rows un-selected - let mut selection_vector = vec![false; actions.len()]; + let mut selection_vector = vec![false; batch.len()]; assert_eq!( selection_vector.len(), - actions.len(), + batch.len(), "Initial selection vector length does not match actions length" ); @@ -90,7 +74,7 @@ impl V1CheckpointLogReplayScanner { }; // Process actions and let visitor update selection vector - non_file_actions_visitor.visit_rows_of(actions.as_ref())?; + non_file_actions_visitor.visit_rows_of(batch.as_ref())?; // Update shared counters with non-file action counts from this batch self.total_actions @@ -107,7 +91,7 @@ impl V1CheckpointLogReplayScanner { }; // Process actions and let visitor update selection vector - file_actions_visitor.visit_rows_of(actions.as_ref())?; + file_actions_visitor.visit_rows_of(batch.as_ref())?; // Update shared counters with file action counts from this batch self.total_actions @@ -115,7 +99,31 @@ impl V1CheckpointLogReplayScanner { self.total_add_actions .fetch_add(file_actions_visitor.total_add_actions, Ordering::Relaxed); - Ok((actions, selection_vector)) + Ok((batch, selection_vector)) + } + + // Get a reference to the set of seen file keys + fn seen_file_keys(&mut self) -> &mut HashSet { + &mut self.seen_file_keys + } +} + +#[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented +impl CheckpointLogReplayProcessor { + pub(super) fn new( + total_actions_counter: Arc, + total_add_actions_counter: Arc, + minimum_file_retention_timestamp: i64, + ) -> Self { + Self { + seen_file_keys: Default::default(), + total_actions: total_actions_counter, + total_add_actions: total_add_actions_counter, + seen_protocol: false, + seen_metadata: false, + seen_txns: Default::default(), + minimum_file_retention_timestamp, + } } } @@ -128,13 +136,13 @@ impl V1CheckpointLogReplayScanner { /// Note: The iterator of (engine_data, bool) tuples must be sorted by the order of the actions in /// the log from most recent to least recent. #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented -pub(crate) fn v1_checkpoint_actions_iter( +pub(crate) fn checkpoint_actions_iter( action_iter: impl Iterator, bool)>> + Send + 'static, total_actions_counter: Arc, total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> impl Iterator, Vec)>> + Send + 'static { - let mut log_scanner = V1CheckpointLogReplayScanner::new( + let mut log_scanner = CheckpointLogReplayProcessor::new( total_actions_counter, total_add_actions_counter, minimum_file_retention_timestamp, @@ -143,7 +151,7 @@ pub(crate) fn v1_checkpoint_actions_iter( action_iter .map(move |action_res| { let (batch, is_log_batch) = action_res?; - log_scanner.filter_v1_checkpoint_actions(batch, is_log_batch) + log_scanner.process_batch(batch, is_log_batch) }) // Only yield batches that have at least one selected row .filter(|res| res.as_ref().map_or(true, |(_, sv)| sv.contains(&true))) @@ -155,7 +163,7 @@ mod tests { use std::sync::Arc; use crate::arrow::array::StringArray; - use crate::checkpoints::log_replay::v1_checkpoint_actions_iter; + use crate::checkpoints::log_replay::checkpoint_actions_iter; use crate::utils::test_utils::parse_json_batch; use crate::DeltaResult; @@ -201,7 +209,7 @@ mod tests { ]; // Run the iterator - let results: Vec<_> = v1_checkpoint_actions_iter( + let results: Vec<_> = checkpoint_actions_iter( input_batches.into_iter(), total_actions_counter.clone(), total_add_actions_counter.clone(), diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index bf2476921..787d2a482 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -78,6 +78,7 @@ pub mod checkpoints; pub mod engine_data; pub mod error; pub mod expressions; +pub mod log_replay; pub mod scan; pub mod schema; pub mod snapshot; diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs new file mode 100644 index 000000000..e545f2408 --- /dev/null +++ b/kernel/src/log_replay.rs @@ -0,0 +1,154 @@ +use std::collections::{HashMap, HashSet}; +use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, LazyLock, +}; + +use itertools::Itertools; +use tracing::debug; + +use crate::actions::deletion_vector::DeletionVectorDescriptor; +use crate::actions::get_log_add_schema; +use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; +use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; + +#[derive(Debug, Hash, Eq, PartialEq)] +/// The subset of file action fields that uniquely identifies it in the log, used for deduplication +/// of adds and removes during log replay. +pub struct FileActionKey { + pub(crate) path: String, + pub(crate) dv_unique_id: Option, +} + +impl FileActionKey { + pub fn new(path: impl Into, dv_unique_id: Option) -> Self { + let path = path.into(); + Self { path, dv_unique_id } + } +} + +/// Trait defining the interface for log replay processors that process and filter +/// Delta Lake log actions based on different strategies. +pub trait LogReplayProcessor { + /// The type of results produced by this processor + type ProcessingResult; + + /// Process a batch of actions and return the filtered result + fn process_batch( + &mut self, + batch: Box, + is_log_batch: bool, + ) -> DeltaResult; + + // Get a reference to the set of seen file keys + fn seen_file_keys(&mut self) -> &mut HashSet; +} + +/// Base trait for visitors that process file actions during log replay +pub trait FileActionVisitor { + /// Get a reference to the set of seen file keys + fn seen_file_keys(&mut self) -> &mut HashSet; + + /// Checks if log replay already processed this logical file (in which case the current action + /// should be ignored). If not already seen, register it so we can recognize future duplicates. + /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it + /// and should process it. + fn check_and_record_seen(&mut self, key: FileActionKey, is_log_batch: bool) -> bool { + // Note: each (add.path + add.dv_unique_id()) pair has a + // unique Add + Remove pair in the log. For example: + // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json + + if self.seen_file_keys().contains(&key) { + debug!( + "Ignoring duplicate ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, is_log_batch + ); + true + } else { + debug!( + "Including ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, is_log_batch + ); + if is_log_batch { + // Remember file actions from this batch so we can ignore duplicates as we process + // batches from older commit and/or checkpoint files. We don't track checkpoint + // batches because they are already the oldest actions and never replace anything. + self.seen_file_keys().insert(key); + } + false + } + } + + /// Index in getters array for add.path + fn add_path_index(&self) -> usize; + + /// Index in getters array for remove.path + fn remove_path_index(&self) -> Option; + + /// Starting index for add action's deletion vector getters + /// (Assumes 3 consecutive items: storageType, pathOrInlineDv, offset) + fn add_dv_start_index(&self) -> usize; + + /// Starting index for remove action's deletion vector getters + /// (Assumes 3 consecutive items: storageType, pathOrInlineDv, offset) + fn remove_dv_start_index(&self) -> Option; + + /// Extract deletion vector unique ID + fn extract_dv_unique_id<'a>( + &self, + i: usize, + getters: &[&'a dyn GetData<'a>], + is_add: bool, + ) -> DeltaResult> { + // Get the starting index based on action type + let start_idx = if is_add { + self.add_dv_start_index() + } else if let Some(idx) = self.remove_dv_start_index() { + idx + } else { + return Err(Error::GenericError { + source: "DV getters should exist".into(), + }); + }; + + // Extract the DV unique ID + match getters[start_idx].get_opt(i, "deletionVector.storageType")? { + Some(storage_type) => Ok(Some(DeletionVectorDescriptor::unique_id_from_parts( + storage_type, + getters[start_idx + 1].get(i, "deletionVector.pathOrInlineDv")?, + getters[start_idx + 2].get_opt(i, "deletionVector.offset")?, + ))), + None => Ok(None), + } + } + + /// Extract file action key and determine if it's an add operation + fn extract_file_action<'a>( + &self, + i: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult> { + // Try to extract an add action path + if let Some(path) = getters[self.add_path_index()].get_str(i, "add.path")? { + let dv_unique_id = self.extract_dv_unique_id(i, getters, true)?; + let file_key = FileActionKey::new(path, dv_unique_id); + return Ok(Some((file_key, true))); + } + + // The AddRemoveDedupVisitor does not include remove action getters when + // dealing with non-log batches (since they are not needed for deduplication). + let Some(remove_idx) = self.remove_path_index() else { + return Ok(None); + }; + + // Try to extract a remove action path + if let Some(path) = getters[remove_idx].get_str(i, "remove.path")? { + let dv_unique_id = self.extract_dv_unique_id(i, getters, false)?; + let file_key = FileActionKey::new(path, dv_unique_id); + return Ok(Some((file_key, false))); + } + + // No path found, not a file action + Ok(None) + } +} diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index dbcd056df..b2c56c026 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -3,41 +3,29 @@ use std::collections::{HashMap, HashSet}; use std::sync::{Arc, LazyLock}; use itertools::Itertools; -use tracing::debug; use super::data_skipping::DataSkippingFilter; use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; +use crate::log_replay::{FileActionKey, FileActionVisitor, LogReplayProcessor}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; use crate::scan::{DeletionVectorDescriptor, Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; -/// The subset of file action fields that uniquely identifies it in the log, used for deduplication -/// of adds and removes during log replay. -#[derive(Debug, Hash, Eq, PartialEq)] -pub(crate) struct FileActionKey { - pub(crate) path: String, - pub(crate) dv_unique_id: Option, -} -impl FileActionKey { - pub(crate) fn new(path: impl Into, dv_unique_id: Option) -> Self { - let path = path.into(); - Self { path, dv_unique_id } - } -} - -struct LogReplayScanner { +struct ScanLogReplayProcessor { partition_filter: Option, data_skipping_filter: Option, - + add_transform: Arc, + logical_schema: SchemaRef, + transform: Option>, /// A set of (data file path, dv_unique_id) pairs that have been seen thus /// far in the log. This is used to filter out files with Remove actions as /// well as duplicate entries in the log. - seen: HashSet, + seen_file_keys: HashSet, } /// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds. Log @@ -45,7 +33,7 @@ struct LogReplayScanner { /// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the /// first action for a given file is a remove, then that file does not show up in the result at all. struct AddRemoveDedupVisitor<'seen> { - seen: &'seen mut HashSet, + seen_file_keys: &'seen mut HashSet, selection_vector: Vec, logical_schema: SchemaRef, transform: Option>, @@ -54,37 +42,37 @@ struct AddRemoveDedupVisitor<'seen> { is_log_batch: bool, } -impl AddRemoveDedupVisitor<'_> { - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it - /// and should process it. - fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen.contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - true +impl FileActionVisitor for AddRemoveDedupVisitor<'_> { + fn seen_file_keys(&mut self) -> &mut HashSet { + self.seen_file_keys + } + + fn add_path_index(&self) -> usize { + 0 + } + + fn remove_path_index(&self) -> Option { + if self.is_log_batch { + Some(5) } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - if self.is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen.insert(key); - } - false + None // No remove action getters when not a log batch } } + fn add_dv_start_index(&self) -> usize { + 2 + } + + fn remove_dv_start_index(&self) -> Option { + if self.is_log_batch { + Some(6) + } else { + None // No remove action getters when not a log batch + } + } +} + +impl AddRemoveDedupVisitor<'_> { fn parse_partition_value( &self, field_idx: usize, @@ -162,28 +150,12 @@ impl AddRemoveDedupVisitor<'_> { /// True if this row contains an Add action that should survive log replay. Skip it if the row /// is not an Add action, or the file has already been seen previously. fn is_valid_add<'a>(&mut self, i: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult { - // Add will have a path at index 0 if it is valid; otherwise, if it is a log batch, we may - // have a remove with a path at index 4. In either case, extract the three dv getters at - // indexes that immediately follow a valid path index. - let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? { - (path, &getters[2..5], true) - } else if !self.is_log_batch { - return Ok(false); - } else if let Some(path) = getters[5].get_opt(i, "remove.path")? { - (path, &getters[6..9], false) - } else { + // Retrieve the file action key and whether it is an add action + let Some((file_key, is_add)) = self.extract_file_action(i, getters)? else { + // Not a file action return Ok(false); }; - let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?, - dv_getters[2].get_opt(i, "deletionVector.offset")?, - )), - None => None, - }; - // Apply partition pruning (to adds only) before deduplication, so that we don't waste memory // tracking pruned files. Removes don't get pruned and we'll still have to track them. // @@ -203,8 +175,7 @@ impl AddRemoveDedupVisitor<'_> { }; // Check both adds and removes (skipping already-seen), but only transform and return adds - let file_key = FileActionKey::new(path, dv_unique_id); - if self.check_and_record_seen(file_key) || !is_add { + if self.check_and_record_seen(file_key, self.is_log_batch) || !is_add { return Ok(false); } let transform = self @@ -310,48 +281,70 @@ fn get_add_transform_expr() -> Expression { ]) } -impl LogReplayScanner { - /// Create a new [`LogReplayScanner`] instance - fn new(engine: &dyn Engine, physical_predicate: Option<(ExpressionRef, SchemaRef)>) -> Self { - Self { - partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()), - data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate), - seen: Default::default(), - } - } +impl LogReplayProcessor for ScanLogReplayProcessor { + type ProcessingResult = ScanData; - fn process_scan_batch( + fn process_batch( &mut self, - add_transform: &dyn ExpressionEvaluator, - actions: &dyn EngineData, - logical_schema: SchemaRef, - transform: Option>, + batch: Box, is_log_batch: bool, - ) -> DeltaResult { + ) -> DeltaResult { // Apply data skipping to get back a selection vector for actions that passed skipping. We // will update the vector below as log replay identifies duplicates that should be ignored. let selection_vector = match &self.data_skipping_filter { - Some(filter) => filter.apply(actions)?, - None => vec![true; actions.len()], + Some(filter) => filter.apply(batch.as_ref())?, + None => vec![true; batch.len()], }; - assert_eq!(selection_vector.len(), actions.len()); + assert_eq!(selection_vector.len(), batch.len()); + + let logical_schema = self.logical_schema.clone(); + let transform = self.transform.clone(); + let partition_filter = self.partition_filter.clone(); + let result = self.add_transform.evaluate(batch.as_ref())?; let mut visitor = AddRemoveDedupVisitor { - seen: &mut self.seen, + seen_file_keys: &mut self.seen_file_keys(), selection_vector, logical_schema, transform, - partition_filter: self.partition_filter.clone(), + partition_filter, row_transform_exprs: Vec::new(), is_log_batch, }; - visitor.visit_rows_of(actions)?; + + visitor.visit_rows_of(batch.as_ref())?; // TODO: Teach expression eval to respect the selection vector we just computed so carefully! let selection_vector = visitor.selection_vector; - let result = add_transform.evaluate(actions)?; Ok((result, selection_vector, visitor.row_transform_exprs)) } + + fn seen_file_keys(&mut self) -> &mut HashSet { + &mut self.seen_file_keys + } +} + +impl ScanLogReplayProcessor { + /// Create a new [`ScanLogReplayProcessor`] instance + fn new( + engine: &dyn Engine, + physical_predicate: Option<(ExpressionRef, SchemaRef)>, + logical_schema: SchemaRef, + transform: Option>, + ) -> Self { + Self { + partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()), + data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate), + add_transform: engine.get_expression_handler().get_evaluator( + get_log_add_schema().clone(), + get_add_transform_expr(), + SCAN_ROW_DATATYPE.clone(), + ), + seen_file_keys: Default::default(), + logical_schema, + transform, + } + } } /// Given an iterator of (engine_data, bool) tuples and a predicate, returns an iterator of @@ -365,22 +358,13 @@ pub(crate) fn scan_action_iter( transform: Option>, physical_predicate: Option<(ExpressionRef, SchemaRef)>, ) -> impl Iterator> { - let mut log_scanner = LogReplayScanner::new(engine, physical_predicate); - let add_transform = engine.get_expression_handler().get_evaluator( - get_log_add_schema().clone(), - get_add_transform_expr(), - SCAN_ROW_DATATYPE.clone(), - ); + let mut log_scanner = + ScanLogReplayProcessor::new(engine, physical_predicate, logical_schema, transform); + action_iter .map(move |action_res| { let (batch, is_log_batch) = action_res?; - log_scanner.process_scan_batch( - add_transform.as_ref(), - batch.as_ref(), - logical_schema.clone(), - transform.clone(), - is_log_batch, - ) + log_scanner.process_batch(batch, is_log_batch) }) .filter(|res| res.as_ref().map_or(true, |(_, sv, _)| sv.contains(&true))) } From d420fd1fd2ad5e3d172052b99698b4929178d1e8 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 24 Mar 2025 15:31:31 -0700 Subject: [PATCH 09/45] these github action checks clog my screen --- kernel/src/log_replay.rs | 13 +++---------- kernel/src/scan/log_replay.rs | 2 +- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index e545f2408..cfd4a10c0 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -1,16 +1,9 @@ -use std::collections::{HashMap, HashSet}; -use std::sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, LazyLock, -}; - -use itertools::Itertools; +use std::collections::HashSet; use tracing::debug; use crate::actions::deletion_vector::DeletionVectorDescriptor; -use crate::actions::get_log_add_schema; -use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; -use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; +use crate::engine_data::{GetData, TypedGetData as _}; +use crate::{DeltaResult, EngineData, Error}; #[derive(Debug, Hash, Eq, PartialEq)] /// The subset of file action fields that uniquely identifies it in the log, used for deduplication diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index b2c56c026..8dce0ed6f 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -11,7 +11,7 @@ use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; use crate::log_replay::{FileActionKey, FileActionVisitor, LogReplayProcessor}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; -use crate::scan::{DeletionVectorDescriptor, Scalar, TransformExpr}; +use crate::scan::{Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; From 9e0e0483a88f995fd55ac6755caf4bf473325a82 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 25 Mar 2025 14:34:44 -0700 Subject: [PATCH 10/45] base file actions struct --- kernel/src/actions/visitors.rs | 272 ++++++++++++++++++++++++--------- kernel/src/scan/log_replay.rs | 104 +++++-------- 2 files changed, 240 insertions(+), 136 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 150beffe6..9a04411e1 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -492,9 +492,7 @@ impl RowVisitor for SidecarVisitor { /// action, we should only include it if it is not expired (i.e., its deletion timestamp is greater /// than the minimum file retention timestamp). struct CheckpointFileActionsVisitor<'seen> { - seen_file_keys: &'seen mut HashSet, - selection_vector: Vec, - is_log_batch: bool, + deduplicator: FileActionDeduplicator<'seen>, total_actions: usize, total_add_actions: usize, minimum_file_retention_timestamp: i64, @@ -502,35 +500,22 @@ struct CheckpointFileActionsVisitor<'seen> { #[allow(unused)] // TODO: Remove flag once used for checkpoint writing impl CheckpointFileActionsVisitor<'_> { - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it - /// and should process it. - /// - /// TODO: This method is a duplicate of AddRemoveDedupVisior's method! - fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen_file_keys.contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - true - } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - if self.is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen_file_keys.insert(key); - } - false + /// Create a new CheckpointFileActionsVisitor + fn new( + seen_file_keys: &mut HashSet, + selection_vector: Vec, + is_log_batch: bool, + minimum_file_retention_timestamp: i64, + ) -> CheckpointFileActionsVisitor<'_> { + CheckpointFileActionsVisitor { + deduplicator: FileActionDeduplicator::new( + seen_file_keys, + selection_vector, + is_log_batch, + ), + total_actions: 0, + total_add_actions: 0, + minimum_file_retention_timestamp, } } @@ -556,29 +541,17 @@ impl CheckpointFileActionsVisitor<'_> { i: usize, getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { - // Add will have a path at index 0 if it is valid; otherwise we may - // have a remove with a path at index 4. In either case, extract the three dv getters at - // indexes that immediately follow a valid path index. - let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? { - (path, &getters[1..4], true) - } else if let Some(path) = getters[4].get_opt(i, "remove.path")? { - (path, &getters[6..9], false) - } else { + let Some((file_key, is_add)) = self.deduplicator.extract_file_action( + i, getters, 0, // add_path_index + 4, // remove_path_index + 1, // add_dv_start_index + 6, // remove_dv_start_index + false, // Never skip remove actions (even if we're processing a log batch) + )? + else { return Ok(false); }; - - let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?, - dv_getters[2].get_opt(i, "deletionVector.offset")?, - )), - None => None, - }; - - // Check both adds and removes (skipping already-seen) - let file_key = FileActionKey::new(path, dv_unique_id); - if self.check_and_record_seen(file_key) { + if self.deduplicator.check_and_record_seen(file_key) { return Ok(false); } @@ -634,7 +607,7 @@ impl RowVisitor for CheckpointFileActionsVisitor<'_> { let should_select = self.is_valid_file_action(i, getters)?; if should_select { - self.selection_vector[i] = true; + self.deduplicator.selection_vector[i] = true; self.total_actions += 1; } } @@ -642,6 +615,145 @@ impl RowVisitor for CheckpointFileActionsVisitor<'_> { } } +/// Core implementation for deduplicating file actions in Delta log replay +/// This struct extracts the common functionality from the CheckpointVisitor +/// and the ScanDataVisitor. +pub(crate) struct FileActionDeduplicator<'seen> { + /// A set of (data file path, dv_unique_id) pairs that have been seen thus + /// far in the log for deduplication + seen_file_keys: &'seen mut HashSet, + /// Selection vector to track which rows should be included + selection_vector: Vec, + /// Whether we're processing a log batch (as opposed to a checkpoint) + is_log_batch: bool, +} + +impl<'seen> FileActionDeduplicator<'seen> { + pub(crate) fn new( + seen_file_keys: &'seen mut HashSet, + selection_vector: Vec, + is_log_batch: bool, + ) -> Self { + Self { + seen_file_keys, + selection_vector, + is_log_batch, + } + } + + /// Checks if log replay already processed this logical file (in which case the current action + /// should be ignored). If not already seen, register it so we can recognize future duplicates. + /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it + /// and should process it. + pub(crate) fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { + // Note: each (add.path + add.dv_unique_id()) pair has a + // unique Add + Remove pair in the log. For example: + // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json + + if self.seen_file_keys.contains(&key) { + debug!( + "Ignoring duplicate ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + true + } else { + debug!( + "Including ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + if self.is_log_batch { + // Remember file actions from this batch so we can ignore duplicates as we process + // batches from older commit and/or checkpoint files. We don't track checkpoint + // batches because they are already the oldest actions and never replace anything. + self.seen_file_keys.insert(key); + } + false + } + } + + /// Extract deletion vector unique ID + fn extract_dv_unique_id<'a>( + &self, + i: usize, + getters: &[&'a dyn GetData<'a>], + add_dv_start_index: Option, + remove_dv_start_index: Option, + ) -> DeltaResult> { + // Get the starting index based on action type + let start_idx = add_dv_start_index + .or(remove_dv_start_index) + .ok_or_else(|| Error::GenericError { + source: "starting indices for add/remove DVs should have been passed".into(), + })?; + + // Extract the DV unique ID + match getters[start_idx].get_opt(i, "deletionVector.storageType")? { + Some(storage_type) => Ok(Some(DeletionVectorDescriptor::unique_id_from_parts( + storage_type, + getters[start_idx + 1].get(i, "deletionVector.pathOrInlineDv")?, + getters[start_idx + 2].get_opt(i, "deletionVector.offset")?, + ))), + None => Ok(None), + } + } + + /// Extract file action key and determine if it's an add operation + pub(crate) fn extract_file_action<'a>( + &self, + i: usize, + getters: &[&'a dyn GetData<'a>], + add_path_index: usize, + remove_path_index: usize, + add_dv_start_index: usize, + remove_dv_start_index: usize, + skip_removes: bool, + ) -> DeltaResult> { + // Try to extract an add action path + if let Some(path) = getters[add_path_index].get_str(i, "add.path")? { + let dv_unique_id = + self.extract_dv_unique_id(i, getters, Some(add_dv_start_index), None)?; + return Ok(Some((FileActionKey::new(path, dv_unique_id), true))); + } + + // The AddRemoveDedupVisitor does not include remove action getters when + // dealing with non-log batches (since they are not needed for deduplication). + // In this case, we should skip remove actions. + if skip_removes { + return Ok(None); + } + + // Try to extract a remove action path + if let Some(path) = getters[remove_path_index].get_str(i, "remove.path")? { + let dv_unique_id = + self.extract_dv_unique_id(i, getters, None, Some(remove_dv_start_index))?; + return Ok(Some((FileActionKey::new(path, dv_unique_id), false))); + } + + // If we didn't find an add or remove action, return None + return Ok(None); + } + + /// Get the selection vector + pub(crate) fn selection_vector(self) -> Vec { + self.selection_vector + } + + /// Get reference to the selection vector + pub(crate) fn selection_vector_ref(&self) -> &Vec { + &self.selection_vector + } + + /// Get mutable reference to the selection vector + pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec { + &mut self.selection_vector + } + + /// Get whether we are processing a log batch + pub(crate) fn is_log_batch(&self) -> bool { + self.is_log_batch + } +} + /// A visitor that selects non-file actions for a checkpoint file. Since log replay visits actions /// in newest-first order, we only keep the first occurrence of: /// - a protocol action, @@ -1030,10 +1142,13 @@ mod tests { #[test] fn test_parse_checkpoint_file_action_visitor() -> DeltaResult<()> { let data = action_batch(); - let mut visitor = CheckpointFileActionsVisitor { + let deduplicator = FileActionDeduplicator { seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 8], // 8 rows in the action batch + selection_vector: vec![false; 8], is_log_batch: true, + }; + let mut visitor = CheckpointFileActionsVisitor { + deduplicator, total_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp: 0, // No tombstones are expired @@ -1042,8 +1157,8 @@ mod tests { visitor.visit_rows_of(data.as_ref())?; let expected = vec![true, true, false, false, false, false, false, false]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_file_keys.len(), 2); + assert_eq!(visitor.deduplicator.seen_file_keys.len(), 2); + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_actions, 2); assert_eq!(visitor.total_add_actions, 1); Ok(()) @@ -1061,10 +1176,13 @@ mod tests { .into(); let batch = parse_json_batch(json_strings); - let mut visitor = CheckpointFileActionsVisitor { + let deduplicator = FileActionDeduplicator { seen_file_keys: &mut HashSet::new(), selection_vector: vec![false; 4], is_log_batch: true, + }; + let mut visitor = CheckpointFileActionsVisitor { + deduplicator, total_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp: 100, // Threshold set to 100 @@ -1073,8 +1191,8 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![false, false, true, false]; // Only "one_above_threshold" should be kept - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_file_keys.len(), 4); // All are recorded as seen even if expired + assert_eq!(visitor.deduplicator.seen_file_keys.len(), 4); // All are recorded as seen even if expired + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_actions, 1); assert_eq!(visitor.total_add_actions, 0); Ok(()) @@ -1090,10 +1208,13 @@ mod tests { .into(); let batch = parse_json_batch(json_strings); - let mut visitor = CheckpointFileActionsVisitor { + let deduplicator = FileActionDeduplicator { seen_file_keys: &mut HashSet::new(), selection_vector: vec![false; 2], - is_log_batch: true, // Log batch + is_log_batch: true, + }; + let mut visitor = CheckpointFileActionsVisitor { + deduplicator, total_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp: 0, @@ -1103,8 +1224,8 @@ mod tests { // First one should be included, second one skipped as a duplicate let expected = vec![true, false]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_file_keys.len(), 1); + assert_eq!(visitor.deduplicator.seen_file_keys.len(), 1); + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_actions, 1); assert_eq!(visitor.total_add_actions, 1); Ok(()) @@ -1121,10 +1242,13 @@ mod tests { .into(); let batch = parse_json_batch(json_strings); - let mut visitor = CheckpointFileActionsVisitor { + let deduplicator = FileActionDeduplicator { seen_file_keys: &mut HashSet::new(), selection_vector: vec![false; 2], - is_log_batch: false, // Checkpoint batch + is_log_batch: false, + }; + let mut visitor = CheckpointFileActionsVisitor { + deduplicator, total_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp: 0, @@ -1134,8 +1258,8 @@ mod tests { // Both should be included since we don't track duplicates in checkpoint batches let expected = vec![true, true]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_file_keys.len(), 0); // No tracking for checkpoint batches + assert_eq!(visitor.deduplicator.seen_file_keys.len(), 0); // No tracking for checkpoint batches + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_actions, 2); assert_eq!(visitor.total_add_actions, 2); Ok(()) @@ -1152,11 +1276,13 @@ mod tests { ] .into(); let batch = parse_json_batch(json_strings); - - let mut visitor = CheckpointFileActionsVisitor { + let deduplicator = FileActionDeduplicator { seen_file_keys: &mut HashSet::new(), selection_vector: vec![false; 3], is_log_batch: true, + }; + let mut visitor = CheckpointFileActionsVisitor { + deduplicator, total_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp: 0, @@ -1165,8 +1291,8 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, false]; // Third one is a duplicate - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_file_keys.len(), 2); + assert_eq!(visitor.deduplicator.seen_file_keys.len(), 2); + assert_eq!(visitor.deduplicator.selection_vector(), expected); assert_eq!(visitor.total_actions, 2); assert_eq!(visitor.total_add_actions, 2); Ok(()) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index dbcd056df..59e3e52c1 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -3,15 +3,15 @@ use std::collections::{HashMap, HashSet}; use std::sync::{Arc, LazyLock}; use itertools::Itertools; -use tracing::debug; use super::data_skipping::DataSkippingFilter; use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; +use crate::actions::visitors::FileActionDeduplicator; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; -use crate::scan::{DeletionVectorDescriptor, Scalar, TransformExpr}; +use crate::scan::{Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; @@ -45,43 +45,28 @@ struct LogReplayScanner { /// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the /// first action for a given file is a remove, then that file does not show up in the result at all. struct AddRemoveDedupVisitor<'seen> { - seen: &'seen mut HashSet, - selection_vector: Vec, + deduplicator: FileActionDeduplicator<'seen>, logical_schema: SchemaRef, transform: Option>, partition_filter: Option, row_transform_exprs: Vec>, - is_log_batch: bool, } impl AddRemoveDedupVisitor<'_> { - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it - /// and should process it. - fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen.contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - true - } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - if self.is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen.insert(key); - } - false + fn new( + seen: &mut HashSet, + selection_vector: Vec, + logical_schema: SchemaRef, + transform: Option>, + partition_filter: Option, + is_log_batch: bool, + ) -> AddRemoveDedupVisitor<'_> { + AddRemoveDedupVisitor { + deduplicator: FileActionDeduplicator::new(seen, selection_vector, is_log_batch), + logical_schema, + transform, + partition_filter, + row_transform_exprs: Vec::new(), } } @@ -162,28 +147,19 @@ impl AddRemoveDedupVisitor<'_> { /// True if this row contains an Add action that should survive log replay. Skip it if the row /// is not an Add action, or the file has already been seen previously. fn is_valid_add<'a>(&mut self, i: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult { - // Add will have a path at index 0 if it is valid; otherwise, if it is a log batch, we may - // have a remove with a path at index 4. In either case, extract the three dv getters at - // indexes that immediately follow a valid path index. - let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? { - (path, &getters[2..5], true) - } else if !self.is_log_batch { - return Ok(false); - } else if let Some(path) = getters[5].get_opt(i, "remove.path")? { - (path, &getters[6..9], false) - } else { + let Some((file_key, is_add)) = self.deduplicator.extract_file_action( + i, + getters, + 0, // add_path_index + 5, // remove_path_index + 2, // add_dv_start_index + 6, // remove_dv_start_index + !self.deduplicator.is_log_batch(), // skip_removes if it's a log batch + )? + else { return Ok(false); }; - let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?, - dv_getters[2].get_opt(i, "deletionVector.offset")?, - )), - None => None, - }; - // Apply partition pruning (to adds only) before deduplication, so that we don't waste memory // tracking pruned files. Removes don't get pruned and we'll still have to track them. // @@ -203,8 +179,7 @@ impl AddRemoveDedupVisitor<'_> { }; // Check both adds and removes (skipping already-seen), but only transform and return adds - let file_key = FileActionKey::new(path, dv_unique_id); - if self.check_and_record_seen(file_key) || !is_add { + if self.deduplicator.check_and_record_seen(file_key) || !is_add { return Ok(false); } let transform = self @@ -243,7 +218,7 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { (names, types).into() }); let (names, types) = NAMES_AND_TYPES.as_ref(); - if self.is_log_batch { + if self.deduplicator.is_log_batch() { (names, types) } else { // All checkpoint actions are already reconciled and Remove actions in checkpoint files @@ -253,7 +228,11 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { } fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { - let expected_getters = if self.is_log_batch { 9 } else { 5 }; + let expected_getters = if self.deduplicator.is_log_batch() { + 9 + } else { + 5 + }; require!( getters.len() == expected_getters, Error::InternalError(format!( @@ -263,8 +242,8 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { ); for i in 0..row_count { - if self.selection_vector[i] { - self.selection_vector[i] = self.is_valid_add(i, getters)?; + if self.deduplicator.selection_vector_ref()[i] { + self.deduplicator.selection_vector_mut()[i] = self.is_valid_add(i, getters)?; } } Ok(()) @@ -336,19 +315,18 @@ impl LogReplayScanner { }; assert_eq!(selection_vector.len(), actions.len()); - let mut visitor = AddRemoveDedupVisitor { - seen: &mut self.seen, + let mut visitor = AddRemoveDedupVisitor::new( + &mut self.seen, selection_vector, logical_schema, transform, - partition_filter: self.partition_filter.clone(), - row_transform_exprs: Vec::new(), + self.partition_filter.clone(), is_log_batch, - }; + ); visitor.visit_rows_of(actions)?; // TODO: Teach expression eval to respect the selection vector we just computed so carefully! - let selection_vector = visitor.selection_vector; + let selection_vector = visitor.deduplicator.selection_vector(); let result = add_transform.evaluate(actions)?; Ok((result, selection_vector, visitor.row_transform_exprs)) } From 303444b5df466f697722bc85c4f23dd340d6faff Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 25 Mar 2025 15:34:58 -0700 Subject: [PATCH 11/45] combine visitors --- kernel/src/actions/visitors.rs | 457 ++++++++++++++++++++------------- 1 file changed, 281 insertions(+), 176 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 9a04411e1..73eb25d93 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -484,38 +484,61 @@ impl RowVisitor for SidecarVisitor { Ok(()) } } - -/// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds and -/// removes to be included in a checkpoint file. Log replay visits actions newest-first, so once -/// we've seen a file action for a given (path, dvId) pair, we should ignore all subsequent (older) -/// actions for that same (path, dvId) pair. If the first action for a given (path, dvId) is a remove -/// action, we should only include it if it is not expired (i.e., its deletion timestamp is greater -/// than the minimum file retention timestamp). -struct CheckpointFileActionsVisitor<'seen> { - deduplicator: FileActionDeduplicator<'seen>, - total_actions: usize, +/// A visitor that filters actions for inclusion in a checkpoint file. +/// +/// This visitor processes actions in newest-to-oldest order (as they appear in log +/// replay) and applies deduplication logic for both file and non-file actions. +/// +/// # File Action Filtering +/// - Keeps only the first occurrence of each unique (path, dvId) pair +/// - Excludes expired tombstone remove actions (where deletionTimestamp ≤ minimumFileRetentionTimestamp) +/// +/// # Non-File Action Filtering +/// - Keeps only the first protocol action +/// - Keeps only the first metadata action +/// - Keeps only the first transaction action for each unique app ID +/// +/// This filtered set of actions represents the minimal set needed to reconstruct +/// the latest valid state of the table. +#[cfg_attr(feature = "developer-visibility", visibility::make(pub))] +pub(crate) struct CheckpointVisitor<'seen> { + // File actions deduplication state + file_deduplicator: FileActionDeduplicator<'seen>, + total_file_actions: usize, total_add_actions: usize, minimum_file_retention_timestamp: i64, + + // Non-file actions deduplication state + seen_protocol: bool, + seen_metadata: bool, + seen_txns: &'seen mut HashSet, + total_non_file_actions: usize, } -#[allow(unused)] // TODO: Remove flag once used for checkpoint writing -impl CheckpointFileActionsVisitor<'_> { - /// Create a new CheckpointFileActionsVisitor - fn new( - seen_file_keys: &mut HashSet, +#[allow(unused)] +impl CheckpointVisitor<'_> { + /// Create a new CheckpointVisitor + fn new<'seen>( + seen_file_keys: &'seen mut HashSet, + seen_txns: &'seen mut HashSet, selection_vector: Vec, is_log_batch: bool, minimum_file_retention_timestamp: i64, - ) -> CheckpointFileActionsVisitor<'_> { - CheckpointFileActionsVisitor { - deduplicator: FileActionDeduplicator::new( + ) -> CheckpointVisitor<'seen> { + CheckpointVisitor { + file_deduplicator: FileActionDeduplicator::new( seen_file_keys, selection_vector, is_log_batch, ), - total_actions: 0, + total_file_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp, + + seen_protocol: false, + seen_metadata: false, + seen_txns, + total_non_file_actions: 0, } } @@ -541,8 +564,8 @@ impl CheckpointFileActionsVisitor<'_> { i: usize, getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { - let Some((file_key, is_add)) = self.deduplicator.extract_file_action( - i, getters, 0, // add_path_index + let Some((file_key, is_add)) = self.file_deduplicator.extract_file_action( + i, &getters, 0, // add_path_index 4, // remove_path_index 1, // add_dv_start_index 6, // remove_dv_start_index @@ -551,11 +574,12 @@ impl CheckpointFileActionsVisitor<'_> { else { return Ok(false); }; - if self.deduplicator.check_and_record_seen(file_key) { + + if self.file_deduplicator.check_and_record_seen(file_key) { return Ok(false); } - // Ignore expired tombstones. + // Ignore expired tombstones. The getter at the fifth index is the remove action's deletionTimestamp. if !is_add && self.is_expired_tombstone(i, getters[5])? { return Ok(false); } @@ -564,39 +588,98 @@ impl CheckpointFileActionsVisitor<'_> { self.total_add_actions += 1; } + self.total_file_actions += 1; Ok(true) } + + /// Returns true if the row contains a protocol action, and we haven't seen one yet. + fn is_valid_protocol_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol { + self.seen_protocol = true; + self.total_non_file_actions += 1; + Ok(true) + } else { + Ok(false) + } + } + + /// Returns true if the row contains a metadata action, and we haven't seen one yet. + fn is_valid_metadata_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata { + self.seen_metadata = true; + self.total_non_file_actions += 1; + Ok(true) + } else { + Ok(false) + } + } + + /// Returns true if the row contains a txn action with an appId that we haven't seen yet. + fn is_valid_txn_action<'a>( + &mut self, + i: usize, + getter: &'a dyn GetData<'a>, + ) -> DeltaResult { + let app_id = match getter.get_str(i, "txn.appId")? { + Some(id) => id, + None => return Ok(false), + }; + + // Attempting to insert the app_id into the set. If it's already present, the insert will + // return false, indicating that we've already seen this app_id. + if self.seen_txns.insert(app_id.to_string()) { + self.total_non_file_actions += 1; + Ok(true) + } else { + Ok(false) + } + } } -impl RowVisitor for CheckpointFileActionsVisitor<'_> { +impl RowVisitor for CheckpointVisitor<'_> { fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { // The data columns visited must be in the following order: // 1. ADD // 2. REMOVE - static CHECKPOINT_FILE_ACTION_COLUMNS: LazyLock = - LazyLock::new(|| { - const STRING: DataType = DataType::STRING; - const INTEGER: DataType = DataType::INTEGER; - let types_and_names = vec![ - (STRING, column_name!("add.path")), - (STRING, column_name!("add.deletionVector.storageType")), - (STRING, column_name!("add.deletionVector.pathOrInlineDv")), - (INTEGER, column_name!("add.deletionVector.offset")), - (STRING, column_name!("remove.path")), - (DataType::LONG, column_name!("remove.deletionTimestamp")), - (STRING, column_name!("remove.deletionVector.storageType")), - (STRING, column_name!("remove.deletionVector.pathOrInlineDv")), - (INTEGER, column_name!("remove.deletionVector.offset")), - ]; - let (types, names) = types_and_names.into_iter().unzip(); - (names, types).into() - }); - CHECKPOINT_FILE_ACTION_COLUMNS.as_ref() + // 3. METADATA + // 4. PROTOCOL + // 5. TXN + static NAMES_AND_TYPES: LazyLock = LazyLock::new(|| { + const STRING: DataType = DataType::STRING; + const INTEGER: DataType = DataType::INTEGER; + let types_and_names = vec![ + // File action columns + (STRING, column_name!("add.path")), + (STRING, column_name!("add.deletionVector.storageType")), + (STRING, column_name!("add.deletionVector.pathOrInlineDv")), + (INTEGER, column_name!("add.deletionVector.offset")), + (STRING, column_name!("remove.path")), + (DataType::LONG, column_name!("remove.deletionTimestamp")), + (STRING, column_name!("remove.deletionVector.storageType")), + (STRING, column_name!("remove.deletionVector.pathOrInlineDv")), + (INTEGER, column_name!("remove.deletionVector.offset")), + // Non-file action columns + (STRING, column_name!("metaData.id")), + (INTEGER, column_name!("protocol.minReaderVersion")), + (STRING, column_name!("txn.appId")), + ]; + let (types, names) = types_and_names.into_iter().unzip(); + (names, types).into() + }); + NAMES_AND_TYPES.as_ref() } fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { require!( - getters.len() == 9, + getters.len() == 12, Error::InternalError(format!( "Wrong number of visitor getters: {}", getters.len() @@ -604,11 +687,17 @@ impl RowVisitor for CheckpointFileActionsVisitor<'_> { ); for i in 0..row_count { - let should_select = self.is_valid_file_action(i, getters)?; + // Check for non-file actions (metadata, protocol, txn) + let is_non_file_action = self.is_valid_metadata_action(i, getters[9])? + || self.is_valid_protocol_action(i, getters[10])? + || self.is_valid_txn_action(i, getters[11])?; - if should_select { - self.deduplicator.selection_vector[i] = true; - self.total_actions += 1; + // Check for file actions (add, remove) + let is_file_action = self.is_valid_file_action(i, getters)?; + + // Mark the row for selection if it's either a valid non-file or file action + if is_non_file_action || is_file_action { + self.file_deduplicator.selection_vector_mut()[i] = true; } } Ok(()) @@ -1140,100 +1229,105 @@ mod tests { } #[test] - fn test_parse_checkpoint_file_action_visitor() -> DeltaResult<()> { + fn test_checkpoint_visitor() -> DeltaResult<()> { let data = action_batch(); - let deduplicator = FileActionDeduplicator { - seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 8], - is_log_batch: true, - }; - let mut visitor = CheckpointFileActionsVisitor { - deduplicator, - total_actions: 0, - total_add_actions: 0, - minimum_file_retention_timestamp: 0, // No tombstones are expired - }; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = CheckpointVisitor::new( + &mut seen_file_keys, + &mut seen_txns, + vec![false; 8], + true, + 0, // minimum_file_retention_timestamp (no expired tombstones) + ); visitor.visit_rows_of(data.as_ref())?; - let expected = vec![true, true, false, false, false, false, false, false]; - assert_eq!(visitor.deduplicator.seen_file_keys.len(), 2); - assert_eq!(visitor.deduplicator.selection_vector(), expected); - assert_eq!(visitor.total_actions, 2); + // Combined results from both file and non-file actions + // Row 0 is an add action + // Row 1 is a remove action + // Row 3 is a protocol action + // Row 4 is a metadata action + // Row 7 is a txn action + let expected = vec![true, true, false, true, true, false, false, true]; + + // Verify file action results + assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 1); + + // Verify non-file action results + assert!(visitor.seen_protocol); + assert!(visitor.seen_metadata); + assert_eq!(visitor.seen_txns.len(), 1); + assert_eq!(visitor.total_non_file_actions, 3); + + assert_eq!(visitor.file_deduplicator.selection_vector, expected); Ok(()) } #[test] - fn test_checkpoint_file_action_visitor_boundary_cases_for_tombstone_expiration( - ) -> DeltaResult<()> { + fn test_checkpoint_visitor_boundary_cases_for_tombstone_expiration() -> DeltaResult<()> { let json_strings: StringArray = vec![ - r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, - r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, - r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#, - r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, // Missing timestamp defaults to 0 + r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#, + // Missing timestamp defaults to 0 + r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, ] .into(); let batch = parse_json_batch(json_strings); - let deduplicator = FileActionDeduplicator { - seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 4], - is_log_batch: true, - }; - let mut visitor = CheckpointFileActionsVisitor { - deduplicator, - total_actions: 0, - total_add_actions: 0, - minimum_file_retention_timestamp: 100, // Threshold set to 100 - }; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = CheckpointVisitor::new( + &mut seen_file_keys, + &mut seen_txns, + vec![false; 4], + true, + 100, // minimum_file_retention_timestamp (threshold set to 100) + ); visitor.visit_rows_of(batch.as_ref())?; - let expected = vec![false, false, true, false]; // Only "one_above_threshold" should be kept - assert_eq!(visitor.deduplicator.seen_file_keys.len(), 4); // All are recorded as seen even if expired - assert_eq!(visitor.deduplicator.selection_vector(), expected); - assert_eq!(visitor.total_actions, 1); + // Only "one_above_threshold" should be kept + let expected = vec![false, false, true, false]; + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 0); + assert_eq!(visitor.total_non_file_actions, 0); Ok(()) } #[test] - fn test_checkpoint_file_action_visitor_duplicate_file_actions_in_log_batch() -> DeltaResult<()> - { + fn test_checkpoint_visitor_conflicting_file_actions_in_log_batch() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, - r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, // Duplicate path - ] + // Duplicate path + r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + ] .into(); let batch = parse_json_batch(json_strings); - let deduplicator = FileActionDeduplicator { - seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 2], - is_log_batch: true, - }; - let mut visitor = CheckpointFileActionsVisitor { - deduplicator, - total_actions: 0, - total_add_actions: 0, - minimum_file_retention_timestamp: 0, - }; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = + CheckpointVisitor::new(&mut seen_file_keys, &mut seen_txns, vec![false; 2], true, 0); visitor.visit_rows_of(batch.as_ref())?; // First one should be included, second one skipped as a duplicate let expected = vec![true, false]; - assert_eq!(visitor.deduplicator.seen_file_keys.len(), 1); - assert_eq!(visitor.deduplicator.selection_vector(), expected); - assert_eq!(visitor.total_actions, 1); + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 1); + assert_eq!(visitor.total_non_file_actions, 0); Ok(()) } #[test] - fn test_checkpoint_file_action_visitor_duplicate_file_actions_in_checkpoint_batch( - ) -> DeltaResult<()> { + fn test_checkpoint_visitor_duplicate_file_actions_in_checkpoint_batch() -> DeltaResult<()> { + // Note: this is NOT a valid checkpoint batch since it contains duplicate file actions! + // However, we should still be able to parse it without errors, and the duplicates should be included. let json_strings: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, // Duplicate path @@ -1242,31 +1336,29 @@ mod tests { .into(); let batch = parse_json_batch(json_strings); - let deduplicator = FileActionDeduplicator { - seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 2], - is_log_batch: false, - }; - let mut visitor = CheckpointFileActionsVisitor { - deduplicator, - total_actions: 0, - total_add_actions: 0, - minimum_file_retention_timestamp: 0, - }; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = CheckpointVisitor::new( + &mut seen_file_keys, + &mut seen_txns, + vec![false; 2], + false, // is_log_batch = false (checkpoint batch) + 0, + ); visitor.visit_rows_of(batch.as_ref())?; // Both should be included since we don't track duplicates in checkpoint batches let expected = vec![true, true]; - assert_eq!(visitor.deduplicator.seen_file_keys.len(), 0); // No tracking for checkpoint batches - assert_eq!(visitor.deduplicator.selection_vector(), expected); - assert_eq!(visitor.total_actions, 2); + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); + assert_eq!(visitor.total_non_file_actions, 0); Ok(()) } #[test] - fn test_checkpoint_file_action_visitor_with_deletion_vectors() -> DeltaResult<()> { + fn test_checkpoint_visitor_with_deletion_vectors() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, // Same path but different DV @@ -1276,52 +1368,52 @@ mod tests { ] .into(); let batch = parse_json_batch(json_strings); - let deduplicator = FileActionDeduplicator { - seen_file_keys: &mut HashSet::new(), - selection_vector: vec![false; 3], - is_log_batch: true, - }; - let mut visitor = CheckpointFileActionsVisitor { - deduplicator, - total_actions: 0, - total_add_actions: 0, - minimum_file_retention_timestamp: 0, - }; + + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = + CheckpointVisitor::new(&mut seen_file_keys, &mut seen_txns, vec![false; 3], true, 0); visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, false]; // Third one is a duplicate - assert_eq!(visitor.deduplicator.seen_file_keys.len(), 2); - assert_eq!(visitor.deduplicator.selection_vector(), expected); - assert_eq!(visitor.total_actions, 2); + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); + assert_eq!(visitor.total_non_file_actions, 0); + Ok(()) } #[test] - fn test_parse_checkpoint_non_file_actions_visitor() -> DeltaResult<()> { - let data = action_batch(); - let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: false, - seen_metadata: false, - seen_txns: &mut HashSet::new(), - selection_vector: vec![false; 8], - total_actions: 0, - }; + fn test_checkpoint_visitor_non_file_actions() -> DeltaResult<()> { + let json_strings: StringArray = vec![ + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + ].into(); + let batch = parse_json_batch(json_strings); - visitor.visit_rows_of(data.as_ref())?; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = + CheckpointVisitor::new(&mut seen_file_keys, &mut seen_txns, vec![false; 3], true, 0); - let expected = vec![false, false, false, true, true, false, false, true]; - assert_eq!(visitor.selection_vector, expected); - assert!(visitor.seen_metadata); + visitor.visit_rows_of(batch.as_ref())?; + + let expected = vec![true, true, true]; + assert_eq!(visitor.file_deduplicator.selection_vector, expected); assert!(visitor.seen_protocol); + assert!(visitor.seen_metadata); assert_eq!(visitor.seen_txns.len(), 1); - assert_eq!(visitor.total_actions, 3); + assert_eq!(visitor.total_non_file_actions, 3); + assert_eq!(visitor.total_file_actions, 0); + Ok(()) } #[test] - fn test_checkpoint_non_file_actions_visitor_already_seen_actions() -> DeltaResult<()> { + fn test_checkpoint_visitor_already_seen_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, @@ -1330,54 +1422,67 @@ mod tests { let batch = parse_json_batch(json_strings); // Pre-populate with txn app1 + let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); seen_txns.insert("app1".to_string()); - let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: true, // Already seen - seen_metadata: true, // Already seen - seen_txns: &mut seen_txns, - selection_vector: vec![false; 3], - total_actions: 0, - }; + let mut visitor = CheckpointVisitor::new( + &mut seen_file_keys, + &mut seen_txns, // Pre-populated transaction + vec![false; 3], + true, + 0, + ); + + // Mark these as already seen + visitor.seen_protocol = true; + visitor.seen_metadata = true; visitor.visit_rows_of(batch.as_ref())?; // All actions should be skipped as they have already been seen - let expected = vec![false; 3]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_txns.len(), 1); // Still only one transaction - assert_eq!(visitor.total_actions, 0); + let expected = vec![false, false, false]; + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.total_non_file_actions, 0); + assert_eq!(visitor.total_file_actions, 0); + Ok(()) } #[test] - fn test_checkpoint_non_file_actions_visitor_duplicate_non_file_actions() -> DeltaResult<()> { + fn test_checkpoint_visitor_duplicate_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, // Duplicate txn + r#"{"txn":{"appId":"app2","version":1,"lastUpdated":123456789}}"#, // Different app ID + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, // Duplicate protocol r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + // Duplicate metadata + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, ] .into(); let batch = parse_json_batch(json_strings); - let mut visitor = CheckpointNonFileActionsVisitor { - seen_protocol: false, - seen_metadata: false, - seen_txns: &mut HashSet::new(), // Empty set - selection_vector: vec![false; 6], - total_actions: 0, - }; + let mut seen_file_keys = HashSet::new(); + let mut seen_txns = HashSet::new(); + let mut visitor = CheckpointVisitor::new( + &mut seen_file_keys, + &mut seen_txns, + vec![false; 7], + true, // is_log_batch + 0, // minimum_file_retention_timestamp + ); visitor.visit_rows_of(batch.as_ref())?; - let expected = vec![true, false, true, false, true, false]; - assert_eq!(visitor.selection_vector, expected); - assert_eq!(visitor.seen_txns.len(), 1); - assert_eq!(visitor.total_actions, 3); + // First occurrence of each type should be included + let expected = vec![true, false, true, true, false, true, false]; + assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.seen_txns.len(), 2); // Two different app IDs + assert_eq!(visitor.total_non_file_actions, 4); // 2 txns + 1 protocol + 1 metadata + assert_eq!(visitor.total_file_actions, 0); + Ok(()) } } From 5dbc924b65eeb2d3a5b34f03059d1d03a9b80f6d Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 25 Mar 2025 17:11:15 -0700 Subject: [PATCH 12/45] fmt --- kernel/src/actions/visitors.rs | 115 ++++++++++++++++++++++----------- kernel/src/scan/log_replay.rs | 8 +-- 2 files changed, 79 insertions(+), 44 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 73eb25d93..a93aa71ec 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -503,7 +503,7 @@ impl RowVisitor for SidecarVisitor { #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] pub(crate) struct CheckpointVisitor<'seen> { // File actions deduplication state - file_deduplicator: FileActionDeduplicator<'seen>, + deduplicator: FileActionDeduplicator<'seen>, total_file_actions: usize, total_add_actions: usize, minimum_file_retention_timestamp: i64, @@ -526,7 +526,7 @@ impl CheckpointVisitor<'_> { minimum_file_retention_timestamp: i64, ) -> CheckpointVisitor<'seen> { CheckpointVisitor { - file_deduplicator: FileActionDeduplicator::new( + deduplicator: FileActionDeduplicator::new( seen_file_keys, selection_vector, is_log_batch, @@ -564,18 +564,18 @@ impl CheckpointVisitor<'_> { i: usize, getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { - let Some((file_key, is_add)) = self.file_deduplicator.extract_file_action( - i, &getters, 0, // add_path_index - 4, // remove_path_index - 1, // add_dv_start_index - 6, // remove_dv_start_index - false, // Never skip remove actions (even if we're processing a log batch) + // Extract file action key and determine if it's an add operation + let Some((file_key, is_add)) = self.deduplicator.extract_file_action( + i, + getters, + // Do not skip remove actions (even if we're processing a log batch) + FileActionExtractConfig::new(0, 4, 1, 6, false), )? else { return Ok(false); }; - if self.file_deduplicator.check_and_record_seen(file_key) { + if self.deduplicator.check_and_record_seen(file_key) { return Ok(false); } @@ -697,13 +697,46 @@ impl RowVisitor for CheckpointVisitor<'_> { // Mark the row for selection if it's either a valid non-file or file action if is_non_file_action || is_file_action { - self.file_deduplicator.selection_vector_mut()[i] = true; + self.deduplicator.selection_vector_mut()[i] = true; } } Ok(()) } } +/// This struct contains indices and configuration options needed to +/// extract file actions from action batches in the Delta log. +pub(crate) struct FileActionExtractConfig { + /// Index of the getter containing the add.path column + pub add_path_index: usize, + /// Index of the getter containing the remove.path column + pub remove_path_index: usize, + /// Starting index for add action deletion vector columns + pub add_dv_start_index: usize, + /// Starting index for remove action deletion vector columns + pub remove_dv_start_index: usize, + /// Whether to skip remove actions when extracting file actions + pub skip_removes: bool, +} + +impl FileActionExtractConfig { + pub(crate) fn new( + add_path_index: usize, + remove_path_index: usize, + add_dv_start_index: usize, + remove_dv_start_index: usize, + skip_removes: bool, + ) -> Self { + Self { + add_path_index, + remove_path_index, + add_dv_start_index, + remove_dv_start_index, + skip_removes, + } + } +} + /// Core implementation for deduplicating file actions in Delta log replay /// This struct extracts the common functionality from the CheckpointVisitor /// and the ScanDataVisitor. @@ -786,58 +819,64 @@ impl<'seen> FileActionDeduplicator<'seen> { } } - /// Extract file action key and determine if it's an add operation + /// Extracts a file action key and determines if it's an add operation. + /// + /// This method examines the data at the given index using the provided getters and config + /// to identify whether a file action exists and what type it is. + /// + /// # Arguments + /// + /// * `i` - Index position in the data structure to examine + /// * `getters` - Collection of data getter implementations used to access the data + /// * `config` - Configuration specifying where to find add/remove operations + /// + /// # Returns + /// + /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation + /// * `Ok(None)` - When no file action is found + /// * `Err(...)` - On any error during extraction pub(crate) fn extract_file_action<'a>( &self, i: usize, getters: &[&'a dyn GetData<'a>], - add_path_index: usize, - remove_path_index: usize, - add_dv_start_index: usize, - remove_dv_start_index: usize, - skip_removes: bool, + config: FileActionExtractConfig, ) -> DeltaResult> { // Try to extract an add action path - if let Some(path) = getters[add_path_index].get_str(i, "add.path")? { + if let Some(path) = getters[config.add_path_index].get_str(i, "add.path")? { let dv_unique_id = - self.extract_dv_unique_id(i, getters, Some(add_dv_start_index), None)?; + self.extract_dv_unique_id(i, getters, Some(config.add_dv_start_index), None)?; return Ok(Some((FileActionKey::new(path, dv_unique_id), true))); } - // The AddRemoveDedupVisitor does not include remove action getters when - // dealing with non-log batches (since they are not needed for deduplication). - // In this case, we should skip remove actions. - if skip_removes { + // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint file. + if config.skip_removes { return Ok(None); } // Try to extract a remove action path - if let Some(path) = getters[remove_path_index].get_str(i, "remove.path")? { + if let Some(path) = getters[config.remove_path_index].get_str(i, "remove.path")? { let dv_unique_id = - self.extract_dv_unique_id(i, getters, None, Some(remove_dv_start_index))?; + self.extract_dv_unique_id(i, getters, None, Some(config.remove_dv_start_index))?; return Ok(Some((FileActionKey::new(path, dv_unique_id), false))); } - // If we didn't find an add or remove action, return None - return Ok(None); + // No file action found + Ok(None) } - /// Get the selection vector pub(crate) fn selection_vector(self) -> Vec { self.selection_vector } - /// Get reference to the selection vector pub(crate) fn selection_vector_ref(&self) -> &Vec { &self.selection_vector } - /// Get mutable reference to the selection vector pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec { &mut self.selection_vector } - /// Get whether we are processing a log batch + /// Returns whether we are currently processing a log batch. pub(crate) fn is_log_batch(&self) -> bool { self.is_log_batch } @@ -1261,7 +1300,7 @@ mod tests { assert_eq!(visitor.seen_txns.len(), 1); assert_eq!(visitor.total_non_file_actions, 3); - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); Ok(()) } @@ -1291,7 +1330,7 @@ mod tests { // Only "one_above_threshold" should be kept let expected = vec![false, false, true, false]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 0); assert_eq!(visitor.total_non_file_actions, 0); @@ -1317,7 +1356,7 @@ mod tests { // First one should be included, second one skipped as a duplicate let expected = vec![true, false]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 1); assert_eq!(visitor.total_non_file_actions, 0); @@ -1350,7 +1389,7 @@ mod tests { // Both should be included since we don't track duplicates in checkpoint batches let expected = vec![true, true]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); assert_eq!(visitor.total_non_file_actions, 0); @@ -1377,7 +1416,7 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, false]; // Third one is a duplicate - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); assert_eq!(visitor.total_non_file_actions, 0); @@ -1402,7 +1441,7 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, true]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert!(visitor.seen_protocol); assert!(visitor.seen_metadata); assert_eq!(visitor.seen_txns.len(), 1); @@ -1442,7 +1481,7 @@ mod tests { // All actions should be skipped as they have already been seen let expected = vec![false, false, false]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.total_non_file_actions, 0); assert_eq!(visitor.total_file_actions, 0); @@ -1478,7 +1517,7 @@ mod tests { // First occurrence of each type should be included let expected = vec![true, false, true, true, false, true, false]; - assert_eq!(visitor.file_deduplicator.selection_vector, expected); + assert_eq!(visitor.deduplicator.selection_vector, expected); assert_eq!(visitor.seen_txns.len(), 2); // Two different app IDs assert_eq!(visitor.total_non_file_actions, 4); // 2 txns + 1 protocol + 1 metadata assert_eq!(visitor.total_file_actions, 0); diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 59e3e52c1..392b1511c 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -7,7 +7,7 @@ use itertools::Itertools; use super::data_skipping::DataSkippingFilter; use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; -use crate::actions::visitors::FileActionDeduplicator; +use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig}; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; @@ -150,11 +150,7 @@ impl AddRemoveDedupVisitor<'_> { let Some((file_key, is_add)) = self.deduplicator.extract_file_action( i, getters, - 0, // add_path_index - 5, // remove_path_index - 2, // add_dv_start_index - 6, // remove_dv_start_index - !self.deduplicator.is_log_batch(), // skip_removes if it's a log batch + FileActionExtractConfig::new(0, 5, 2, 6, !self.deduplicator.is_log_batch()), )? else { return Ok(false); From b7939610ebf92dbbc9825437bac45b99a3b221d1 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 25 Mar 2025 17:39:26 -0700 Subject: [PATCH 13/45] remove old code --- kernel/src/actions/visitors.rs | 109 +-------------------------------- 1 file changed, 1 insertion(+), 108 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index a93aa71ec..c0feb93eb 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -739,7 +739,7 @@ impl FileActionExtractConfig { /// Core implementation for deduplicating file actions in Delta log replay /// This struct extracts the common functionality from the CheckpointVisitor -/// and the ScanDataVisitor. +/// and the AddRemoveDedupVisitor. pub(crate) struct FileActionDeduplicator<'seen> { /// A set of (data file path, dv_unique_id) pairs that have been seen thus /// far in the log for deduplication @@ -882,113 +882,6 @@ impl<'seen> FileActionDeduplicator<'seen> { } } -/// A visitor that selects non-file actions for a checkpoint file. Since log replay visits actions -/// in newest-first order, we only keep the first occurrence of: -/// - a protocol action, -/// - a metadata action, -/// - a transaction (txn) action for a given app ID. -/// -/// Any subsequent (older) actions of the same type are ignored. This visitor tracks which actions -/// have been seen and includes only the first occurrence of each in the selection vector. -#[cfg_attr(feature = "developer-visibility", visibility::make(pub))] -pub(crate) struct CheckpointNonFileActionsVisitor<'seen> { - // Non-file actions state - pub(crate) seen_protocol: bool, - pub(crate) seen_metadata: bool, - pub(crate) seen_txns: &'seen mut HashSet, - pub(crate) selection_vector: Vec, - pub(crate) total_actions: usize, -} - -#[allow(unused)] // TODO: Remove flag once used for checkpoint writing -impl CheckpointNonFileActionsVisitor<'_> { - /// Returns true if the row contains a protocol action, and we haven’t seen one yet. - fn is_valid_protocol_action<'a>( - &mut self, - i: usize, - getter: &'a dyn GetData<'a>, - ) -> DeltaResult { - if getter.get_int(i, "protocol.minReaderVersion")?.is_some() && !self.seen_protocol { - self.seen_protocol = true; - Ok(true) - } else { - Ok(false) - } - } - - /// Returns true if the row contains a metadata action, and we haven’t seen one yet. - fn is_valid_metadata_action<'a>( - &mut self, - i: usize, - getter: &'a dyn GetData<'a>, - ) -> DeltaResult { - if getter.get_str(i, "metaData.id")?.is_some() && !self.seen_metadata { - self.seen_metadata = true; - Ok(true) - } else { - Ok(false) - } - } - - /// Returns true if the row contains a txn action with an appId that we haven’t seen yet. - fn is_valid_txn_action<'a>( - &mut self, - i: usize, - getter: &'a dyn GetData<'a>, - ) -> DeltaResult { - let app_id = match getter.get_str(i, "txn.appId")? { - Some(id) => id, - None => return Ok(false), - }; - - Ok(self.seen_txns.insert(app_id.to_string())) - } -} - -impl RowVisitor for CheckpointNonFileActionsVisitor<'_> { - fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { - // The data columns visited must be in the following order: - // 1. METADATA - // 2. PROTOCOL - // 3. TXN - static CHECKPOINT_NON_FILE_ACTION_COLUMNS: LazyLock = - LazyLock::new(|| { - const STRING: DataType = DataType::STRING; - const INTEGER: DataType = DataType::INTEGER; - let types_and_names = vec![ - (STRING, column_name!("metaData.id")), - (INTEGER, column_name!("protocol.minReaderVersion")), - (STRING, column_name!("txn.appId")), - ]; - let (types, names) = types_and_names.into_iter().unzip(); - (names, types).into() - }); - CHECKPOINT_NON_FILE_ACTION_COLUMNS.as_ref() - } - - fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { - require!( - getters.len() == 3, - Error::InternalError(format!( - "Wrong number of visitor getters: {}", - getters.len() - )) - ); - - for i in 0..row_count { - let should_select = self.is_valid_metadata_action(i, getters[0])? - || self.is_valid_protocol_action(i, getters[1])? - || self.is_valid_txn_action(i, getters[2])?; - - if should_select { - self.selection_vector[i] = true; - self.total_actions += 1; - } - } - Ok(()) - } -} - /// Get a DV out of some engine data. The caller is responsible for slicing the `getters` slice such /// that the first element contains the `storageType` element of the deletion vector. pub(crate) fn visit_deletion_vector_at<'a>( From 508976ff35a8e10da28222c4a33030eba468965e Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 25 Mar 2025 18:10:05 -0700 Subject: [PATCH 14/45] move FileActionKey --- kernel/src/actions/visitors.rs | 15 ++++++++++++++- kernel/src/scan/log_replay.rs | 16 +--------------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index c0feb93eb..037dfdd42 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -6,7 +6,6 @@ use std::sync::LazyLock; use tracing::debug; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; -use crate::scan::log_replay::FileActionKey; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, Error}; @@ -704,6 +703,20 @@ impl RowVisitor for CheckpointVisitor<'_> { } } +/// The subset of file action fields that uniquely identifies it in the log, used for deduplication +/// of adds and removes during log replay. +#[derive(Debug, Hash, Eq, PartialEq)] +pub(crate) struct FileActionKey { + pub(crate) path: String, + pub(crate) dv_unique_id: Option, +} +impl FileActionKey { + pub(crate) fn new(path: impl Into, dv_unique_id: Option) -> Self { + let path = path.into(); + Self { path, dv_unique_id } + } +} + /// This struct contains indices and configuration options needed to /// extract file actions from action batches in the Delta log. pub(crate) struct FileActionExtractConfig { diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 392b1511c..d3287eb5d 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -7,7 +7,7 @@ use itertools::Itertools; use super::data_skipping::DataSkippingFilter; use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; -use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig}; +use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig, FileActionKey}; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; @@ -16,20 +16,6 @@ use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructFie use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; -/// The subset of file action fields that uniquely identifies it in the log, used for deduplication -/// of adds and removes during log replay. -#[derive(Debug, Hash, Eq, PartialEq)] -pub(crate) struct FileActionKey { - pub(crate) path: String, - pub(crate) dv_unique_id: Option, -} -impl FileActionKey { - pub(crate) fn new(path: impl Into, dv_unique_id: Option) -> Self { - let path = path.into(); - Self { path, dv_unique_id } - } -} - struct LogReplayScanner { partition_filter: Option, data_skipping_filter: Option, From 0160ef151185de1f2c10c2e0a866ebafe3e2eabb Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 01:23:57 -0700 Subject: [PATCH 15/45] fix whitespace --- kernel/src/actions/visitors.rs | 84 +++++++++++++++++----------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 828fc4387..4d93d6fd3 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -1182,13 +1182,13 @@ mod tests { #[test] fn test_checkpoint_visitor_boundary_cases_for_tombstone_expiration() -> DeltaResult<()> { let json_strings: StringArray = vec![ - r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, - r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, - r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#, - // Missing timestamp defaults to 0 - r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, - ] - .into(); + r#"{"remove":{"path":"exactly_at_threshold","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_below_threshold","deletionTimestamp":99,"dataChange":true,"partitionValues":{}}}"#, + r#"{"remove":{"path":"one_above_threshold","deletionTimestamp":101,"dataChange":true,"partitionValues":{}}}"#, + // Missing timestamp defaults to 0 + r#"{"remove":{"path":"missing_timestamp","dataChange":true,"partitionValues":{}}}"#, + ] + .into(); let batch = parse_json_batch(json_strings); let mut seen_file_keys = HashSet::new(); @@ -1217,11 +1217,11 @@ mod tests { #[test] fn test_checkpoint_visitor_conflicting_file_actions_in_log_batch() -> DeltaResult<()> { let json_strings: StringArray = vec![ - r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, - // Duplicate path - r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, - ] - .into(); + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + // Duplicate path + r#"{"remove":{"path":"file1","deletionTimestamp":100,"dataChange":true,"partitionValues":{}}}"#, + ] + .into(); let batch = parse_json_batch(json_strings); let mut seen_file_keys = HashSet::new(); @@ -1252,11 +1252,11 @@ mod tests { // Note: this is NOT a valid checkpoint batch since it contains duplicate file actions! // However, we should still be able to parse it without errors, and the duplicates should be included. let json_strings: StringArray = vec![ - r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, - // Duplicate path - r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, - ] - .into(); + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + // Duplicate path + r#"{"add":{"path":"file1","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998137,"dataChange":true}}"#, + ] + .into(); let batch = parse_json_batch(json_strings); let mut seen_file_keys = HashSet::new(); @@ -1285,13 +1285,13 @@ mod tests { #[test] fn test_checkpoint_visitor_with_deletion_vectors() -> DeltaResult<()> { let json_strings: StringArray = vec![ - r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - // Same path but different DV - r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"two","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - // Duplicate of first entry - r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - ] - .into(); + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Same path but different DV + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"two","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + // Duplicate of first entry + r#"{"add":{"path":"file1","partitionValues":{},"size":635,"modificationTime":100,"dataChange":true,"deletionVector":{"storageType":"one","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + ] + .into(); let batch = parse_json_batch(json_strings); let mut seen_file_keys = HashSet::new(); @@ -1320,10 +1320,10 @@ mod tests { #[test] fn test_checkpoint_visitor_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - ].into(); + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + ].into(); let batch = parse_json_batch(json_strings); let mut seen_file_keys = HashSet::new(); @@ -1354,10 +1354,10 @@ mod tests { #[test] fn test_checkpoint_visitor_already_seen_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, - ].into(); + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + ].into(); let batch = parse_json_batch(json_strings); // Pre-populate with txn app1 @@ -1389,16 +1389,16 @@ mod tests { #[test] fn test_checkpoint_visitor_duplicate_non_file_actions() -> DeltaResult<()> { let json_strings: StringArray = vec![ - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, - r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, // Duplicate txn - r#"{"txn":{"appId":"app2","version":1,"lastUpdated":123456789}}"#, // Different app ID - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, // Duplicate protocol - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, - // Duplicate metadata - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, - ] - .into(); + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, + r#"{"txn":{"appId":"app1","version":1,"lastUpdated":123456789}}"#, // Duplicate txn + r#"{"txn":{"appId":"app2","version":1,"lastUpdated":123456789}}"#, // Different app ID + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7}}"#, // Duplicate protocol + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + // Duplicate metadata + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1677811175819}}"#, + ] + .into(); let batch = parse_json_batch(json_strings); let mut seen_file_keys = HashSet::new(); From aae7046782e1b9c98f26d4dd9d38d05c6be78fb0 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 01:28:27 -0700 Subject: [PATCH 16/45] remove old code --- kernel/src/log_replay.rs | 114 +-------------------------------------- 1 file changed, 1 insertion(+), 113 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index cfd4a10c0..e98dd6f03 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -1,9 +1,6 @@ use std::collections::HashSet; -use tracing::debug; -use crate::actions::deletion_vector::DeletionVectorDescriptor; -use crate::engine_data::{GetData, TypedGetData as _}; -use crate::{DeltaResult, EngineData, Error}; +use crate::{DeltaResult, EngineData}; #[derive(Debug, Hash, Eq, PartialEq)] /// The subset of file action fields that uniquely identifies it in the log, used for deduplication @@ -36,112 +33,3 @@ pub trait LogReplayProcessor { // Get a reference to the set of seen file keys fn seen_file_keys(&mut self) -> &mut HashSet; } - -/// Base trait for visitors that process file actions during log replay -pub trait FileActionVisitor { - /// Get a reference to the set of seen file keys - fn seen_file_keys(&mut self) -> &mut HashSet; - - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it - /// and should process it. - fn check_and_record_seen(&mut self, key: FileActionKey, is_log_batch: bool) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen_file_keys().contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, is_log_batch - ); - true - } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, is_log_batch - ); - if is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen_file_keys().insert(key); - } - false - } - } - - /// Index in getters array for add.path - fn add_path_index(&self) -> usize; - - /// Index in getters array for remove.path - fn remove_path_index(&self) -> Option; - - /// Starting index for add action's deletion vector getters - /// (Assumes 3 consecutive items: storageType, pathOrInlineDv, offset) - fn add_dv_start_index(&self) -> usize; - - /// Starting index for remove action's deletion vector getters - /// (Assumes 3 consecutive items: storageType, pathOrInlineDv, offset) - fn remove_dv_start_index(&self) -> Option; - - /// Extract deletion vector unique ID - fn extract_dv_unique_id<'a>( - &self, - i: usize, - getters: &[&'a dyn GetData<'a>], - is_add: bool, - ) -> DeltaResult> { - // Get the starting index based on action type - let start_idx = if is_add { - self.add_dv_start_index() - } else if let Some(idx) = self.remove_dv_start_index() { - idx - } else { - return Err(Error::GenericError { - source: "DV getters should exist".into(), - }); - }; - - // Extract the DV unique ID - match getters[start_idx].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Ok(Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - getters[start_idx + 1].get(i, "deletionVector.pathOrInlineDv")?, - getters[start_idx + 2].get_opt(i, "deletionVector.offset")?, - ))), - None => Ok(None), - } - } - - /// Extract file action key and determine if it's an add operation - fn extract_file_action<'a>( - &self, - i: usize, - getters: &[&'a dyn GetData<'a>], - ) -> DeltaResult> { - // Try to extract an add action path - if let Some(path) = getters[self.add_path_index()].get_str(i, "add.path")? { - let dv_unique_id = self.extract_dv_unique_id(i, getters, true)?; - let file_key = FileActionKey::new(path, dv_unique_id); - return Ok(Some((file_key, true))); - } - - // The AddRemoveDedupVisitor does not include remove action getters when - // dealing with non-log batches (since they are not needed for deduplication). - let Some(remove_idx) = self.remove_path_index() else { - return Ok(None); - }; - - // Try to extract a remove action path - if let Some(path) = getters[remove_idx].get_str(i, "remove.path")? { - let dv_unique_id = self.extract_dv_unique_id(i, getters, false)?; - let file_key = FileActionKey::new(path, dv_unique_id); - return Ok(Some((file_key, false))); - } - - // No path found, not a file action - Ok(None) - } -} From f5743709a48c2ebf4e2c1086cbb85d486daac31c Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 11:25:28 -0700 Subject: [PATCH 17/45] refactor more --- kernel/src/checkpoints/log_replay.rs | 47 ++++++++++++++++-------- kernel/src/log_replay.rs | 55 ++++++++++++++++++++++++++++ kernel/src/scan/log_replay.rs | 11 ++---- kernel/src/scan/mod.rs | 9 +++++ 4 files changed, 99 insertions(+), 23 deletions(-) diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs index 23c1f50d7..dc64b766c 100644 --- a/kernel/src/checkpoints/log_replay.rs +++ b/kernel/src/checkpoints/log_replay.rs @@ -4,9 +4,23 @@ use std::sync::Arc; use crate::actions::visitors::CheckpointVisitor; use crate::engine_data::RowVisitor; -use crate::log_replay::{FileActionKey, LogReplayProcessor}; +use crate::log_replay::{ + apply_processor_to_iterator, FileActionKey, HasSelectionVector, LogReplayProcessor, +}; use crate::{DeltaResult, EngineData}; +pub struct CheckpointData { + #[allow(unused)] + data: Box, + selection_vector: Vec, +} + +impl HasSelectionVector for CheckpointData { + fn has_selected_rows(&self) -> bool { + self.selection_vector.contains(&true) + } +} + /// `CheckpointLogReplayProcessor` is responsible for filtering actions during log /// replay to include only those that should be included in a V1 checkpoint. #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented @@ -36,7 +50,7 @@ struct CheckpointLogReplayProcessor { impl LogReplayProcessor for CheckpointLogReplayProcessor { // Define the processing result type as a tuple of the data and selection vector - type ProcessingResult = (Box, Vec); + type ProcessingResult = CheckpointData; /// This function processes batches of actions in reverse chronological order /// (from most recent to least recent) and performs the necessary filtering @@ -90,7 +104,10 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { self.seen_protocol = visitor.seen_protocol; self.seen_metadata = visitor.seen_metadata; - Ok((batch, visitor.deduplicator.selection_vector())) + Ok(CheckpointData { + data: batch, + selection_vector: visitor.deduplicator.selection_vector(), + }) } // Get a reference to the set of seen file keys @@ -132,20 +149,14 @@ pub(crate) fn checkpoint_actions_iter( total_actions_counter: Arc, total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, -) -> impl Iterator, Vec)>> + Send + 'static { +) -> impl Iterator> + Send + 'static { let mut log_scanner = CheckpointLogReplayProcessor::new( total_actions_counter, total_add_actions_counter, minimum_file_retention_timestamp, ); - action_iter - .map(move |action_res| { - let (batch, is_log_batch) = action_res?; - log_scanner.process_batch(batch, is_log_batch) - }) - // Only yield batches that have at least one selected row - .filter(|res| res.as_ref().map_or(true, |(_, sv)| sv.contains(&true))) + apply_processor_to_iterator(log_scanner, action_iter) } #[cfg(test)] @@ -212,12 +223,18 @@ mod tests { assert_eq!(results.len(), 2); // First batch should have all rows selected - let (_, selection_vector1) = &results[0]; - assert_eq!(selection_vector1, &vec![true, true, true, true]); + let checkpoint_data = &results[0]; + assert_eq!( + checkpoint_data.selection_vector, + vec![true, true, true, true] + ); // Second batch should have only new file and transaction selected - let (_, selection_vector2) = &results[1]; - assert_eq!(selection_vector2, &vec![false, false, true, false, true]); + let checkpoint_data = &results[1]; + assert_eq!( + checkpoint_data.selection_vector, + vec![false, false, true, false, true] + ); // Verify counters // 6 total actions (4 from batch1 + 2 from batch2 + 0 from batch3) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index e98dd6f03..852c3fe0d 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -32,4 +32,59 @@ pub trait LogReplayProcessor { // Get a reference to the set of seen file keys fn seen_file_keys(&mut self) -> &mut HashSet; + + // Create a selection vector of appropriate length with all elements set to the given value + fn create_selection_vector( + &self, + batch: &Box, + default_value: bool, + ) -> Vec { + let selection_vector = vec![default_value; batch.len()]; + assert_eq!( + selection_vector.len(), + batch.len(), + "Selection vector length does not match actions length" + ); + selection_vector + } + + // Filter an iterator to only include results with at least one selected item + fn filter_non_empty_results(iter: I) -> impl Iterator> + where + I: Iterator>, + T: HasSelectionVector, + { + iter.filter(|res| { + res.as_ref() + .map_or(true, |result| result.has_selected_rows()) + }) + } +} + +/// Trait for types that contain a selection vector +pub trait HasSelectionVector { + /// Check if the selection vector contains at least one selected row + fn has_selected_rows(&self) -> bool; +} + +/// Applies the given processor to the given iterator of action results, +/// and filters out batches with no selected rows. +/// +/// This function abstracts the common pattern used by both checkpoint and scan iterators. +pub fn apply_processor_to_iterator

( + mut processor: P, + action_iter: impl Iterator, bool)>>, +) -> impl Iterator> +where + P: LogReplayProcessor, +{ + action_iter + .map(move |action_res| { + let (batch, is_log_batch) = action_res?; + processor.process_batch(batch, is_log_batch) + }) + .filter(|res| { + res.as_ref() + .map_or(true, |result| result.has_selected_rows()) + }) } diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 6a0fa929a..42a88af4f 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -10,7 +10,7 @@ use crate::actions::get_log_add_schema; use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig}; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; -use crate::log_replay::{FileActionKey, LogReplayProcessor}; +use crate::log_replay::{apply_processor_to_iterator, FileActionKey, LogReplayProcessor}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; use crate::scan::{Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; @@ -350,15 +350,10 @@ pub(crate) fn scan_action_iter( transform: Option>, physical_predicate: Option<(ExpressionRef, SchemaRef)>, ) -> impl Iterator> { - let mut log_scanner = + let log_scanner = ScanLogReplayProcessor::new(engine, physical_predicate, logical_schema, transform); - action_iter - .map(move |action_res| { - let (batch, is_log_batch) = action_res?; - log_scanner.process_batch(batch, is_log_batch) - }) - .filter(|res| res.as_ref().map_or(true, |(_, sv, _)| sv.contains(&true))) + apply_processor_to_iterator(log_scanner, action_iter) } #[cfg(test)] diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index a8e5da899..92fd00cea 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -13,6 +13,7 @@ use crate::actions::deletion_vector::{ }; use crate::actions::{get_log_schema, ADD_NAME, REMOVE_NAME, SIDECAR_NAME}; use crate::expressions::{ColumnName, Expression, ExpressionRef, ExpressionTransform, Scalar}; +use crate::log_replay::HasSelectionVector; use crate::predicates::{DefaultPredicateEvaluator, EmptyColumnResolver}; use crate::scan::state::{DvInfo, Stats}; use crate::schema::{ @@ -324,6 +325,14 @@ pub(crate) enum TransformExpr { // (data, deletion_vec, transforms) pub type ScanData = (Box, Vec, Vec>); +// Implementation for the scan result type +impl HasSelectionVector for ScanData { + fn has_selected_rows(&self) -> bool { + let (_, sv, _) = self; + sv.contains(&true) + } +} + /// The result of building a scan over a table. This can be used to get the actual data from /// scanning the table. pub struct Scan { From a618833af203866fc356cde62131cf3a1572c61a Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 14:10:24 -0700 Subject: [PATCH 18/45] refactor --- kernel/src/lib.rs | 1 + kernel/src/log_replay.rs | 176 ++++++++++++++++++++++++++++++++++ kernel/src/scan/log_replay.rs | 132 +++++++++++-------------- 3 files changed, 231 insertions(+), 78 deletions(-) create mode 100644 kernel/src/log_replay.rs diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index 2e4698658..bb21bb0f9 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -77,6 +77,7 @@ pub mod actions; pub mod engine_data; pub mod error; pub mod expressions; +pub mod log_replay; pub mod scan; pub mod schema; pub mod snapshot; diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs new file mode 100644 index 000000000..7650183a0 --- /dev/null +++ b/kernel/src/log_replay.rs @@ -0,0 +1,176 @@ +use crate::{ + actions::deletion_vector::DeletionVectorDescriptor, + engine_data::{GetData, TypedGetData}, + DeltaResult, +}; +use std::collections::HashSet; +use tracing::debug; + +/// The subset of file action fields that uniquely identifies it in the log, used for deduplication +/// of adds and removes during log replay. +#[derive(Debug, Hash, Eq, PartialEq)] +pub(crate) struct FileActionKey { + pub(crate) path: String, + pub(crate) dv_unique_id: Option, +} +impl FileActionKey { + pub(crate) fn new(path: impl Into, dv_unique_id: Option) -> Self { + let path = path.into(); + Self { path, dv_unique_id } + } +} + +/// Core implementation for deduplicating file actions in Delta log replay +/// This struct extracts the common functionality from the incoming CheckpointVisitor +/// and the AddRemoveDedupVisitor. +pub(crate) struct FileActionDeduplicator<'seen> { + /// A set of (data file path, dv_unique_id) pairs that have been seen thus + /// far in the log for deduplication + seen_file_keys: &'seen mut HashSet, + /// Selection vector to track which rows should be included + selection_vector: Vec, + /// Whether we're processing a log batch (as opposed to a checkpoint) + is_log_batch: bool, + /// Index of the getter containing the add.path column + add_path_index: usize, + /// Index of the getter containing the remove.path column + remove_path_index: usize, + /// Starting index for add action deletion vector columns + add_dv_start_index: usize, + /// Starting index for remove action deletion vector columns + remove_dv_start_index: usize, +} + +impl<'seen> FileActionDeduplicator<'seen> { + pub(crate) fn new( + seen_file_keys: &'seen mut HashSet, + selection_vector: Vec, + is_log_batch: bool, + add_path_index: usize, + remove_path_index: usize, + add_dv_start_index: usize, + remove_dv_start_index: usize, + ) -> Self { + Self { + seen_file_keys, + selection_vector, + is_log_batch, + add_path_index, + remove_path_index, + add_dv_start_index, + remove_dv_start_index, + } + } + + /// Checks if log replay already processed this logical file (in which case the current action + /// should be ignored). If not already seen, register it so we can recognize future duplicates. + /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it + /// and should process it. + pub(crate) fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { + // Note: each (add.path + add.dv_unique_id()) pair has a + // unique Add + Remove pair in the log. For example: + // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json + + if self.seen_file_keys.contains(&key) { + debug!( + "Ignoring duplicate ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + true + } else { + debug!( + "Including ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + if self.is_log_batch { + // Remember file actions from this batch so we can ignore duplicates as we process + // batches from older commit and/or checkpoint files. We don't track checkpoint + // batches because they are already the oldest actions and never replace anything. + self.seen_file_keys.insert(key); + } + false + } + } + + /// Extract the deletion vector unique ID if it exists. + fn extract_dv_unique_id<'a>( + &self, + i: usize, + getters: &[&'a dyn GetData<'a>], + dv_start_index: usize, + ) -> DeltaResult> { + match getters[dv_start_index].get_opt(i, "deletionVector.storageType")? { + Some(storage_type) => { + let path_or_inline = + getters[dv_start_index + 1].get(i, "deletionVector.pathOrInlineDv")?; + let offset = getters[dv_start_index + 2].get_opt(i, "deletionVector.offset")?; + + Ok(Some(DeletionVectorDescriptor::unique_id_from_parts( + storage_type, + path_or_inline, + offset, + ))) + } + None => Ok(None), + } + } + + /// Extracts a file action key and determines if it's an add operation. + /// This method examines the data at the given index using the provided getters + /// to identify whether a file action exists and what type it is. + /// + /// # Arguments + /// + /// * `i` - Index position in the data structure to examine + /// * `getters` - Collection of data getter implementations used to access the data + /// * `skip_removes` - Whether to skip remove actions when extracting file actions + /// + /// # Returns + /// + /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation + /// * `Ok(None)` - When no file action is found + /// * `Err(...)` - On any error during extraction + pub(crate) fn extract_file_action<'a>( + &self, + i: usize, + getters: &[&'a dyn GetData<'a>], + skip_removes: bool, + ) -> DeltaResult> { + // Try to extract an add action by the required path column + if let Some(path) = getters[self.add_path_index].get_str(i, "add.path")? { + let dv_unique_id = self.extract_dv_unique_id(i, getters, self.add_dv_start_index)?; + return Ok(Some((FileActionKey::new(path, dv_unique_id), true))); + } + + // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint batch. + if skip_removes { + return Ok(None); + } + + // Try to extract a remove action by the required path column + if let Some(path) = getters[self.remove_path_index].get_str(i, "remove.path")? { + let dv_unique_id = self.extract_dv_unique_id(i, getters, self.remove_dv_start_index)?; + return Ok(Some((FileActionKey::new(path, dv_unique_id), false))); + } + + // No file action found + Ok(None) + } + + pub(crate) fn selection_vector(self) -> Vec { + self.selection_vector + } + + pub(crate) fn selection_vector_ref(&self) -> &Vec { + &self.selection_vector + } + + pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec { + &mut self.selection_vector + } + + /// Returns whether we are currently processing a log batch. + pub(crate) fn is_log_batch(&self) -> bool { + self.is_log_batch + } +} diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 0e26b610f..a2d65f1b0 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -3,33 +3,19 @@ use std::collections::{HashMap, HashSet}; use std::sync::{Arc, LazyLock}; use itertools::Itertools; -use tracing::debug; use super::data_skipping::DataSkippingFilter; use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; +use crate::log_replay::{FileActionDeduplicator, FileActionKey}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; -use crate::scan::{DeletionVectorDescriptor, Scalar, TransformExpr}; +use crate::scan::{Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; -/// The subset of file action fields that uniquely identifies it in the log, used for deduplication -/// of adds and removes during log replay. -#[derive(Debug, Hash, Eq, PartialEq)] -struct FileActionKey { - path: String, - dv_unique_id: Option, -} -impl FileActionKey { - fn new(path: impl Into, dv_unique_id: Option) -> Self { - let path = path.into(); - Self { path, dv_unique_id } - } -} - struct LogReplayScanner { partition_filter: Option, data_skipping_filter: Option, @@ -45,43 +31,43 @@ struct LogReplayScanner { /// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the /// first action for a given file is a remove, then that file does not show up in the result at all. struct AddRemoveDedupVisitor<'seen> { - seen: &'seen mut HashSet, - selection_vector: Vec, + deduplicator: FileActionDeduplicator<'seen>, logical_schema: SchemaRef, transform: Option>, partition_filter: Option, row_transform_exprs: Vec>, - is_log_batch: bool, } impl AddRemoveDedupVisitor<'_> { - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it - /// and should process it. - fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen.contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - true - } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - if self.is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen.insert(key); - } - false + // The index position in the row getters for the following columns + const ADD_PATH_INDEX: usize = 0; + const ADD_PARTITION_VALUES_INDEX: usize = 1; + const ADD_DV_START_INDEX: usize = 2; + const REMOVE_PATH_INDEX: usize = 5; + const REMOVE_DV_START_INDEX: usize = 6; + + fn new( + seen: &mut HashSet, + selection_vector: Vec, + logical_schema: SchemaRef, + transform: Option>, + partition_filter: Option, + is_log_batch: bool, + ) -> AddRemoveDedupVisitor<'_> { + AddRemoveDedupVisitor { + deduplicator: FileActionDeduplicator::new( + seen, + selection_vector, + is_log_batch, + Self::ADD_PATH_INDEX, + Self::REMOVE_PATH_INDEX, + Self::ADD_DV_START_INDEX, + Self::REMOVE_DV_START_INDEX, + ), + logical_schema, + transform, + partition_filter, + row_transform_exprs: Vec::new(), } } @@ -162,26 +148,13 @@ impl AddRemoveDedupVisitor<'_> { /// True if this row contains an Add action that should survive log replay. Skip it if the row /// is not an Add action, or the file has already been seen previously. fn is_valid_add<'a>(&mut self, i: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult { - // Add will have a path at index 0 if it is valid; otherwise, if it is a log batch, we may - // have a remove with a path at index 4. In either case, extract the three dv getters at - // indexes that immediately follow a valid path index. - let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? { - (path, &getters[2..5], true) - } else if !self.is_log_batch { + // Remove getters are not included when visiting a non-log batch (checkpoint batch), so do + // not try to extract remove actions in that case. + let Some((file_key, is_add)) = + self.deduplicator + .extract_file_action(i, getters, self.deduplicator.is_log_batch())? + else { return Ok(false); - } else if let Some(path) = getters[5].get_opt(i, "remove.path")? { - (path, &getters[6..9], false) - } else { - return Ok(false); - }; - - let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?, - dv_getters[2].get_opt(i, "deletionVector.offset")?, - )), - None => None, }; // Apply partition pruning (to adds only) before deduplication, so that we don't waste memory @@ -192,7 +165,8 @@ impl AddRemoveDedupVisitor<'_> { // encounter if the table's schema was replaced after the most recent checkpoint. let partition_values = match &self.transform { Some(transform) if is_add => { - let partition_values = getters[1].get(i, "add.partitionValues")?; + let partition_values = + getters[Self::ADD_PARTITION_VALUES_INDEX].get(i, "add.partitionValues")?; let partition_values = self.parse_partition_values(transform, &partition_values)?; if self.is_file_partition_pruned(&partition_values) { return Ok(false); @@ -203,8 +177,7 @@ impl AddRemoveDedupVisitor<'_> { }; // Check both adds and removes (skipping already-seen), but only transform and return adds - let file_key = FileActionKey::new(path, dv_unique_id); - if self.check_and_record_seen(file_key) || !is_add { + if self.deduplicator.check_and_record_seen(file_key) || !is_add { return Ok(false); } let transform = self @@ -243,7 +216,7 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { (names, types).into() }); let (names, types) = NAMES_AND_TYPES.as_ref(); - if self.is_log_batch { + if self.deduplicator.is_log_batch() { (names, types) } else { // All checkpoint actions are already reconciled and Remove actions in checkpoint files @@ -253,7 +226,11 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { } fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { - let expected_getters = if self.is_log_batch { 9 } else { 5 }; + let expected_getters = if self.deduplicator.is_log_batch() { + 9 + } else { + 5 + }; require!( getters.len() == expected_getters, Error::InternalError(format!( @@ -263,8 +240,8 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { ); for i in 0..row_count { - if self.selection_vector[i] { - self.selection_vector[i] = self.is_valid_add(i, getters)?; + if self.deduplicator.selection_vector_ref()[i] { + self.deduplicator.selection_vector_mut()[i] = self.is_valid_add(i, getters)?; } } Ok(()) @@ -336,19 +313,18 @@ impl LogReplayScanner { }; assert_eq!(selection_vector.len(), actions.len()); - let mut visitor = AddRemoveDedupVisitor { - seen: &mut self.seen, + let mut visitor = AddRemoveDedupVisitor::new( + &mut self.seen, selection_vector, logical_schema, transform, - partition_filter: self.partition_filter.clone(), - row_transform_exprs: Vec::new(), + self.partition_filter.clone(), is_log_batch, - }; + ); visitor.visit_rows_of(actions)?; // TODO: Teach expression eval to respect the selection vector we just computed so carefully! - let selection_vector = visitor.selection_vector; + let selection_vector = visitor.deduplicator.selection_vector(); let result = add_transform.evaluate(actions)?; Ok((result, selection_vector, visitor.row_transform_exprs)) } From 7da74b268f38672c54651749c27777d7293cdbc3 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 14:14:15 -0700 Subject: [PATCH 19/45] more docs --- kernel/src/log_replay.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 7650183a0..ef9004ae1 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -25,7 +25,8 @@ impl FileActionKey { /// and the AddRemoveDedupVisitor. pub(crate) struct FileActionDeduplicator<'seen> { /// A set of (data file path, dv_unique_id) pairs that have been seen thus - /// far in the log for deduplication + /// far in the log for deduplication. This is a mutable reference to the set + /// of seen file keys that persists across multiple log batches. seen_file_keys: &'seen mut HashSet, /// Selection vector to track which rows should be included selection_vector: Vec, From 220a216a2968531943a0773a87b4e2fc702d08fe Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 15:05:59 -0700 Subject: [PATCH 20/45] invert is_log_batch logic --- kernel/src/scan/log_replay.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index a2d65f1b0..b6bdc1570 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -152,7 +152,7 @@ impl AddRemoveDedupVisitor<'_> { // not try to extract remove actions in that case. let Some((file_key, is_add)) = self.deduplicator - .extract_file_action(i, getters, self.deduplicator.is_log_batch())? + .extract_file_action(i, getters, !self.deduplicator.is_log_batch())? else { return Ok(false); }; From 9d86911fadb6a6aa6267a82cc4aec9c3949ec0da Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 15:14:46 -0700 Subject: [PATCH 21/45] docs --- kernel/src/log_replay.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index ef9004ae1..d6b175f28 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -20,9 +20,12 @@ impl FileActionKey { } } -/// Core implementation for deduplicating file actions in Delta log replay -/// This struct extracts the common functionality from the incoming CheckpointVisitor -/// and the AddRemoveDedupVisitor. +/// Maintains state and provides functionality for deduplicating file actions during log replay. +/// +/// This struct is embedded in visitors AddRemoveDedupVisitor and CheckpointVisitor to track +/// which files have been seen across multiple log batches. Since logs are processed +/// newest-to-oldest, this deduplicator ensures that each unique file (identified by path +/// and deletion vector ID) is processed only once. pub(crate) struct FileActionDeduplicator<'seen> { /// A set of (data file path, dv_unique_id) pairs that have been seen thus /// far in the log for deduplication. This is a mutable reference to the set From e5b0e32056b8ea12060fd48cb18b2eb63f3e537f Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 15:16:42 -0700 Subject: [PATCH 22/45] docs --- kernel/src/log_replay.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index d6b175f28..e400c27d1 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -22,10 +22,9 @@ impl FileActionKey { /// Maintains state and provides functionality for deduplicating file actions during log replay. /// -/// This struct is embedded in visitors AddRemoveDedupVisitor and CheckpointVisitor to track -/// which files have been seen across multiple log batches. Since logs are processed -/// newest-to-oldest, this deduplicator ensures that each unique file (identified by path -/// and deletion vector ID) is processed only once. +/// This struct is embedded in visitors to track which files have been seen across multiple +/// log batches. Since logs are processed newest-to-oldest, this deduplicator ensures that each +/// unique file (identified by path and deletion vector ID) is processed only once. pub(crate) struct FileActionDeduplicator<'seen> { /// A set of (data file path, dv_unique_id) pairs that have been seen thus /// far in the log for deduplication. This is a mutable reference to the set From a5393dcc896d05071b2a704b82e6f47b93f07bcc Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 15:48:28 -0700 Subject: [PATCH 23/45] docs and imports --- kernel/src/log_replay.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index e400c27d1..521b6e81e 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -1,9 +1,10 @@ -use crate::{ - actions::deletion_vector::DeletionVectorDescriptor, - engine_data::{GetData, TypedGetData}, - DeltaResult, -}; +//! This module provides structures and functionality to faciliate the log replay process. use std::collections::HashSet; + +use crate::actions::deletion_vector::DeletionVectorDescriptor; +use crate::engine_data::{GetData, TypedGetData}; +use crate::DeltaResult; + use tracing::debug; /// The subset of file action fields that uniquely identifies it in the log, used for deduplication @@ -24,7 +25,9 @@ impl FileActionKey { /// /// This struct is embedded in visitors to track which files have been seen across multiple /// log batches. Since logs are processed newest-to-oldest, this deduplicator ensures that each -/// unique file (identified by path and deletion vector ID) is processed only once. +/// unique file (identified by path and deletion vector ID) is processed only once. Performing +/// deduplication at the visitor level avoids having to load all actions into memory at once, +/// significantly reducing memory usage for large Delta tables with extensive history. pub(crate) struct FileActionDeduplicator<'seen> { /// A set of (data file path, dv_unique_id) pairs that have been seen thus /// far in the log for deduplication. This is a mutable reference to the set From a23c651ad7aa3a8399f836eac5e6113bec2aafde Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 26 Mar 2025 20:04:05 -0700 Subject: [PATCH 24/45] improve mod doc --- kernel/src/log_replay.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 521b6e81e..e5854ca31 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -1,4 +1,20 @@ -//! This module provides structures and functionality to faciliate the log replay process. +//! This module provides log replay utilities. +//! +//! Log replay is the process of transforming an iterator of action batches (read from Delta +//! transaction logs) into an iterator of filtered/transformed actions for specific use cases. +//! The logs, which record all table changes as JSON entries, are processed batch by batch, +//! typically from newest to oldest. +//! +//! Log replay can be implemented in various ways: +//! - For table scans: Deduplicate file actions to identify the current set of valid files +//! - For checkpointing: Filter actions to include only those needed to rebuild table state +//! +//! This module provides structures for efficient batch processing, focusing on file action +//! deduplication with `FileActionDeduplicator` which tracks unique files across log batches +//! to minimize memory usage for tables with extensive history. +//! +//! Future extensions will support additional log replay processors beyond the current use cases. + use std::collections::HashSet; use crate::actions::deletion_vector::DeletionVectorDescriptor; From d712d181204d068e9346cc0e3e6ee582c95a80a7 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 09:43:14 -0700 Subject: [PATCH 25/45] improve doc --- kernel/src/log_replay.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index e5854ca31..3b2a84692 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -5,15 +5,14 @@ //! The logs, which record all table changes as JSON entries, are processed batch by batch, //! typically from newest to oldest. //! -//! Log replay can be implemented in various ways: -//! - For table scans: Deduplicate file actions to identify the current set of valid files -//! - For checkpointing: Filter actions to include only those needed to rebuild table state +//! Log replay is currently implemented for table scans, which filter and apply transofmations +//! to file actions to produce a view of the table state at a specific point in time. +//! Future extensions will support additional log replay processors beyond the current use case. +//! (e.g. checkpointing: filter actions to include only those needed to rebuild table state) //! //! This module provides structures for efficient batch processing, focusing on file action //! deduplication with `FileActionDeduplicator` which tracks unique files across log batches //! to minimize memory usage for tables with extensive history. -//! -//! Future extensions will support additional log replay processors beyond the current use cases. use std::collections::HashSet; From e564ae17ca5f6b17659a6ac05867af7df0681621 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 09:46:10 -0700 Subject: [PATCH 26/45] docs' --- kernel/src/log_replay.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 3b2a84692..d9a906525 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -5,8 +5,8 @@ //! The logs, which record all table changes as JSON entries, are processed batch by batch, //! typically from newest to oldest. //! -//! Log replay is currently implemented for table scans, which filter and apply transofmations -//! to file actions to produce a view of the table state at a specific point in time. +//! Log replay is currently implemented for table scans, which filter and apply transformations +//! to produce file actions which builds the view of the table state at a specific point in time. //! Future extensions will support additional log replay processors beyond the current use case. //! (e.g. checkpointing: filter actions to include only those needed to rebuild table state) //! From b14ff195c0655cfecc29d6b666af6f72c4bcd29d Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 13:42:06 -0700 Subject: [PATCH 27/45] docs --- kernel/src/log_replay.rs | 2 +- kernel/src/scan/log_replay.rs | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index d9a906525..0064a701a 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -178,7 +178,7 @@ impl<'seen> FileActionDeduplicator<'seen> { Ok(None) } - pub(crate) fn selection_vector(self) -> Vec { + pub(crate) fn into_selection_vector(self) -> Vec { self.selection_vector } diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index b6bdc1570..77a985125 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -148,6 +148,10 @@ impl AddRemoveDedupVisitor<'_> { /// True if this row contains an Add action that should survive log replay. Skip it if the row /// is not an Add action, or the file has already been seen previously. fn is_valid_add<'a>(&mut self, i: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult { + // When processing file actions, we extract path and deletion vector information based on action type: + // - For Add actions: path is at index 0, followed by DV fields at indexes 2-4 + // - For Remove actions (in log batches only): path is at index 5, followed by DV fields at indexes 6-8 + // The file extraction logic selects the appropriate indexes based on whether we found a valid path. // Remove getters are not included when visiting a non-log batch (checkpoint batch), so do // not try to extract remove actions in that case. let Some((file_key, is_add)) = @@ -324,7 +328,7 @@ impl LogReplayScanner { visitor.visit_rows_of(actions)?; // TODO: Teach expression eval to respect the selection vector we just computed so carefully! - let selection_vector = visitor.deduplicator.selection_vector(); + let selection_vector = visitor.deduplicator.into_selection_vector(); let result = add_transform.evaluate(actions)?; Ok((result, selection_vector, visitor.row_transform_exprs)) } From a52d484be0741e5e9d3e72336a0e65d8b86a3298 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 14:22:33 -0700 Subject: [PATCH 28/45] update --- kernel/src/log_replay.rs | 16 ---------------- kernel/src/scan/log_replay.rs | 14 +++++++++----- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 0064a701a..39aa4ab6e 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -48,8 +48,6 @@ pub(crate) struct FileActionDeduplicator<'seen> { /// far in the log for deduplication. This is a mutable reference to the set /// of seen file keys that persists across multiple log batches. seen_file_keys: &'seen mut HashSet, - /// Selection vector to track which rows should be included - selection_vector: Vec, /// Whether we're processing a log batch (as opposed to a checkpoint) is_log_batch: bool, /// Index of the getter containing the add.path column @@ -65,7 +63,6 @@ pub(crate) struct FileActionDeduplicator<'seen> { impl<'seen> FileActionDeduplicator<'seen> { pub(crate) fn new( seen_file_keys: &'seen mut HashSet, - selection_vector: Vec, is_log_batch: bool, add_path_index: usize, remove_path_index: usize, @@ -74,7 +71,6 @@ impl<'seen> FileActionDeduplicator<'seen> { ) -> Self { Self { seen_file_keys, - selection_vector, is_log_batch, add_path_index, remove_path_index, @@ -178,18 +174,6 @@ impl<'seen> FileActionDeduplicator<'seen> { Ok(None) } - pub(crate) fn into_selection_vector(self) -> Vec { - self.selection_vector - } - - pub(crate) fn selection_vector_ref(&self) -> &Vec { - &self.selection_vector - } - - pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec { - &mut self.selection_vector - } - /// Returns whether we are currently processing a log batch. pub(crate) fn is_log_batch(&self) -> bool { self.is_log_batch diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 77a985125..3c6c2e845 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -36,6 +36,7 @@ struct AddRemoveDedupVisitor<'seen> { transform: Option>, partition_filter: Option, row_transform_exprs: Vec>, + selection_vector: Vec, } impl AddRemoveDedupVisitor<'_> { @@ -57,7 +58,6 @@ impl AddRemoveDedupVisitor<'_> { AddRemoveDedupVisitor { deduplicator: FileActionDeduplicator::new( seen, - selection_vector, is_log_batch, Self::ADD_PATH_INDEX, Self::REMOVE_PATH_INDEX, @@ -68,6 +68,7 @@ impl AddRemoveDedupVisitor<'_> { transform, partition_filter, row_transform_exprs: Vec::new(), + selection_vector, } } @@ -244,8 +245,8 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { ); for i in 0..row_count { - if self.deduplicator.selection_vector_ref()[i] { - self.deduplicator.selection_vector_mut()[i] = self.is_valid_add(i, getters)?; + if self.selection_vector[i] { + self.selection_vector[i] = self.is_valid_add(i, getters)?; } } Ok(()) @@ -328,9 +329,12 @@ impl LogReplayScanner { visitor.visit_rows_of(actions)?; // TODO: Teach expression eval to respect the selection vector we just computed so carefully! - let selection_vector = visitor.deduplicator.into_selection_vector(); let result = add_transform.evaluate(actions)?; - Ok((result, selection_vector, visitor.row_transform_exprs)) + Ok(( + result, + visitor.selection_vector, + visitor.row_transform_exprs, + )) } } From a243a989af2d99bfa3ab07ddc78ec236ec0fbb54 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 16:59:37 -0700 Subject: [PATCH 29/45] nits --- kernel/src/log_replay.rs | 31 ++++++++++++++++++++++++------- kernel/src/scan/log_replay.rs | 23 ++++++++++++----------- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 39aa4ab6e..12528d296 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -18,8 +18,8 @@ use std::collections::HashSet; use crate::actions::deletion_vector::DeletionVectorDescriptor; use crate::engine_data::{GetData, TypedGetData}; +use crate::log_replay::FileActionKeyType::{Add, Remove}; use crate::DeltaResult; - use tracing::debug; /// The subset of file action fields that uniquely identifies it in the log, used for deduplication @@ -28,14 +28,31 @@ use tracing::debug; pub(crate) struct FileActionKey { pub(crate) path: String, pub(crate) dv_unique_id: Option, + pub(crate) action_type: FileActionKeyType, } + impl FileActionKey { - pub(crate) fn new(path: impl Into, dv_unique_id: Option) -> Self { + pub(crate) fn new( + path: impl Into, + dv_unique_id: Option, + action_type: FileActionKeyType, + ) -> Self { let path = path.into(); - Self { path, dv_unique_id } + Self { + path, + dv_unique_id, + action_type, + } } } +// File actions are either add or remove actions. +#[derive(Debug, Hash, Eq, PartialEq)] +pub(crate) enum FileActionKeyType { + Add, + Remove, +} + /// Maintains state and provides functionality for deduplicating file actions during log replay. /// /// This struct is embedded in visitors to track which files have been seen across multiple @@ -144,7 +161,7 @@ impl<'seen> FileActionDeduplicator<'seen> { /// /// # Returns /// - /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation + /// * `Ok(Some((key))` - When a file action is found, returns the key /// * `Ok(None)` - When no file action is found /// * `Err(...)` - On any error during extraction pub(crate) fn extract_file_action<'a>( @@ -152,11 +169,11 @@ impl<'seen> FileActionDeduplicator<'seen> { i: usize, getters: &[&'a dyn GetData<'a>], skip_removes: bool, - ) -> DeltaResult> { + ) -> DeltaResult> { // Try to extract an add action by the required path column if let Some(path) = getters[self.add_path_index].get_str(i, "add.path")? { let dv_unique_id = self.extract_dv_unique_id(i, getters, self.add_dv_start_index)?; - return Ok(Some((FileActionKey::new(path, dv_unique_id), true))); + return Ok(Some(FileActionKey::new(path, dv_unique_id, Add))); } // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint batch. @@ -167,7 +184,7 @@ impl<'seen> FileActionDeduplicator<'seen> { // Try to extract a remove action by the required path column if let Some(path) = getters[self.remove_path_index].get_str(i, "remove.path")? { let dv_unique_id = self.extract_dv_unique_id(i, getters, self.remove_dv_start_index)?; - return Ok(Some((FileActionKey::new(path, dv_unique_id), false))); + return Ok(Some(FileActionKey::new(path, dv_unique_id, Remove))); } // No file action found diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 3c6c2e845..2042ab090 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -9,7 +9,7 @@ use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; -use crate::log_replay::{FileActionDeduplicator, FileActionKey}; +use crate::log_replay::{FileActionDeduplicator, FileActionKey, FileActionKeyType}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; use crate::scan::{Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; @@ -32,11 +32,11 @@ struct LogReplayScanner { /// first action for a given file is a remove, then that file does not show up in the result at all. struct AddRemoveDedupVisitor<'seen> { deduplicator: FileActionDeduplicator<'seen>, + selection_vector: Vec, logical_schema: SchemaRef, transform: Option>, partition_filter: Option, row_transform_exprs: Vec>, - selection_vector: Vec, } impl AddRemoveDedupVisitor<'_> { @@ -64,11 +64,11 @@ impl AddRemoveDedupVisitor<'_> { Self::ADD_DV_START_INDEX, Self::REMOVE_DV_START_INDEX, ), + selection_vector, logical_schema, transform, partition_filter, row_transform_exprs: Vec::new(), - selection_vector, } } @@ -155,12 +155,15 @@ impl AddRemoveDedupVisitor<'_> { // The file extraction logic selects the appropriate indexes based on whether we found a valid path. // Remove getters are not included when visiting a non-log batch (checkpoint batch), so do // not try to extract remove actions in that case. - let Some((file_key, is_add)) = - self.deduplicator - .extract_file_action(i, getters, !self.deduplicator.is_log_batch())? + let Some(file_key) = self.deduplicator.extract_file_action( + i, + getters, + !self.deduplicator.is_log_batch(), // skip_removes. true if this is a checkpoint batch + )? else { return Ok(false); }; + let is_add = matches!(file_key.action_type, FileActionKeyType::Add); // Apply partition pruning (to adds only) before deduplication, so that we don't waste memory // tracking pruned files. Removes don't get pruned and we'll still have to track them. @@ -231,11 +234,9 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { } fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { - let expected_getters = if self.deduplicator.is_log_batch() { - 9 - } else { - 5 - }; + let is_log_batch = self.deduplicator.is_log_batch(); + let expected_getters = if is_log_batch { 9 } else { 5 }; + require!( getters.len() == expected_getters, Error::InternalError(format!( From 9f06382993af7c30b164cf3b452880141adc4dc1 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 17:05:14 -0700 Subject: [PATCH 30/45] Revert "nits" This reverts commit a243a989af2d99bfa3ab07ddc78ec236ec0fbb54. --- kernel/src/log_replay.rs | 31 +++++++------------------------ kernel/src/scan/log_replay.rs | 23 +++++++++++------------ 2 files changed, 18 insertions(+), 36 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 12528d296..39aa4ab6e 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -18,8 +18,8 @@ use std::collections::HashSet; use crate::actions::deletion_vector::DeletionVectorDescriptor; use crate::engine_data::{GetData, TypedGetData}; -use crate::log_replay::FileActionKeyType::{Add, Remove}; use crate::DeltaResult; + use tracing::debug; /// The subset of file action fields that uniquely identifies it in the log, used for deduplication @@ -28,31 +28,14 @@ use tracing::debug; pub(crate) struct FileActionKey { pub(crate) path: String, pub(crate) dv_unique_id: Option, - pub(crate) action_type: FileActionKeyType, } - impl FileActionKey { - pub(crate) fn new( - path: impl Into, - dv_unique_id: Option, - action_type: FileActionKeyType, - ) -> Self { + pub(crate) fn new(path: impl Into, dv_unique_id: Option) -> Self { let path = path.into(); - Self { - path, - dv_unique_id, - action_type, - } + Self { path, dv_unique_id } } } -// File actions are either add or remove actions. -#[derive(Debug, Hash, Eq, PartialEq)] -pub(crate) enum FileActionKeyType { - Add, - Remove, -} - /// Maintains state and provides functionality for deduplicating file actions during log replay. /// /// This struct is embedded in visitors to track which files have been seen across multiple @@ -161,7 +144,7 @@ impl<'seen> FileActionDeduplicator<'seen> { /// /// # Returns /// - /// * `Ok(Some((key))` - When a file action is found, returns the key + /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation /// * `Ok(None)` - When no file action is found /// * `Err(...)` - On any error during extraction pub(crate) fn extract_file_action<'a>( @@ -169,11 +152,11 @@ impl<'seen> FileActionDeduplicator<'seen> { i: usize, getters: &[&'a dyn GetData<'a>], skip_removes: bool, - ) -> DeltaResult> { + ) -> DeltaResult> { // Try to extract an add action by the required path column if let Some(path) = getters[self.add_path_index].get_str(i, "add.path")? { let dv_unique_id = self.extract_dv_unique_id(i, getters, self.add_dv_start_index)?; - return Ok(Some(FileActionKey::new(path, dv_unique_id, Add))); + return Ok(Some((FileActionKey::new(path, dv_unique_id), true))); } // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint batch. @@ -184,7 +167,7 @@ impl<'seen> FileActionDeduplicator<'seen> { // Try to extract a remove action by the required path column if let Some(path) = getters[self.remove_path_index].get_str(i, "remove.path")? { let dv_unique_id = self.extract_dv_unique_id(i, getters, self.remove_dv_start_index)?; - return Ok(Some(FileActionKey::new(path, dv_unique_id, Remove))); + return Ok(Some((FileActionKey::new(path, dv_unique_id), false))); } // No file action found diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 2042ab090..3c6c2e845 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -9,7 +9,7 @@ use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; -use crate::log_replay::{FileActionDeduplicator, FileActionKey, FileActionKeyType}; +use crate::log_replay::{FileActionDeduplicator, FileActionKey}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; use crate::scan::{Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; @@ -32,11 +32,11 @@ struct LogReplayScanner { /// first action for a given file is a remove, then that file does not show up in the result at all. struct AddRemoveDedupVisitor<'seen> { deduplicator: FileActionDeduplicator<'seen>, - selection_vector: Vec, logical_schema: SchemaRef, transform: Option>, partition_filter: Option, row_transform_exprs: Vec>, + selection_vector: Vec, } impl AddRemoveDedupVisitor<'_> { @@ -64,11 +64,11 @@ impl AddRemoveDedupVisitor<'_> { Self::ADD_DV_START_INDEX, Self::REMOVE_DV_START_INDEX, ), - selection_vector, logical_schema, transform, partition_filter, row_transform_exprs: Vec::new(), + selection_vector, } } @@ -155,15 +155,12 @@ impl AddRemoveDedupVisitor<'_> { // The file extraction logic selects the appropriate indexes based on whether we found a valid path. // Remove getters are not included when visiting a non-log batch (checkpoint batch), so do // not try to extract remove actions in that case. - let Some(file_key) = self.deduplicator.extract_file_action( - i, - getters, - !self.deduplicator.is_log_batch(), // skip_removes. true if this is a checkpoint batch - )? + let Some((file_key, is_add)) = + self.deduplicator + .extract_file_action(i, getters, !self.deduplicator.is_log_batch())? else { return Ok(false); }; - let is_add = matches!(file_key.action_type, FileActionKeyType::Add); // Apply partition pruning (to adds only) before deduplication, so that we don't waste memory // tracking pruned files. Removes don't get pruned and we'll still have to track them. @@ -234,9 +231,11 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { } fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { - let is_log_batch = self.deduplicator.is_log_batch(); - let expected_getters = if is_log_batch { 9 } else { 5 }; - + let expected_getters = if self.deduplicator.is_log_batch() { + 9 + } else { + 5 + }; require!( getters.len() == expected_getters, Error::InternalError(format!( From 58f38c0345179ad11300fad2197953ac4adc61e0 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 17:07:08 -0700 Subject: [PATCH 31/45] nits --- kernel/src/scan/log_replay.rs | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 3c6c2e845..37e504405 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -32,11 +32,11 @@ struct LogReplayScanner { /// first action for a given file is a remove, then that file does not show up in the result at all. struct AddRemoveDedupVisitor<'seen> { deduplicator: FileActionDeduplicator<'seen>, + selection_vector: Vec, logical_schema: SchemaRef, transform: Option>, partition_filter: Option, row_transform_exprs: Vec>, - selection_vector: Vec, } impl AddRemoveDedupVisitor<'_> { @@ -64,11 +64,11 @@ impl AddRemoveDedupVisitor<'_> { Self::ADD_DV_START_INDEX, Self::REMOVE_DV_START_INDEX, ), + selection_vector, logical_schema, transform, partition_filter, row_transform_exprs: Vec::new(), - selection_vector, } } @@ -155,9 +155,11 @@ impl AddRemoveDedupVisitor<'_> { // The file extraction logic selects the appropriate indexes based on whether we found a valid path. // Remove getters are not included when visiting a non-log batch (checkpoint batch), so do // not try to extract remove actions in that case. - let Some((file_key, is_add)) = - self.deduplicator - .extract_file_action(i, getters, !self.deduplicator.is_log_batch())? + let Some((file_key, is_add)) = self.deduplicator.extract_file_action( + i, + getters, + !self.deduplicator.is_log_batch(), // skip_removes. true if this is a checkpoint batch + )? else { return Ok(false); }; @@ -231,11 +233,8 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { } fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { - let expected_getters = if self.deduplicator.is_log_batch() { - 9 - } else { - 5 - }; + let is_log_batch = self.deduplicator.is_log_batch(); + let expected_getters = if is_log_batch { 9 } else { 5 }; require!( getters.len() == expected_getters, Error::InternalError(format!( From 628546c45bd10fc4b16501bc47bd5693f5c1b9f8 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 11:39:05 -0700 Subject: [PATCH 32/45] refactor --- kernel/src/log_replay.rs | 54 ++++++++++++++++++- kernel/src/scan/log_replay.rs | 97 +++++++++++++++++++---------------- kernel/src/scan/mod.rs | 8 +++ 3 files changed, 115 insertions(+), 44 deletions(-) diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 39aa4ab6e..3b7e87524 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -18,7 +18,7 @@ use std::collections::HashSet; use crate::actions::deletion_vector::DeletionVectorDescriptor; use crate::engine_data::{GetData, TypedGetData}; -use crate::DeltaResult; +use crate::{DeltaResult, EngineData}; use tracing::debug; @@ -179,3 +179,55 @@ impl<'seen> FileActionDeduplicator<'seen> { self.is_log_batch } } + +/// Trait defining log replay processors which implement custom filtering and transformation +/// logic for processing action batches from transaction logs. They receive batches in reverse +/// chronological order (newest to oldest) and typically: +/// +/// 1. Create or maintain a selection vector to track which actions to include +/// 2. Track already-seen file actions to deduplicate across batches +/// 3. Apply specialized filtering based on processor type (scan, checkpoint, etc.) +/// +pub(crate) trait LogReplayProcessor { + /// The type of results produced by this processor + type Output; + + /// Process a batch of actions and return the filtered result + fn process_actions_batch( + &mut self, + batch: Box, + is_log_batch: bool, + ) -> DeltaResult; + + // Get a reference to the set of seen file keys + fn seen_file_keys(&mut self) -> &mut HashSet; + + /// Applies a processor to an action iterator and filters out empty results. + /// + /// This is an associated function rather than an instance method because the + /// returned iterator needs to own the processor. + fn apply_to_iterator( + processor: impl LogReplayProcessor, + action_iter: impl Iterator, bool)>>, + ) -> impl Iterator> + where + Self::Output: HasSelectionVector, + { + let mut processor = processor; + action_iter + .map(move |action_res| { + let (batch, is_log_batch) = action_res?; + processor.process_actions_batch(batch, is_log_batch) + }) + .filter(|res| { + res.as_ref() + .map_or(true, |result| result.has_selected_rows()) + }) + } +} + +/// Trait for types that contain a selection vector used in log replay filtering. +pub(crate) trait HasSelectionVector { + /// Check if the selection vector contains at least one selected row + fn has_selected_rows(&self) -> bool; +} diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 37e504405..223c668ec 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -9,21 +9,23 @@ use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; -use crate::log_replay::{FileActionDeduplicator, FileActionKey}; +use crate::log_replay::{FileActionDeduplicator, FileActionKey, LogReplayProcessor}; use crate::predicates::{DefaultPredicateEvaluator, PredicateEvaluator as _}; use crate::scan::{Scalar, TransformExpr}; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, ExpressionEvaluator}; -struct LogReplayScanner { +struct ScanLogReplayProcessor { partition_filter: Option, data_skipping_filter: Option, - + add_transform: Arc, + logical_schema: SchemaRef, + transform: Option>, /// A set of (data file path, dv_unique_id) pairs that have been seen thus /// far in the log. This is used to filter out files with Remove actions as /// well as duplicate entries in the log. - seen: HashSet, + seen_file_keys: HashSet, } /// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds. Log @@ -291,41 +293,37 @@ fn get_add_transform_expr() -> Expression { ]) } -impl LogReplayScanner { - /// Create a new [`LogReplayScanner`] instance - fn new(engine: &dyn Engine, physical_predicate: Option<(ExpressionRef, SchemaRef)>) -> Self { - Self { - partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()), - data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate), - seen: Default::default(), - } - } +impl LogReplayProcessor for ScanLogReplayProcessor { + type Output = ScanData; - fn process_scan_batch( + fn process_actions_batch( &mut self, - add_transform: &dyn ExpressionEvaluator, - actions: &dyn EngineData, - logical_schema: SchemaRef, - transform: Option>, + batch: Box, is_log_batch: bool, - ) -> DeltaResult { + ) -> DeltaResult { // Apply data skipping to get back a selection vector for actions that passed skipping. We // will update the vector below as log replay identifies duplicates that should be ignored. let selection_vector = match &self.data_skipping_filter { - Some(filter) => filter.apply(actions)?, - None => vec![true; actions.len()], + Some(filter) => filter.apply(batch.as_ref())?, + None => vec![true; batch.len()], }; - assert_eq!(selection_vector.len(), actions.len()); + assert_eq!(selection_vector.len(), batch.len()); + + let logical_schema = self.logical_schema.clone(); + let transform = self.transform.clone(); + let partition_filter = self.partition_filter.clone(); + let result = self.add_transform.evaluate(batch.as_ref())?; let mut visitor = AddRemoveDedupVisitor::new( - &mut self.seen, + self.seen_file_keys(), selection_vector, logical_schema, transform, - self.partition_filter.clone(), + partition_filter, is_log_batch, ); - visitor.visit_rows_of(actions)?; + + visitor.visit_rows_of(batch.as_ref())?; // TODO: Teach expression eval to respect the selection vector we just computed so carefully! let result = add_transform.evaluate(actions)?; @@ -335,6 +333,33 @@ impl LogReplayScanner { visitor.row_transform_exprs, )) } + + fn seen_file_keys(&mut self) -> &mut HashSet { + &mut self.seen_file_keys + } +} + +impl ScanLogReplayProcessor { + /// Create a new [`ScanLogReplayProcessor`] instance + fn new( + engine: &dyn Engine, + physical_predicate: Option<(ExpressionRef, SchemaRef)>, + logical_schema: SchemaRef, + transform: Option>, + ) -> Self { + Self { + partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()), + data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate), + add_transform: engine.get_expression_handler().get_evaluator( + get_log_add_schema().clone(), + get_add_transform_expr(), + SCAN_ROW_DATATYPE.clone(), + ), + seen_file_keys: Default::default(), + logical_schema, + transform, + } + } } /// Given an iterator of (engine_data, bool) tuples and a predicate, returns an iterator of @@ -348,24 +373,10 @@ pub(crate) fn scan_action_iter( transform: Option>, physical_predicate: Option<(ExpressionRef, SchemaRef)>, ) -> impl Iterator> { - let mut log_scanner = LogReplayScanner::new(engine, physical_predicate); - let add_transform = engine.get_expression_handler().get_evaluator( - get_log_add_schema().clone(), - get_add_transform_expr(), - SCAN_ROW_DATATYPE.clone(), - ); - action_iter - .map(move |action_res| { - let (batch, is_log_batch) = action_res?; - log_scanner.process_scan_batch( - add_transform.as_ref(), - batch.as_ref(), - logical_schema.clone(), - transform.clone(), - is_log_batch, - ) - }) - .filter(|res| res.as_ref().map_or(true, |(_, sv, _)| sv.contains(&true))) + let log_scanner = + ScanLogReplayProcessor::new(engine, physical_predicate, logical_schema, transform); + + ScanLogReplayProcessor::apply_to_iterator(log_scanner, action_iter) } #[cfg(test)] diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 0372bfd25..0b419f9a3 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -13,6 +13,7 @@ use crate::actions::deletion_vector::{ }; use crate::actions::{get_log_schema, ADD_NAME, REMOVE_NAME, SIDECAR_NAME}; use crate::expressions::{ColumnName, Expression, ExpressionRef, ExpressionTransform, Scalar}; +use crate::log_replay::HasSelectionVector; use crate::predicates::{DefaultPredicateEvaluator, EmptyColumnResolver}; use crate::scan::state::{DvInfo, Stats}; use crate::schema::{ @@ -324,6 +325,13 @@ pub(crate) enum TransformExpr { // (data, deletion_vec, transforms) pub type ScanData = (Box, Vec, Vec>); +// Implementation for the scan result type +impl HasSelectionVector for ScanData { + fn has_selected_rows(&self) -> bool { + self.1.contains(&true) + } +} + /// The result of building a scan over a table. This can be used to get the actual data from /// scanning the table. pub struct Scan { From 88cf9831c9ca486df0363213f00fee45cf47727e Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 11:49:51 -0700 Subject: [PATCH 33/45] move --- kernel/src/scan/log_replay.rs | 46 +++++++++++++++++------------------ kernel/src/scan/mod.rs | 1 - 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 223c668ec..76459e892 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -28,6 +28,29 @@ struct ScanLogReplayProcessor { seen_file_keys: HashSet, } +impl ScanLogReplayProcessor { + /// Create a new [`ScanLogReplayProcessor`] instance + fn new( + engine: &dyn Engine, + physical_predicate: Option<(ExpressionRef, SchemaRef)>, + logical_schema: SchemaRef, + transform: Option>, + ) -> Self { + Self { + partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()), + data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate), + add_transform: engine.get_expression_handler().get_evaluator( + get_log_add_schema().clone(), + get_add_transform_expr(), + SCAN_ROW_DATATYPE.clone(), + ), + seen_file_keys: Default::default(), + logical_schema, + transform, + } + } +} + /// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds. Log /// replay visits actions newest-first, so once we've seen a file action for a given (path, dvId) /// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the @@ -339,29 +362,6 @@ impl LogReplayProcessor for ScanLogReplayProcessor { } } -impl ScanLogReplayProcessor { - /// Create a new [`ScanLogReplayProcessor`] instance - fn new( - engine: &dyn Engine, - physical_predicate: Option<(ExpressionRef, SchemaRef)>, - logical_schema: SchemaRef, - transform: Option>, - ) -> Self { - Self { - partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()), - data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate), - add_transform: engine.get_expression_handler().get_evaluator( - get_log_add_schema().clone(), - get_add_transform_expr(), - SCAN_ROW_DATATYPE.clone(), - ), - seen_file_keys: Default::default(), - logical_schema, - transform, - } - } -} - /// Given an iterator of (engine_data, bool) tuples and a predicate, returns an iterator of /// `(engine_data, selection_vec)`. Each row that is selected in the returned `engine_data` _must_ /// be processed to complete the scan. Non-selected rows _must_ be ignored. The boolean flag diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 0b419f9a3..7fd1f9ea9 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -325,7 +325,6 @@ pub(crate) enum TransformExpr { // (data, deletion_vec, transforms) pub type ScanData = (Box, Vec, Vec>); -// Implementation for the scan result type impl HasSelectionVector for ScanData { fn has_selected_rows(&self) -> bool { self.1.contains(&true) From 10bb7b56a65ee7f705ce1dbaa74826fcda0f092a Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 27 Mar 2025 23:05:03 -0700 Subject: [PATCH 34/45] fix rebase --- kernel/src/scan/log_replay.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 76459e892..896b18b8b 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -335,6 +335,7 @@ impl LogReplayProcessor for ScanLogReplayProcessor { let logical_schema = self.logical_schema.clone(); let transform = self.transform.clone(); let partition_filter = self.partition_filter.clone(); + // TODO: Teach expression eval to respect the selection vector we just computed so carefully! let result = self.add_transform.evaluate(batch.as_ref())?; let mut visitor = AddRemoveDedupVisitor::new( @@ -347,9 +348,6 @@ impl LogReplayProcessor for ScanLogReplayProcessor { ); visitor.visit_rows_of(batch.as_ref())?; - - // TODO: Teach expression eval to respect the selection vector we just computed so carefully! - let result = add_transform.evaluate(actions)?; Ok(( result, visitor.selection_vector, From abc7e1fe4573d924372502ad26f1b5dcef4e5007 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 28 Mar 2025 16:14:53 -0700 Subject: [PATCH 35/45] merge fixes --- kernel/src/actions/visitors.rs | 228 ++++----------------------- kernel/src/checkpoints/log_replay.rs | 34 ++-- kernel/src/scan/log_replay.rs | 32 +--- 3 files changed, 45 insertions(+), 249 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 4d93d6fd3..aa7f14614 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -4,10 +4,8 @@ use std::collections::{HashMap, HashSet}; use std::sync::LazyLock; -use tracing::debug; - use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; -use crate::log_replay::FileActionKey; +use crate::log_replay::{FileActionDeduplicator, FileActionKey}; use crate::schema::{column_name, ColumnName, ColumnNamesAndTypes, DataType}; use crate::utils::require; use crate::{DeltaResult, Error}; @@ -505,19 +503,25 @@ impl RowVisitor for SidecarVisitor { pub(crate) struct CheckpointVisitor<'seen> { // File actions deduplication state pub(crate) deduplicator: FileActionDeduplicator<'seen>, - pub(crate) total_file_actions: usize, - pub(crate) total_add_actions: usize, + pub(crate) selection_vector: Vec, + pub(crate) total_file_actions: u64, + pub(crate) total_add_actions: u64, pub(crate) minimum_file_retention_timestamp: i64, // Non-file actions deduplication state pub(crate) seen_protocol: bool, pub(crate) seen_metadata: bool, pub(crate) seen_txns: &'seen mut HashSet, - pub(crate) total_non_file_actions: usize, + pub(crate) total_non_file_actions: u64, } #[allow(unused)] impl CheckpointVisitor<'_> { + // The index position in the row getters for the following columns + const ADD_PATH_INDEX: usize = 0; + const ADD_DV_START_INDEX: usize = 1; + const REMOVE_PATH_INDEX: usize = 4; + const REMOVE_DV_START_INDEX: usize = 6; /// Create a new CheckpointVisitor pub(crate) fn new<'seen>( seen_file_keys: &'seen mut HashSet, @@ -531,9 +535,13 @@ impl CheckpointVisitor<'_> { CheckpointVisitor { deduplicator: FileActionDeduplicator::new( seen_file_keys, - selection_vector, is_log_batch, + Self::ADD_PATH_INDEX, + Self::REMOVE_PATH_INDEX, + Self::ADD_DV_START_INDEX, + Self::REMOVE_DV_START_INDEX, ), + selection_vector, total_file_actions: 0, total_add_actions: 0, minimum_file_retention_timestamp, @@ -567,17 +575,13 @@ impl CheckpointVisitor<'_> { i: usize, getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { - // Extract file action key and determine if it's an add operation - let Some((file_key, is_add)) = self.deduplicator.extract_file_action( - i, - getters, - // Do not skip remove actions (even if we're processing a log batch) - FileActionExtractConfig::new(0, 4, 1, 6, false), - )? + // Never skip remove actions, as they may be unexpired tombstones. + let Some((file_key, is_add)) = self.deduplicator.extract_file_action(i, getters, false)? else { return Ok(false); }; + // Check if we've already seen this file action if self.deduplicator.check_and_record_seen(file_key) { return Ok(false); } @@ -700,191 +704,13 @@ impl RowVisitor for CheckpointVisitor<'_> { // Mark the row for selection if it's either a valid non-file or file action if is_non_file_action || is_file_action { - self.deduplicator.selection_vector_mut()[i] = true; + self.selection_vector[i] = true; } } Ok(()) } } -/// This struct contains indices and configuration options needed to -/// extract file actions from action batches in the Delta log. -pub(crate) struct FileActionExtractConfig { - /// Index of the getter containing the add.path column - pub add_path_index: usize, - /// Index of the getter containing the remove.path column - pub remove_path_index: usize, - /// Starting index for add action deletion vector columns - pub add_dv_start_index: usize, - /// Starting index for remove action deletion vector columns - pub remove_dv_start_index: usize, - /// Whether to skip remove actions when extracting file actions - pub skip_removes: bool, -} - -impl FileActionExtractConfig { - pub(crate) fn new( - add_path_index: usize, - remove_path_index: usize, - add_dv_start_index: usize, - remove_dv_start_index: usize, - skip_removes: bool, - ) -> Self { - Self { - add_path_index, - remove_path_index, - add_dv_start_index, - remove_dv_start_index, - skip_removes, - } - } -} - -/// Core implementation for deduplicating file actions in Delta log replay -/// This struct extracts the common functionality from the CheckpointVisitor -/// and the AddRemoveDedupVisitor. -pub(crate) struct FileActionDeduplicator<'seen> { - /// A set of (data file path, dv_unique_id) pairs that have been seen thus - /// far in the log for deduplication - seen_file_keys: &'seen mut HashSet, - /// Selection vector to track which rows should be included - selection_vector: Vec, - /// Whether we're processing a log batch (as opposed to a checkpoint) - is_log_batch: bool, -} - -impl<'seen> FileActionDeduplicator<'seen> { - pub(crate) fn new( - seen_file_keys: &'seen mut HashSet, - selection_vector: Vec, - is_log_batch: bool, - ) -> Self { - Self { - seen_file_keys, - selection_vector, - is_log_batch, - } - } - - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it - /// and should process it. - pub(crate) fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen_file_keys.contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - true - } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - if self.is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen_file_keys.insert(key); - } - false - } - } - - /// Extract deletion vector unique ID - fn extract_dv_unique_id<'a>( - &self, - i: usize, - getters: &[&'a dyn GetData<'a>], - add_dv_start_index: Option, - remove_dv_start_index: Option, - ) -> DeltaResult> { - // Get the starting index based on action type - let start_idx = add_dv_start_index - .or(remove_dv_start_index) - .ok_or_else(|| Error::GenericError { - source: "starting indices for add/remove DVs should have been passed".into(), - })?; - - // Extract the DV unique ID - match getters[start_idx].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Ok(Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - getters[start_idx + 1].get(i, "deletionVector.pathOrInlineDv")?, - getters[start_idx + 2].get_opt(i, "deletionVector.offset")?, - ))), - None => Ok(None), - } - } - - /// Extracts a file action key and determines if it's an add operation. - /// - /// This method examines the data at the given index using the provided getters and config - /// to identify whether a file action exists and what type it is. - /// - /// # Arguments - /// - /// * `i` - Index position in the data structure to examine - /// * `getters` - Collection of data getter implementations used to access the data - /// * `config` - Configuration specifying where to find add/remove operations - /// - /// # Returns - /// - /// * `Ok(Some((key, is_add)))` - When a file action is found, returns the key and whether it's an add operation - /// * `Ok(None)` - When no file action is found - /// * `Err(...)` - On any error during extraction - pub(crate) fn extract_file_action<'a>( - &self, - i: usize, - getters: &[&'a dyn GetData<'a>], - config: FileActionExtractConfig, - ) -> DeltaResult> { - // Try to extract an add action path - if let Some(path) = getters[config.add_path_index].get_str(i, "add.path")? { - let dv_unique_id = - self.extract_dv_unique_id(i, getters, Some(config.add_dv_start_index), None)?; - return Ok(Some((FileActionKey::new(path, dv_unique_id), true))); - } - - // The AddRemoveDedupVisitor skips remove actions when extracting file actions from a checkpoint file. - if config.skip_removes { - return Ok(None); - } - - // Try to extract a remove action path - if let Some(path) = getters[config.remove_path_index].get_str(i, "remove.path")? { - let dv_unique_id = - self.extract_dv_unique_id(i, getters, None, Some(config.remove_dv_start_index))?; - return Ok(Some((FileActionKey::new(path, dv_unique_id), false))); - } - - // No file action found - Ok(None) - } - - pub(crate) fn selection_vector(self) -> Vec { - self.selection_vector - } - - pub(crate) fn selection_vector_ref(&self) -> &Vec { - &self.selection_vector - } - - pub(crate) fn selection_vector_mut(&mut self) -> &mut Vec { - &mut self.selection_vector - } - - /// Returns whether we are currently processing a log batch. - pub(crate) fn is_log_batch(&self) -> bool { - self.is_log_batch - } -} - /// Get a DV out of some engine data. The caller is responsible for slicing the `getters` slice such /// that the first element contains the `storageType` element of the deletion vector. pub(crate) fn visit_deletion_vector_at<'a>( @@ -1175,7 +1001,7 @@ mod tests { assert_eq!(visitor.seen_txns.len(), 1); assert_eq!(visitor.total_non_file_actions, 3); - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.selection_vector, expected); Ok(()) } @@ -1207,7 +1033,7 @@ mod tests { // Only "one_above_threshold" should be kept let expected = vec![false, false, true, false]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 0); assert_eq!(visitor.total_non_file_actions, 0); @@ -1240,7 +1066,7 @@ mod tests { // First one should be included, second one skipped as a duplicate let expected = vec![true, false]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_file_actions, 1); assert_eq!(visitor.total_add_actions, 1); assert_eq!(visitor.total_non_file_actions, 0); @@ -1275,7 +1101,7 @@ mod tests { // Both should be included since we don't track duplicates in checkpoint batches let expected = vec![true, true]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); assert_eq!(visitor.total_non_file_actions, 0); @@ -1309,7 +1135,7 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, false]; // Third one is a duplicate - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_file_actions, 2); assert_eq!(visitor.total_add_actions, 2); assert_eq!(visitor.total_non_file_actions, 0); @@ -1341,7 +1167,7 @@ mod tests { visitor.visit_rows_of(batch.as_ref())?; let expected = vec![true, true, true]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.selection_vector, expected); assert!(visitor.seen_protocol); assert!(visitor.seen_metadata); assert_eq!(visitor.seen_txns.len(), 1); @@ -1379,7 +1205,7 @@ mod tests { // All actions should be skipped as they have already been seen let expected = vec![false, false, false]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.total_non_file_actions, 0); assert_eq!(visitor.total_file_actions, 0); @@ -1417,7 +1243,7 @@ mod tests { // First occurrence of each type should be included let expected = vec![true, false, true, true, false, true, false]; - assert_eq!(visitor.deduplicator.selection_vector, expected); + assert_eq!(visitor.selection_vector, expected); assert_eq!(visitor.seen_txns.len(), 2); // Two different app IDs assert_eq!(visitor.total_non_file_actions, 4); assert_eq!(visitor.total_file_actions, 0); diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs index dc64b766c..0a31bffc1 100644 --- a/kernel/src/checkpoints/log_replay.rs +++ b/kernel/src/checkpoints/log_replay.rs @@ -1,12 +1,10 @@ use std::collections::HashSet; -use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; use crate::actions::visitors::CheckpointVisitor; use crate::engine_data::RowVisitor; -use crate::log_replay::{ - apply_processor_to_iterator, FileActionKey, HasSelectionVector, LogReplayProcessor, -}; +use crate::log_replay::{FileActionKey, HasSelectionVector, LogReplayProcessor}; use crate::{DeltaResult, EngineData}; pub struct CheckpointData { @@ -30,10 +28,10 @@ struct CheckpointLogReplayProcessor { seen_file_keys: HashSet, /// Counter for the total number of actions processed during log replay. - total_actions: Arc, + total_actions: Arc, /// Counter for the total number of add actions processed during log replay. - total_add_actions: Arc, + total_add_actions: Arc, /// Indicates whether a protocol action has been seen in the log. seen_protocol: bool, @@ -50,7 +48,7 @@ struct CheckpointLogReplayProcessor { impl LogReplayProcessor for CheckpointLogReplayProcessor { // Define the processing result type as a tuple of the data and selection vector - type ProcessingResult = CheckpointData; + type Output = CheckpointData; /// This function processes batches of actions in reverse chronological order /// (from most recent to least recent) and performs the necessary filtering @@ -65,11 +63,11 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { /// 2. For each app ID, only the most recent transaction action is included /// 3. File actions are deduplicated based on path and unique ID /// 4. Tombstones older than `minimum_file_retention_timestamp` are excluded - fn process_batch( + fn process_actions_batch( &mut self, batch: Box, is_log_batch: bool, - ) -> DeltaResult { + ) -> DeltaResult { // Initialize selection vector with all rows un-selected let selection_vector = vec![false; batch.len()]; assert_eq!( @@ -106,7 +104,7 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { Ok(CheckpointData { data: batch, - selection_vector: visitor.deduplicator.selection_vector(), + selection_vector: visitor.selection_vector, }) } @@ -119,8 +117,8 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented impl CheckpointLogReplayProcessor { pub(super) fn new( - total_actions_counter: Arc, - total_add_actions_counter: Arc, + total_actions_counter: Arc, + total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> Self { Self { @@ -146,8 +144,8 @@ impl CheckpointLogReplayProcessor { #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented pub(crate) fn checkpoint_actions_iter( action_iter: impl Iterator, bool)>> + Send + 'static, - total_actions_counter: Arc, - total_add_actions_counter: Arc, + total_actions_counter: Arc, + total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> impl Iterator> + Send + 'static { let mut log_scanner = CheckpointLogReplayProcessor::new( @@ -156,12 +154,12 @@ pub(crate) fn checkpoint_actions_iter( minimum_file_retention_timestamp, ); - apply_processor_to_iterator(log_scanner, action_iter) + CheckpointLogReplayProcessor::apply_to_iterator(log_scanner, action_iter) } #[cfg(test)] mod tests { - use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; use crate::arrow::array::StringArray; @@ -175,8 +173,8 @@ mod tests { #[test] fn test_v1_checkpoint_actions_iter_multi_batch_integration() -> DeltaResult<()> { // Setup counters - let total_actions_counter = Arc::new(AtomicUsize::new(0)); - let total_add_actions_counter = Arc::new(AtomicUsize::new(0)); + let total_actions_counter = Arc::new(AtomicU64::new(0)); + let total_add_actions_counter = Arc::new(AtomicU64::new(0)); // Create first batch with protocol, metadata, and some files let json_strings1: StringArray = vec![ diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 7782b1857..0959d9cf8 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -7,7 +7,6 @@ use itertools::Itertools; use super::data_skipping::DataSkippingFilter; use super::{ScanData, Transform}; use crate::actions::get_log_add_schema; -use crate::actions::visitors::{FileActionDeduplicator, FileActionExtractConfig}; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; use crate::log_replay::{FileActionDeduplicator, FileActionKey, LogReplayProcessor}; @@ -29,29 +28,6 @@ struct ScanLogReplayProcessor { seen_file_keys: HashSet, } -impl ScanLogReplayProcessor { - /// Create a new [`ScanLogReplayProcessor`] instance - fn new( - engine: &dyn Engine, - physical_predicate: Option<(ExpressionRef, SchemaRef)>, - logical_schema: SchemaRef, - transform: Option>, - ) -> Self { - Self { - partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()), - data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate), - add_transform: engine.get_expression_handler().get_evaluator( - get_log_add_schema().clone(), - get_add_transform_expr(), - SCAN_ROW_DATATYPE.clone(), - ), - seen_file_keys: Default::default(), - logical_schema, - transform, - } - } -} - /// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds. Log /// replay visits actions newest-first, so once we've seen a file action for a given (path, dvId) /// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the @@ -270,8 +246,8 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { ); for i in 0..row_count { - if self.deduplicator.selection_vector_ref()[i] { - self.deduplicator.selection_vector_mut()[i] = self.is_valid_add(i, getters)?; + if self.selection_vector[i] { + self.selection_vector[i] = self.is_valid_add(i, getters)?; } } Ok(()) @@ -359,10 +335,6 @@ impl LogReplayProcessor for ScanLogReplayProcessor { fn seen_file_keys(&mut self) -> &mut HashSet { &mut self.seen_file_keys } - - fn seen_file_keys(&mut self) -> &mut HashSet { - &mut self.seen_file_keys - } } impl ScanLogReplayProcessor { From 7fbfe29fb19698694f193212d9415915d6cd8a06 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 28 Mar 2025 20:19:47 -0700 Subject: [PATCH 36/45] mvp --- kernel/src/checkpoints/log_replay.rs | 5 +- kernel/src/checkpoints/mod.rs | 318 +++++++++++++++++++++++++++ kernel/src/checkpoints/tests.rs | 183 +++++++++++++++ kernel/src/path.rs | 78 +++++++ kernel/src/table.rs | 15 ++ kernel/src/table_configuration.rs | 19 ++ 6 files changed, 615 insertions(+), 3 deletions(-) create mode 100644 kernel/src/checkpoints/tests.rs diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs index 0a31bffc1..cbad63305 100644 --- a/kernel/src/checkpoints/log_replay.rs +++ b/kernel/src/checkpoints/log_replay.rs @@ -8,9 +8,8 @@ use crate::log_replay::{FileActionKey, HasSelectionVector, LogReplayProcessor}; use crate::{DeltaResult, EngineData}; pub struct CheckpointData { - #[allow(unused)] - data: Box, - selection_vector: Vec, + pub data: Box, + pub selection_vector: Vec, } impl HasSelectionVector for CheckpointData { diff --git a/kernel/src/checkpoints/mod.rs b/kernel/src/checkpoints/mod.rs index 826ff771f..f5a764779 100644 --- a/kernel/src/checkpoints/mod.rs +++ b/kernel/src/checkpoints/mod.rs @@ -1 +1,319 @@ +//! # Delta Kernel Checkpoint API +//! +//! This module provides functionality for writing single-file checkpoints in Delta tables. +//! +//! 1. Single-file Classic-named V1 Checkpoint - For legacy tables without v2Checkpoints feature +//! 2. Single-file Classic-named V2 Checkpoint - For backwards compatibility with v2Checkpoints feature +//! 3. Single-file UUID-named V2 Checkpoint - Recommended for small to medium tables with v2Checkpoints feature +//! +//! The API is designed with a builder pattern for configuring and creating checkpoint writers. +//! +//! # Example +//! ``` +//! let path = "./tests/data/app-txn-no-checkpoint"; +//! let engine = Arc::new(SyncEngine::new()); +//! let table = Table::try_from_uri(path)?; +//! // Create a checkpoint builder for the table at a specific version +//! let builder = table.checkpoint(&engine, Some(2))?; +//! // Configure the builder (optional) +//! let writer = builder.with_classic_naming(true); +//! // Build the checkpoint writer +//! let writer = builder.build(&engine)?; +//! // Get the checkpoint data and path +//! let checkpoint_data = writer.get_checkpoint_info()?; +//! /* Engine writes data to file path and collects metadata: (path, bytes, timestamp) */ +//! /* All checkpoint data must be written before calling .finalize_checkpoint() */ +//! writer.finalize_checkpoint()?; +//! ``` +use log_replay::{checkpoint_actions_iter, CheckpointData}; +use std::{ + sync::{atomic::AtomicU64, Arc, LazyLock}, + time::{Duration, SystemTime, UNIX_EPOCH}, +}; +use url::Url; + +use crate::{ + actions::{ + Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, METADATA_NAME, + PROTOCOL_NAME, REMOVE_NAME, SET_TRANSACTION_NAME, SIDECAR_NAME, + }, + path::ParsedLogPath, + snapshot::Snapshot, + DeltaResult, Engine, EngineData, Error, +}; + +use crate::actions::schemas::GetStructField; +use crate::schema::{SchemaRef, StructType}; pub mod log_replay; +#[cfg(test)] +mod tests; + +/// Read schema definition for collecting checkpoint actions +static CHECKPOINT_READ_SCHEMA: LazyLock = LazyLock::new(|| { + StructType::new([ + Option::::get_struct_field(ADD_NAME), + Option::::get_struct_field(REMOVE_NAME), + Option::::get_struct_field(METADATA_NAME), + Option::::get_struct_field(PROTOCOL_NAME), + Option::::get_struct_field(SET_TRANSACTION_NAME), + Option::::get_struct_field(SIDECAR_NAME), + ]) + .into() +}); + +/// Returns the read schema to collect checkpoint actions +#[cfg_attr(feature = "developer-visibility", visibility::make(pub))] +#[cfg_attr(not(feature = "developer-visibility"), visibility::make(pub(crate)))] +fn get_checkpoint_read_schema() -> &'static SchemaRef { + &CHECKPOINT_READ_SCHEMA +} + +/// Contains the path and data for a single-file checkpoint. +/// +/// This struct holds all the necessary information for writing a checkpoint file, +/// including the destination path and the iterator over checkpoint actions. +pub struct SingleFileCheckpointData { + /// The target URL where the checkpoint file will be written + pub path: Url, + + /// Iterator over checkpoint actions to be written to the file + pub data: Box>>, +} + +/// Writer for creating checkpoint files in Delta tables. +/// +/// The CheckpointWriter orchestrates the process of writing checkpoint data to storage. +/// It manages the one-time consumption of checkpoint data and tracks statistics +/// about the actions included in the checkpoint. +pub struct CheckpointWriter { + /// Using Option to enforce single consumption at compile time + single_file_checkpoint_data: Option, + + /// Counter for the total number of actions in the checkpoint + total_actions_counter: Arc, + + /// Counter for add file actions specifically + total_add_actions_counter: Arc, +} + +impl CheckpointWriter { + /// Creates a new CheckpointWriter with the provided checkpoint data and counters + fn new( + single_file_checkpoint_data: Option, + total_actions_counter: Arc, + total_add_actions_counter: Arc, + ) -> Self { + Self { + single_file_checkpoint_data, + total_actions_counter, + total_add_actions_counter, + } + } + + /// Retrieves the checkpoint data and path information + /// + /// This method takes ownership of the checkpoint data, ensuring it can + /// only be consumed once. It returns an error if the data has already + /// been consumed. + pub fn get_checkpoint_info(&mut self) -> DeltaResult { + self.single_file_checkpoint_data + .take() + .ok_or_else(|| Error::generic("Checkpoint data already consumed")) + } + + /// Finalizes the checkpoint writing process + /// + /// This method should be only called AFTER writing all checkpoint data to + /// ensure proper completion of the checkpoint operation, which includes + /// writing the _last_checkpoint file. + pub fn finalize_checkpoint(self) -> DeltaResult<()> { + Ok(()) + } +} + +/// Builder for configuring and creating CheckpointWriter instances +/// +/// The CheckpointBuilder provides an interface for configuring checkpoint +/// generation. It handles table feature detection and enforces compatibility +/// between configuration options and table features. +pub struct CheckpointBuilder { + /// The table snapshot from which to create the checkpoint + snapshot: Snapshot, + + /// Whether to use classic naming for the checkpoint file + with_classic_naming: bool, +} + +impl CheckpointBuilder { + /// Creates a new CheckpointBuilder with the given snapshot + pub(crate) fn new(snapshot: Snapshot) -> Self { + Self { + snapshot, + with_classic_naming: false, + } + } + + /// Configures the builder to use classic naming scheme + /// + /// Classic naming is required for V1 checkpoints and optional for V2 checkpoints. + /// For V2 checkpoints, the default is UUID naming unless this method is called. + pub fn with_classic_naming(mut self, with_classic_naming: bool) -> Self { + self.with_classic_naming = with_classic_naming; + self + } + + /// Builds a CheckpointWriter based on the configuration + /// + /// This method validates the configuration against table features and creates + /// a CheckpointWriter for the appropriate checkpoint type. It performs protocol + /// table feature checks to determine if v2Checkpoints are supported. + /// + /// # Arguments + /// * `engine` - The engine implementation for data operations + /// + /// # Returns + /// * `DeltaResult` - A configured checkpoint writer on success, + /// or an error if the configuration is incompatible with table features + pub fn build(self, engine: &dyn Engine) -> DeltaResult { + let v2_checkpoints_supported = self + .snapshot + .table_configuration() + .is_v2_checkpoint_supported(); + + let deleted_file_retention_timestamp = self.deleted_file_retention_timestamp()?; + + // Create counters for tracking actions + let total_actions_counter = Arc::new(AtomicU64::new(0)); + let total_add_actions_counter = Arc::new(AtomicU64::new(0)); + + // Create iterator over actions for checkpoint data + let checkpoint_data = checkpoint_actions_iter( + self.replay_for_checkpoint_data(engine)?, + total_actions_counter.clone(), + total_add_actions_counter.clone(), + deleted_file_retention_timestamp, + ); + + // Generate checkpoint path based on builder configuration + // Classic naming is required for V1 checkpoints and optional for V2 checkpoints + let checkpoint_path = if self.with_classic_naming || !v2_checkpoints_supported { + ParsedLogPath::new_classic_parquet_checkpoint( + self.snapshot.table_root(), + self.snapshot.version(), + )? + } else { + ParsedLogPath::new_uuid_parquet_checkpoint( + self.snapshot.table_root(), + self.snapshot.version(), + )? + }; + + let data = SingleFileCheckpointData { + data: Box::new(checkpoint_data), + path: checkpoint_path.location, + }; + + Ok(CheckpointWriter::new( + Some(data), + total_actions_counter, + total_add_actions_counter, + )) + } + + /// Prepares the iterator over actions for checkpoint creation + /// + /// This method is factored out to facilitate testing and returns an iterator + /// over all actions to be included in the checkpoint. + fn replay_for_checkpoint_data( + &self, + engine: &dyn Engine, + ) -> DeltaResult, bool)>> + Send> { + let read_schema = get_checkpoint_read_schema(); + + self.snapshot.log_segment().read_actions( + engine, + read_schema.clone(), + read_schema.clone(), + None, + ) + } + + /// Calculates the cutoff timestamp for deleted file cleanup. + /// + /// This function determines the minimum timestamp before which deleted files + /// will be permanently removed during VACUUM operations, based on the table's + /// deleted_file_retention_duration property. + /// + /// Returns the cutoff timestamp in milliseconds since epoch, matching + /// the remove action's deletion_timestamp format for comparison. + /// + /// The default retention period is 7 days, matching delta-spark's behavior. + pub(crate) fn deleted_file_retention_timestamp(&self) -> DeltaResult { + let retention_duration = self + .snapshot + .table_properties() + .deleted_file_retention_duration; + + deleted_file_retention_timestamp_with_time( + retention_duration, + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_err(|e| Error::generic(format!("Failed to calculate system time: {}", e)))?, + ) + } +} + +/// Internal implementation with injectable time parameter for testing +fn deleted_file_retention_timestamp_with_time( + retention_duration: Option, + now_duration: Duration, +) -> DeltaResult { + // Use provided retention duration or default (7 days) + let retention_duration = + retention_duration.unwrap_or_else(|| Duration::from_secs(60 * 60 * 24 * 7)); + + // Convert to milliseconds for remove action deletion_timestamp comparison + let now_ms: i64 = now_duration + .as_millis() + .try_into() + .map_err(|_| Error::generic("Current timestamp exceeds i64 millisecond range"))?; + + let retention_ms: i64 = retention_duration + .as_millis() + .try_into() + .map_err(|_| Error::generic("Retention duration exceeds i64 millisecond range"))?; + + // Simple subtraction - will produce negative values if retention > now + Ok(now_ms - retention_ms) +} + +#[cfg(test)] +mod unit_tests { + use super::*; + use std::time::Duration; + + #[test] + fn test_deleted_file_retention_timestamp() -> DeltaResult<()> { + let now = Duration::from_secs(1000).as_millis() as i64; + + // Test cases + let test_cases = [ + // Default case (7 days) + (None, now - (7 * 24 * 60 * 60 * 1000)), + // Zero retention + (Some(Duration::from_secs(0)), now), + // Custom retention (2000 seconds) + // This results in a negative timestamp which is valid - as it just means that + // the retention window extends to before UNIX epoch. + (Some(Duration::from_secs(2000)), now - (2000 * 1000)), + ]; + + for (retention, expected) in test_cases { + let result = + deleted_file_retention_timestamp_with_time(retention, Duration::from_secs(1000))?; + assert_eq!(result, expected); + } + + Ok(()) + } +} diff --git a/kernel/src/checkpoints/tests.rs b/kernel/src/checkpoints/tests.rs new file mode 100644 index 000000000..6975e73bc --- /dev/null +++ b/kernel/src/checkpoints/tests.rs @@ -0,0 +1,183 @@ +use std::sync::Arc; + +use object_store::{memory::InMemory, path::Path, ObjectStore}; +use test_utils::delta_path_for_version; +use url::Url; + +use crate::{ + actions::{Add, Metadata, Protocol, Remove}, + engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}, + utils::test_utils::Action, + DeltaResult, Table, +}; + +// Create an in-memory store and return the store and the URL for the store's _delta_log directory. +fn new_in_memory_store() -> (Arc, Url) { + ( + Arc::new(InMemory::new()), + Url::parse("memory:///") + .unwrap() + .join("_delta_log/") + .unwrap(), + ) +} + +/// Writes all actions to a _delta_log json commit file in the store. +/// This function formats the provided filename into the _delta_log directory. +fn write_commit_to_store( + store: &Arc, + actions: Vec, + version: u64, +) -> DeltaResult<()> { + let json_lines: Vec = actions + .into_iter() + .map(|action| serde_json::to_string(&action).expect("action to string")) + .collect(); + let content = json_lines.join("\n"); + + let commit_path = format!("_delta_log/{}", delta_path_for_version(version, "json")); + + tokio::runtime::Runtime::new() + .expect("create tokio runtime") + .block_on(async { store.put(&Path::from(commit_path), content.into()).await })?; + + Ok(()) +} + +#[test] +fn test_checkpoint_latest_version_by_default() -> DeltaResult<()> { + let (store, _) = new_in_memory_store(); + let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + + // 1st commit: adds `fake_path_1` + write_commit_to_store( + &store, + vec![Action::Add(Add { + path: "fake_path_1".into(), + data_change: true, + ..Default::default() + })], + 0, + )?; + + // 2nd commit: adds `fake_path_2` & removes `fake_path_1` + write_commit_to_store( + &store, + vec![ + Action::Add(Add { + path: "fake_path_2".into(), + data_change: true, + ..Default::default() + }), + Action::Remove(Remove { + path: "fake_path_1".into(), + data_change: true, + ..Default::default() + }), + ], + 1, + )?; + + // 3rd commit: metadata & protocol actions + write_commit_to_store( + &store, + vec![ + Action::Metadata(Metadata { + id: "fake_path_1".into(), + schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(), + ..Default::default() + }), + Action::Protocol(Protocol::try_new(3, 7, Vec::::new().into(), Vec::::new().into())?), + ], + 2, + )?; + let table_root = Url::parse("memory:///")?; + let table = Table::new(table_root); + let mut checkpointer = table.checkpoint(&engine, None)?.build(&engine)?; + let checkpoint_data = checkpointer.get_checkpoint_info()?; + let mut data_iter = checkpoint_data.data; + assert_eq!( + checkpoint_data.path, + Url::parse("memory:///_delta_log/00000000000000000002.checkpoint.parquet")? + ); + + // The first batch should be the metadata and protocol actions. + let checkpoint_data = data_iter.next().unwrap()?; + + assert_eq!(checkpoint_data.selection_vector, [true, true]); + + // The second batch should be the add action as the remove action is expired. + let checkpoint_data = data_iter.next().unwrap()?; + assert_eq!(checkpoint_data.selection_vector, [true, false]); + + // The third batch should not be included as the selection vector does not + // contain any true values, as the add action is removed in a following commit. + assert!(data_iter.next().is_none()); + + Ok(()) +} + +/// Test that `checkpoint` works with a specific version parameter +#[test] +fn test_checkpoint_specific_version() -> DeltaResult<()> { + let (store, _) = new_in_memory_store(); + let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + + // Create test actions + // 1st commit (version 0) - metadata and protocol actions + write_commit_to_store( + &store, + vec![ + Action::Protocol(Protocol::try_new(3, 7, Vec::::new().into(), Vec::::new().into())?), + Action::Metadata(Metadata { + id: "test-table-v0".into(), + schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(), + ..Default::default() + }), + ], + 0, + )?; + + // 2nd commit (version 1) - add and remove actions + write_commit_to_store( + &store, + vec![ + Action::Add(Add { + path: "file1.parquet".into(), + data_change: true, + ..Default::default() + }), + Action::Add(Add { + path: "file2.parquet".into(), + data_change: true, + ..Default::default() + }), + ], + 1, + )?; + + // Initialize the table + let table_root = Url::parse("memory:///")?; + let table = Table::new(table_root); + + // Create the V1CheckpointFileIterator for version 1 specifically + let mut checkpointer = table.checkpoint(&engine, Some(0))?.build(&engine)?; + + // Get the file data iterator + let checkpoint_data = checkpointer.get_checkpoint_info()?; + + // Verify checkpoint file path is for version 0 + let expected_path = Url::parse("memory:///_delta_log/00000000000000000000.checkpoint.parquet")?; + assert_eq!(checkpoint_data.path, expected_path); + + let mut data_iter = checkpoint_data.data; + + // The first batch should be the metadata and protocol actions. + let checkpoint_data = data_iter.next().unwrap()?; + assert_eq!(checkpoint_data.selection_vector, [true, true]); + + // No more data should exist because we only requested version 0 + assert!(data_iter.next().is_none()); + + Ok(()) +} diff --git a/kernel/src/path.rs b/kernel/src/path.rs index df372f08e..1cdab91a9 100644 --- a/kernel/src/path.rs +++ b/kernel/src/path.rs @@ -2,6 +2,7 @@ use std::str::FromStr; use url::Url; +use uuid::Uuid; use crate::{DeltaResult, Error, FileMeta, Version}; @@ -196,6 +197,42 @@ impl ParsedLogPath { } Ok(path) } + + /// Create a new ParsedCommitPath for a classic-named parquet checkpoint file at the specified version + pub(crate) fn new_classic_parquet_checkpoint( + table_root: &Url, + version: Version, + ) -> DeltaResult> { + let filename = format!("{:020}.checkpoint.parquet", version); + let location = table_root.join("_delta_log/")?.join(&filename)?; + let path = Self::try_from(location)? + .ok_or_else(|| Error::internal_error("attempted to create invalid checkpoint path"))?; + if !path.is_checkpoint() { + return Err(Error::internal_error( + "ParsedLogPath::new_classic_parquet_checkpoint created a non-checkpoint path", + )); + } + Ok(path) + } + + /// Create a new ParsedCommitPath for a uuid-named parquet checkpoint file at the specified version + pub(crate) fn new_uuid_parquet_checkpoint( + table_root: &Url, + version: Version, + ) -> DeltaResult> { + // Generate a random UUID v4 + let uuid = Uuid::new_v4().to_string(); + let filename = format!("{:020}.checkpoint.{}.parquet", version, uuid); + let location = table_root.join("_delta_log/")?.join(&filename)?; + let path = Self::try_from(location)? + .ok_or_else(|| Error::internal_error("attempted to create invalid checkpoint path"))?; + if !path.is_checkpoint() { + return Err(Error::internal_error( + "ParsedLogPath::new_uuid_parquet_checkpoint created a non-checkpoint path", + )); + } + Ok(path) + } } #[cfg(test)] @@ -566,4 +603,45 @@ mod tests { assert!(matches!(log_path.file_type, LogPathFileType::Commit)); assert_eq!(log_path.filename, "00000000000000000010.json"); } + #[test] + fn test_new_uuid_parquet_checkpoint() { + let table_log_dir = table_log_dir_url(); + let log_path = ParsedLogPath::new_uuid_parquet_checkpoint(&table_log_dir, 10).unwrap(); + + // Basic properties + assert_eq!(log_path.version, 10); + assert!(log_path.is_checkpoint()); + assert_eq!(log_path.extension, "parquet"); + assert!(matches!( + log_path.file_type, + LogPathFileType::UuidCheckpoint(_) + )); + + // Filename structure + let parts: Vec<&str> = log_path.filename.split('.').collect(); + assert_eq!(parts.len(), 4); + assert_eq!(parts[0], "00000000000000000010"); + assert_eq!(parts[1], "checkpoint"); + assert_eq!(parts[3], "parquet"); + + // Validate UUID + assert!(!parts[2].is_empty()); + assert!(Uuid::parse_str(parts[2]).is_ok()); + } + + #[test] + fn test_new_classic_parquet_checkpoint() { + let table_log_dir = table_log_dir_url(); + let log_path = ParsedLogPath::new_classic_parquet_checkpoint(&table_log_dir, 10).unwrap(); + + // Basic properties + assert_eq!(log_path.version, 10); + assert!(log_path.is_checkpoint()); + assert_eq!(log_path.extension, "parquet"); + assert!(matches!( + log_path.file_type, + LogPathFileType::SinglePartCheckpoint + )); + assert_eq!(log_path.filename, "00000000000000000010.checkpoint.parquet"); + } } diff --git a/kernel/src/table.rs b/kernel/src/table.rs index 97e1596d7..36bdc4743 100644 --- a/kernel/src/table.rs +++ b/kernel/src/table.rs @@ -7,6 +7,7 @@ use std::path::PathBuf; use url::Url; +use crate::checkpoints::CheckpointBuilder; use crate::snapshot::Snapshot; use crate::table_changes::TableChanges; use crate::transaction::Transaction; @@ -98,6 +99,20 @@ impl Table { ) } + /// Creates a [`CheckpointBuilder`] for generating table checkpoints. + /// + /// Checkpoints are compact representations of the table state that improve reading performance. + /// Supports three checkpoint types: Classic V1 (legacy tables), Classic V2 (backwards + /// compatibility), and UUID V2 (recommended for small/medium tables with v2Checkpoints feature). + pub fn checkpoint( + &self, + engine: &dyn Engine, + version: Option, + ) -> DeltaResult { + let snapshot = self.snapshot(engine, version)?; + Ok(CheckpointBuilder::new(snapshot)) + } + /// Create a new write transaction for this table. pub fn new_transaction(&self, engine: &dyn Engine) -> DeltaResult { Transaction::try_new(self.snapshot(engine, None)?) diff --git a/kernel/src/table_configuration.rs b/kernel/src/table_configuration.rs index e2d287b60..3b659615d 100644 --- a/kernel/src/table_configuration.rs +++ b/kernel/src/table_configuration.rs @@ -238,6 +238,25 @@ impl TableConfiguration { version => (2..=6).contains(&version), } } + + /// Returns `true` if V2 checkpoint is supported on this table. To support V2 checkpoint, + /// a table must support reader version 3, writer version 7, and the v2Checkpoint feature in + /// both the protocol's readerFeatures and writerFeatures. + /// + /// See: + #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] + #[allow(unused)] // needed to compile w/o default features + pub(crate) fn is_v2_checkpoint_supported(&self) -> bool { + let read_supported = self + .protocol() + .has_reader_feature(&ReaderFeatures::V2Checkpoint) + && self.protocol.min_reader_version() == 3; + let write_supported = self + .protocol() + .has_writer_feature(&WriterFeatures::V2Checkpoint) + && self.protocol.min_writer_version() == 7; + read_supported && write_supported + } } #[cfg(test)] From 5abba3daf12a4a934a35b8cb42e91e07a7bdfcca Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 30 Mar 2025 13:42:46 -0700 Subject: [PATCH 37/45] usize & ulong type conversions --- ffi/src/expressions/kernel.rs | 15 ++++ ffi/src/schema.rs | 20 +++++ kernel/src/actions/schemas.rs | 2 + kernel/src/engine/arrow_conversion.rs | 14 +++ kernel/src/engine/arrow_expression/mod.rs | 29 ++++++ .../src/engine/parquet_row_group_skipping.rs | 89 +++++++++++++++++++ kernel/src/expressions/scalars.rs | 26 ++++++ kernel/src/schema/mod.rs | 10 +++ 8 files changed, 205 insertions(+) diff --git a/ffi/src/expressions/kernel.rs b/ffi/src/expressions/kernel.rs index c8ce1b2d4..5abe69f77 100644 --- a/ffi/src/expressions/kernel.rs +++ b/ffi/src/expressions/kernel.rs @@ -65,6 +65,10 @@ pub struct EngineExpressionVisitor { pub visit_literal_int: VisitLiteralFn, /// Visit a 64bit `long` belonging to the list identified by `sibling_list_id`. pub visit_literal_long: VisitLiteralFn, + /// Visit a 64bit unsigned `long` belonging to the list identified by `sibling_list_id`. + pub visit_literal_ulong: VisitLiteralFn, + /// Visit a 32bit unsigned `integer` int belonging to the list identified by `sibling_list_id`. + pub visit_literal_uint: VisitLiteralFn, /// Visit a 16bit `short` belonging to the list identified by `sibling_list_id`. pub visit_literal_short: VisitLiteralFn, /// Visit an 8bit `byte` belonging to the list identified by `sibling_list_id`. @@ -292,6 +296,17 @@ fn visit_expression_internal( match scalar { Scalar::Integer(val) => call!(visitor, visit_literal_int, sibling_list_id, *val), Scalar::Long(val) => call!(visitor, visit_literal_long, sibling_list_id, *val), + Scalar::ULong(val) => call!(visitor, visit_literal_ulong, sibling_list_id, *val), // TODO: Fix typecast + Scalar::USize(val) => { + #[cfg(target_pointer_width = "32")] + { + call!(visitor, visit_literal_uint, sibling_list_id, *val as u64) + } + #[cfg(target_pointer_width = "64")] + { + call!(visitor, visit_literal_ulong, sibling_list_id, *val as u64) + } + } Scalar::Short(val) => call!(visitor, visit_literal_short, sibling_list_id, *val), Scalar::Byte(val) => call!(visitor, visit_literal_byte, sibling_list_id, *val), Scalar::Float(val) => call!(visitor, visit_literal_float, sibling_list_id, *val), diff --git a/ffi/src/schema.rs b/ffi/src/schema.rs index a474c80c3..0cd1ed423 100644 --- a/ffi/src/schema.rs +++ b/ffi/src/schema.rs @@ -102,6 +102,24 @@ pub struct EngineSchemaVisitor { metadata: &CStringMap, ), + /// Visit a `ulong` belonging to the list identified by `sibling_list_id`. + pub visit_ulong: extern "C" fn( + data: *mut c_void, + sibling_list_id: usize, + name: KernelStringSlice, + is_nullable: bool, + metadata: &CStringMap, + ), + + /// Visit a `usize` belonging to the list identified by `sibling_list_id`. + pub visit_usize: extern "C" fn( + data: *mut c_void, + sibling_list_id: usize, + name: KernelStringSlice, + is_nullable: bool, + metadata: &CStringMap, + ), + /// Visit an `integer` belonging to the list identified by `sibling_list_id`. pub visit_integer: extern "C" fn( data: *mut c_void, @@ -308,6 +326,8 @@ fn visit_schema_impl(schema: &StructType, visitor: &mut EngineSchemaVisitor) -> } &DataType::STRING => call!(visit_string), &DataType::LONG => call!(visit_long), + &DataType::ULONG => call!(visit_ulong), + &DataType::USIZE => call!(visit_usize), &DataType::INTEGER => call!(visit_integer), &DataType::SHORT => call!(visit_short), &DataType::BYTE => call!(visit_byte), diff --git a/kernel/src/actions/schemas.rs b/kernel/src/actions/schemas.rs index aa3b3e47b..dfccbd028 100644 --- a/kernel/src/actions/schemas.rs +++ b/kernel/src/actions/schemas.rs @@ -36,6 +36,8 @@ macro_rules! impl_to_data_type { impl_to_data_type!( (String, DataType::STRING), + (u64, DataType::ULONG), + (usize, DataType::USIZE), (i64, DataType::LONG), (i32, DataType::INTEGER), (i16, DataType::SHORT), diff --git a/kernel/src/engine/arrow_conversion.rs b/kernel/src/engine/arrow_conversion.rs index a425cd143..6242d27bd 100644 --- a/kernel/src/engine/arrow_conversion.rs +++ b/kernel/src/engine/arrow_conversion.rs @@ -100,6 +100,20 @@ impl TryFrom<&DataType> for ArrowDataType { match p { PrimitiveType::String => Ok(ArrowDataType::Utf8), PrimitiveType::Long => Ok(ArrowDataType::Int64), // undocumented type + PrimitiveType::ULong => Ok(ArrowDataType::UInt64), + // Since usize is platform dependent, we need to check the target_pointer_width + // to determine the correct arrow type to use. + PrimitiveType::USize => { + #[cfg(target_pointer_width = "32")] + { + Ok(ArrowDataType::UInt32) + } + + #[cfg(target_pointer_width = "64")] + { + Ok(ArrowDataType::UInt64) + } + } PrimitiveType::Integer => Ok(ArrowDataType::Int32), PrimitiveType::Short => Ok(ArrowDataType::Int16), PrimitiveType::Byte => Ok(ArrowDataType::Int8), diff --git a/kernel/src/engine/arrow_expression/mod.rs b/kernel/src/engine/arrow_expression/mod.rs index 6e15d10bd..621ff7755 100644 --- a/kernel/src/engine/arrow_expression/mod.rs +++ b/kernel/src/engine/arrow_expression/mod.rs @@ -19,6 +19,7 @@ use crate::expressions::{Expression, Scalar}; use crate::schema::{DataType, PrimitiveType, SchemaRef}; use crate::{EngineData, ExpressionEvaluator, ExpressionHandler}; +use arrow_53::array::UInt64Array; use itertools::Itertools; use tracing::debug; @@ -40,6 +41,20 @@ impl Scalar { let arr: ArrayRef = match self { Integer(val) => Arc::new(Int32Array::from_value(*val, num_rows)), Long(val) => Arc::new(Int64Array::from_value(*val, num_rows)), + ULong(val) => Arc::new(UInt64Array::from_value(*val, num_rows)), + // Since usize is platform dependent, we need to check the target_pointer_width + // to determine the correct array type to use. + USize(val) => { + #[cfg(target_pointer_width = "32")] + { + Arc::new(UInt32Array::from_value(*val as u32, num_rows)) + } + + #[cfg(target_pointer_width = "64")] + { + Arc::new(UInt64Array::from_value(*val as u64, num_rows)) + } + } Short(val) => Arc::new(Int16Array::from_value(*val, num_rows)), Byte(val) => Arc::new(Int8Array::from_value(*val, num_rows)), Float(val) => Arc::new(Float32Array::from_value(*val, num_rows)), @@ -88,6 +103,20 @@ impl Scalar { Null(DataType::SHORT) => Arc::new(Int16Array::new_null(num_rows)), Null(DataType::INTEGER) => Arc::new(Int32Array::new_null(num_rows)), Null(DataType::LONG) => Arc::new(Int64Array::new_null(num_rows)), + Null(DataType::ULONG) => Arc::new(UInt64Array::new_null(num_rows)), + // Since usize is platform dependent, we need to check the target_pointer_width + // to determine the correct array type to use. + Null(DataType::USIZE) => { + #[cfg(target_pointer_width = "32")] + { + Arc::new(UInt32Array::new_null(num_rows)) + } + + #[cfg(target_pointer_width = "64")] + { + Arc::new(UInt64Array::new_null(num_rows)) + } + } Null(DataType::FLOAT) => Arc::new(Float32Array::new_null(num_rows)), Null(DataType::DOUBLE) => Arc::new(Float64Array::new_null(num_rows)), Null(DataType::STRING) => Arc::new(StringArray::new_null(num_rows)), diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs index fbce2f913..c9d78fbdd 100644 --- a/kernel/src/engine/parquet_row_group_skipping.rs +++ b/kernel/src/engine/parquet_row_group_skipping.rs @@ -105,6 +105,50 @@ impl ParquetStatsProvider for RowGroupFilter<'_> { (Long, Statistics::Int64(s)) => s.min_opt()?.into(), (Long, Statistics::Int32(s)) => (*s.min_opt()? as i64).into(), (Long, _) => return None, + (ULong, Statistics::Int64(s)) => + // Attempt to convert value to u64, return None if conversion fails + { + u64::try_from(*s.min_opt()?).ok()?.into() + } + + // Handling ULong type with Int32 statistics + (ULong, Statistics::Int32(s)) => + // Attempt to convert value to u64, return None if conversion fails + { + u64::try_from(*s.min_opt()?).ok()?.into() + } + + (ULong, _) => return None, + // Handling USize type on 64-bit architecture with Int64 statistics + #[cfg(target_pointer_width = "64")] + (USize, Statistics::Int64(s)) => + // Attempt to convert value to usize, return None if conversion fails + { + usize::try_from(*s.min_opt()?).ok()?.into() + } + // Handling USize type on 64-bit architecture with Int32 statistics + #[cfg(target_pointer_width = "64")] + (USize, Statistics::Int32(s)) => + // Attempt to convert value to usize, converting from u64 if needed + { + usize::try_from(*s.min_opt()? as u64).ok()?.into() + } + // Handling USize type on 32-bit architecture with Int64 statistics + #[cfg(target_pointer_width = "32")] + (USize, Statistics::Int64(s)) => + // Attempt to convert value to usize, ensuring it's cast to u32 first + { + usize::try_from(*s.min_opt()? as u32).ok()?.into() + } + + // Handling USize type on 32-bit architecture with Int32 statistics + #[cfg(target_pointer_width = "32")] + (USize, Statistics::Int32(s)) => + // Attempt to convert vvalue to usize, return None if conversion fails + { + usize::try_from(*s.min_opt()?).ok()?.into() + } + (USize, _) => return None, (Integer, Statistics::Int32(s)) => s.min_opt()?.into(), (Integer, _) => return None, (Short, Statistics::Int32(s)) => (*s.min_opt()? as i16).into(), @@ -147,6 +191,51 @@ impl ParquetStatsProvider for RowGroupFilter<'_> { (Long, Statistics::Int64(s)) => s.max_opt()?.into(), (Long, Statistics::Int32(s)) => (*s.max_opt()? as i64).into(), (Long, _) => return None, + (ULong, Statistics::Int64(s)) => + // Attempt to convert value to u64, return None if conversion fails + { + u64::try_from(*s.min_opt()?).ok()?.into() + } + + // Handling ULong type with Int32 statistics + (ULong, Statistics::Int32(s)) => + // Attempt to convert value to u64, return None if conversion fails + { + u64::try_from(*s.min_opt()?).ok()?.into() + } + + (ULong, _) => return None, + // Handling USize type on 64-bit architecture with Int64 statistics + #[cfg(target_pointer_width = "64")] + (USize, Statistics::Int64(s)) => + // Attempt to convert value to usize, return None if conversion fails + { + usize::try_from(*s.min_opt()?).ok()?.into() + } + // Handling USize type on 64-bit architecture with Int32 statistics + #[cfg(target_pointer_width = "64")] + (USize, Statistics::Int32(s)) => + // Attempt to convert value to usize, converting from u64 if needed + { + usize::try_from(*s.min_opt()? as u64).ok()?.into() + } + // Handling USize type on 32-bit architecture with Int64 statistics + #[cfg(target_pointer_width = "32")] + (USize, Statistics::Int64(s)) => + // Attempt to convert value to usize, ensuring it's cast to u32 first + { + usize::try_from(*s.min_opt()? as u32).ok()?.into() + } + + // Handling USize type on 32-bit architecture with Int32 statistics + #[cfg(target_pointer_width = "32")] + (USize, Statistics::Int32(s)) => + // Attempt to convert vvalue to usize, return None if conversion fails + { + usize::try_from(*s.min_opt()?).ok()?.into() + } + (USize, _) => return None, + (Integer, Statistics::Int32(s)) => s.max_opt()?.into(), (Integer, _) => return None, (Short, Statistics::Int32(s)) => (*s.max_opt()? as i16).into(), diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index 90f5358a6..f5e887fad 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -96,6 +96,10 @@ pub enum Scalar { Integer(i32), /// 64bit integer Long(i64), + // unsigned 64bit integer + ULong(u64), + // usize + USize(usize), /// 16bit integer Short(i16), /// 8bit integer @@ -131,6 +135,8 @@ impl Scalar { match self { Self::Integer(_) => DataType::INTEGER, Self::Long(_) => DataType::LONG, + Self::ULong(_) => DataType::ULONG, + Self::USize(_) => DataType::USIZE, Self::Short(_) => DataType::SHORT, Self::Byte(_) => DataType::BYTE, Self::Float(_) => DataType::FLOAT, @@ -169,6 +175,8 @@ impl Display for Scalar { match self { Self::Integer(i) => write!(f, "{}", i), Self::Long(i) => write!(f, "{}", i), + Self::ULong(i) => write!(f, "{}", i), + Self::USize(i) => write!(f, "{}", i), Self::Short(i) => write!(f, "{}", i), Self::Byte(i) => write!(f, "{}", i), Self::Float(fl) => write!(f, "{}", fl), @@ -241,6 +249,10 @@ impl PartialOrd for Scalar { (Integer(_), _) => None, (Long(a), Long(b)) => a.partial_cmp(b), (Long(_), _) => None, + (ULong(a), ULong(b)) => a.partial_cmp(b), + (ULong(_), _) => None, + (USize(a), USize(b)) => a.partial_cmp(b), + (USize(_), _) => None, (Short(a), Short(b)) => a.partial_cmp(b), (Short(_), _) => None, (Byte(a), Byte(b)) => a.partial_cmp(b), @@ -338,6 +350,18 @@ impl From<&[u8]> for Scalar { } } +impl From for Scalar { + fn from(u: u64) -> Self { + Self::ULong(u.into()) + } +} + +impl From for Scalar { + fn from(u: usize) -> Self { + Self::USize(u.into()) + } +} + // TODO: add more From impls impl PrimitiveType { @@ -378,6 +402,8 @@ impl PrimitiveType { Short => self.parse_str_as_scalar(raw, Scalar::Short), Integer => self.parse_str_as_scalar(raw, Scalar::Integer), Long => self.parse_str_as_scalar(raw, Scalar::Long), + ULong => self.parse_str_as_scalar(raw, Scalar::ULong), + USize => self.parse_str_as_scalar(raw, Scalar::USize), Float => self.parse_str_as_scalar(raw, Scalar::Float), Double => self.parse_str_as_scalar(raw, Scalar::Double), Boolean => { diff --git a/kernel/src/schema/mod.rs b/kernel/src/schema/mod.rs index 3a5648b57..ede497305 100644 --- a/kernel/src/schema/mod.rs +++ b/kernel/src/schema/mod.rs @@ -493,6 +493,12 @@ pub enum PrimitiveType { String, /// i64: 8-byte signed integer. Range: -9223372036854775808 to 9223372036854775807 Long, + /// u64: 8-byte unsigned integer. Range: 0 to 18446744073709551615 + ULong, + /// usize: Platform-dependent unsigned integer. Typically used for indexing and memory sizes. + /// - 64-bit platforms: Range 0 to 18_446_744_073_709_551_615 + /// - 32-bit platforms: Range 0 to 4_294_967_295 + USize, /// i32: 4-byte signed integer. Range: -2147483648 to 2147483647 Integer, /// i16: 2-byte signed integer numbers. Range: -32768 to 32767 @@ -559,6 +565,8 @@ impl Display for PrimitiveType { match self { PrimitiveType::String => write!(f, "string"), PrimitiveType::Long => write!(f, "long"), + PrimitiveType::ULong => write!(f, "ulong"), + PrimitiveType::USize => write!(f, "usize"), PrimitiveType::Integer => write!(f, "integer"), PrimitiveType::Short => write!(f, "short"), PrimitiveType::Byte => write!(f, "byte"), @@ -624,6 +632,8 @@ impl From for DataType { impl DataType { pub const STRING: Self = DataType::Primitive(PrimitiveType::String); pub const LONG: Self = DataType::Primitive(PrimitiveType::Long); + pub const ULONG: Self = DataType::Primitive(PrimitiveType::ULong); + pub const USIZE: Self = DataType::Primitive(PrimitiveType::USize); pub const INTEGER: Self = DataType::Primitive(PrimitiveType::Integer); pub const SHORT: Self = DataType::Primitive(PrimitiveType::Short); pub const BYTE: Self = DataType::Primitive(PrimitiveType::Byte); From 80fc9360ee690fa6d6210eeb08cdab4b27065fa2 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 30 Mar 2025 13:50:34 -0700 Subject: [PATCH 38/45] finalize_checkpoint API --- kernel/src/actions/visitors.rs | 6 +- kernel/src/checkpoints/log_replay.rs | 20 +- kernel/src/checkpoints/mod.rs | 336 ++++++++++++++++++++++++++- kernel/src/transaction.rs | 1 + 4 files changed, 338 insertions(+), 25 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 76c83b1c3..d46aa8b0a 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -504,15 +504,15 @@ pub(crate) struct CheckpointVisitor<'seen> { // File actions deduplication state pub(crate) deduplicator: FileActionDeduplicator<'seen>, pub(crate) selection_vector: Vec, - pub(crate) total_file_actions: u64, - pub(crate) total_add_actions: u64, + pub(crate) total_file_actions: i64, + pub(crate) total_add_actions: i64, pub(crate) minimum_file_retention_timestamp: i64, // Non-file actions deduplication state pub(crate) seen_protocol: bool, pub(crate) seen_metadata: bool, pub(crate) seen_txns: &'seen mut HashSet, - pub(crate) total_non_file_actions: u64, + pub(crate) total_non_file_actions: i64, } #[allow(unused)] diff --git a/kernel/src/checkpoints/log_replay.rs b/kernel/src/checkpoints/log_replay.rs index cbad63305..cab607324 100644 --- a/kernel/src/checkpoints/log_replay.rs +++ b/kernel/src/checkpoints/log_replay.rs @@ -1,5 +1,5 @@ use std::collections::HashSet; -use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::atomic::{AtomicI64, Ordering}; use std::sync::Arc; use crate::actions::visitors::CheckpointVisitor; @@ -27,10 +27,10 @@ struct CheckpointLogReplayProcessor { seen_file_keys: HashSet, /// Counter for the total number of actions processed during log replay. - total_actions: Arc, + total_actions: Arc, /// Counter for the total number of add actions processed during log replay. - total_add_actions: Arc, + total_add_actions: Arc, /// Indicates whether a protocol action has been seen in the log. seen_protocol: bool, @@ -116,8 +116,8 @@ impl LogReplayProcessor for CheckpointLogReplayProcessor { #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented impl CheckpointLogReplayProcessor { pub(super) fn new( - total_actions_counter: Arc, - total_add_actions_counter: Arc, + total_actions_counter: Arc, + total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> Self { Self { @@ -143,8 +143,8 @@ impl CheckpointLogReplayProcessor { #[allow(unused)] // TODO: Remove once checkpoint_v1 API is implemented pub(crate) fn checkpoint_actions_iter( action_iter: impl Iterator, bool)>> + Send + 'static, - total_actions_counter: Arc, - total_add_actions_counter: Arc, + total_actions_counter: Arc, + total_add_actions_counter: Arc, minimum_file_retention_timestamp: i64, ) -> impl Iterator> + Send + 'static { let mut log_scanner = CheckpointLogReplayProcessor::new( @@ -158,7 +158,7 @@ pub(crate) fn checkpoint_actions_iter( #[cfg(test)] mod tests { - use std::sync::atomic::{AtomicU64, Ordering}; + use std::sync::atomic::{AtomicI64, Ordering}; use std::sync::Arc; use crate::arrow::array::StringArray; @@ -172,8 +172,8 @@ mod tests { #[test] fn test_v1_checkpoint_actions_iter_multi_batch_integration() -> DeltaResult<()> { // Setup counters - let total_actions_counter = Arc::new(AtomicU64::new(0)); - let total_add_actions_counter = Arc::new(AtomicU64::new(0)); + let total_actions_counter = Arc::new(AtomicI64::new(0)); + let total_add_actions_counter = Arc::new(AtomicI64::new(0)); // Create first batch with protocol, metadata, and some files let json_strings1: StringArray = vec![ diff --git a/kernel/src/checkpoints/mod.rs b/kernel/src/checkpoints/mod.rs index f5a764779..72311b259 100644 --- a/kernel/src/checkpoints/mod.rs +++ b/kernel/src/checkpoints/mod.rs @@ -27,11 +27,17 @@ //! ``` use log_replay::{checkpoint_actions_iter, CheckpointData}; use std::{ - sync::{atomic::AtomicU64, Arc, LazyLock}, + sync::{ + atomic::{AtomicI64, Ordering}, + Arc, LazyLock, + }, time::{Duration, SystemTime, UNIX_EPOCH}, }; use url::Url; +use crate::actions::schemas::GetStructField; +use crate::expressions::column_expr; +use crate::schema::{SchemaRef, StructType}; use crate::{ actions::{ Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, METADATA_NAME, @@ -39,15 +45,30 @@ use crate::{ }, path::ParsedLogPath, snapshot::Snapshot, - DeltaResult, Engine, EngineData, Error, + DeltaResult, Engine, EngineData, Error, Expression, Version, }; - -use crate::actions::schemas::GetStructField; -use crate::schema::{SchemaRef, StructType}; pub mod log_replay; #[cfg(test)] mod tests; +/// Schema definition for the _last_checkpoint file +pub(crate) static CHECKPOINT_METADATA_SCHEMA: LazyLock = LazyLock::new(|| { + Arc::new(StructType::new(vec![ + ::get_struct_field("version"), + ::get_struct_field("size"), + Option::::get_struct_field("parts"), + Option::::get_struct_field("sizeInBytes"), + Option::::get_struct_field("numOfAddFiles"), + // Option::::get_struct_field("checkpoint_schema"), TODO: Schema + // Option::::get_struct_field("checksum"), TODO: Checksum + ])) +}); + +/// Get the expected schema for the _last_checkpoint file +pub fn get_checkpoint_metadata_schema() -> &'static SchemaRef { + &CHECKPOINT_METADATA_SCHEMA +} + /// Read schema definition for collecting checkpoint actions static CHECKPOINT_READ_SCHEMA: LazyLock = LazyLock::new(|| { StructType::new([ @@ -90,23 +111,38 @@ pub struct CheckpointWriter { single_file_checkpoint_data: Option, /// Counter for the total number of actions in the checkpoint - total_actions_counter: Arc, + total_actions_counter: Arc, /// Counter for add file actions specifically - total_add_actions_counter: Arc, + total_add_actions_counter: Arc, + + /// Version of the checkpoint + version: Version, + + /// Number of parts of the checkpoint + parts: usize, + + /// Path to table's log + log_root: Url, } impl CheckpointWriter { /// Creates a new CheckpointWriter with the provided checkpoint data and counters fn new( single_file_checkpoint_data: Option, - total_actions_counter: Arc, - total_add_actions_counter: Arc, + total_actions_counter: Arc, + total_add_actions_counter: Arc, + version: Version, + parts: usize, + log_root: Url, ) -> Self { Self { single_file_checkpoint_data, total_actions_counter, total_add_actions_counter, + version, + parts, + log_root, } } @@ -126,9 +162,75 @@ impl CheckpointWriter { /// This method should be only called AFTER writing all checkpoint data to /// ensure proper completion of the checkpoint operation, which includes /// writing the _last_checkpoint file. - pub fn finalize_checkpoint(self) -> DeltaResult<()> { + /// + /// Metadata is a single-row EngineData batch with {size_in_bytes: i64} + /// Given the engine collected checkpoint metadata we want to extend + /// the EngineData batch with the remaining fields for the `_last_checkpoint` + /// file. + + pub fn finalize_checkpoint( + self, + engine: &dyn Engine, + metadata: &dyn EngineData, + ) -> DeltaResult<()> { + // Prepare the checkpoint metadata + let checkpoint_metadata = self.prepare_last_checkpoint_metadata(engine, metadata)?; + + // Write the metadata to _last_checkpoint.json + let last_checkpoint_path = self.log_root.join("_last_checkpoint.json")?; + + engine.get_json_handler().write_json_file( + &last_checkpoint_path, + Box::new(std::iter::once(Ok(checkpoint_metadata))), + true, // overwrite the last checkpoint file + )?; + Ok(()) } + + /// Prepares the _last_checkpoint metadata batch + /// + /// This method validates and transforms the engine-provided metadata into + /// the complete checkpoint metadata including counters and versioning information. + /// + /// Refactored into a separate method to facilitate testing. + fn prepare_last_checkpoint_metadata( + &self, + engine: &dyn Engine, + metadata: &dyn EngineData, + ) -> DeltaResult> { + // Validate metadata has exactly one row + if metadata.len() != 1 { + return Err(Error::Generic(format!( + "Engine checkpoint metadata should have exactly one row, found {}", + metadata.len() + ))); + } + + // Create expression for transforming the metadata + let last_checkpoint_exprs = [ + Expression::literal(self.version), + Expression::literal(self.total_actions_counter.load(Ordering::SeqCst)), + Expression::literal(self.parts), + column_expr!("sizeInBytes"), + Expression::literal(self.total_add_actions_counter.load(Ordering::SeqCst)), + ]; + let last_checkpoint_expr = Expression::struct_from(last_checkpoint_exprs); + + // Get schemas for transformation + let last_checkpoint_schema = get_checkpoint_metadata_schema(); + let engine_metadata_schema = last_checkpoint_schema.project_as_struct(&["sizeInBytes"])?; + + // Create the evaluator for the transformation + let last_checkpoint_metadata_evaluator = engine.get_expression_handler().get_evaluator( + engine_metadata_schema.into(), + last_checkpoint_expr, + last_checkpoint_schema.clone().into(), + ); + + // Transform the metadata + Ok(last_checkpoint_metadata_evaluator.evaluate(metadata)?) + } } /// Builder for configuring and creating CheckpointWriter instances @@ -183,8 +285,8 @@ impl CheckpointBuilder { let deleted_file_retention_timestamp = self.deleted_file_retention_timestamp()?; // Create counters for tracking actions - let total_actions_counter = Arc::new(AtomicU64::new(0)); - let total_add_actions_counter = Arc::new(AtomicU64::new(0)); + let total_actions_counter = Arc::new(AtomicI64::new(0)); + let total_add_actions_counter = Arc::new(AtomicI64::new(0)); // Create iterator over actions for checkpoint data let checkpoint_data = checkpoint_actions_iter( @@ -217,6 +319,9 @@ impl CheckpointBuilder { Some(data), total_actions_counter, total_add_actions_counter, + self.snapshot.version(), + 1, + self.snapshot.log_segment().log_root.clone(), )) } @@ -290,7 +395,71 @@ fn deleted_file_retention_timestamp_with_time( #[cfg(test)] mod unit_tests { use super::*; + use crate::arrow::array::Int64Array; + use crate::arrow::datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema}; + use crate::arrow::record_batch::RecordBatch; + use crate::engine::arrow_data::ArrowEngineData; + use crate::engine::arrow_expression::ArrowExpressionHandler; + use crate::{ExpressionHandler, FileSystemClient, JsonHandler, ParquetHandler}; + use arrow_53::json::LineDelimitedWriter; + use std::sync::{atomic::AtomicI64, Arc}; use std::time::Duration; + use url::Url; + + // Helper to serialize and extract the _last_checkpoint JSON for verification + fn as_json(data: Box) -> serde_json::Value { + let record_batch: RecordBatch = data + .into_any() + .downcast::() + .unwrap() + .into(); + + let buf = Vec::new(); + let mut writer = LineDelimitedWriter::new(buf); + writer.write_batches(&[&record_batch]).unwrap(); + writer.finish().unwrap(); + let buf = writer.into_inner(); + + serde_json::from_slice(&buf).unwrap() + } + + // TODO(seb): Merge with other definitions and move to a common test module + pub(crate) struct ExprEngine(Arc); + + impl ExprEngine { + pub(crate) fn new() -> Self { + ExprEngine(Arc::new(ArrowExpressionHandler)) + } + } + + impl Engine for ExprEngine { + fn get_expression_handler(&self) -> Arc { + self.0.clone() + } + + fn get_json_handler(&self) -> Arc { + unimplemented!() + } + + fn get_parquet_handler(&self) -> Arc { + unimplemented!() + } + + fn get_file_system_client(&self) -> Arc { + unimplemented!() + } + } + + /// Creates a mock engine metadata batch with size_in_bytes field + fn create_engine_metadata(size_in_bytes: i64) -> Box { + // Create Arrow schema with size_in_bytes field + let schema = ArrowSchema::new(vec![Field::new("sizeInBytes", ArrowDataType::Int64, false)]); + + let size_array = Int64Array::from(vec![size_in_bytes]); + let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(size_array)]) + .expect("Failed to create record batch"); + Box::new(ArrowEngineData::new(record_batch)) + } #[test] fn test_deleted_file_retention_timestamp() -> DeltaResult<()> { @@ -316,4 +485,147 @@ mod unit_tests { Ok(()) } + + #[test] + fn test_prepare_last_checkpoint_metadata() -> DeltaResult<()> { + // Setup test data + let size_in_bytes: i64 = 1024 * 1024; // 1MB + let version: Version = 10; + let parts: usize = 3; + let total_actions_counter = Arc::new(AtomicI64::new(100)); + let total_add_actions_counter = Arc::new(AtomicI64::new(75)); + + let log_root = Url::parse("memory://test-table/_delta_log/").unwrap(); + let engine = ExprEngine::new(); + + // Create engine metadata with size_in_bytes + let metadata = create_engine_metadata(size_in_bytes); + + // Create checkpoint writer + let writer = CheckpointWriter::new( + None, // We don't need checkpoint data for this test + total_actions_counter.clone(), + total_add_actions_counter.clone(), + version, + parts, + log_root, + ); + + // Call the method under test + let last_checkpoint_batch = writer.prepare_last_checkpoint_metadata(&engine, &*metadata)?; + + // Convert to JSON for easier verification + let json = as_json(last_checkpoint_batch); + + // Verify the values match our expectations + assert_eq!(json["version"], version); + assert_eq!(json["size"], total_actions_counter.load(Ordering::Relaxed)); + assert_eq!(json["parts"], parts as i64); + assert_eq!(json["sizeInBytes"], size_in_bytes); + assert_eq!( + json["numOfAddFiles"], + total_add_actions_counter.load(Ordering::Relaxed) + ); + + Ok(()) + } + + #[test] + fn test_prepare_last_checkpoint_metadata_with_empty_batch() { + // Setup test data + let version: Version = 10; + let parts: usize = 3; + let total_actions_counter = Arc::new(AtomicI64::new(100)); + let total_add_actions_counter = Arc::new(AtomicI64::new(75)); + + let log_root = Url::parse("memory://test-table/_delta_log/").unwrap(); + let engine = ExprEngine::new(); + + // Create empty metadata (no rows) + let empty_schema = Arc::new(ArrowSchema::new(vec![Field::new( + "sizeInBytes", + ArrowDataType::Int64, + false, + )])); + let empty_batch = RecordBatch::try_new( + empty_schema, + vec![Arc::new(Int64Array::from(Vec::::new()))], + ) + .expect("Failed to create empty batch"); + let empty_metadata = ArrowEngineData::new(empty_batch); + + // Create checkpoint writer + let writer = CheckpointWriter::new( + None, + total_actions_counter, + total_add_actions_counter, + version, + parts, + log_root, + ); + + // Call the method under test - should fail with InvalidCommitInfo + let result = writer.prepare_last_checkpoint_metadata(&engine, &empty_metadata); + assert!(result.is_err()); + + match result { + Err(Error::Generic(e)) => { + assert_eq!( + e, + "Engine checkpoint metadata should have exactly one row, found 0" + ); + } + _ => panic!("Should have failed with error"), + } + } + + #[test] + fn test_prepare_last_checkpoint_metadata_with_multiple_rows() { + // Setup test data + let version: Version = 10; + let parts: usize = 1; + let total_actions_counter = Arc::new(AtomicI64::new(50)); + let total_add_actions_counter = Arc::new(AtomicI64::new(30)); + + // Create a log root URL + let log_root = Url::parse("memory://test-table/_delta_log/").unwrap(); + + // Create engine + let engine = ExprEngine::new(); + + // Create metadata with multiple rows + let schema = Arc::new(ArrowSchema::new(vec![Field::new( + "sizeInBytes", + ArrowDataType::Int64, + false, + )])); + let multi_row_batch = + RecordBatch::try_new(schema, vec![Arc::new(Int64Array::from(vec![1024, 2048]))]) + .expect("Failed to create multi-row batch"); + let multi_row_metadata = ArrowEngineData::new(multi_row_batch); + + // Create checkpoint writer + let writer = CheckpointWriter::new( + None, + total_actions_counter, + total_add_actions_counter, + version, + parts, + log_root, + ); + + // Call the method under test - should fail with InvalidCommitInfo + let result = writer.prepare_last_checkpoint_metadata(&engine, &multi_row_metadata); + assert!(result.is_err()); + + match result { + Err(Error::Generic(e)) => { + assert_eq!( + e, + "Engine checkpoint metadata should have exactly one row, found 2" + ); + } + _ => panic!("Should have failed with error"), + } + } } diff --git a/kernel/src/transaction.rs b/kernel/src/transaction.rs index 138d4fdef..94abba4be 100644 --- a/kernel/src/transaction.rs +++ b/kernel/src/transaction.rs @@ -344,6 +344,7 @@ mod tests { use crate::arrow::json::writer::LineDelimitedWriter; use crate::arrow::record_batch::RecordBatch; + // TODO(seb): Merge with other definitions and move to a common test module struct ExprEngine(Arc); impl ExprEngine { From 5e4df58d91920692349c9b9ef4462b8cde57da5e Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 30 Mar 2025 13:55:39 -0700 Subject: [PATCH 39/45] nits --- kernel/src/checkpoints/mod.rs | 3 +-- kernel/src/engine/arrow_expression/mod.rs | 6 +++++- kernel/src/expressions/scalars.rs | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/src/checkpoints/mod.rs b/kernel/src/checkpoints/mod.rs index 72311b259..1b8ebf1b1 100644 --- a/kernel/src/checkpoints/mod.rs +++ b/kernel/src/checkpoints/mod.rs @@ -167,7 +167,6 @@ impl CheckpointWriter { /// Given the engine collected checkpoint metadata we want to extend /// the EngineData batch with the remaining fields for the `_last_checkpoint` /// file. - pub fn finalize_checkpoint( self, engine: &dyn Engine, @@ -229,7 +228,7 @@ impl CheckpointWriter { ); // Transform the metadata - Ok(last_checkpoint_metadata_evaluator.evaluate(metadata)?) + last_checkpoint_metadata_evaluator.evaluate(metadata) } } diff --git a/kernel/src/engine/arrow_expression/mod.rs b/kernel/src/engine/arrow_expression/mod.rs index 621ff7755..1ff1834b7 100644 --- a/kernel/src/engine/arrow_expression/mod.rs +++ b/kernel/src/engine/arrow_expression/mod.rs @@ -1,6 +1,11 @@ //! Expression handling based on arrow-rs compute kernels. use std::sync::Arc; +#[cfg(target_pointer_width = "32")] +use crate::arrow::array::UInt32Array; +#[cfg(target_pointer_width = "64")] +use crate::arrow::array::UInt64Array; + use crate::arrow::array::{ Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, RecordBatch, @@ -19,7 +24,6 @@ use crate::expressions::{Expression, Scalar}; use crate::schema::{DataType, PrimitiveType, SchemaRef}; use crate::{EngineData, ExpressionEvaluator, ExpressionHandler}; -use arrow_53::array::UInt64Array; use itertools::Itertools; use tracing::debug; diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index f5e887fad..55b22c982 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -352,13 +352,13 @@ impl From<&[u8]> for Scalar { impl From for Scalar { fn from(u: u64) -> Self { - Self::ULong(u.into()) + Self::ULong(u) } } impl From for Scalar { fn from(u: usize) -> Self { - Self::USize(u.into()) + Self::USize(u) } } From c8bcc2e466f8cf009f0ff972c589e5f3e38aa2a1 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 30 Mar 2025 14:01:49 -0700 Subject: [PATCH 40/45] ignore doc test --- kernel/src/checkpoints/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/checkpoints/mod.rs b/kernel/src/checkpoints/mod.rs index 1b8ebf1b1..813eefc24 100644 --- a/kernel/src/checkpoints/mod.rs +++ b/kernel/src/checkpoints/mod.rs @@ -9,7 +9,7 @@ //! The API is designed with a builder pattern for configuring and creating checkpoint writers. //! //! # Example -//! ``` +//! ```ignore //! let path = "./tests/data/app-txn-no-checkpoint"; //! let engine = Arc::new(SyncEngine::new()); //! let table = Table::try_from_uri(path)?; From c4ba531fef129aaa508362925d0d8a3849d37c98 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 31 Mar 2025 15:06:31 -0700 Subject: [PATCH 41/45] rename and update struct types --- acceptance/tests/other.rs | 4 +-- kernel/src/actions/schemas.rs | 9 ++++++ kernel/src/log_segment.rs | 55 +++++++++++++++++++++++++-------- kernel/src/log_segment/tests.rs | 22 ++++++------- kernel/src/snapshot.rs | 15 ++++----- 5 files changed, 72 insertions(+), 33 deletions(-) diff --git a/acceptance/tests/other.rs b/acceptance/tests/other.rs index 5a89f23de..daea7ac77 100644 --- a/acceptance/tests/other.rs +++ b/acceptance/tests/other.rs @@ -3,7 +3,7 @@ /// Since each new `.rs` file in this directory results in increased build and link time, it is /// important to only add new files if absolutely necessary for code readability or test /// performance. -use delta_kernel::snapshot::CheckpointMetadata; +use delta_kernel::snapshot::LastCheckpointHint; #[test] fn test_checkpoint_serde() { @@ -11,7 +11,7 @@ fn test_checkpoint_serde() { "./tests/dat/out/reader_tests/generated/with_checkpoint/delta/_delta_log/_last_checkpoint", ) .unwrap(); - let cp: CheckpointMetadata = serde_json::from_reader(file).unwrap(); + let cp: LastCheckpointHint = serde_json::from_reader(file).unwrap(); assert_eq!(cp.version, 2) } diff --git a/kernel/src/actions/schemas.rs b/kernel/src/actions/schemas.rs index aa3b3e47b..4ecb1d3fb 100644 --- a/kernel/src/actions/schemas.rs +++ b/kernel/src/actions/schemas.rs @@ -8,6 +8,15 @@ pub(crate) trait ToSchema { fn to_schema() -> StructType; } +/// Implement ToSchema for StructType to enable its use within Option fields +/// in schema-derived structs. This follows the system pattern where schema types +/// implement ToSchema rather than directly implementing ToDataType. +impl ToSchema for StructType { + fn to_schema() -> StructType { + StructType::new(vec![]) + } +} + pub(crate) trait ToDataType { fn to_data_type() -> DataType; } diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index c146f9eca..cbbd632f4 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -8,7 +8,7 @@ use crate::actions::{ }; use crate::path::{LogPathFileType, ParsedLogPath}; use crate::schema::SchemaRef; -use crate::snapshot::CheckpointMetadata; +use crate::snapshot::LastCheckpointHint; use crate::utils::require; use crate::{ DeltaResult, Engine, EngineData, Error, Expression, ExpressionRef, FileSystemClient, @@ -109,7 +109,7 @@ impl LogSegment { /// parts. All these parts will have the same checkpoint version. /// /// The options for constructing a LogSegment for Snapshot are as follows: - /// - `checkpoint_hint`: a `CheckpointMetadata` to start the log segment from (e.g. from reading the `last_checkpoint` file). + /// - `checkpoint_hint`: a `LastCheckpointHint` to start the log segment from (e.g. from reading the `last_checkpoint` file). /// - `time_travel_version`: The version of the log that the Snapshot will be at. /// /// [`Snapshot`]: crate::snapshot::Snapshot @@ -117,7 +117,7 @@ impl LogSegment { pub(crate) fn for_snapshot( fs_client: &dyn FileSystemClient, log_root: Url, - checkpoint_hint: impl Into>, + checkpoint_hint: impl Into>, time_travel_version: impl Into>, ) -> DeltaResult { let time_travel_version = time_travel_version.into(); @@ -127,7 +127,12 @@ impl LogSegment { (Some(cp), None) => { list_log_files_with_checkpoint(&cp, fs_client, &log_root, None)? } - (Some(cp), Some(end_version)) if cp.version <= end_version => { + // If type conversion fails, we skip the checkpoint hint and list all log files. + // Else, we check if the checkpoint hint's version is less than or equal to the + // time travel version. + (Some(cp), Some(end_version)) + if i64::try_from(end_version).map_or(false, |v| cp.version <= v) => + { list_log_files_with_checkpoint(&cp, fs_client, &log_root, Some(end_version))? } _ => list_log_files_with_version(fs_client, &log_root, None, time_travel_version)?, @@ -535,15 +540,26 @@ fn group_checkpoint_parts(parts: Vec) -> HashMap, ) -> DeltaResult<(Vec, Vec)> { + // Safely convert checkpoint_metadata.version (i64) to u64 for comparisons + let checkpoint_metadata_version = match u64::try_from(checkpoint_metadata.version) { + Ok(version) => version, + Err(e) => { + return Err(Error::InvalidCheckpoint(format!( + "Invalid checkpoint version (negative value): {}", + e + ))); + } + }; + let (commit_files, checkpoint_parts) = list_log_files_with_version( fs_client, log_root, - Some(checkpoint_metadata.version), + Some(checkpoint_metadata_version), end_version, )?; @@ -553,18 +569,31 @@ fn list_log_files_with_checkpoint( "Had a _last_checkpoint hint but didn't find any checkpoints", )); }; - if latest_checkpoint.version != checkpoint_metadata.version { + if latest_checkpoint.version != checkpoint_metadata_version { warn!( "_last_checkpoint hint is out of date. _last_checkpoint version: {}. Using actual most recent: {}", checkpoint_metadata.version, latest_checkpoint.version ); - } else if checkpoint_parts.len() != checkpoint_metadata.parts.unwrap_or(1) { - return Err(Error::InvalidCheckpoint(format!( - "_last_checkpoint indicated that checkpoint should have {} parts, but it has {}", - checkpoint_metadata.parts.unwrap_or(1), - checkpoint_parts.len() - ))); + } else { + // Convert checkpoint_metadata.parts(i64) to usize for comparisons + let expected_parts = match usize::try_from(checkpoint_metadata.parts.unwrap_or(1)) { + Ok(parts) => parts, + Err(e) => { + return Err(Error::InvalidCheckpoint(format!( + "Invalid number of checkpoint parts (negative or too large): {}", + e + ))); + } + }; + + if checkpoint_parts.len() != expected_parts { + return Err(Error::InvalidCheckpoint(format!( + "_last_checkpoint indicated that checkpoint should have {} parts, but it has {}", + expected_parts, + checkpoint_parts.len() + ))); + } } Ok((commit_files, checkpoint_parts)) } diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index d00ad235c..4d3e055e1 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -22,7 +22,7 @@ use crate::path::ParsedLogPath; use crate::scan::test_utils::{ add_batch_simple, add_batch_with_remove, sidecar_batch_with_given_paths, }; -use crate::snapshot::CheckpointMetadata; +use crate::snapshot::LastCheckpointHint; use crate::utils::test_utils::{assert_batch_matches, Action}; use crate::{ DeltaResult, Engine, EngineData, Expression, ExpressionRef, FileMeta, FileSystemClient, @@ -81,10 +81,10 @@ fn delta_path_for_multipart_checkpoint(version: u64, part_num: u32, num_parts: u } // Utility method to build a log using a list of log paths and an optional checkpoint hint. The -// CheckpointMetadata is written to `_delta_log/_last_checkpoint`. +// LastCheckpointHint is written to `_delta_log/_last_checkpoint`. fn build_log_with_paths_and_checkpoint( paths: &[Path], - checkpoint_metadata: Option<&CheckpointMetadata>, + checkpoint_metadata: Option<&LastCheckpointHint>, ) -> (Box, Url) { let store = Arc::new(InMemory::new()); @@ -269,7 +269,7 @@ fn build_snapshot_with_uuid_checkpoint_json() { #[test] fn build_snapshot_with_correct_last_uuid_checkpoint() { - let checkpoint_metadata = CheckpointMetadata { + let checkpoint_metadata = LastCheckpointHint { version: 5, size: 10, parts: Some(1), @@ -347,7 +347,7 @@ fn build_snapshot_with_multiple_incomplete_multipart_checkpoints() { #[test] fn build_snapshot_with_out_of_date_last_checkpoint() { - let checkpoint_metadata = CheckpointMetadata { + let checkpoint_metadata = LastCheckpointHint { version: 3, size: 10, parts: None, @@ -384,7 +384,7 @@ fn build_snapshot_with_out_of_date_last_checkpoint() { } #[test] fn build_snapshot_with_correct_last_multipart_checkpoint() { - let checkpoint_metadata = CheckpointMetadata { + let checkpoint_metadata = LastCheckpointHint { version: 5, size: 10, parts: Some(3), @@ -427,7 +427,7 @@ fn build_snapshot_with_correct_last_multipart_checkpoint() { #[test] fn build_snapshot_with_missing_checkpoint_part_from_hint_fails() { - let checkpoint_metadata = CheckpointMetadata { + let checkpoint_metadata = LastCheckpointHint { version: 5, size: 10, parts: Some(3), @@ -462,7 +462,7 @@ fn build_snapshot_with_missing_checkpoint_part_from_hint_fails() { } #[test] fn build_snapshot_with_bad_checkpoint_hint_fails() { - let checkpoint_metadata = CheckpointMetadata { + let checkpoint_metadata = LastCheckpointHint { version: 5, size: 10, parts: Some(1), @@ -536,7 +536,7 @@ fn build_snapshot_with_out_of_date_last_checkpoint_and_incomplete_recent_checkpo // When the _last_checkpoint is out of date and the most recent checkpoint is incomplete, the // Snapshot should be made of the most recent complete checkpoint and the commit files that // follow it. - let checkpoint_metadata = CheckpointMetadata { + let checkpoint_metadata = LastCheckpointHint { version: 3, size: 10, parts: None, @@ -625,7 +625,7 @@ fn build_snapshot_without_checkpoints() { #[test] fn build_snapshot_with_checkpoint_greater_than_time_travel_version() { - let checkpoint_metadata = CheckpointMetadata { + let checkpoint_metadata = LastCheckpointHint { version: 5, size: 10, parts: None, @@ -665,7 +665,7 @@ fn build_snapshot_with_checkpoint_greater_than_time_travel_version() { #[test] fn build_snapshot_with_start_checkpoint_and_time_travel_version() { - let checkpoint_metadata = CheckpointMetadata { + let checkpoint_metadata = LastCheckpointHint { version: 3, size: 10, parts: None, diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index 1d91efeeb..b27693877 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -1,6 +1,7 @@ //! In-memory representation of snapshots of tables (snapshot is a table at given point in time, it //! has schema etc.) +use delta_kernel_derive::Schema; use serde::{Deserialize, Serialize}; use std::sync::Arc; use tracing::{debug, warn}; @@ -142,22 +143,22 @@ impl Snapshot { } } -#[derive(Debug, Deserialize, Serialize)] +#[derive(Debug, Deserialize, Serialize, Schema)] #[serde(rename_all = "camelCase")] #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] #[cfg_attr(not(feature = "developer-visibility"), visibility::make(pub(crate)))] -struct CheckpointMetadata { +struct LastCheckpointHint { /// The version of the table when the last checkpoint was made. #[allow(unreachable_pub)] // used by acceptance tests (TODO make an fn accessor?) - pub version: Version, + pub version: i64, // TODO: use Version type instead of i64 /// The number of actions that are stored in the checkpoint. pub(crate) size: i64, /// The number of fragments if the last checkpoint was written in multiple parts. - pub(crate) parts: Option, + pub(crate) parts: Option, // TODO: use u64 instead /// The number of bytes of the checkpoint. - pub(crate) size_in_bytes: Option, + pub(crate) size_in_bytes: Option, // TODO: use u64 instead /// The number of AddFile actions in the checkpoint. - pub(crate) num_of_add_files: Option, + pub(crate) num_of_add_files: Option, // TODO: use u64 instead /// The schema of the checkpoint file. pub(crate) checkpoint_schema: Option, /// The checksum of the last checkpoint JSON. @@ -175,7 +176,7 @@ struct CheckpointMetadata { fn read_last_checkpoint( fs_client: &dyn FileSystemClient, log_root: &Url, -) -> DeltaResult> { +) -> DeltaResult> { let file_path = log_root.join(LAST_CHECKPOINT_FILE_NAME)?; match fs_client .read_files(vec![(file_path, None)]) From f935ad7e29936b73a3dc86892c182ed42e00e4c4 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 31 Mar 2025 15:24:35 -0700 Subject: [PATCH 42/45] doc update --- kernel/src/snapshot.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index b27693877..daa54a5ae 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -154,7 +154,7 @@ struct LastCheckpointHint { /// The number of actions that are stored in the checkpoint. pub(crate) size: i64, /// The number of fragments if the last checkpoint was written in multiple parts. - pub(crate) parts: Option, // TODO: use u64 instead + pub(crate) parts: Option, // TODO: use usize instead /// The number of bytes of the checkpoint. pub(crate) size_in_bytes: Option, // TODO: use u64 instead /// The number of AddFile actions in the checkpoint. From 2a78848f06fa526893a8b914dc7f08bbda20399c Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 31 Mar 2025 15:31:16 -0700 Subject: [PATCH 43/45] fix build and docs --- kernel/src/log_segment.rs | 2 +- kernel/src/snapshot.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index cbbd632f4..389ff9cf8 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -131,7 +131,7 @@ impl LogSegment { // Else, we check if the checkpoint hint's version is less than or equal to the // time travel version. (Some(cp), Some(end_version)) - if i64::try_from(end_version).map_or(false, |v| cp.version <= v) => + if i64::try_from(end_version).is_ok_and(|v| cp.version <= v) => { list_log_files_with_checkpoint(&cp, fs_client, &log_root, Some(end_version))? } diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index daa54a5ae..6d87901be 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -98,7 +98,7 @@ impl Snapshot { self.table_configuration().version() } - /// Table [`Schema`] at this `Snapshot`s version. + /// Table [`type@Schema`] at this `Snapshot`s version. pub fn schema(&self) -> SchemaRef { self.table_configuration.schema() } From 6231d8419eacc24c484889ad11bbf71d0e21eeb8 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 31 Mar 2025 16:34:44 -0700 Subject: [PATCH 44/45] revert primtiive additions --- ffi/src/expressions/kernel.rs | 13 --- ffi/src/schema.rs | 20 ----- kernel/src/actions/schemas.rs | 2 - kernel/src/checkpoints/mod.rs | 55 +++++------- kernel/src/engine/arrow_conversion.rs | 14 --- kernel/src/engine/arrow_expression/mod.rs | 28 ------ .../src/engine/parquet_row_group_skipping.rs | 89 ------------------- kernel/src/expressions/scalars.rs | 26 ------ kernel/src/schema/mod.rs | 10 --- 9 files changed, 21 insertions(+), 236 deletions(-) diff --git a/ffi/src/expressions/kernel.rs b/ffi/src/expressions/kernel.rs index 5abe69f77..9914758d5 100644 --- a/ffi/src/expressions/kernel.rs +++ b/ffi/src/expressions/kernel.rs @@ -66,8 +66,6 @@ pub struct EngineExpressionVisitor { /// Visit a 64bit `long` belonging to the list identified by `sibling_list_id`. pub visit_literal_long: VisitLiteralFn, /// Visit a 64bit unsigned `long` belonging to the list identified by `sibling_list_id`. - pub visit_literal_ulong: VisitLiteralFn, - /// Visit a 32bit unsigned `integer` int belonging to the list identified by `sibling_list_id`. pub visit_literal_uint: VisitLiteralFn, /// Visit a 16bit `short` belonging to the list identified by `sibling_list_id`. pub visit_literal_short: VisitLiteralFn, @@ -296,17 +294,6 @@ fn visit_expression_internal( match scalar { Scalar::Integer(val) => call!(visitor, visit_literal_int, sibling_list_id, *val), Scalar::Long(val) => call!(visitor, visit_literal_long, sibling_list_id, *val), - Scalar::ULong(val) => call!(visitor, visit_literal_ulong, sibling_list_id, *val), // TODO: Fix typecast - Scalar::USize(val) => { - #[cfg(target_pointer_width = "32")] - { - call!(visitor, visit_literal_uint, sibling_list_id, *val as u64) - } - #[cfg(target_pointer_width = "64")] - { - call!(visitor, visit_literal_ulong, sibling_list_id, *val as u64) - } - } Scalar::Short(val) => call!(visitor, visit_literal_short, sibling_list_id, *val), Scalar::Byte(val) => call!(visitor, visit_literal_byte, sibling_list_id, *val), Scalar::Float(val) => call!(visitor, visit_literal_float, sibling_list_id, *val), diff --git a/ffi/src/schema.rs b/ffi/src/schema.rs index 0cd1ed423..a474c80c3 100644 --- a/ffi/src/schema.rs +++ b/ffi/src/schema.rs @@ -102,24 +102,6 @@ pub struct EngineSchemaVisitor { metadata: &CStringMap, ), - /// Visit a `ulong` belonging to the list identified by `sibling_list_id`. - pub visit_ulong: extern "C" fn( - data: *mut c_void, - sibling_list_id: usize, - name: KernelStringSlice, - is_nullable: bool, - metadata: &CStringMap, - ), - - /// Visit a `usize` belonging to the list identified by `sibling_list_id`. - pub visit_usize: extern "C" fn( - data: *mut c_void, - sibling_list_id: usize, - name: KernelStringSlice, - is_nullable: bool, - metadata: &CStringMap, - ), - /// Visit an `integer` belonging to the list identified by `sibling_list_id`. pub visit_integer: extern "C" fn( data: *mut c_void, @@ -326,8 +308,6 @@ fn visit_schema_impl(schema: &StructType, visitor: &mut EngineSchemaVisitor) -> } &DataType::STRING => call!(visit_string), &DataType::LONG => call!(visit_long), - &DataType::ULONG => call!(visit_ulong), - &DataType::USIZE => call!(visit_usize), &DataType::INTEGER => call!(visit_integer), &DataType::SHORT => call!(visit_short), &DataType::BYTE => call!(visit_byte), diff --git a/kernel/src/actions/schemas.rs b/kernel/src/actions/schemas.rs index a0ec4b5be..4ecb1d3fb 100644 --- a/kernel/src/actions/schemas.rs +++ b/kernel/src/actions/schemas.rs @@ -45,8 +45,6 @@ macro_rules! impl_to_data_type { impl_to_data_type!( (String, DataType::STRING), - (u64, DataType::ULONG), - (usize, DataType::USIZE), (i64, DataType::LONG), (i32, DataType::INTEGER), (i16, DataType::SHORT), diff --git a/kernel/src/checkpoints/mod.rs b/kernel/src/checkpoints/mod.rs index 813eefc24..6429df3fa 100644 --- a/kernel/src/checkpoints/mod.rs +++ b/kernel/src/checkpoints/mod.rs @@ -35,9 +35,12 @@ use std::{ }; use url::Url; -use crate::actions::schemas::GetStructField; use crate::expressions::column_expr; use crate::schema::{SchemaRef, StructType}; +use crate::{ + actions::schemas::{GetStructField, ToSchema}, + snapshot::LastCheckpointHint, +}; use crate::{ actions::{ Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, METADATA_NAME, @@ -45,30 +48,12 @@ use crate::{ }, path::ParsedLogPath, snapshot::Snapshot, - DeltaResult, Engine, EngineData, Error, Expression, Version, + DeltaResult, Engine, EngineData, Error, Expression, }; pub mod log_replay; #[cfg(test)] mod tests; -/// Schema definition for the _last_checkpoint file -pub(crate) static CHECKPOINT_METADATA_SCHEMA: LazyLock = LazyLock::new(|| { - Arc::new(StructType::new(vec![ - ::get_struct_field("version"), - ::get_struct_field("size"), - Option::::get_struct_field("parts"), - Option::::get_struct_field("sizeInBytes"), - Option::::get_struct_field("numOfAddFiles"), - // Option::::get_struct_field("checkpoint_schema"), TODO: Schema - // Option::::get_struct_field("checksum"), TODO: Checksum - ])) -}); - -/// Get the expected schema for the _last_checkpoint file -pub fn get_checkpoint_metadata_schema() -> &'static SchemaRef { - &CHECKPOINT_METADATA_SCHEMA -} - /// Read schema definition for collecting checkpoint actions static CHECKPOINT_READ_SCHEMA: LazyLock = LazyLock::new(|| { StructType::new([ @@ -117,10 +102,10 @@ pub struct CheckpointWriter { total_add_actions_counter: Arc, /// Version of the checkpoint - version: Version, + version: i64, /// Number of parts of the checkpoint - parts: usize, + parts: i64, /// Path to table's log log_root: Url, @@ -132,8 +117,8 @@ impl CheckpointWriter { single_file_checkpoint_data: Option, total_actions_counter: Arc, total_add_actions_counter: Arc, - version: Version, - parts: usize, + version: i64, + parts: i64, log_root: Url, ) -> Self { Self { @@ -205,7 +190,6 @@ impl CheckpointWriter { metadata.len() ))); } - // Create expression for transforming the metadata let last_checkpoint_exprs = [ Expression::literal(self.version), @@ -214,11 +198,14 @@ impl CheckpointWriter { column_expr!("sizeInBytes"), Expression::literal(self.total_add_actions_counter.load(Ordering::SeqCst)), ]; + let last_checkpoint_expr = Expression::struct_from(last_checkpoint_exprs); // Get schemas for transformation - let last_checkpoint_schema = get_checkpoint_metadata_schema(); + let last_checkpoint_schema = LastCheckpointHint::to_schema(); + println!("last_checkpoint_schema: {:?}", last_checkpoint_schema); let engine_metadata_schema = last_checkpoint_schema.project_as_struct(&["sizeInBytes"])?; + println!("engine_metadata_schema: {:?}", engine_metadata_schema); // Create the evaluator for the transformation let last_checkpoint_metadata_evaluator = engine.get_expression_handler().get_evaluator( @@ -318,7 +305,7 @@ impl CheckpointBuilder { Some(data), total_actions_counter, total_add_actions_counter, - self.snapshot.version(), + self.snapshot.version() as i64, 1, self.snapshot.log_segment().log_root.clone(), )) @@ -489,8 +476,8 @@ mod unit_tests { fn test_prepare_last_checkpoint_metadata() -> DeltaResult<()> { // Setup test data let size_in_bytes: i64 = 1024 * 1024; // 1MB - let version: Version = 10; - let parts: usize = 3; + let version = 10; + let parts = 3; let total_actions_counter = Arc::new(AtomicI64::new(100)); let total_add_actions_counter = Arc::new(AtomicI64::new(75)); @@ -519,7 +506,7 @@ mod unit_tests { // Verify the values match our expectations assert_eq!(json["version"], version); assert_eq!(json["size"], total_actions_counter.load(Ordering::Relaxed)); - assert_eq!(json["parts"], parts as i64); + assert_eq!(json["parts"], parts); assert_eq!(json["sizeInBytes"], size_in_bytes); assert_eq!( json["numOfAddFiles"], @@ -532,8 +519,8 @@ mod unit_tests { #[test] fn test_prepare_last_checkpoint_metadata_with_empty_batch() { // Setup test data - let version: Version = 10; - let parts: usize = 3; + let version = 10; + let parts = 3; let total_actions_counter = Arc::new(AtomicI64::new(100)); let total_add_actions_counter = Arc::new(AtomicI64::new(75)); @@ -581,8 +568,8 @@ mod unit_tests { #[test] fn test_prepare_last_checkpoint_metadata_with_multiple_rows() { // Setup test data - let version: Version = 10; - let parts: usize = 1; + let version = 10; + let parts = 1; let total_actions_counter = Arc::new(AtomicI64::new(50)); let total_add_actions_counter = Arc::new(AtomicI64::new(30)); diff --git a/kernel/src/engine/arrow_conversion.rs b/kernel/src/engine/arrow_conversion.rs index 6242d27bd..a425cd143 100644 --- a/kernel/src/engine/arrow_conversion.rs +++ b/kernel/src/engine/arrow_conversion.rs @@ -100,20 +100,6 @@ impl TryFrom<&DataType> for ArrowDataType { match p { PrimitiveType::String => Ok(ArrowDataType::Utf8), PrimitiveType::Long => Ok(ArrowDataType::Int64), // undocumented type - PrimitiveType::ULong => Ok(ArrowDataType::UInt64), - // Since usize is platform dependent, we need to check the target_pointer_width - // to determine the correct arrow type to use. - PrimitiveType::USize => { - #[cfg(target_pointer_width = "32")] - { - Ok(ArrowDataType::UInt32) - } - - #[cfg(target_pointer_width = "64")] - { - Ok(ArrowDataType::UInt64) - } - } PrimitiveType::Integer => Ok(ArrowDataType::Int32), PrimitiveType::Short => Ok(ArrowDataType::Int16), PrimitiveType::Byte => Ok(ArrowDataType::Int8), diff --git a/kernel/src/engine/arrow_expression/mod.rs b/kernel/src/engine/arrow_expression/mod.rs index 1ff1834b7..432844d21 100644 --- a/kernel/src/engine/arrow_expression/mod.rs +++ b/kernel/src/engine/arrow_expression/mod.rs @@ -45,20 +45,6 @@ impl Scalar { let arr: ArrayRef = match self { Integer(val) => Arc::new(Int32Array::from_value(*val, num_rows)), Long(val) => Arc::new(Int64Array::from_value(*val, num_rows)), - ULong(val) => Arc::new(UInt64Array::from_value(*val, num_rows)), - // Since usize is platform dependent, we need to check the target_pointer_width - // to determine the correct array type to use. - USize(val) => { - #[cfg(target_pointer_width = "32")] - { - Arc::new(UInt32Array::from_value(*val as u32, num_rows)) - } - - #[cfg(target_pointer_width = "64")] - { - Arc::new(UInt64Array::from_value(*val as u64, num_rows)) - } - } Short(val) => Arc::new(Int16Array::from_value(*val, num_rows)), Byte(val) => Arc::new(Int8Array::from_value(*val, num_rows)), Float(val) => Arc::new(Float32Array::from_value(*val, num_rows)), @@ -107,20 +93,6 @@ impl Scalar { Null(DataType::SHORT) => Arc::new(Int16Array::new_null(num_rows)), Null(DataType::INTEGER) => Arc::new(Int32Array::new_null(num_rows)), Null(DataType::LONG) => Arc::new(Int64Array::new_null(num_rows)), - Null(DataType::ULONG) => Arc::new(UInt64Array::new_null(num_rows)), - // Since usize is platform dependent, we need to check the target_pointer_width - // to determine the correct array type to use. - Null(DataType::USIZE) => { - #[cfg(target_pointer_width = "32")] - { - Arc::new(UInt32Array::new_null(num_rows)) - } - - #[cfg(target_pointer_width = "64")] - { - Arc::new(UInt64Array::new_null(num_rows)) - } - } Null(DataType::FLOAT) => Arc::new(Float32Array::new_null(num_rows)), Null(DataType::DOUBLE) => Arc::new(Float64Array::new_null(num_rows)), Null(DataType::STRING) => Arc::new(StringArray::new_null(num_rows)), diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs index c9d78fbdd..fbce2f913 100644 --- a/kernel/src/engine/parquet_row_group_skipping.rs +++ b/kernel/src/engine/parquet_row_group_skipping.rs @@ -105,50 +105,6 @@ impl ParquetStatsProvider for RowGroupFilter<'_> { (Long, Statistics::Int64(s)) => s.min_opt()?.into(), (Long, Statistics::Int32(s)) => (*s.min_opt()? as i64).into(), (Long, _) => return None, - (ULong, Statistics::Int64(s)) => - // Attempt to convert value to u64, return None if conversion fails - { - u64::try_from(*s.min_opt()?).ok()?.into() - } - - // Handling ULong type with Int32 statistics - (ULong, Statistics::Int32(s)) => - // Attempt to convert value to u64, return None if conversion fails - { - u64::try_from(*s.min_opt()?).ok()?.into() - } - - (ULong, _) => return None, - // Handling USize type on 64-bit architecture with Int64 statistics - #[cfg(target_pointer_width = "64")] - (USize, Statistics::Int64(s)) => - // Attempt to convert value to usize, return None if conversion fails - { - usize::try_from(*s.min_opt()?).ok()?.into() - } - // Handling USize type on 64-bit architecture with Int32 statistics - #[cfg(target_pointer_width = "64")] - (USize, Statistics::Int32(s)) => - // Attempt to convert value to usize, converting from u64 if needed - { - usize::try_from(*s.min_opt()? as u64).ok()?.into() - } - // Handling USize type on 32-bit architecture with Int64 statistics - #[cfg(target_pointer_width = "32")] - (USize, Statistics::Int64(s)) => - // Attempt to convert value to usize, ensuring it's cast to u32 first - { - usize::try_from(*s.min_opt()? as u32).ok()?.into() - } - - // Handling USize type on 32-bit architecture with Int32 statistics - #[cfg(target_pointer_width = "32")] - (USize, Statistics::Int32(s)) => - // Attempt to convert vvalue to usize, return None if conversion fails - { - usize::try_from(*s.min_opt()?).ok()?.into() - } - (USize, _) => return None, (Integer, Statistics::Int32(s)) => s.min_opt()?.into(), (Integer, _) => return None, (Short, Statistics::Int32(s)) => (*s.min_opt()? as i16).into(), @@ -191,51 +147,6 @@ impl ParquetStatsProvider for RowGroupFilter<'_> { (Long, Statistics::Int64(s)) => s.max_opt()?.into(), (Long, Statistics::Int32(s)) => (*s.max_opt()? as i64).into(), (Long, _) => return None, - (ULong, Statistics::Int64(s)) => - // Attempt to convert value to u64, return None if conversion fails - { - u64::try_from(*s.min_opt()?).ok()?.into() - } - - // Handling ULong type with Int32 statistics - (ULong, Statistics::Int32(s)) => - // Attempt to convert value to u64, return None if conversion fails - { - u64::try_from(*s.min_opt()?).ok()?.into() - } - - (ULong, _) => return None, - // Handling USize type on 64-bit architecture with Int64 statistics - #[cfg(target_pointer_width = "64")] - (USize, Statistics::Int64(s)) => - // Attempt to convert value to usize, return None if conversion fails - { - usize::try_from(*s.min_opt()?).ok()?.into() - } - // Handling USize type on 64-bit architecture with Int32 statistics - #[cfg(target_pointer_width = "64")] - (USize, Statistics::Int32(s)) => - // Attempt to convert value to usize, converting from u64 if needed - { - usize::try_from(*s.min_opt()? as u64).ok()?.into() - } - // Handling USize type on 32-bit architecture with Int64 statistics - #[cfg(target_pointer_width = "32")] - (USize, Statistics::Int64(s)) => - // Attempt to convert value to usize, ensuring it's cast to u32 first - { - usize::try_from(*s.min_opt()? as u32).ok()?.into() - } - - // Handling USize type on 32-bit architecture with Int32 statistics - #[cfg(target_pointer_width = "32")] - (USize, Statistics::Int32(s)) => - // Attempt to convert vvalue to usize, return None if conversion fails - { - usize::try_from(*s.min_opt()?).ok()?.into() - } - (USize, _) => return None, - (Integer, Statistics::Int32(s)) => s.max_opt()?.into(), (Integer, _) => return None, (Short, Statistics::Int32(s)) => (*s.max_opt()? as i16).into(), diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index 55b22c982..90f5358a6 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -96,10 +96,6 @@ pub enum Scalar { Integer(i32), /// 64bit integer Long(i64), - // unsigned 64bit integer - ULong(u64), - // usize - USize(usize), /// 16bit integer Short(i16), /// 8bit integer @@ -135,8 +131,6 @@ impl Scalar { match self { Self::Integer(_) => DataType::INTEGER, Self::Long(_) => DataType::LONG, - Self::ULong(_) => DataType::ULONG, - Self::USize(_) => DataType::USIZE, Self::Short(_) => DataType::SHORT, Self::Byte(_) => DataType::BYTE, Self::Float(_) => DataType::FLOAT, @@ -175,8 +169,6 @@ impl Display for Scalar { match self { Self::Integer(i) => write!(f, "{}", i), Self::Long(i) => write!(f, "{}", i), - Self::ULong(i) => write!(f, "{}", i), - Self::USize(i) => write!(f, "{}", i), Self::Short(i) => write!(f, "{}", i), Self::Byte(i) => write!(f, "{}", i), Self::Float(fl) => write!(f, "{}", fl), @@ -249,10 +241,6 @@ impl PartialOrd for Scalar { (Integer(_), _) => None, (Long(a), Long(b)) => a.partial_cmp(b), (Long(_), _) => None, - (ULong(a), ULong(b)) => a.partial_cmp(b), - (ULong(_), _) => None, - (USize(a), USize(b)) => a.partial_cmp(b), - (USize(_), _) => None, (Short(a), Short(b)) => a.partial_cmp(b), (Short(_), _) => None, (Byte(a), Byte(b)) => a.partial_cmp(b), @@ -350,18 +338,6 @@ impl From<&[u8]> for Scalar { } } -impl From for Scalar { - fn from(u: u64) -> Self { - Self::ULong(u) - } -} - -impl From for Scalar { - fn from(u: usize) -> Self { - Self::USize(u) - } -} - // TODO: add more From impls impl PrimitiveType { @@ -402,8 +378,6 @@ impl PrimitiveType { Short => self.parse_str_as_scalar(raw, Scalar::Short), Integer => self.parse_str_as_scalar(raw, Scalar::Integer), Long => self.parse_str_as_scalar(raw, Scalar::Long), - ULong => self.parse_str_as_scalar(raw, Scalar::ULong), - USize => self.parse_str_as_scalar(raw, Scalar::USize), Float => self.parse_str_as_scalar(raw, Scalar::Float), Double => self.parse_str_as_scalar(raw, Scalar::Double), Boolean => { diff --git a/kernel/src/schema/mod.rs b/kernel/src/schema/mod.rs index ede497305..3a5648b57 100644 --- a/kernel/src/schema/mod.rs +++ b/kernel/src/schema/mod.rs @@ -493,12 +493,6 @@ pub enum PrimitiveType { String, /// i64: 8-byte signed integer. Range: -9223372036854775808 to 9223372036854775807 Long, - /// u64: 8-byte unsigned integer. Range: 0 to 18446744073709551615 - ULong, - /// usize: Platform-dependent unsigned integer. Typically used for indexing and memory sizes. - /// - 64-bit platforms: Range 0 to 18_446_744_073_709_551_615 - /// - 32-bit platforms: Range 0 to 4_294_967_295 - USize, /// i32: 4-byte signed integer. Range: -2147483648 to 2147483647 Integer, /// i16: 2-byte signed integer numbers. Range: -32768 to 32767 @@ -565,8 +559,6 @@ impl Display for PrimitiveType { match self { PrimitiveType::String => write!(f, "string"), PrimitiveType::Long => write!(f, "long"), - PrimitiveType::ULong => write!(f, "ulong"), - PrimitiveType::USize => write!(f, "usize"), PrimitiveType::Integer => write!(f, "integer"), PrimitiveType::Short => write!(f, "short"), PrimitiveType::Byte => write!(f, "byte"), @@ -632,8 +624,6 @@ impl From for DataType { impl DataType { pub const STRING: Self = DataType::Primitive(PrimitiveType::String); pub const LONG: Self = DataType::Primitive(PrimitiveType::Long); - pub const ULONG: Self = DataType::Primitive(PrimitiveType::ULong); - pub const USIZE: Self = DataType::Primitive(PrimitiveType::USize); pub const INTEGER: Self = DataType::Primitive(PrimitiveType::Integer); pub const SHORT: Self = DataType::Primitive(PrimitiveType::Short); pub const BYTE: Self = DataType::Primitive(PrimitiveType::Byte); From 3f2c0f9ac5ea7a2d5d145b4e13a50fef3ccb1e11 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 31 Mar 2025 20:04:43 -0700 Subject: [PATCH 45/45] add the CheckpointMetadata action batch for v2 checkpoints --- kernel/Cargo.toml | 4 +- kernel/src/actions/mod.rs | 13 +++ kernel/src/checkpoints/mod.rs | 84 +++++++++++++++++-- kernel/src/checkpoints/tests.rs | 139 ++++++++++++++++++++++++++++++++ kernel/src/lib.rs | 2 +- 5 files changed, 230 insertions(+), 12 deletions(-) diff --git a/kernel/Cargo.toml b/kernel/Cargo.toml index 5bc607c2a..6aa1052df 100644 --- a/kernel/Cargo.toml +++ b/kernel/Cargo.toml @@ -49,7 +49,7 @@ thiserror = "1" # only for structured logging tracing = { version = "0.1", features = ["log"] } url = "2" -uuid = "1.10.0" +uuid = { version = "1.10.0", features = ["v4", "fast-rng"] } z85 = "3.0.5" # bring in our derive macros @@ -118,8 +118,6 @@ default-engine-base = [ "need_arrow", "object_store", "tokio", - "uuid/v4", - "uuid/fast-rng", ] # the default-engine use the reqwest crate with default features which uses native-tls. if you want diff --git a/kernel/src/actions/mod.rs b/kernel/src/actions/mod.rs index e30d3033b..a0f9fdecd 100644 --- a/kernel/src/actions/mod.rs +++ b/kernel/src/actions/mod.rs @@ -544,6 +544,19 @@ pub(crate) struct SetTransaction { pub(crate) last_updated: Option, } +/// The CheckpointMetadata action describes details about a checkpoint following the V2 specification. +/// +/// [More info]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#checkpoint-metadata +#[derive(Schema, Debug, PartialEq)] +#[cfg_attr(feature = "developer-visibility", visibility::make(pub))] +pub(crate) struct CheckpointMetadata { + /// The version of the V2 spec checkpoint. + pub(crate) version: i64, + + /// Map containing any additional metadata about the V2 spec checkpoint. + pub(crate) tags: Option>, +} + /// The sidecar action references a sidecar file which provides some of the checkpoint's /// file actions. This action is only allowed in checkpoints following the V2 spec. /// diff --git a/kernel/src/checkpoints/mod.rs b/kernel/src/checkpoints/mod.rs index 6429df3fa..66b2aced3 100644 --- a/kernel/src/checkpoints/mod.rs +++ b/kernel/src/checkpoints/mod.rs @@ -35,12 +35,12 @@ use std::{ }; use url::Url; -use crate::expressions::column_expr; -use crate::schema::{SchemaRef, StructType}; use crate::{ actions::schemas::{GetStructField, ToSchema}, + expressions::Scalar, snapshot::LastCheckpointHint, }; +use crate::{actions::CheckpointMetadata, expressions::column_expr}; use crate::{ actions::{ Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, METADATA_NAME, @@ -50,6 +50,10 @@ use crate::{ snapshot::Snapshot, DeltaResult, Engine, EngineData, Error, Expression, }; +use crate::{ + schema::{SchemaRef, StructType}, + ExpressionHandlerExtension, +}; pub mod log_replay; #[cfg(test)] mod tests; @@ -282,6 +286,13 @@ impl CheckpointBuilder { deleted_file_retention_timestamp, ); + // Chain the result of create_checkpoint_metadata_batch to the checkpoint data + let chained = checkpoint_data.chain(create_checkpoint_metadata_batch( + self.snapshot.version() as i64, + engine, + v2_checkpoints_supported, + )?); + // Generate checkpoint path based on builder configuration // Classic naming is required for V1 checkpoints and optional for V2 checkpoints let checkpoint_path = if self.with_classic_naming || !v2_checkpoints_supported { @@ -296,13 +307,11 @@ impl CheckpointBuilder { )? }; - let data = SingleFileCheckpointData { - data: Box::new(checkpoint_data), - path: checkpoint_path.location, - }; - Ok(CheckpointWriter::new( - Some(data), + Some(SingleFileCheckpointData { + data: Box::new(chained), + path: checkpoint_path.location, + }), total_actions_counter, total_add_actions_counter, self.snapshot.version() as i64, @@ -378,6 +387,37 @@ fn deleted_file_retention_timestamp_with_time( Ok(now_ms - retention_ms) } +/// Create a batch with a single row containing the [`CheckpointMetadata`] action +/// for the V2 spec checkpoint. +/// +/// This method calls the create_one method on the expression handler to create +/// a single-row batch with the checkpoint metadata action. The method returns: +/// - None if the checkpoint is not a V2 checkpoint +/// - Some(Ok(batch)) if the batch was successfully created +fn create_checkpoint_metadata_batch( + version: i64, + engine: &dyn Engine, + is_v2_checkpoint: bool, +) -> DeltaResult>> { + if is_v2_checkpoint { + let values: &[Scalar] = &[version.into()]; + let checkpoint_metadata_batch = engine.get_expression_handler().create_one( + // TODO: Include checkpointMetadata.tags when maps are supported + Arc::new(CheckpointMetadata::to_schema().project_as_struct(&["version"])?), + &values, + )?; + + let result = CheckpointData { + data: checkpoint_metadata_batch, + selection_vector: vec![true], + }; + + Ok(Some(Ok(result))) + } else { + Ok(None) + } +} + #[cfg(test)] mod unit_tests { use super::*; @@ -614,4 +654,32 @@ mod unit_tests { _ => panic!("Should have failed with error"), } } + + #[test] + fn test_create_checkpoint_metadata() -> DeltaResult<()> { + let engine = ExprEngine::new(); + let version = 10; + let is_v2_checkpoint = true; + + // Call the method under test + let result = create_checkpoint_metadata_batch(version, &engine, is_v2_checkpoint)?; + + assert!(result.is_some()); + let checkpoint_data = result.unwrap()?; + assert!(checkpoint_data.selection_vector == vec![true]); + + // Extract the batch and verify the version field + let arrow_data = ArrowEngineData::try_from_engine_data(checkpoint_data.data)?; + assert!(arrow_data.len() == 1); + + // Verify the version field + let version_field = arrow_data + .record_batch() + .column(0) + .as_any() + .downcast_ref::() + .expect("Failed to downcast to Int64Array"); + assert_eq!(version_field.value(0), version); + Ok(()) + } } diff --git a/kernel/src/checkpoints/tests.rs b/kernel/src/checkpoints/tests.rs index 6975e73bc..8d5e87092 100644 --- a/kernel/src/checkpoints/tests.rs +++ b/kernel/src/checkpoints/tests.rs @@ -117,6 +117,145 @@ fn test_checkpoint_latest_version_by_default() -> DeltaResult<()> { Ok(()) } +#[test] +fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { + let (store, _) = new_in_memory_store(); + let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + + // 1st commit: adds `fake_path_1` + write_commit_to_store( + &store, + vec![Action::Add(Add { + path: "fake_path_1".into(), + data_change: true, + ..Default::default() + })], + 0, + )?; + + // 2nd commit: adds `fake_path_2` & removes `fake_path_1` + write_commit_to_store( + &store, + vec![ + Action::Add(Add { + path: "fake_path_2".into(), + data_change: true, + ..Default::default() + }), + Action::Remove(Remove { + path: "fake_path_1".into(), + data_change: true, + ..Default::default() + }), + ], + 1, + )?; + + // 3rd commit: metadata & protocol actions + write_commit_to_store( + &store, + vec![ + Action::Metadata(Metadata { + id: "fake_path_1".into(), + schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(), + ..Default::default() + }), + Action::Protocol(Protocol::try_new(3, 7, Vec::::new().into(), Vec::::new().into())?), + ], + 2, + )?; + let table_root = Url::parse("memory:///")?; + let table = Table::new(table_root); + let mut checkpointer = table.checkpoint(&engine, None)?.build(&engine)?; + let checkpoint_data = checkpointer.get_checkpoint_info()?; + let mut data_iter = checkpoint_data.data; + assert_eq!( + checkpoint_data.path, + Url::parse("memory:///_delta_log/00000000000000000002.checkpoint.parquet")? + ); + + // The first batch should be the metadata and protocol actions. + let checkpoint_data = data_iter.next().unwrap()?; + + assert_eq!(checkpoint_data.selection_vector, [true, true]); + + // The second batch should be the add action as the remove action is expired. + let checkpoint_data = data_iter.next().unwrap()?; + assert_eq!(checkpoint_data.selection_vector, [true, false]); + + // The third batch should not be included as the selection vector does not + // contain any true values, as the add action is removed in a following commit. + assert!(data_iter.next().is_none()); + + Ok(()) +} + +#[test] +fn test_uuid_v2_checkpoint() -> DeltaResult<()> { + let (store, _) = new_in_memory_store(); + let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + + // 1st commit: adds `fake_path_2` & removes `fake_path_1` + write_commit_to_store( + &store, + vec![ + Action::Add(Add { + path: "fake_path_2".into(), + data_change: true, + ..Default::default() + }), + Action::Remove(Remove { + path: "fake_path_1".into(), + data_change: true, + ..Default::default() + }), + ], + 1, + )?; + + // 2nd commit: metadata & protocol actions + // Protocol action includes the v2Checkpoint reader/writer feature. + write_commit_to_store( + &store, + vec![ + Action::Metadata(Metadata { + id: "fake_path_1".into(), + schema_string: "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}".to_string(), + ..Default::default() + }), + Action::Protocol(Protocol::try_new(3, 7, vec!["v2Checkpoint"].into(), vec!["v2Checkpoint"].into())?), + ], + 2, + )?; + let table_root = Url::parse("memory:///")?; + let table = Table::new(table_root); + let mut checkpointer = table.checkpoint(&engine, None)?.build(&engine)?; + let checkpoint_data = checkpointer.get_checkpoint_info()?; + let mut data_iter = checkpoint_data.data; + + // TODO: Assert that the checkpoint file path is UUID-based + let path = checkpoint_data.path; + let parts = path.as_str().split("."); + assert_eq!(parts.clone().count(), 4); + + // The first batch should be the metadata and protocol actions. + let checkpoint_data = data_iter.next().unwrap()?; + + assert_eq!(checkpoint_data.selection_vector, [true, true]); + + // The second batch should be the add action as the remove action is expired. + let checkpoint_data = data_iter.next().unwrap()?; + assert_eq!(checkpoint_data.selection_vector, [true, false]); + + // The third batch should be the CheckpointMetaData action. + let checkpoint_data = data_iter.next().unwrap()?; + assert_eq!(checkpoint_data.selection_vector, [true]); + + assert!(data_iter.next().is_none()); + + Ok(()) +} + /// Test that `checkpoint` works with a specific version parameter #[test] fn test_checkpoint_specific_version() -> DeltaResult<()> { diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index dd99fda99..91979d41f 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -382,7 +382,7 @@ trait ExpressionHandlerExtension: ExpressionHandler { } // Auto-implement the extension trait for all ExpressionHandlers -impl ExpressionHandlerExtension for T {} +impl ExpressionHandlerExtension for T {} /// Provides file system related functionalities to Delta Kernel. ///