delta-io · sebastiantia · Jan 13, 2025 · Jan 13, 2025 · Jan 13, 2025 · Jan 13, 2025
diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs
@@ -2,15 +2,14 @@
 //! files.
 
 use crate::actions::{get_log_schema, Metadata, Protocol, METADATA_NAME, PROTOCOL_NAME};
-use crate::path::ParsedLogPath;
+use crate::path::{LogPathFileType, ParsedLogPath};
 use crate::schema::SchemaRef;
 use crate::snapshot::CheckpointMetadata;
 use crate::utils::require;
 use crate::{
     DeltaResult, Engine, EngineData, Error, Expression, ExpressionRef, FileSystemClient, Version,
 };
 use itertools::Itertools;
-use std::cmp::Ordering;
 use std::convert::identity;
 use std::sync::{Arc, LazyLock};
 use tracing::warn;
@@ -313,28 +312,33 @@ fn list_log_files_with_version(
     let mut checkpoint_parts = vec![];
     let mut max_checkpoint_version = start_version;
 
-    for parsed_path in list_log_files(fs_client, log_root, start_version, end_version)? {
-        let parsed_path = parsed_path?;
-        if parsed_path.is_commit() {
-            commit_files.push(parsed_path);
-        } else if parsed_path.is_checkpoint() {
-            let path_version = parsed_path.version;
-            match max_checkpoint_version {
-                None => {
-                    checkpoint_parts.push(parsed_path);
-                    max_checkpoint_version = Some(path_version);
-                }
-                Some(checkpoint_version) => match path_version.cmp(&checkpoint_version) {
-                    Ordering::Greater => {
-                        max_checkpoint_version = Some(path_version);
-                        checkpoint_parts.clear();
-                        checkpoint_parts.push(parsed_path);
-                    }
-                    Ordering::Equal => checkpoint_parts.push(parsed_path),
-                    Ordering::Less => {}
-                },
+    let log_files = list_log_files(fs_client, log_root, start_version, end_version)?;
+
+    for (version, files) in &log_files
+        .filter_map(|res| match res {
+            Ok(path) => Some(path),
+            Err(e) => {
+                warn!("Error processing path: {:?}", e);
+                None
+            }
+        })
+        .chunk_by(|path| path.version)
+    {
+        let mut new_checkpoint_parts = vec![];
+
+        for file in files {
+            if file.is_commit() {
+                commit_files.push(file);
+            } else if file.is_checkpoint() {
+                new_checkpoint_parts.push(file);
             }
         }
+        if validate_checkpoint_parts(version, &new_checkpoint_parts)
+            && (max_checkpoint_version.is_none() || Some(version) >= max_checkpoint_version)
+        {
+            max_checkpoint_version = Some(version);
+            checkpoint_parts = new_checkpoint_parts;
+        }
     }
 
     Ok((commit_files, checkpoint_parts))
@@ -377,3 +381,39 @@ fn list_log_files_with_checkpoint(
     }
     Ok((commit_files, checkpoint_parts))
 }
+
+/// Validates that all the checkpoint parts belong to the same checkpoint version and that all parts
+/// are present. Returns `true` if we have a complete checkpoint, `false` otherwise.
+fn validate_checkpoint_parts(version: u64, checkpoint_parts: &Vec<ParsedLogPath>) -> bool {
+    if checkpoint_parts.is_empty() {
+        return false;
+    }
+
+    match checkpoint_parts.last().map(|file| &file.file_type) {
+        Some(LogPathFileType::MultiPartCheckpoint { num_parts, .. }) => {
+            if *num_parts as usize != checkpoint_parts.len() {
+                warn!(
+                    "Found a multi-part checkpoint at version {}. Found {} parts, expected {}",
+                    version,
+                    checkpoint_parts.len(),
+                    num_parts
+                );
+                return false;
+            }
+        }
+        Some(LogPathFileType::SinglePartCheckpoint) => {
+            if checkpoint_parts.len() != 1 {
+                warn!(
+                    "Found a single-part checkpoint at version {}. Found {} parts",
+                    version,
+                    checkpoint_parts.len()
+                );
+                return false;
+            }
+        }
+        // TODO: Include UuidCheckpoint once we actually support v2 checkpoints
+        _ => {}
+    }
+
+    true
+}
diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs
@@ -257,11 +257,8 @@ fn build_snapshot_with_bad_checkpoint_hint_fails() {
     assert!(log_segment.is_err())
 }
 
-#[ignore]
 #[test]
 fn build_snapshot_with_missing_checkpoint_part_no_hint() {
-    // TODO: Handle checkpoints correctly so that this test passes: https://github.com/delta-io/delta-kernel-rs/issues/497
-
     // Part 2 of 3 is missing from checkpoint 5. The Snapshot should be made of checkpoint
     // number 3 and commit files 4 to 7.
     let (client, log_root) = build_log_with_paths_and_checkpoint(
@@ -296,6 +293,50 @@ fn build_snapshot_with_missing_checkpoint_part_no_hint() {
     assert_eq!(versions, expected_versions);
 }
 
+#[test]
+fn build_snapshot_with_out_of_date_last_checkpoint_and_incomplete_recent_checkpoint() {
+    // When the _last_checkpoint is out of date and the most recent checkpoint is incomplete, the
+    // Snapshot should be made of the most recent complete checkpoint and the commit files that
+    // follow it.
+    let checkpoint_metadata = CheckpointMetadata {
+        version: 3,
+        size: 10,
+        parts: None,
+        size_in_bytes: None,
+        num_of_add_files: None,
+        checkpoint_schema: None,
+        checksum: None,
+    };
+
+    let (client, log_root) = build_log_with_paths_and_checkpoint(
+        &[
+            delta_path_for_version(0, "json"),
+            delta_path_for_version(1, "checkpoint.parquet"),
+            delta_path_for_version(2, "json"),
+            delta_path_for_version(3, "checkpoint.parquet"),
+            delta_path_for_version(4, "json"),
+            delta_path_for_multipart_checkpoint(5, 1, 3),
+            // Part 2 is missing!
+            delta_path_for_multipart_checkpoint(5, 3, 3),
+            delta_path_for_version(5, "json"),
+            delta_path_for_version(6, "json"),
+            delta_path_for_version(7, "json"),
+        ],
+        Some(&checkpoint_metadata),
+    );
+
+    let log_segment =
+        LogSegment::for_snapshot(client.as_ref(), log_root, checkpoint_metadata, None).unwrap();
+    let commit_files = log_segment.ascending_commit_files;
+    let checkpoint_parts = log_segment.checkpoint_parts;
+
+    assert_eq!(checkpoint_parts.len(), 1);
+
+    let versions = commit_files.into_iter().map(|x| x.version).collect_vec();
+    let expected_versions = vec![4, 5, 6, 7];
+    assert_eq!(versions, expected_versions);
+}
+
 #[test]
 fn build_snapshot_without_checkpoints() {
     let (client, log_root) = build_log_with_paths_and_checkpoint(