delta-io · zachschuermann · Apr 7, 2025 · Nov 27, 2024 · Mar 18, 2025 · Mar 18, 2025
diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs
@@ -146,6 +146,36 @@ impl LogSegment {
         )
     }
 
+    pub(crate) fn for_versions(
+        fs_client: &dyn FileSystemClient,
+        log_root: Url,
+        start_version: Version,
+        end_version: impl Into<Option<Version>>,
+    ) -> DeltaResult<Self> {
+        let end_version = end_version.into();
+        if let Some(end_version) = end_version {
+            if start_version > end_version {
+                return Err(Error::generic(
+                    "Failed to build LogSegment: start_version cannot be greater than end_version",
+                ));
+            }
+        }
+        let (mut ascending_commit_files, checkpoint_parts) =
+            list_log_files_with_version(fs_client, &log_root, Some(start_version), end_version)?;
+
+        // Commit file versions must be greater than the most recent checkpoint version if it exists
+        if let Some(checkpoint_file) = checkpoint_parts.first() {
+            ascending_commit_files.retain(|log_path| checkpoint_file.version < log_path.version);
+        }
+
+        LogSegment::try_new(
+            ascending_commit_files,
+            checkpoint_parts,
+            log_root,
+            end_version,
+        )
+    }
+
     /// Constructs a [`LogSegment`] to be used for `TableChanges`. For a TableChanges between versions
     /// `start_version` and `end_version`: Its LogSegment is made of zero checkpoints and all commits
     /// between versions `start_version` (inclusive) and `end_version` (inclusive). If no `end_version`
@@ -186,6 +216,7 @@ impl LogSegment {
         );
         LogSegment::try_new(ascending_commit_files, vec![], log_root, end_version)
     }
+
     /// Read a stream of log data from this log segment.
     ///
     /// The log files will be read from most recent to oldest.
@@ -360,8 +391,12 @@ impl LogSegment {
         )?))
     }
 
-    // Get the most up-to-date Protocol and Metadata actions
-    pub(crate) fn read_metadata(&self, engine: &dyn Engine) -> DeltaResult<(Metadata, Protocol)> {
+    // Do a lightweight protocol+metadata log replay to find the latest Protocol and Metadata in
+    // the LogSegment
+    pub(crate) fn protocol_and_metadata(
+        &self,
+        engine: &dyn Engine,
+    ) -> DeltaResult<(Option<Metadata>, Option<Protocol>)> {
         let data_batches = self.replay_for_metadata(engine)?;
         let (mut metadata_opt, mut protocol_opt) = (None, None);
         for batch in data_batches {
@@ -377,7 +412,12 @@ impl LogSegment {
                 break;
             }
         }
-        match (metadata_opt, protocol_opt) {
+        Ok((metadata_opt, protocol_opt))
+    }
+
+    // Get the most up-to-date Protocol and Metadata actions
+    pub(crate) fn read_metadata(&self, engine: &dyn Engine) -> DeltaResult<(Metadata, Protocol)> {
+        match self.protocol_and_metadata(engine)? {
             (Some(m), Some(p)) => Ok((m, p)),
             (None, Some(_)) => Err(Error::MissingMetadata),
             (Some(_), None) => Err(Error::MissingProtocol),
@@ -401,6 +441,11 @@ impl LogSegment {
         // read the same protocol and metadata schema for both commits and checkpoints
         self.replay(engine, schema.clone(), schema, META_PREDICATE.clone())
     }
+
+    /// Return whether or not the LogSegment contains a checkpoint.
+    pub(crate) fn has_checkpoint(&self) -> bool {
+        !self.checkpoint_parts.is_empty()
-        !self.checkpoint_parts.is_empty()
+        self.checkpoint_version.is_some()
-        !self.checkpoint_parts.is_empty()
+        self.checkpoint_version.is_some()
+    }
 }
 
 /// Returns a fallible iterator of [`ParsedLogPath`] that are between the provided `start_version` (inclusive)
@@ -430,6 +475,7 @@ fn list_log_files(
             Err(_) => true,
         }))
 }
+
 /// List all commit and checkpoint files with versions above the provided `start_version` (inclusive).
 /// If successful, this returns a tuple `(ascending_commit_files, checkpoint_parts)` of type
 /// `(Vec<ParsedLogPath>, Vec<ParsedLogPath>)`. The commit files are guaranteed to be sorted in

diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs
@@ -43,13 +43,21 @@ impl std::fmt::Debug for Snapshot {
 }
 
 impl Snapshot {
+    fn new(log_segment: LogSegment, table_configuration: TableConfiguration) -> Self {
+        Self {
+            log_segment,
+            table_configuration,
+        }
+    }
+
     /// Create a new [`Snapshot`] instance for the given version.
     ///
     /// # Parameters
     ///
     /// - `table_root`: url pointing at the table root (where `_delta_log` folder is located)
     /// - `engine`: Implementation of [`Engine`] apis.
-    /// - `version`: target version of the [`Snapshot`]
+    /// - `version`: target version of the [`Snapshot`]. None will create a snapshot at the latest
+    ///   version of the table.
     pub fn try_new(
         table_root: Url,
         engine: &dyn Engine,
@@ -67,6 +75,70 @@ impl Snapshot {
         Self::try_new_from_log_segment(table_root, log_segment, engine)
     }
 
+    /// Create a new [`Snapshot`] instance from an existing [`Snapshot`]. This is useful when you
+    /// already have a [`Snapshot`] lying around and want to do the minimal work to 'update' the
+    /// snapshot to a later version.
+    ///
+    /// # Parameters
+    ///
+    /// - `existing_snapshot`: reference to an existing [`Snapshot`]
+    /// - `engine`: Implementation of [`Engine`] apis.
+    /// - `version`: target version of the [`Snapshot`]. None will create a snapshot at the latest
+    ///   version of the table.
+    pub fn new_from(
+        existing_snapshot: Arc<Snapshot>,
+        engine: &dyn Engine,
+        version: Option<Version>,
+    ) -> DeltaResult<Arc<Self>> {
+        // simple heuristic for now:
+        // 1. if the new version < existing version, just return an entirely new snapshot
+        // 2. if the new version == existing version, just return the existing snapshot
+        // 3. list from existing snapshot version
+        // 4a. if new checkpoint is found: just create a new snapshot from that checkpoint (and
+        // commits after it)
+        // 4b. if no new checkpoint is found: do lightweight P+M replay on the latest commits
+        match version {
+            Some(v) if v < existing_snapshot.version() => {
+                Self::try_new(existing_snapshot.table_root().clone(), engine, version).map(Arc::new)
+            }
+            Some(v) if v == existing_snapshot.version() => Ok(existing_snapshot.clone()),
+            new_version => {
+                debug!(
+                    "new version: {new_version:?}, existing version: {}",
+                    existing_snapshot.version()
+                );
+                let log_root = existing_snapshot.log_segment.log_root.clone();
+                let fs_client = engine.get_file_system_client();
+
+                // create a log segment just from existing_snapshot.version -> new_version
+                let log_segment = LogSegment::for_versions(
+                    fs_client.as_ref(),
+                    log_root,
+                    existing_snapshot.version(),
+                    new_version,
+                )?;
+
+                if log_segment.has_checkpoint() {
+                    Self::try_new_from_log_segment(
+                        existing_snapshot.table_root().clone(),
+                        log_segment,
+                        engine,
+                    )
+                    .map(Arc::new)
+                } else {
+                    let (new_metadata, new_protocol) = log_segment.protocol_and_metadata(engine)?;
+                    let table_configuration = TableConfiguration::new_from(
+                        existing_snapshot.table_configuration(),
+                        new_metadata,
+                        new_protocol,
+                        log_segment.end_version,
+                    )?;
+                    Ok(Arc::new(Snapshot::new(log_segment, table_configuration)))
+                }
+            }
+        }
+    }
+
     /// Create a new [`Snapshot`] instance.
     pub(crate) fn try_new_from_log_segment(
         location: Url,
@@ -241,6 +313,26 @@ mod tests {
         assert_eq!(snapshot.schema(), &expected);
     }
 
+    // TODO(zach)
+    #[test]
+    fn test_snapshot_new_from() {
+        let path =
+            std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap();
+        let url = url::Url::from_directory_path(path).unwrap();
+
+        let engine = SyncEngine::new();
+        let old_snapshot = Arc::new(Snapshot::try_new(url, &engine, Some(0)).unwrap());
+        let snapshot = Snapshot::new_from(old_snapshot, &engine, Some(0)).unwrap();
+
+        let expected =
+            Protocol::try_new(3, 7, Some(["deletionVectors"]), Some(["deletionVectors"])).unwrap();
+        assert_eq!(snapshot.protocol(), &expected);
+
+        let schema_string = r#"{"type":"struct","fields":[{"name":"value","type":"integer","nullable":true,"metadata":{}}]}"#;
+        let expected: StructType = serde_json::from_str(schema_string).unwrap();
+        assert_eq!(snapshot.schema(), &expected);
+    }
+
     #[test]
     fn test_read_table_with_last_checkpoint() {
         let path = std::fs::canonicalize(PathBuf::from(

diff --git a/kernel/src/table_changes/mod.rs b/kernel/src/table_changes/mod.rs
@@ -111,7 +111,7 @@ static CDF_FIELDS: LazyLock<[StructField; 3]> = LazyLock::new(|| {
 pub struct TableChanges {
     pub(crate) log_segment: LogSegment,
     table_root: Url,
-    end_snapshot: Snapshot,
+    end_snapshot: Arc<Snapshot>,
     start_version: Version,
     schema: Schema,
 }
@@ -149,9 +149,12 @@ impl TableChanges {
         // Both snapshots ensure that reading is supported at the start and end version using
         // `ensure_read_supported`. Note that we must still verify that reading is
         // supported for every protocol action in the CDF range.
-        let start_snapshot =
-            Snapshot::try_new(table_root.as_url().clone(), engine, Some(start_version))?;
-        let end_snapshot = Snapshot::try_new(table_root.as_url().clone(), engine, end_version)?;
+        let start_snapshot = Arc::new(Snapshot::try_new(
+            table_root.as_url().clone(),
+            engine,
+            Some(start_version),
+        )?);
+        let end_snapshot = Snapshot::new_from(start_snapshot.clone(), engine, end_version)?;
 
         // Verify CDF is enabled at the beginning and end of the interval using
         // [`check_cdf_table_properties`] to fail early. This also ensures that column mapping is

diff --git a/kernel/src/table_configuration.rs b/kernel/src/table_configuration.rs
@@ -33,7 +33,7 @@ use crate::{DeltaResult, Error, Version};
 /// `try_new` successfully returns `TableConfiguration`, it is also guaranteed that reading the
 /// table is supported.
 #[cfg_attr(feature = "developer-visibility", visibility::make(pub))]
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) struct TableConfiguration {
     metadata: Metadata,
     protocol: Protocol,
@@ -89,6 +89,58 @@ impl TableConfiguration {
         })
     }
 
+    pub(crate) fn new_from(
+        table_configuration: &Self,
+        new_metadata: Option<Metadata>,
+        new_protocol: Option<Protocol>,
+        new_version: Version,
+    ) -> DeltaResult<Self> {
+        // simplest case: no new P/M, just return the existing table configuration with new version
+        if new_metadata.is_none() && new_protocol.is_none() {
+            return Ok(Self {
+                version: new_version,
+                ..(*table_configuration).clone()
+            });
+        }
+
+        // if there's new metadata: have to parse schema, table properties
+        let (metadata, schema, table_properties) = match new_metadata {
+            Some(metadata) => {
+                let schema = Arc::new(metadata.parse_schema()?);
+                let table_properties = metadata.parse_table_properties();
+                (metadata, schema, table_properties)
+            }
+            None => (
+                table_configuration.metadata.clone(),
+                table_configuration.schema.clone(),
+                table_configuration.table_properties.clone(),
+            ),
+        };
+
+        // if there's new protocol: have to ensure read suported
+        let protocol = match new_protocol {
+            Some(protocol) => {
+                protocol.ensure_read_supported()?;
+                protocol
+            }
+            None => table_configuration.protocol.clone(),
+        };
+
+        // if either change, have to validate column mapping mode
+        let column_mapping_mode = column_mapping_mode(&protocol, &table_properties);
+        validate_schema_column_mapping(&schema, column_mapping_mode)?;
+
+        Ok(Self {
+            schema,
+            metadata,
+            protocol,
+            table_properties,
+            column_mapping_mode,
+            table_root: table_configuration.table_root.clone(),
+            version: new_version,
+        })
+    }
+
     /// The [`Metadata`] for this table at this version.
     #[cfg_attr(feature = "developer-visibility", visibility::make(pub))]
     pub(crate) fn metadata(&self) -> &Metadata {