1
- use error:: LogHistoryError ;
2
- use error:: TimestampOutOfRangeError ;
1
+ //! This module defines the [`LogHistoryManager`], which can be used to perform timestamp queries
2
+ //! over the Delta Log, translating from timestamps to Delta versions.
3
+
4
+ use error:: { LogHistoryError , TimestampOutOfRangeError } ;
3
5
use search:: { binary_search_by_key_with_bounds, Bound , SearchError } ;
6
+ use std:: cell:: RefCell ;
4
7
use std:: cmp:: Ordering ;
8
+ use std:: collections:: HashMap ;
5
9
use std:: fmt:: Debug ;
6
10
use std:: sync:: Arc ;
11
+ use url:: Url ;
7
12
8
13
use crate :: log_segment:: LogSegment ;
9
14
use crate :: path:: ParsedLogPath ;
@@ -33,8 +38,8 @@ type Timestamp = i64;
33
38
///
34
39
/// Use this manager to:
35
40
/// - Convert timestamps or timestamp ranges into Delta versions or version ranges
36
- /// - Perform time travel queries using [ `Table::snapshot`]
37
- /// - Execute timestamp-based change data feed queries using [ `Table::table_changes`]
41
+ /// - Perform time travel queries using `Table::snapshot`
42
+ /// - Execute timestamp-based change data feed queries using `Table::table_changes`
38
43
///
39
44
/// The [`LogHistoryManager`] works with tables regardless of whether they have In-Commit
40
45
/// Timestamps enabled.
@@ -44,14 +49,13 @@ type Timestamp = i64;
44
49
/// Once created, the [`LogHistoryManager`] does not automatically update with newer versions
45
50
/// of the table. All timestamp queries are limited to the state captured in the [`Snapshot`]
46
51
/// provided during construction.
47
- #[ allow( unused) ]
48
52
#[ derive( Debug ) ]
49
- pub ( crate ) struct LogHistoryManager {
53
+ pub struct LogHistoryManager {
50
54
log_segment : LogSegment ,
51
55
snapshot : Arc < Snapshot > ,
56
+ commit_to_timestamp_cache : RefCell < HashMap < Url , Timestamp > > ,
52
57
}
53
58
54
- #[ allow( unused) ]
55
59
#[ derive( Debug ) ]
56
60
enum TimestampSearchBounds {
57
61
ExactMatch ( Version ) ,
@@ -97,25 +101,41 @@ impl LogHistoryManager {
97
101
Ok ( Self {
98
102
log_segment,
99
103
snapshot,
104
+ commit_to_timestamp_cache : Default :: default ( ) ,
100
105
} )
101
106
}
107
+ fn update_cache_with_timestamp ( & self , commit_file : & ParsedLogPath , value : Timestamp ) {
108
+ self . commit_to_timestamp_cache
109
+ . borrow_mut ( )
110
+ . insert ( commit_file. location . location . clone ( ) , value) ;
111
+ }
112
+ fn get_cached_timestamp ( & self , commit_file : & ParsedLogPath ) -> Option < Timestamp > {
113
+ self . commit_to_timestamp_cache
114
+ . borrow ( )
115
+ . get ( & commit_file. location . location )
116
+ . copied ( )
117
+ }
102
118
103
119
/// Gets the timestamp for the `commit_file`. If `read_ict` is false, this returns the file's
104
120
/// modification timestamp. If `read_ict` is true, this reads the file's In-commit timestamp.
105
- #[ allow( unused) ]
106
121
fn commit_file_to_timestamp (
107
122
& self ,
108
123
engine : & dyn Engine ,
109
124
commit_file : & ParsedLogPath ,
110
125
read_ict : bool ,
111
126
) -> Result < Timestamp , LogHistoryError > {
127
+ if let Some ( cached) = self . get_cached_timestamp ( commit_file) {
128
+ return Ok ( cached) ;
129
+ }
112
130
let commit_timestamp = if read_ict {
113
131
Self :: read_in_commit_timestamp ( engine, commit_file) ?
114
132
} else {
115
133
// By default, the timestamp of a commit is its modification time
116
134
commit_file. location . last_modified
117
135
} ;
118
136
137
+ self . update_cache_with_timestamp ( commit_file, commit_timestamp) ;
138
+
119
139
Ok ( commit_timestamp)
120
140
}
121
141
@@ -127,7 +147,6 @@ impl LogHistoryManager {
127
147
/// This returns a [`LogHistoryError::InCommitTimestampNotFoundError`] if the in-commit timestamp
128
148
/// is not present in the commit file, or if the CommitInfo is not the first action in the
129
149
/// commit.
130
- #[ allow( unused) ]
131
150
fn read_in_commit_timestamp (
132
151
engine : & dyn Engine ,
133
152
commit_file : & ParsedLogPath ,
@@ -165,11 +184,170 @@ impl LogHistoryManager {
165
184
visitor. in_commit_timestamp . ok_or_else ( not_found)
166
185
}
167
186
187
+ /// Gets the latest version that occurs before or at the given `timestamp`.
188
+ ///
189
+ /// This finds the version whose timestamp is less than or equal to `timestamp`.
190
+ /// If no such version exists, returns [`LogHistoryError::TimestampOutOfRange`].
191
+ ///
192
+ ////// # Examples
193
+ /// ```rust
194
+ /// # use delta_kernel::history_manager::error::LogHistoryError;
195
+ /// # use delta_kernel::engine::sync::SyncEngine;
196
+ /// # use delta_kernel::Table;
197
+ /// # use std::sync::Arc;
198
+ /// # let path = "./tests/data/with_checkpoint_no_last_checkpoint";
199
+ /// # let engine = Arc::new(SyncEngine::new());
200
+ /// let table = Table::try_from_uri(path)?;
201
+ /// let manager = table.history_manager(engine.as_ref(), None)?;
202
+ ///
203
+ /// // Get the latest version as of January 1, 2023
204
+ /// let timestamp = 1672531200000; // Milliseconds since epoch for 2023-01-01
205
+ /// let version_res = manager.latest_version_as_of(engine.as_ref(), timestamp);
206
+ /// # Ok::<(), delta_kernel::Error>(())
207
+ /// ```
208
+ pub fn latest_version_as_of (
209
+ & self ,
210
+ engine : & dyn Engine ,
211
+ timestamp : Timestamp ,
212
+ ) -> DeltaResult < Version > {
213
+ Ok ( self . timestamp_to_version ( engine, timestamp, Bound :: GreatestLower ) ?)
214
+ }
215
+
216
+ /// Gets the first version that occurs after the given `timestamp` (inclusive).
217
+ ///
218
+ /// This finds the version whose timestamp is greater than or equal to `timestamp`.
219
+ /// If no such version exists, returns [`LogHistoryError::TimestampOutOfRange`].
220
+ /// # Examples
221
+ /// ```rust
222
+ /// # use delta_kernel::engine::sync::SyncEngine;
223
+ /// # use delta_kernel::Table;
224
+ /// # use std::sync::Arc;
225
+ /// # let path = "./tests/data/with_checkpoint_no_last_checkpoint";
226
+ /// # let engine = Arc::new(SyncEngine::new());
227
+ /// let table = Table::try_from_uri(path)?;
228
+ /// let manager = table.history_manager(engine.as_ref(), None)?;
229
+ ///
230
+ /// // Find the first version that occurred after January 1, 2023
231
+ /// let timestamp = 1672531200000; // Milliseconds since epoch for 2023-01-01
232
+ /// let version_res = manager.first_version_after(engine.as_ref(), timestamp);
233
+ /// # Ok::<(), delta_kernel::Error>(())
234
+ /// ```
235
+ pub fn first_version_after (
236
+ & self ,
237
+ engine : & dyn Engine ,
238
+ timestamp : Timestamp ,
239
+ ) -> DeltaResult < Version > {
240
+ Ok ( self . timestamp_to_version ( engine, timestamp, Bound :: LeastUpper ) ?)
241
+ }
242
+
243
+ /// Converts a timestamp range to a corresponding version range.
244
+ ///
245
+ /// This function finds the version range that corresponds to the given timestamp range.
246
+ /// The returned tuple contains:
247
+ /// - The first (earliest) version with a timestamp greater than or equal to `start_timestamp`
248
+ /// - If `end_timestamp` is provided, the version with a timestamp less than or equal to `end_timestamp`.
249
+ ///
250
+ /// # Arguments
251
+ /// * `engine` - The engine used to access version history
252
+ /// * `start_timestamp` - The lower bound timestamp (inclusive)
253
+ /// * `end_timestamp` - The optional upper bound timestamp (inclusive), or `None` to indicate no upper bound
254
+ ///
255
+ /// # Returns
256
+ /// A tuple containing the start version and optional end version (inclusive)
257
+ ///
258
+ /// # Errors
259
+ /// Returns [`LogHistoryError::TimestampOutOfRange`] if:
260
+ /// - No version exists at or after `start_timestamp`
261
+ /// - `end_timestamp` is provided and no version exists at or before it
262
+ ///
263
+ /// Returns [`LogHistoryError::InvalidTimestampRange`] if the entire range [start_timestamp,
264
+ /// end_timestamp]
265
+ ///
266
+ /// # Examples
267
+ /// ```rust
268
+ /// # use delta_kernel::engine::sync::SyncEngine;
269
+ /// # use delta_kernel::Table;
270
+ /// # use std::sync::Arc;
271
+ /// # let path = "./tests/data/with_checkpoint_no_last_checkpoint";
272
+ /// # let engine = Arc::new(SyncEngine::new());
273
+ ///
274
+ /// let table = Table::try_from_uri(path)?;
275
+ /// let manager = table.history_manager(engine.as_ref(), None)?;
276
+ ///
277
+ /// // Find versions between January 1, 2023 and March 1, 2023
278
+ /// let start_timestamp = 1672531200000; // Jan 1, 2023 (milliseconds since epoch)
279
+ /// let end_timestamp = 1677628800000; // Mar 1, 2023 (milliseconds since epoch)
280
+ ///
281
+ /// let version_range_res =
282
+ /// manager.timestamp_range_to_versions(engine.as_ref(), start_timestamp, end_timestamp);
283
+ /// # Ok::<(), delta_kernel::Error>(())
284
+ /// ```
285
+ pub fn timestamp_range_to_versions (
286
+ & self ,
287
+ engine : & dyn Engine ,
288
+ start_timestamp : Timestamp ,
289
+ end_timestamp : impl Into < Option < Timestamp > > ,
290
+ ) -> DeltaResult < ( Version , Option < Version > ) > {
291
+ // Check that the start and end timestamps are valid. Timestamps must be positive
292
+ let end_timestamp = end_timestamp. into ( ) ;
293
+ require ! (
294
+ 0 <= start_timestamp,
295
+ LogHistoryError :: InvalidTimestamp ( start_timestamp) . into( )
296
+ ) ;
297
+ if let Some ( end_timestamp) = end_timestamp {
298
+ require ! (
299
+ 0 <= end_timestamp,
300
+ LogHistoryError :: InvalidTimestamp ( end_timestamp) . into( )
301
+ ) ;
302
+ // The `start_timestamp` must be no greater than the `end_timestamp`.
303
+ require ! (
304
+ start_timestamp <= end_timestamp,
305
+ LogHistoryError :: InvalidTimestampRange {
306
+ start_timestamp,
307
+ end_timestamp
308
+ }
309
+ . into( )
310
+ ) ;
311
+ }
312
+
313
+ // Convert the start timestamp to version
314
+ let start_version = self . first_version_after ( engine, start_timestamp) ?;
315
+
316
+ // If the end timestamp is present, convert it to an end version
317
+ let end_version = end_timestamp
318
+ . map ( |end| {
319
+ let end_version = self . latest_version_as_of ( engine, end) ?;
320
+
321
+ // Verify that the start version is no greater than the end version. This can
322
+ // happen in the case that the entire timestamp range falls between two commits.
323
+ // Consider the following history:
324
+ // |-------------|--------------------|---------------|
325
+ // v4 start_timestamp end_timestamp v5
326
+ //
327
+ // The latest version as of the end_timestamp is 4. The first version after the
328
+ // start_timestamp is 5. Thus in the case where end_version < start_version, we
329
+ // return and [`LogHistoryError::EmptyTimestampRange`].
330
+ require ! (
331
+ start_version <= end_version,
332
+ DeltaError :: from( LogHistoryError :: EmptyTimestampRange {
333
+ end_timestamp: end,
334
+ start_timestamp,
335
+ between_left: end_version,
336
+ between_right: start_version
337
+ } )
338
+ ) ;
339
+
340
+ Ok ( end_version)
341
+ } )
342
+ . transpose ( ) ?;
343
+
344
+ Ok ( ( start_version, end_version) )
345
+ }
346
+
168
347
/// Given a timestamp, this function determines the commit range that timestamp conversion
169
348
/// should search. A timestamp search may be conducted over one of two version ranges:
170
349
/// 1) A range of commits whose timestamp is the file modification timestamp
171
350
/// 2) A range of commits whose timestamp is an in-commit timestamp.
172
- #[ allow( unused) ]
173
351
fn get_timestamp_search_bounds (
174
352
& self ,
175
353
timestamp : Timestamp ,
@@ -237,7 +415,6 @@ impl LogHistoryManager {
237
415
/// happen in the following cases based on the bound:
238
416
/// - `Bound::GreatestLower`: There is no commit whose timestamp is lower than the given `timestamp`.
239
417
/// - `Bound::LeastUpper`: There is no commit whose timestamp is greater than the given `timestamp`.
240
- #[ allow( unused) ]
241
418
fn timestamp_to_version (
242
419
& self ,
243
420
engine : & dyn Engine ,
0 commit comments