1
- use crate :: internal_mod;
1
+ //! This module defines the [`LogHistoryManager`], which can be used to perform timestamp queries
2
+ //! over the Delta Log, translating from timestamps to Delta versions.
2
3
3
- use error :: LogHistoryError ;
4
- use error:: TimestampOutOfRangeError ;
4
+ use crate :: internal_mod ;
5
+ use error:: { LogHistoryError , TimestampOutOfRangeError } ;
5
6
use search:: { binary_search_by_key_with_bounds, Bound , SearchError } ;
7
+ use std:: cell:: RefCell ;
6
8
use std:: cmp:: Ordering ;
9
+ use std:: collections:: HashMap ;
7
10
use std:: fmt:: Debug ;
8
11
use std:: sync:: Arc ;
12
+ use url:: Url ;
9
13
10
14
use crate :: log_segment:: LogSegment ;
11
15
use crate :: path:: ParsedLogPath ;
@@ -14,8 +18,8 @@ use crate::utils::require;
14
18
use crate :: { DeltaResult , Engine , Error as DeltaError , RowVisitor , Version } ;
15
19
use timestamp_visitor:: InCommitTimestampVisitor ;
16
20
17
- pub ( crate ) mod error;
18
21
internal_mod ! ( pub ( crate ) mod search) ;
22
+ pub mod error;
19
23
mod timestamp_visitor;
20
24
21
25
type Timestamp = i64 ;
@@ -27,8 +31,8 @@ type Timestamp = i64;
27
31
///
28
32
/// Use this manager to:
29
33
/// - Convert timestamps or timestamp ranges into Delta versions or version ranges
30
- /// - Perform time travel queries using [ `Table::snapshot`]
31
- /// - Execute timestamp-based change data feed queries using [ `Table::table_changes`]
34
+ /// - Perform time travel queries using `Table::snapshot`
35
+ /// - Execute timestamp-based change data feed queries using `Table::table_changes`
32
36
///
33
37
/// The [`LogHistoryManager`] works with tables regardless of whether they have In-Commit
34
38
/// Timestamps enabled.
@@ -38,14 +42,13 @@ type Timestamp = i64;
38
42
/// Once created, the [`LogHistoryManager`] does not automatically update with newer versions
39
43
/// of the table. All timestamp queries are limited to the state captured in the [`Snapshot`]
40
44
/// provided during construction.
41
- #[ allow( unused) ]
42
45
#[ derive( Debug ) ]
43
- pub ( crate ) struct LogHistoryManager {
46
+ pub struct LogHistoryManager {
44
47
log_segment : LogSegment ,
45
48
snapshot : Arc < Snapshot > ,
49
+ commit_to_timestamp_cache : RefCell < HashMap < Url , Timestamp > > ,
46
50
}
47
51
48
- #[ allow( unused) ]
49
52
#[ derive( Debug ) ]
50
53
enum TimestampSearchBounds {
51
54
ExactMatch ( Version ) ,
@@ -92,25 +95,41 @@ impl LogHistoryManager {
92
95
Ok ( Self {
93
96
log_segment,
94
97
snapshot,
98
+ commit_to_timestamp_cache : Default :: default ( ) ,
95
99
} )
96
100
}
101
+ fn update_cache_with_timestamp ( & self , commit_file : & ParsedLogPath , value : Timestamp ) {
102
+ self . commit_to_timestamp_cache
103
+ . borrow_mut ( )
104
+ . insert ( commit_file. location . location . clone ( ) , value) ;
105
+ }
106
+ fn get_cached_timestamp ( & self , commit_file : & ParsedLogPath ) -> Option < Timestamp > {
107
+ self . commit_to_timestamp_cache
108
+ . borrow ( )
109
+ . get ( & commit_file. location . location )
110
+ . copied ( )
111
+ }
97
112
98
113
/// Gets the timestamp for the `commit_file`. If `read_ict` is false ,this returns the file's
99
114
/// modification timestamp. If `read_ict` is true, this reads the file's In-commit timestamp.
100
- #[ allow( unused) ]
101
115
fn commit_file_to_timestamp (
102
116
& self ,
103
117
engine : & dyn Engine ,
104
118
commit_file : & ParsedLogPath ,
105
119
read_ict : bool ,
106
120
) -> Result < Timestamp , LogHistoryError > {
121
+ if let Some ( cached) = self . get_cached_timestamp ( commit_file) {
122
+ return Ok ( cached) ;
123
+ }
107
124
let commit_timestamp = if read_ict {
108
125
Self :: read_in_commit_timestamp ( engine, commit_file) ?
109
126
} else {
110
127
// By default, the timestamp of a commit is its modification time
111
128
commit_file. location . last_modified
112
129
} ;
113
130
131
+ self . update_cache_with_timestamp ( commit_file, commit_timestamp) ;
132
+
114
133
Ok ( commit_timestamp)
115
134
}
116
135
@@ -122,7 +141,6 @@ impl LogHistoryManager {
122
141
/// This returns a [`LogHistoryError::InCommitTimestampNotFoundError`] if the in-commit timestamp
123
142
/// is not present in the commit file, or if the CommitInfo is not the first action in the
124
143
/// commit.
125
- #[ allow( unused) ]
126
144
fn read_in_commit_timestamp (
127
145
engine : & dyn Engine ,
128
146
commit_file : & ParsedLogPath ,
@@ -134,7 +152,7 @@ impl LogHistoryManager {
134
152
} ;
135
153
136
154
// Get an iterator over the actions in the commit file
137
- let mut action_iter = engine
155
+ let action_iter = engine
138
156
. json_handler ( )
139
157
. read_json_files (
140
158
& [ commit_file. location . clone ( ) ] ,
@@ -143,7 +161,7 @@ impl LogHistoryManager {
143
161
)
144
162
. map_err ( wrap_err) ?;
145
163
146
- // Take the first non-empty engine data batch
164
+ // Take the first engine data batch
147
165
let batch = action_iter
148
166
. map ( |res| res. map_err ( wrap_err) )
149
167
. next ( )
@@ -164,11 +182,170 @@ impl LogHistoryManager {
164
182
} )
165
183
}
166
184
185
+ /// Gets the latest version that occurs before or at the given `timestamp`.
186
+ ///
187
+ /// This finds the version whose timestamp is less than or equal to `timestamp`.
188
+ /// If no such version exists, returns [`LogHistoryError::TimestampOutOfRange`].
189
+ ///
190
+ ////// # Examples
191
+ /// ```rust
192
+ /// # use delta_kernel::history_manager::error::LogHistoryError;
193
+ /// # use delta_kernel::engine::sync::SyncEngine;
194
+ /// # use delta_kernel::Table;
195
+ /// # use std::sync::Arc;
196
+ /// # let path = "./tests/data/with_checkpoint_no_last_checkpoint";
197
+ /// # let engine = Arc::new(SyncEngine::new());
198
+ /// let table = Table::try_from_uri(path)?;
199
+ /// let manager = table.history_manager(engine.as_ref())?;
200
+ ///
201
+ /// // Get the latest version as of January 1, 2023
202
+ /// let timestamp = 1672531200000; // Milliseconds since epoch for 2023-01-01
203
+ /// let version_res = manager.latest_version_as_of(engine.as_ref(), timestamp);
204
+ /// # Ok::<(), delta_kernel::Error>(())
205
+ /// ```
206
+ pub fn latest_version_as_of (
207
+ & self ,
208
+ engine : & dyn Engine ,
209
+ timestamp : Timestamp ,
210
+ ) -> DeltaResult < Version > {
211
+ Ok ( self . timestamp_to_version ( engine, timestamp, Bound :: GreatestLower ) ?)
212
+ }
213
+
214
+ /// Gets the first version that occurs after the given `timestamp` (inclusive).
215
+ ///
216
+ /// This finds the version whose timestamp is greater than or equal to `timestamp`.
217
+ /// If no such version exists, returns [`LogHistoryError::TimestampOutOfRange`].
218
+ /// # Examples
219
+ /// ```rust
220
+ /// # use delta_kernel::engine::sync::SyncEngine;
221
+ /// # use delta_kernel::Table;
222
+ /// # use std::sync::Arc;
223
+ /// # let path = "./tests/data/with_checkpoint_no_last_checkpoint";
224
+ /// # let engine = Arc::new(SyncEngine::new());
225
+ /// let table = Table::try_from_uri(path)?;
226
+ /// let manager = table.history_manager(engine.as_ref())?;
227
+ ///
228
+ /// // Find the first version that occurred after January 1, 2023
229
+ /// let timestamp = 1672531200000; // Milliseconds since epoch for 2023-01-01
230
+ /// let version_res = manager.first_version_after(engine.as_ref(), timestamp);
231
+ /// # Ok::<(), delta_kernel::Error>(())
232
+ /// ```
233
+ pub fn first_version_after (
234
+ & self ,
235
+ engine : & dyn Engine ,
236
+ timestamp : Timestamp ,
237
+ ) -> DeltaResult < Version > {
238
+ Ok ( self . timestamp_to_version ( engine, timestamp, Bound :: LeastUpper ) ?)
239
+ }
240
+
241
+ /// Converts a timestamp range to a corresponding version range.
242
+ ///
243
+ /// This function finds the version range that corresponds to the given timestamp range.
244
+ /// The returned tuple contains:
245
+ /// - The first (earliest) version with a timestamp greater than or equal to `start_timestamp`
246
+ /// - If `end_timestamp` is provided, the version with a timestamp less than or equal to `end_timestamp`.
247
+ ///
248
+ /// # Arguments
249
+ /// * `engine` - The engine used to access version history
250
+ /// * `start_timestamp` - The lower bound timestamp (inclusive)
251
+ /// * `end_timestamp` - The optional upper bound timestamp (inclusive), or `None` to indicate no upper bound
252
+ ///
253
+ /// # Returns
254
+ /// A tuple containing the start version and optional end version (inclusive)
255
+ ///
256
+ /// # Errors
257
+ /// Returns [`LogHistoryError::TimestampOutOfRange`] if:
258
+ /// - No version exists at or after `start_timestamp`
259
+ /// - `end_timestamp` is provided and no version exists at or before it
260
+ ///
261
+ /// Returns [`LogHistoryError::InvalidTimestampRange`] if the entire range [start_timestamp,
262
+ /// end_timestamp]
263
+ ///
264
+ /// # Examples
265
+ /// ```rust
266
+ /// # use delta_kernel::engine::sync::SyncEngine;
267
+ /// # use delta_kernel::Table;
268
+ /// # use std::sync::Arc;
269
+ /// # let path = "./tests/data/with_checkpoint_no_last_checkpoint";
270
+ /// # let engine = Arc::new(SyncEngine::new());
271
+ ///
272
+ /// let table = Table::try_from_uri(path)?;
273
+ /// let manager = table.history_manager(engine.as_ref())?;
274
+ ///
275
+ /// // Find versions between January 1, 2023 and March 1, 2023
276
+ /// let start_timestamp = 1672531200000; // Jan 1, 2023 (milliseconds since epoch)
277
+ /// let end_timestamp = 1677628800000; // Mar 1, 2023 (milliseconds since epoch)
278
+ ///
279
+ /// let version_range_res =
280
+ /// manager.timestamp_range_to_versions(engine.as_ref(), start_timestamp, end_timestamp);
281
+ /// # Ok::<(), delta_kernel::Error>(())
282
+ /// ```
283
+ pub fn timestamp_range_to_versions (
284
+ & self ,
285
+ engine : & dyn Engine ,
286
+ start_timestamp : Timestamp ,
287
+ end_timestamp : impl Into < Option < Timestamp > > ,
288
+ ) -> DeltaResult < ( Version , Option < Version > ) > {
289
+ // Check that the start and end timestamps are valid. Timestamps must be positive
290
+ let end_timestamp = end_timestamp. into ( ) ;
291
+ require ! (
292
+ 0 <= start_timestamp,
293
+ LogHistoryError :: InvalidTimestamp ( start_timestamp) . into( )
294
+ ) ;
295
+ if let Some ( end_timestamp) = end_timestamp {
296
+ require ! (
297
+ 0 <= end_timestamp,
298
+ LogHistoryError :: InvalidTimestamp ( end_timestamp) . into( )
299
+ ) ;
300
+ // The `start_timestamp` must be no greater than the `end_timestamp`.
301
+ require ! (
302
+ start_timestamp <= end_timestamp,
303
+ LogHistoryError :: InvalidTimestampRange {
304
+ start_timestamp,
305
+ end_timestamp
306
+ }
307
+ . into( )
308
+ ) ;
309
+ }
310
+
311
+ // Convert the start timestamp to version
312
+ let start_version = self . first_version_after ( engine, start_timestamp) ?;
313
+
314
+ // If the end timestamp is present, convert it to an end version
315
+ let end_version = end_timestamp
316
+ . map ( |end| {
317
+ let end_version = self . latest_version_as_of ( engine, end) ?;
318
+
319
+ // Verify that the start version is no greater than the end version. This can
320
+ // happen in the case that the entire timestamp range falls between two commits.
321
+ // Consider the following history:
322
+ // |-------------|--------------------|---------------|
323
+ // v4 start_timestamp end_timestamp v5
324
+ //
325
+ // The latest version as of the end_timestamp is 4. The first version after the
326
+ // start_timestamp is 5. Thus in the case where end_version < start_version, we
327
+ // return and [`LogHistoryError::EmptyTimestampRange`].
328
+ require ! (
329
+ start_version <= end_version,
330
+ DeltaError :: from( LogHistoryError :: EmptyTimestampRange {
331
+ end_timestamp: end,
332
+ start_timestamp,
333
+ between_left: end_version,
334
+ between_right: start_version
335
+ } )
336
+ ) ;
337
+
338
+ Ok ( end_version)
339
+ } )
340
+ . transpose ( ) ?;
341
+
342
+ Ok ( ( start_version, end_version) )
343
+ }
344
+
167
345
/// Given a timestamp, this function determines the commit range that timestamp conversion
168
346
/// should search. A timestamp search may be conducted over one of two version ranges:
169
347
/// 1) A range of commits whose timestamp is the file modification timestamp
170
348
/// 2) A range of commits whose timestamp is an in-commit timestamp.
171
- #[ allow( unused) ]
172
349
fn get_timestamp_search_bounds (
173
350
& self ,
174
351
timestamp : Timestamp ,
@@ -236,7 +413,6 @@ impl LogHistoryManager {
236
413
/// happen in the following cases based on the bound:
237
414
/// - `Bound::GreatestLower`: There is no commit whose timestamp is lower than the given `timestamp`.
238
415
/// - `Bound::LeastUpper`: There is no commit whose timestamp is greater than the given `timestamp`.
239
- #[ allow( unused) ]
240
416
fn timestamp_to_version (
241
417
& self ,
242
418
engine : & dyn Engine ,
0 commit comments