Skip to content

Commit 10bdee7

Browse files
refactor!: rename ScanData to ScanMetadata (#817)
## What changes are proposed in this pull request? Rename `ScanData` to `ScanMetadata` and `Scan::scan_data` to `Scan::scan_metadata` (and corresponding FFI). Additionally, renames `TableChangesScanData` to `TableChangesScanMetadata`. Additional docs/refactor coming in #768 ### This PR affects the following public APIs breaking changes: 1. rename `ScanData` to `ScanMetadata` 2. rename `Scan::scan_data()` to `Scan::scan_metadata()` 3. (ffi) rename `free_kernel_scan_data()` to `free_scan_metadata_iter()` 4. (ffi) rename `kernel_scan_data_next()` to `scan_metadata_next()` 5. (ffi) rename `visit_scan_data()` to `visit_scan_metadata()` 6. (ffi) rename `kernel_scan_data_init()` to `scan_metadata_iter_init()` 7. (ffi) rename `KernelScanDataIterator` to `ScanMetadataIterator` 8. (ffi) rename `SharedScanDataIterator` to `SharedScanMetadataIterator` ## How was this change tested? existing resolves #816
1 parent 8961e97 commit 10bdee7

File tree

18 files changed

+201
-187
lines changed

18 files changed

+201
-187
lines changed

acceptance/src/data.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,10 @@ fn assert_columns_match(actual: &[Arc<dyn Array>], expected: &[Arc<dyn Array>])
109109
}
110110
}
111111

112-
pub async fn assert_scan_data(engine: Arc<dyn Engine>, test_case: &TestCaseInfo) -> TestResult<()> {
112+
pub async fn assert_scan_metadata(
113+
engine: Arc<dyn Engine>,
114+
test_case: &TestCaseInfo,
115+
) -> TestResult<()> {
113116
let table_root = test_case.table_root()?;
114117
let table = Table::new(table_root);
115118
let snapshot = table.snapshot(engine.as_ref(), None)?;

acceptance/tests/dat_reader.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ fn reader_test(path: &Path) -> datatest_stable::Result<()> {
3737
);
3838

3939
case.assert_metadata(engine.clone()).await.unwrap();
40-
acceptance::data::assert_scan_data(engine.clone(), &case)
40+
acceptance::data::assert_scan_metadata(engine.clone(), &case)
4141
.await
4242
.unwrap();
4343
});

ffi/examples/read-table/read_table.c

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,10 @@ void scan_row_callback(
8383
context->partition_values = NULL;
8484
}
8585

86-
// For each chunk of scan data (which may contain multiple files to scan), kernel will call this
87-
// function (named do_visit_scan_data to avoid conflict with visit_scan_data exported by kernel)
88-
void do_visit_scan_data(
86+
// For each chunk of scan metadata (which may contain multiple files to scan), kernel will call this
87+
// function (named do_visit_scan_metadata to avoid conflict with visit_scan_metadata exported by
88+
// kernel)
89+
void do_visit_scan_metadata(
8990
void* engine_context,
9091
ExclusiveEngineData* engine_data,
9192
KernelBoolSlice selection_vec,
@@ -96,7 +97,7 @@ void do_visit_scan_data(
9697
print_selection_vector(" ", &selection_vec);
9798
// Ask kernel to iterate each individual file and call us back with extracted metadata
9899
print_diag("Asking kernel to call us back for each scan row (file to read)\n");
99-
visit_scan_data(engine_data, selection_vec, transforms, engine_context, scan_row_callback);
100+
visit_scan_metadata(engine_data, selection_vec, transforms, engine_context, scan_row_callback);
100101
free_bool_slice(selection_vec);
101102
free_engine_data(engine_data);
102103
}
@@ -291,26 +292,28 @@ int main(int argc, char* argv[])
291292
#endif
292293
};
293294

294-
ExternResultHandleSharedScanDataIterator data_iter_res = kernel_scan_data_init(engine, scan);
295-
if (data_iter_res.tag != OkHandleSharedScanDataIterator) {
296-
print_error("Failed to construct scan data iterator.", (Error*)data_iter_res.err);
295+
ExternResultHandleSharedScanMetadataIterator data_iter_res =
296+
scan_metadata_iter_init(engine, scan);
297+
if (data_iter_res.tag != OkHandleSharedScanMetadataIterator) {
298+
print_error("Failed to construct scan metadata iterator.", (Error*)data_iter_res.err);
297299
free_error((Error*)data_iter_res.err);
298300
return -1;
299301
}
300302

301-
SharedScanDataIterator* data_iter = data_iter_res.ok;
303+
SharedScanMetadataIterator* data_iter = data_iter_res.ok;
302304

303-
print_diag("\nIterating scan data\n");
305+
print_diag("\nIterating scan metadata\n");
304306

305307
// iterate scan files
306308
for (;;) {
307-
ExternResultbool ok_res = kernel_scan_data_next(data_iter, &context, do_visit_scan_data);
309+
ExternResultbool ok_res =
310+
scan_metadata_next(data_iter, &context, do_visit_scan_metadata);
308311
if (ok_res.tag != Okbool) {
309-
print_error("Failed to iterate scan data.", (Error*)ok_res.err);
312+
print_error("Failed to iterate scan metadata.", (Error*)ok_res.err);
310313
free_error((Error*)ok_res.err);
311314
return -1;
312315
} else if (!ok_res.ok) {
313-
print_diag("Scan data iterator done\n");
316+
print_diag("Scan metadata iterator done\n");
314317
break;
315318
}
316319
}
@@ -323,7 +326,7 @@ int main(int argc, char* argv[])
323326
context.arrow_context = NULL;
324327
#endif
325328

326-
free_kernel_scan_data(data_iter);
329+
free_scan_metadata_iter(data_iter);
327330
free_scan(scan);
328331
free_schema(logical_schema);
329332
free_schema(read_schema);

ffi/src/lib.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -685,11 +685,11 @@ pub struct StringSliceIterator;
685685

686686
/// # Safety
687687
///
688-
/// The iterator must be valid (returned by [`kernel_scan_data_init`]) and not yet freed by
689-
/// [`free_kernel_scan_data`]. The visitor function pointer must be non-null.
688+
/// The iterator must be valid (returned by [`scan_metadata_iter_init`]) and not yet freed by
689+
/// [`free_scan_metadata_iter`]. The visitor function pointer must be non-null.
690690
///
691-
/// [`kernel_scan_data_init`]: crate::scan::kernel_scan_data_init
692-
/// [`free_kernel_scan_data`]: crate::scan::free_kernel_scan_data
691+
/// [`scan_metadata_iter_init`]: crate::scan::scan_metadata_iter_init
692+
/// [`free_scan_metadata_iter`]: crate::scan::free_scan_metadata_iter
693693
#[no_mangle]
694694
pub unsafe extern "C" fn string_slice_next(
695695
data: Handle<StringSliceIterator>,

ffi/src/scan.rs

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use std::collections::HashMap;
44
use std::sync::{Arc, Mutex};
55

66
use delta_kernel::scan::state::{visit_scan_files, DvInfo, GlobalScanState};
7-
use delta_kernel::scan::{Scan, ScanData};
7+
use delta_kernel::scan::{Scan, ScanMetadata};
88
use delta_kernel::snapshot::Snapshot;
99
use delta_kernel::{DeltaResult, Error, Expression, ExpressionRef};
1010
use delta_kernel_ffi_macros::handle_descriptor;
@@ -24,7 +24,7 @@ use crate::{
2424
use super::handle::Handle;
2525

2626
// TODO: Why do we even need to expose a scan, when the only thing an engine can do with it is
27-
// handit back to the kernel by calling `kernel_scan_data_init`? There isn't even an FFI method to
27+
// handit back to the kernel by calling `scan_metadata_iter_init`? There isn't even an FFI method to
2828
// drop it!
2929
#[handle_descriptor(target=Scan, mutable=false, sized=true)]
3030
pub struct SharedScan;
@@ -125,70 +125,70 @@ pub unsafe extern "C" fn free_global_scan_state(state: Handle<SharedGlobalScanSt
125125
// means kernel made the decision of how to achieve thread safety. This may not be desirable if the
126126
// engine is single-threaded, or has its own mutual exclusion mechanisms. Deadlock is even a
127127
// conceivable risk, if this interacts poorly with engine's mutual exclusion mechanism.
128-
pub struct KernelScanDataIterator {
128+
pub struct ScanMetadataIterator {
129129
// Mutex -> Allow the iterator to be accessed safely by multiple threads.
130130
// Box -> Wrap its unsized content this struct is fixed-size with thin pointers.
131-
// Item = DeltaResult<ScanData>
132-
data: Mutex<Box<dyn Iterator<Item = DeltaResult<ScanData>> + Send>>,
131+
// Item = DeltaResult<ScanMetadata>
132+
data: Mutex<Box<dyn Iterator<Item = DeltaResult<ScanMetadata>> + Send>>,
133133

134134
// Also keep a reference to the external engine for its error allocator. The default Parquet and
135135
// Json handlers don't hold any reference to the tokio reactor they rely on, so the iterator
136136
// terminates early if the last engine goes out of scope.
137137
engine: Arc<dyn ExternEngine>,
138138
}
139139

140-
#[handle_descriptor(target=KernelScanDataIterator, mutable=false, sized=true)]
141-
pub struct SharedScanDataIterator;
140+
#[handle_descriptor(target=ScanMetadataIterator, mutable=false, sized=true)]
141+
pub struct SharedScanMetadataIterator;
142142

143-
impl Drop for KernelScanDataIterator {
143+
impl Drop for ScanMetadataIterator {
144144
fn drop(&mut self) {
145-
debug!("dropping KernelScanDataIterator");
145+
debug!("dropping ScanMetadataIterator");
146146
}
147147
}
148148

149149
/// Get an iterator over the data needed to perform a scan. This will return a
150-
/// [`KernelScanDataIterator`] which can be passed to [`kernel_scan_data_next`] to get the actual
151-
/// data in the iterator.
150+
/// [`ScanMetadataIterator`] which can be passed to [`scan_metadata_next`] to get the
151+
/// actual data in the iterator.
152152
///
153153
/// # Safety
154154
///
155155
/// Engine is responsible for passing a valid [`SharedExternEngine`] and [`SharedScan`]
156156
#[no_mangle]
157-
pub unsafe extern "C" fn kernel_scan_data_init(
157+
pub unsafe extern "C" fn scan_metadata_iter_init(
158158
engine: Handle<SharedExternEngine>,
159159
scan: Handle<SharedScan>,
160-
) -> ExternResult<Handle<SharedScanDataIterator>> {
160+
) -> ExternResult<Handle<SharedScanMetadataIterator>> {
161161
let engine = unsafe { engine.clone_as_arc() };
162162
let scan = unsafe { scan.as_ref() };
163-
kernel_scan_data_init_impl(&engine, scan).into_extern_result(&engine.as_ref())
163+
scan_metadata_iter_init_impl(&engine, scan).into_extern_result(&engine.as_ref())
164164
}
165165

166-
fn kernel_scan_data_init_impl(
166+
fn scan_metadata_iter_init_impl(
167167
engine: &Arc<dyn ExternEngine>,
168168
scan: &Scan,
169-
) -> DeltaResult<Handle<SharedScanDataIterator>> {
170-
let scan_data = scan.scan_data(engine.engine().as_ref())?;
171-
let data = KernelScanDataIterator {
172-
data: Mutex::new(Box::new(scan_data)),
169+
) -> DeltaResult<Handle<SharedScanMetadataIterator>> {
170+
let scan_metadata = scan.scan_metadata(engine.engine().as_ref())?;
171+
let data = ScanMetadataIterator {
172+
data: Mutex::new(Box::new(scan_metadata)),
173173
engine: engine.clone(),
174174
};
175175
Ok(Arc::new(data).into())
176176
}
177177

178-
/// Call the provided `engine_visitor` on the next scan data item. The visitor will be provided with
179-
/// a selection vector and engine data. It is the responsibility of the _engine_ to free these when
180-
/// it is finished by calling [`free_bool_slice`] and [`free_engine_data`] respectively.
178+
/// Call the provided `engine_visitor` on the next scan metadata item. The visitor will be provided
179+
/// with a selection vector and engine data. It is the responsibility of the _engine_ to free these
180+
/// when it is finished by calling [`free_bool_slice`] and [`free_engine_data`] respectively.
181181
///
182182
/// # Safety
183183
///
184-
/// The iterator must be valid (returned by [kernel_scan_data_init]) and not yet freed by
185-
/// [`free_kernel_scan_data`]. The visitor function pointer must be non-null.
184+
/// The iterator must be valid (returned by [scan_metadata_iter_init]) and not yet freed by
185+
/// [`free_scan_metadata_iter`]. The visitor function pointer must be non-null.
186186
///
187187
/// [`free_bool_slice`]: crate::free_bool_slice
188188
/// [`free_engine_data`]: crate::free_engine_data
189189
#[no_mangle]
190-
pub unsafe extern "C" fn kernel_scan_data_next(
191-
data: Handle<SharedScanDataIterator>,
190+
pub unsafe extern "C" fn scan_metadata_next(
191+
data: Handle<SharedScanMetadataIterator>,
192192
engine_context: NullableCvoid,
193193
engine_visitor: extern "C" fn(
194194
engine_context: NullableCvoid,
@@ -198,11 +198,11 @@ pub unsafe extern "C" fn kernel_scan_data_next(
198198
),
199199
) -> ExternResult<bool> {
200200
let data = unsafe { data.as_ref() };
201-
kernel_scan_data_next_impl(data, engine_context, engine_visitor)
201+
scan_metadata_next_impl(data, engine_context, engine_visitor)
202202
.into_extern_result(&data.engine.as_ref())
203203
}
204-
fn kernel_scan_data_next_impl(
205-
data: &KernelScanDataIterator,
204+
fn scan_metadata_next_impl(
205+
data: &ScanMetadataIterator,
206206
engine_context: NullableCvoid,
207207
engine_visitor: extern "C" fn(
208208
engine_context: NullableCvoid,
@@ -228,11 +228,11 @@ fn kernel_scan_data_next_impl(
228228
/// # Safety
229229
///
230230
/// Caller is responsible for (at most once) passing a valid pointer returned by a call to
231-
/// [`kernel_scan_data_init`].
231+
/// [`scan_metadata_iter_init`].
232232
// we should probably be consistent with drop vs. free on engine side (probably the latter is more
233233
// intuitive to non-rust code)
234234
#[no_mangle]
235-
pub unsafe extern "C" fn free_kernel_scan_data(data: Handle<SharedScanDataIterator>) {
235+
pub unsafe extern "C" fn free_scan_metadata_iter(data: Handle<SharedScanMetadataIterator>) {
236236
data.drop_handle();
237237
}
238238

@@ -297,14 +297,14 @@ pub unsafe extern "C" fn get_from_string_map(
297297
.and_then(|v| allocate_fn(kernel_string_slice!(v)))
298298
}
299299

300-
/// Transformation expressions that need to be applied to each row `i` in ScanData. You can use
300+
/// Transformation expressions that need to be applied to each row `i` in ScanMetadata. You can use
301301
/// [`get_transform_for_row`] to get the transform for a particular row. If that returns an
302302
/// associated expression, it _must_ be applied to the data read from the file specified by the
303303
/// row. The resultant schema for this expression is guaranteed to be `Scan.schema()`. If
304304
/// `get_transform_for_row` returns `NULL` no expression need be applied and the data read from disk
305305
/// is already in the correct logical state.
306306
///
307-
/// NB: If you are using `visit_scan_data` you don't need to worry about dealing with probing
307+
/// NB: If you are using `visit_scan_metadata` you don't need to worry about dealing with probing
308308
/// `CTransforms`. The callback will be invoked with the correct transform for you.
309309
pub struct CTransforms {
310310
transforms: Vec<Option<ExpressionRef>>,
@@ -420,13 +420,13 @@ struct ContextWrapper {
420420
callback: CScanCallback,
421421
}
422422

423-
/// Shim for ffi to call visit_scan_data. This will generally be called when iterating through scan
423+
/// Shim for ffi to call visit_scan_metadata. This will generally be called when iterating through scan
424424
/// data which provides the data handle and selection vector as each element in the iterator.
425425
///
426426
/// # Safety
427427
/// engine is responsible for passing a valid [`ExclusiveEngineData`] and selection vector.
428428
#[no_mangle]
429-
pub unsafe extern "C" fn visit_scan_data(
429+
pub unsafe extern "C" fn visit_scan_metadata(
430430
data: Handle<ExclusiveEngineData>,
431431
selection_vec: KernelBoolSlice,
432432
transforms: &CTransforms,

kernel/examples/inspect-table/src/main.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ enum Commands {
4141
/// Show the table's schema
4242
Schema,
4343
/// Show the meta-data that would be used to scan the table
44-
ScanData,
44+
ScanMetadata,
4545
/// Show each action from the log-segments
4646
Actions {
4747
/// Show the log in reverse order (default is log replay order -- newest first)
@@ -207,10 +207,10 @@ fn try_main() -> DeltaResult<()> {
207207
Commands::Schema => {
208208
println!("{:#?}", snapshot.schema());
209209
}
210-
Commands::ScanData => {
210+
Commands::ScanMetadata => {
211211
let scan = ScanBuilder::new(snapshot).build()?;
212-
let scan_data = scan.scan_data(&engine)?;
213-
for res in scan_data {
212+
let scan_metadata = scan.scan_metadata(&engine)?;
213+
for res in scan_metadata {
214214
let (data, vector, transforms) = res?;
215215
delta_kernel::scan::state::visit_scan_files(
216216
data.as_ref(),

kernel/examples/read-table-multi-threaded/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ Read Table Multi-Threaded
33

44
# About
55
This example shows a program that reads a table using multiple threads. This shows the use of the
6-
`scan_data`, `global_scan_state`, and `visit_scan_files` methods, that can be used to partition work
6+
`scan_metadata`, `global_scan_state`, and `visit_scan_files` methods, that can be used to partition work
77
to either multiple threads, or workers (in the case of a distributed engine).
88

99
You can run this from the same directory as this `README.md` by running `cargo run -- [args]`.
@@ -49,4 +49,4 @@ To select specific columns you need a `--` after the column list specification.
4949

5050
- Read `letter` and `data` columns from the `multi_partitioned` dat table:
5151

52-
`cargo run -- --columns letter,data -- ../../../acceptance/tests/dat/out/reader_tests/generated/multi_partitioned/delta/`
52+
`cargo run -- --columns letter,data -- ../../../acceptance/tests/dat/out/reader_tests/generated/multi_partitioned/delta/`

kernel/examples/read-table-multi-threaded/src/main.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use clap::{Parser, ValueEnum};
2020
use url::Url;
2121

2222
/// An example program that reads a table using multiple threads. This shows the use of the
23-
/// scan_data and global_scan_state methods on a Scan, that can be used to partition work to either
23+
/// scan_metadata and global_scan_state methods on a Scan, that can be used to partition work to either
2424
/// multiple threads, or workers (in the case of a distributed engine).
2525
#[derive(Parser)]
2626
#[command(author, version, about, long_about = None)]
@@ -179,7 +179,7 @@ fn try_main() -> DeltaResult<()> {
179179
// [`delta_kernel::scan::scan_row_schema`]. Generally engines will not need to interact with
180180
// this data directly, and can just call [`visit_scan_files`] to get pre-parsed data back from
181181
// the kernel.
182-
let scan_data = scan.scan_data(engine.as_ref())?;
182+
let scan_metadata = scan.scan_metadata(engine.as_ref())?;
183183

184184
// get any global state associated with this scan
185185
let global_state = Arc::new(scan.global_scan_state());
@@ -209,7 +209,7 @@ fn try_main() -> DeltaResult<()> {
209209
// done sending
210210
drop(record_batch_tx);
211211

212-
for res in scan_data {
212+
for res in scan_metadata {
213213
let (data, vector, transforms) = res?;
214214
scan_file_tx = delta_kernel::scan::state::visit_scan_files(
215215
data.as_ref(),

kernel/src/log_replay.rs

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
//! typically from newest to oldest.
77
//!
88
//! Log replay is currently implemented for table scans, which filter and apply transformations
9-
//! to produce file actions which builds the view of the table state at a specific point in time.
9+
//! to produce file actions which builds the view of the table state at a specific point in time.
1010
//! Future extensions will support additional log replay processors beyond the current use case.
1111
//! (e.g. checkpointing: filter actions to include only those needed to rebuild table state)
1212
//!
@@ -46,7 +46,7 @@ impl FileActionKey {
4646
/// significantly reducing memory usage for large Delta tables with extensive history.
4747
///
4848
/// TODO: Modify deduplication to track only file paths instead of (path, dv_unique_id).
49-
/// More info here: https://github.com/delta-io/delta-kernel-rs/issues/701
49+
/// More info here: https://github.com/delta-io/delta-kernel-rs/issues/701
5050
pub(crate) struct FileActionDeduplicator<'seen> {
5151
/// A set of (data file path, dv_unique_id) pairs that have been seen thus
5252
/// far in the log for deduplication. This is a mutable reference to the set
@@ -208,23 +208,23 @@ impl<'seen> FileActionDeduplicator<'seen> {
208208
/// processed by the processor, (if a filter is provided).
209209
///
210210
/// Implementations:
211-
/// - `ScanLogReplayProcessor`: Used for table scans, this processor filters and selects deduplicated
211+
/// - `ScanLogReplayProcessor`: Used for table scans, this processor filters and selects deduplicated
212212
/// `Add` actions from log batches to reconstruct the view of the table at a specific point in time.
213-
/// Note that scans do not expose `Remove` actions. Data skipping may be applied when a predicate is
213+
/// Note that scans do not expose `Remove` actions. Data skipping may be applied when a predicate is
214214
/// provided.
215215
///
216-
/// - `CheckpointLogReplayProcessor` (WIP): Will be responsible for processing log batches to construct
217-
/// V1 spec checkpoint files. Unlike scans, checkpoint processing includes additional actions, such as
218-
/// `Remove`, `Metadata`, and `Protocol`, required to fully reconstruct table state.
216+
/// - `CheckpointLogReplayProcessor` (WIP): Will be responsible for processing log batches to construct
217+
/// V1 spec checkpoint files. Unlike scans, checkpoint processing includes additional actions, such as
218+
/// `Remove`, `Metadata`, and `Protocol`, required to fully reconstruct table state.
219219
/// Data skipping is not applied during checkpoint processing.
220220
///
221221
/// The `Output` type represents the material result of log replay, and it must implement the
222222
/// `HasSelectionVector` trait to allow filtering of irrelevant rows:
223223
///
224-
/// - For **scans**, the output type is `ScanData`, which contains the file actions (`Add` actions) that
225-
/// need to be applied to build the table's view, accompanied by a **selection vector** that identifies
226-
/// which rows should be included. A transform vector may also be included to handle schema changes,
227-
/// such as renaming columns or modifying data types.
224+
/// - For **scans**, the output type is `ScanMetadata`, which contains the file actions (`Add`
225+
/// actions) that need to be applied to build the table's view, accompanied by a
226+
/// **selection vector** that identifies which rows should be included. A transform vector may
227+
/// also be included to handle schema changes, such as renaming columns or modifying data types.
228228
///
229229
/// - For **checkpoints**, the output includes the actions necessary to write to the checkpoint file (`Add`,
230230
/// `Remove`, `Metadata`, `Protocol` actions), filtered by the **selection vector** to determine which

0 commit comments

Comments
 (0)