Skip to content

Commit 3305d3a

Browse files
feat: Introduce TableConfiguration to jointly manage metadata, protocol, and table properties (#644)
## What changes are proposed in this pull request? This PR introduces the `TableConfiguration` struct which is used to perform feature support and feature enablement checks the table's protocol, metadata, table properties, and schema. #### Problem statement To check that a feature is enabled, you often must check that a certain reader/writer feature is supported and that a table property is set to true. For example, a writer must check both the `delta.enableDeletionVectors` table property, and check that the `deletionVectors` writer/reader features are present in the table's Protocol. Probing two disparate structs to do a single check is error-prone and may lead to these metadata/protocol checks to become out of sync. Moreover checks are performed in the CDF path, snapshot scan path, and in the read path. Thus there are many ways in which protocol and metadata checks can diverge with one another. Put simply, the problems are: 1. When checking feature support over multiple structs (like P&M), it is easy to forget one and violate correctness. 2. Duplicate checks for the same feature may diverge among different code paths #### Solution `TableConfiguration` consolidates all protocol and metadata checks to one place. It also ensures that the logic for checking feature enablement is kept consistent throughout the codebase. This addresses the problems outlined above. Closes: #571 ## How was this change tested? We add a couple tests to ensure that: 1) Creating `TableConfiguration` fails on tables for which reading is not supported 2) deletion vector support and enablement checks work as expected. --------- Co-authored-by: Oussama Saoudi <[email protected]>
1 parent 6751838 commit 3305d3a

File tree

9 files changed

+334
-53
lines changed

9 files changed

+334
-53
lines changed

kernel/examples/inspect-table/src/main.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ fn try_main() -> DeltaResult<()> {
220220
}
221221
Commands::Actions { oldest_first } => {
222222
let log_schema = get_log_schema();
223-
let actions = snapshot._log_segment().replay(
223+
let actions = snapshot.log_segment().replay(
224224
&engine,
225225
log_schema.clone(),
226226
log_schema.clone(),

kernel/src/actions/set_transaction.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ impl SetTransactionScanner {
6060
))
6161
});
6262
self.snapshot
63-
.log_segment
63+
.log_segment()
6464
.replay(engine, schema.clone(), schema, META_PREDICATE.clone())
6565
}
6666

kernel/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ pub mod schema;
8282
pub mod snapshot;
8383
pub mod table;
8484
pub mod table_changes;
85+
pub mod table_configuration;
8586
pub mod table_features;
8687
pub mod table_properties;
8788
pub mod transaction;

kernel/src/log_segment/tests.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ fn test_replay_for_metadata() {
3131
let table = Table::new(url);
3232
let snapshot = table.snapshot(&engine, None).unwrap();
3333
let data: Vec<_> = snapshot
34-
.log_segment
34+
.log_segment()
3535
.replay_for_metadata(&engine)
3636
.unwrap()
3737
.try_collect()

kernel/src/scan/mod.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,7 @@ impl Scan {
395395
// needed (currently just means no partition cols AND no column mapping but will be extended
396396
// for other transforms as we support them)
397397
let static_transform = (self.have_partition_cols
398-
|| self.snapshot.column_mapping_mode != ColumnMappingMode::None)
398+
|| self.snapshot.column_mapping_mode() != ColumnMappingMode::None)
399399
.then_some(Arc::new(Scan::get_static_transform(&self.all_fields)));
400400
let physical_predicate = match self.physical_predicate.clone() {
401401
PhysicalPredicate::StaticSkipAll => return Ok(None.into_iter().flatten()),
@@ -423,19 +423,19 @@ impl Scan {
423423
// NOTE: We don't pass any meta-predicate because we expect no meaningful row group skipping
424424
// when ~every checkpoint file will contain the adds and removes we are looking for.
425425
self.snapshot
426-
.log_segment
426+
.log_segment()
427427
.replay(engine, commit_read_schema, checkpoint_read_schema, None)
428428
}
429429

430430
/// Get global state that is valid for the entire scan. This is somewhat expensive so should
431431
/// only be called once per scan.
432432
pub fn global_scan_state(&self) -> GlobalScanState {
433433
GlobalScanState {
434-
table_root: self.snapshot.table_root.to_string(),
434+
table_root: self.snapshot.table_root().to_string(),
435435
partition_columns: self.snapshot.metadata().partition_columns.clone(),
436436
logical_schema: self.logical_schema.clone(),
437437
physical_schema: self.physical_schema.clone(),
438-
column_mapping_mode: self.snapshot.column_mapping_mode,
438+
column_mapping_mode: self.snapshot.column_mapping_mode(),
439439
}
440440
}
441441

@@ -479,7 +479,7 @@ impl Scan {
479479
);
480480

481481
let global_state = Arc::new(self.global_scan_state());
482-
let table_root = self.snapshot.table_root.clone();
482+
let table_root = self.snapshot.table_root().clone();
483483
let physical_predicate = self.physical_predicate();
484484
let all_fields = self.all_fields.clone();
485485
let have_partition_cols = self.have_partition_cols;

kernel/src/snapshot.rs

+21-36
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,8 @@ use crate::actions::{Metadata, Protocol};
1010
use crate::log_segment::LogSegment;
1111
use crate::scan::ScanBuilder;
1212
use crate::schema::Schema;
13-
use crate::table_features::{
14-
column_mapping_mode, validate_schema_column_mapping, ColumnMappingMode,
15-
};
13+
use crate::table_configuration::TableConfiguration;
14+
use crate::table_features::ColumnMappingMode;
1615
use crate::table_properties::TableProperties;
1716
use crate::{DeltaResult, Engine, Error, FileSystemClient, Version};
1817

@@ -23,13 +22,8 @@ const LAST_CHECKPOINT_FILE_NAME: &str = "_last_checkpoint";
2322
/// have a defined schema (which may change over time for any given table), specific version, and
2423
/// frozen log segment.
2524
pub struct Snapshot {
26-
pub(crate) table_root: Url,
27-
pub(crate) log_segment: LogSegment,
28-
metadata: Metadata,
29-
protocol: Protocol,
30-
schema: Schema,
31-
table_properties: TableProperties,
32-
pub(crate) column_mapping_mode: ColumnMappingMode,
25+
log_segment: LogSegment,
26+
table_configuration: TableConfiguration,
3327
}
3428

3529
impl Drop for Snapshot {
@@ -43,7 +37,7 @@ impl std::fmt::Debug for Snapshot {
4337
f.debug_struct("Snapshot")
4438
.field("path", &self.log_segment.log_root.as_str())
4539
.field("version", &self.version())
46-
.field("metadata", &self.metadata)
40+
.field("metadata", &self.metadata())
4741
.finish()
4842
}
4943
}
@@ -80,67 +74,58 @@ impl Snapshot {
8074
engine: &dyn Engine,
8175
) -> DeltaResult<Self> {
8276
let (metadata, protocol) = log_segment.read_metadata(engine)?;
83-
84-
// important! before a read/write to the table we must check it is supported
85-
protocol.ensure_read_supported()?;
86-
87-
// validate column mapping mode -- all schema fields should be correctly (un)annotated
88-
let schema = metadata.parse_schema()?;
89-
let table_properties = metadata.parse_table_properties();
90-
let column_mapping_mode = column_mapping_mode(&protocol, &table_properties);
91-
validate_schema_column_mapping(&schema, column_mapping_mode)?;
92-
77+
let table_configuration =
78+
TableConfiguration::try_new(metadata, protocol, location, log_segment.end_version)?;
9379
Ok(Self {
94-
table_root: location,
9580
log_segment,
96-
metadata,
97-
protocol,
98-
schema,
99-
table_properties,
100-
column_mapping_mode,
81+
table_configuration,
10182
})
10283
}
10384

10485
/// Log segment this snapshot uses
10586
#[cfg_attr(feature = "developer-visibility", visibility::make(pub))]
106-
fn _log_segment(&self) -> &LogSegment {
87+
pub(crate) fn log_segment(&self) -> &LogSegment {
10788
&self.log_segment
10889
}
10990

11091
pub fn table_root(&self) -> &Url {
111-
&self.table_root
92+
self.table_configuration.table_root()
11293
}
11394

11495
/// Version of this `Snapshot` in the table.
11596
pub fn version(&self) -> Version {
116-
self.log_segment.end_version
97+
self.table_configuration().version()
11798
}
11899

119100
/// Table [`Schema`] at this `Snapshot`s version.
120101
pub fn schema(&self) -> &Schema {
121-
&self.schema
102+
self.table_configuration.schema()
122103
}
123104

124105
/// Table [`Metadata`] at this `Snapshot`s version.
125106
pub fn metadata(&self) -> &Metadata {
126-
&self.metadata
107+
self.table_configuration.metadata()
127108
}
128109

129110
/// Table [`Protocol`] at this `Snapshot`s version.
130111
pub fn protocol(&self) -> &Protocol {
131-
&self.protocol
112+
self.table_configuration.protocol()
132113
}
133114

134115
/// Get the [`TableProperties`] for this [`Snapshot`].
135116
pub fn table_properties(&self) -> &TableProperties {
136-
&self.table_properties
117+
self.table_configuration().table_properties()
118+
}
119+
/// Get the [`TableConfiguration`] for this [`Snapshot`].
120+
#[cfg_attr(feature = "developer-visibility", visibility::make(pub))]
121+
pub(crate) fn table_configuration(&self) -> &TableConfiguration {
122+
&self.table_configuration
137123
}
138-
139124
/// Get the [column mapping
140125
/// mode](https://github.com/delta-io/delta/blob/master/PROTOCOL.md#column-mapping) at this
141126
/// `Snapshot`s version.
142127
pub fn column_mapping_mode(&self) -> ColumnMappingMode {
143-
self.column_mapping_mode
128+
self.table_configuration.column_mapping_mode()
144129
}
145130

146131
/// Create a [`ScanBuilder`] for an `Arc<Snapshot>`.

kernel/src/table_changes/mod.rs

+8-8
Original file line numberDiff line numberDiff line change
@@ -161,15 +161,15 @@ impl TableChanges {
161161
// we support CDF with those features enabled.
162162
//
163163
// Note: We must still check each metadata and protocol action in the CDF range.
164-
let check_snapshot = |snapshot: &Snapshot| -> DeltaResult<()> {
165-
ensure_cdf_read_supported(snapshot.protocol())?;
166-
check_cdf_table_properties(snapshot.table_properties())?;
167-
Ok(())
164+
let check_table_config = |snapshot: &Snapshot| {
165+
if snapshot.table_configuration().is_cdf_read_supported() {
166+
Ok(())
167+
} else {
168+
Err(Error::change_data_feed_unsupported(snapshot.version()))
169+
}
168170
};
169-
check_snapshot(&start_snapshot)
170-
.map_err(|_| Error::change_data_feed_unsupported(start_snapshot.version()))?;
171-
check_snapshot(&end_snapshot)
172-
.map_err(|_| Error::change_data_feed_unsupported(end_snapshot.version()))?;
171+
check_table_config(&start_snapshot)?;
172+
check_table_config(&end_snapshot)?;
173173

174174
// Verify that the start and end schemas are compatible. We must still check schema
175175
// compatibility for each schema update in the CDF range.

kernel/src/table_changes/scan.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ impl TableChangesScan {
211211
partition_columns: end_snapshot.metadata().partition_columns.clone(),
212212
logical_schema: self.logical_schema.clone(),
213213
physical_schema: self.physical_schema.clone(),
214-
column_mapping_mode: end_snapshot.column_mapping_mode,
214+
column_mapping_mode: end_snapshot.column_mapping_mode(),
215215
}
216216
}
217217

0 commit comments

Comments
 (0)