Skip to content

Commit 318087c

Browse files
committed
nullable transform
1 parent b3b511b commit 318087c

File tree

3 files changed

+22
-65
lines changed

3 files changed

+22
-65
lines changed

kernel/src/scan/data_skipping.rs

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ use crate::{
2121
};
2222

2323
pub(crate) mod stats_schema;
24+
25+
use stats_schema::NullableStatsTransform;
2426
#[cfg(test)]
2527
mod tests;
2628

@@ -77,28 +79,6 @@ impl DataSkippingFilter {
7779
let (predicate, referenced_schema) = physical_predicate?;
7880
debug!("Creating a data skipping filter for {:#?}", predicate);
7981

80-
// Convert all fields into nullable, as stats may not be available for all columns
81-
// (and usually aren't for partition columns).
82-
struct NullableStatsTransform;
83-
impl<'a> SchemaTransform<'a> for NullableStatsTransform {
84-
fn transform_struct_field(
85-
&mut self,
86-
field: &'a StructField,
87-
) -> Option<Cow<'a, StructField>> {
88-
use Cow::*;
89-
let field = match self.transform(&field.data_type)? {
90-
Borrowed(_) if field.is_nullable() => Borrowed(field),
91-
data_type => Owned(StructField {
92-
name: field.name.clone(),
93-
data_type: data_type.into_owned(),
94-
nullable: true,
95-
metadata: field.metadata.clone(),
96-
}),
97-
};
98-
Some(field)
99-
}
100-
}
101-
10282
// Convert a min/max stats schema into a nullcount schema (all leaf fields are LONG)
10383
struct NullCountStatsTransform;
10484
impl<'a> SchemaTransform<'a> for NullCountStatsTransform {

kernel/src/scan/data_skipping/stats_schema.rs

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,25 @@ pub(crate) fn expected_stats_schema(
111111
StructType::try_new(fields)
112112
}
113113

114+
/// Transforms a schema to make all fields nullable.
115+
/// Used for stats schemas where stats may not be available for all columns.
116+
pub(crate) struct NullableStatsTransform;
117+
impl<'a> SchemaTransform<'a> for NullableStatsTransform {
118+
fn transform_struct_field(&mut self, field: &'a StructField) -> Option<Cow<'a, StructField>> {
119+
use Cow::*;
120+
let field = match self.transform(&field.data_type)? {
121+
Borrowed(_) if field.is_nullable() => Borrowed(field),
122+
data_type => Owned(StructField {
123+
name: field.name.clone(),
124+
data_type: data_type.into_owned(),
125+
nullable: true,
126+
metadata: field.metadata.clone(),
127+
}),
128+
};
129+
Some(field)
130+
}
131+
}
132+
114133
// Convert a min/max stats schema into a nullcount schema (all leaf fields are LONG)
115134
#[allow(unused)]
116135
pub(crate) struct NullCountStatsTransform;
@@ -317,26 +336,6 @@ mod tests {
317336

318337
use super::*;
319338

320-
pub(crate) struct NullableStatsTransform;
321-
impl<'a> SchemaTransform<'a> for NullableStatsTransform {
322-
fn transform_struct_field(
323-
&mut self,
324-
field: &'a StructField,
325-
) -> Option<Cow<'a, StructField>> {
326-
use Cow::*;
327-
let field = match self.transform(&field.data_type)? {
328-
Borrowed(_) if field.is_nullable() => Borrowed(field),
329-
data_type => Owned(StructField {
330-
name: field.name.clone(),
331-
data_type: data_type.into_owned(),
332-
nullable: true,
333-
metadata: field.metadata.clone(),
334-
}),
335-
};
336-
Some(field)
337-
}
338-
}
339-
340339
#[test]
341340
fn test_should_include_column() {
342341
let full_name = vec![ColumnName::new(["lvl1", "lvl2", "lvl3", "lvl4"])];

kernel/src/transaction/mod.rs

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
use std::borrow::Cow;
21
use std::collections::{HashMap, HashSet};
32
use std::iter;
43
use std::ops::Deref;
@@ -23,6 +22,7 @@ use crate::expressions::{column_name, ColumnName};
2322
use crate::expressions::{ArrayData, Scalar, StructData, Transform, UnaryExpressionOp::ToJson};
2423
use crate::path::{LogRoot, ParsedLogPath};
2524
use crate::row_tracking::{RowTrackingDomainMetadata, RowTrackingVisitor};
25+
use crate::scan::data_skipping::stats_schema::NullableStatsTransform;
2626
use crate::scan::log_replay::{
2727
get_scan_metadata_transform_expr, BASE_ROW_ID_NAME, DEFAULT_ROW_COMMIT_VERSION_NAME,
2828
FILE_CONSTANT_VALUES_NAME, TAGS_NAME,
@@ -41,28 +41,6 @@ use crate::{
4141
};
4242
use delta_kernel_derive::internal_api;
4343

44-
// This is a workaround due to the fact that expression evaluation happens
45-
// on the whole EngineData instead of accounting for filtered rows, which can lead to null values in
46-
// required fields.
47-
// TODO: Move this to a common place (dedupe from data_skipping.rs) or remove when evaluations work
48-
// on FilteredEngineData directly.
49-
struct NullableStatsTransform;
50-
impl<'a> SchemaTransform<'a> for NullableStatsTransform {
51-
fn transform_struct_field(&mut self, field: &'a StructField) -> Option<Cow<'a, StructField>> {
52-
use Cow::*;
53-
let field = match self.transform(&field.data_type)? {
54-
Borrowed(_) if field.is_nullable() => Borrowed(field),
55-
data_type => Owned(StructField {
56-
name: field.name.clone(),
57-
data_type: data_type.into_owned(),
58-
nullable: true,
59-
metadata: field.metadata.clone(),
60-
}),
61-
};
62-
Some(field)
63-
}
64-
}
65-
6644
/// Type alias for an iterator of [`EngineData`] results.
6745
pub(crate) type EngineDataResultIterator<'a> =
6846
Box<dyn Iterator<Item = DeltaResult<Box<dyn EngineData>>> + Send + 'a>;

0 commit comments

Comments
 (0)