Skip to content

Commit c8c1f6b

Browse files
committed
nullable transform
1 parent b3b511b commit c8c1f6b

File tree

3 files changed

+23
-78
lines changed

3 files changed

+23
-78
lines changed

kernel/src/scan/data_skipping.rs

Lines changed: 3 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
use std::borrow::Cow;
21
use std::cmp::Ordering;
32
use std::sync::{Arc, LazyLock};
43

@@ -15,12 +14,14 @@ use crate::expressions::{
1514
use crate::kernel_predicates::{
1615
DataSkippingPredicateEvaluator, KernelPredicateEvaluator, KernelPredicateEvaluatorDefaults,
1716
};
18-
use crate::schema::{DataType, PrimitiveType, SchemaRef, SchemaTransform, StructField, StructType};
17+
use crate::schema::{DataType, SchemaRef, SchemaTransform, StructField, StructType};
1918
use crate::{
2019
Engine, EngineData, ExpressionEvaluator, JsonHandler, PredicateEvaluator, RowVisitor as _,
2120
};
2221

2322
pub(crate) mod stats_schema;
23+
24+
use stats_schema::{NullableStatsTransform, NullCountStatsTransform};
2425
#[cfg(test)]
2526
mod tests;
2627

@@ -77,39 +78,6 @@ impl DataSkippingFilter {
7778
let (predicate, referenced_schema) = physical_predicate?;
7879
debug!("Creating a data skipping filter for {:#?}", predicate);
7980

80-
// Convert all fields into nullable, as stats may not be available for all columns
81-
// (and usually aren't for partition columns).
82-
struct NullableStatsTransform;
83-
impl<'a> SchemaTransform<'a> for NullableStatsTransform {
84-
fn transform_struct_field(
85-
&mut self,
86-
field: &'a StructField,
87-
) -> Option<Cow<'a, StructField>> {
88-
use Cow::*;
89-
let field = match self.transform(&field.data_type)? {
90-
Borrowed(_) if field.is_nullable() => Borrowed(field),
91-
data_type => Owned(StructField {
92-
name: field.name.clone(),
93-
data_type: data_type.into_owned(),
94-
nullable: true,
95-
metadata: field.metadata.clone(),
96-
}),
97-
};
98-
Some(field)
99-
}
100-
}
101-
102-
// Convert a min/max stats schema into a nullcount schema (all leaf fields are LONG)
103-
struct NullCountStatsTransform;
104-
impl<'a> SchemaTransform<'a> for NullCountStatsTransform {
105-
fn transform_primitive(
106-
&mut self,
107-
_ptype: &'a PrimitiveType,
108-
) -> Option<Cow<'a, PrimitiveType>> {
109-
Some(Cow::Owned(PrimitiveType::Long))
110-
}
111-
}
112-
11381
let stats_schema = NullableStatsTransform
11482
.transform_struct(&referenced_schema)?
11583
.into_owned();

kernel/src/scan/data_skipping/stats_schema.rs

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,25 @@ pub(crate) fn expected_stats_schema(
111111
StructType::try_new(fields)
112112
}
113113

114+
/// Transforms a schema to make all fields nullable.
115+
/// Used for stats schemas where stats may not be available for all columns.
116+
pub(crate) struct NullableStatsTransform;
117+
impl<'a> SchemaTransform<'a> for NullableStatsTransform {
118+
fn transform_struct_field(&mut self, field: &'a StructField) -> Option<Cow<'a, StructField>> {
119+
use Cow::*;
120+
let field = match self.transform(&field.data_type)? {
121+
Borrowed(_) if field.is_nullable() => Borrowed(field),
122+
data_type => Owned(StructField {
123+
name: field.name.clone(),
124+
data_type: data_type.into_owned(),
125+
nullable: true,
126+
metadata: field.metadata.clone(),
127+
}),
128+
};
129+
Some(field)
130+
}
131+
}
132+
114133
// Convert a min/max stats schema into a nullcount schema (all leaf fields are LONG)
115134
#[allow(unused)]
116135
pub(crate) struct NullCountStatsTransform;
@@ -317,26 +336,6 @@ mod tests {
317336

318337
use super::*;
319338

320-
pub(crate) struct NullableStatsTransform;
321-
impl<'a> SchemaTransform<'a> for NullableStatsTransform {
322-
fn transform_struct_field(
323-
&mut self,
324-
field: &'a StructField,
325-
) -> Option<Cow<'a, StructField>> {
326-
use Cow::*;
327-
let field = match self.transform(&field.data_type)? {
328-
Borrowed(_) if field.is_nullable() => Borrowed(field),
329-
data_type => Owned(StructField {
330-
name: field.name.clone(),
331-
data_type: data_type.into_owned(),
332-
nullable: true,
333-
metadata: field.metadata.clone(),
334-
}),
335-
};
336-
Some(field)
337-
}
338-
}
339-
340339
#[test]
341340
fn test_should_include_column() {
342341
let full_name = vec![ColumnName::new(["lvl1", "lvl2", "lvl3", "lvl4"])];

kernel/src/transaction/mod.rs

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
use std::borrow::Cow;
21
use std::collections::{HashMap, HashSet};
32
use std::iter;
43
use std::ops::Deref;
@@ -23,6 +22,7 @@ use crate::expressions::{column_name, ColumnName};
2322
use crate::expressions::{ArrayData, Scalar, StructData, Transform, UnaryExpressionOp::ToJson};
2423
use crate::path::{LogRoot, ParsedLogPath};
2524
use crate::row_tracking::{RowTrackingDomainMetadata, RowTrackingVisitor};
25+
use crate::scan::data_skipping::stats_schema::NullableStatsTransform;
2626
use crate::scan::log_replay::{
2727
get_scan_metadata_transform_expr, BASE_ROW_ID_NAME, DEFAULT_ROW_COMMIT_VERSION_NAME,
2828
FILE_CONSTANT_VALUES_NAME, TAGS_NAME,
@@ -41,28 +41,6 @@ use crate::{
4141
};
4242
use delta_kernel_derive::internal_api;
4343

44-
// This is a workaround due to the fact that expression evaluation happens
45-
// on the whole EngineData instead of accounting for filtered rows, which can lead to null values in
46-
// required fields.
47-
// TODO: Move this to a common place (dedupe from data_skipping.rs) or remove when evaluations work
48-
// on FilteredEngineData directly.
49-
struct NullableStatsTransform;
50-
impl<'a> SchemaTransform<'a> for NullableStatsTransform {
51-
fn transform_struct_field(&mut self, field: &'a StructField) -> Option<Cow<'a, StructField>> {
52-
use Cow::*;
53-
let field = match self.transform(&field.data_type)? {
54-
Borrowed(_) if field.is_nullable() => Borrowed(field),
55-
data_type => Owned(StructField {
56-
name: field.name.clone(),
57-
data_type: data_type.into_owned(),
58-
nullable: true,
59-
metadata: field.metadata.clone(),
60-
}),
61-
};
62-
Some(field)
63-
}
64-
}
65-
6644
/// Type alias for an iterator of [`EngineData`] results.
6745
pub(crate) type EngineDataResultIterator<'a> =
6846
Box<dyn Iterator<Item = DeltaResult<Box<dyn EngineData>>> + Send + 'a>;

0 commit comments

Comments
 (0)