Skip to content

Commit b3b511b

Browse files
committed
not propagated
1 parent bc9a5d8 commit b3b511b

File tree

2 files changed

+90
-15
lines changed

2 files changed

+90
-15
lines changed

kernel/src/engine/arrow_expression/apply_schema.rs

Lines changed: 82 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,24 +15,23 @@ use crate::engine::ensure_data_types::ensure_data_types;
1515
use crate::error::{DeltaResult, Error};
1616
use crate::schema::{ArrayType, DataType, MapType, Schema, StructField};
1717

18-
// Apply a schema to an array. The array _must_ be a `StructArray`. Returns a `RecordBatch where the
19-
// names of fields, nullable, and metadata in the struct have been transformed to match those in
20-
// schema specified by `schema`
18+
// Apply a schema to an array. The array _must_ be a `StructArray`. Returns a `RecordBatch` where
19+
// the names of fields, nullable, and metadata in the struct have been transformed to match those
20+
// in the schema specified by `schema`.
21+
//
22+
// Note: If the struct array has top-level nulls, the child columns are expected to already have
23+
// those nulls propagated. Arrow's JSON reader does this automatically, and parquet data goes
24+
// through `fix_nested_null_masks` which handles it. We decompose the struct and discard its null
25+
// buffer since RecordBatch cannot have top-level nulls.
2126
pub(crate) fn apply_schema(array: &dyn Array, schema: &DataType) -> DeltaResult<RecordBatch> {
2227
let DataType::Struct(struct_schema) = schema else {
2328
return Err(Error::generic(
2429
"apply_schema at top-level must be passed a struct schema",
2530
));
2631
};
2732
let applied = apply_schema_to_struct(array, struct_schema)?;
28-
let (fields, columns, nulls) = applied.into_parts();
29-
if let Some(nulls) = nulls {
30-
if nulls.null_count() != 0 {
31-
return Err(Error::invalid_struct_data(
32-
"Top-level nulls in struct are not supported",
33-
));
34-
}
35-
}
33+
let (fields, columns, _nulls) = applied.into_parts();
34+
3635
Ok(RecordBatch::try_new(
3736
Arc::new(ArrowSchema::new(fields)),
3837
columns,
@@ -191,6 +190,7 @@ mod apply_schema_validation_tests {
191190
use std::sync::Arc;
192191

193192
use crate::arrow::array::{Int32Array, StructArray};
193+
use crate::arrow::buffer::{BooleanBuffer, NullBuffer};
194194
use crate::arrow::datatypes::{
195195
DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema,
196196
};
@@ -238,4 +238,75 @@ mod apply_schema_validation_tests {
238238
StructField::new("b", DataType::INTEGER, false),
239239
])
240240
}
241+
242+
/// Test that apply_schema handles structs with top-level nulls correctly.
243+
///
244+
/// This simulates a Delta log scenario where each row is one action type (add, remove, etc.).
245+
/// When extracting `add.stats_parsed`, rows where `add` is null (e.g., remove actions) should
246+
/// have null child columns. The child columns are expected to already have nulls propagated
247+
/// (Arrow's JSON reader does this, and parquet data goes through `fix_nested_null_masks`).
248+
#[test]
249+
fn test_apply_schema_handles_top_level_nulls() {
250+
// Create a struct array with 4 rows where rows 1 and 3 have top-level nulls.
251+
// This simulates: [add_action, remove_action, add_action, remove_action]
252+
// where remove_action rows have null for the entire struct.
253+
let field_a = ArrowField::new("a", ArrowDataType::Int32, true);
254+
let field_b = ArrowField::new("b", ArrowDataType::Int32, true);
255+
let schema = ArrowSchema::new(vec![field_a, field_b]);
256+
257+
// Child columns with nulls already propagated (simulating what Arrow readers do).
258+
// Rows 1 and 3 are null because the parent struct is null at those positions.
259+
let a_data = Int32Array::from(vec![Some(1), None, Some(3), None]);
260+
let b_data = Int32Array::from(vec![Some(10), None, Some(30), None]);
261+
262+
// Top-level struct nulls: rows 0 and 2 are valid, rows 1 and 3 are null
263+
let null_buffer = NullBuffer::new(BooleanBuffer::from(vec![true, false, true, false]));
264+
265+
let struct_array = StructArray::try_new(
266+
schema.fields.clone(),
267+
vec![Arc::new(a_data), Arc::new(b_data)],
268+
Some(null_buffer),
269+
)
270+
.unwrap();
271+
272+
// Target schema with nullable fields
273+
let target_schema = DataType::Struct(Box::new(StructType::new_unchecked([
274+
StructField::new("a", DataType::INTEGER, true),
275+
StructField::new("b", DataType::INTEGER, true),
276+
])));
277+
278+
// Apply schema - should successfully convert to RecordBatch
279+
let result = apply_schema(&struct_array, &target_schema).unwrap();
280+
281+
assert_eq!(result.num_rows(), 4);
282+
assert_eq!(result.num_columns(), 2);
283+
284+
// Verify columns preserve nulls from child arrays
285+
let col_a = result.column(0);
286+
assert!(col_a.is_valid(0), "Row 0 should be valid");
287+
assert!(col_a.is_null(1), "Row 1 should be null");
288+
assert!(col_a.is_valid(2), "Row 2 should be valid");
289+
assert!(col_a.is_null(3), "Row 3 should be null");
290+
291+
let col_b = result.column(1);
292+
assert!(col_b.is_valid(0), "Row 0 should be valid");
293+
assert!(col_b.is_null(1), "Row 1 should be null");
294+
assert!(col_b.is_valid(2), "Row 2 should be valid");
295+
assert!(col_b.is_null(3), "Row 3 should be null");
296+
297+
// Verify the actual values for valid rows
298+
let col_a = col_a
299+
.as_any()
300+
.downcast_ref::<Int32Array>()
301+
.expect("column a should be Int32Array");
302+
let col_b = col_b
303+
.as_any()
304+
.downcast_ref::<Int32Array>()
305+
.expect("column b should be Int32Array");
306+
307+
assert_eq!(col_a.value(0), 1);
308+
assert_eq!(col_a.value(2), 3);
309+
assert_eq!(col_b.value(0), 10);
310+
assert_eq!(col_b.value(2), 30);
311+
}
241312
}

kernel/src/engine/arrow_expression/tests.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -827,6 +827,8 @@ fn test_create_one_mismatching_scalar_types() {
827827

828828
#[test]
829829
fn test_create_one_not_null_struct() {
830+
// Creating a NOT NULL struct field with null values should error.
831+
// The error comes from Arrow's RecordBatch validation (non-nullable column has nulls).
830832
let values: &[Scalar] = &[
831833
Scalar::Null(KernelDataType::INTEGER),
832834
Scalar::Null(KernelDataType::INTEGER),
@@ -841,23 +843,25 @@ fn test_create_one_not_null_struct() {
841843
let handler = ArrowEvaluationHandler;
842844
assert_result_error_with_message(
843845
handler.create_one(schema, values),
844-
"Invalid struct data: Top-level nulls in struct are not supported",
846+
"Column 'a' is declared as non-nullable but contains null values",
845847
);
846848
}
847849

848850
#[test]
849851
fn test_create_one_top_level_null() {
852+
// Creating a NOT NULL field with null value should error.
853+
// The error comes from Arrow's RecordBatch validation.
850854
let values = &[Scalar::Null(KernelDataType::INTEGER)];
851855
let handler = ArrowEvaluationHandler;
852856

853857
let schema = Arc::new(StructType::new_unchecked([StructField::not_null(
854858
"col_1",
855859
KernelDataType::INTEGER,
856860
)]));
857-
assert!(matches!(
861+
assert_result_error_with_message(
858862
handler.create_one(schema, values),
859-
Err(Error::InvalidStructData(_))
860-
));
863+
"Column 'col_1' is declared as non-nullable but contains null values",
864+
);
861865
}
862866

863867
#[test]

0 commit comments

Comments
 (0)