Skip to content

Commit e614153

Browse files
refactor!: add TryIntoKernel/Arrow traits (#946)
## What changes are proposed in this pull request? As discovered in crate split exploration we implement `TryFrom`/`TryInto` for converting between kernel types and arrow types. This is problematic for two reasons: (1) in the future whenever we do the split, the current implementation breaks the orphan rule and (2) it's difficult to reason about the types you are converting to/from. This PR solves both issues by introducing new traits: `TryFromKernel` and `TryFromArrow` which can be implemented to give `Self::try_from_kernel(...)` and `Self::try_from_arrow(...)` respectively. Additionally (similar to std lib) we automatically derive `TryIntoArrow` for types that implement `TryFromKernel` and `TryIntoKernel` for types that implement `TryFromArrow`. ### This PR affects the following public APIs Removes old From/Into implementations for kernel schema types, replaces with `TryFromKernel`/`TryIntoKernel`/`TryFromArrow`/`TryIntoArrow`. migration will usually be as simple as changing a `.try_into()` to a `.try_into_kernel()` or `.try_into_arrow()`. ## How was this change tested? Existing UT
1 parent ce24aa3 commit e614153

File tree

15 files changed

+168
-121
lines changed

15 files changed

+168
-121
lines changed

integration-tests/src/main.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use delta_kernel::arrow::datatypes::{DataType, Field, Schema};
2+
use delta_kernel::engine::arrow_conversion::TryFromArrow as _;
23

34
fn create_arrow_schema() -> Schema {
45
let field_a = Field::new("a", DataType::Int64, false);
@@ -17,7 +18,7 @@ fn main() {
1718
let arrow_schema = create_arrow_schema();
1819
let kernel_schema = create_kernel_schema();
1920
let converted: delta_kernel::schema::Schema =
20-
delta_kernel::schema::Schema::try_from(&arrow_schema).expect("couldn't convert");
21+
delta_kernel::schema::Schema::try_from_arrow(&arrow_schema).expect("couldn't convert");
2122
assert!(kernel_schema == converted);
2223
println!("Okay, made it");
2324
}

kernel/src/engine/arrow_conversion.rs

Lines changed: 110 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//! Conversions from kernel types to arrow types
1+
//! Conversions from kernel schema types to arrow schema types.
22
33
use std::sync::Arc;
44

@@ -19,19 +19,57 @@ pub(crate) const MAP_ROOT_DEFAULT: &str = "key_value";
1919
pub(crate) const MAP_KEY_DEFAULT: &str = "key";
2020
pub(crate) const MAP_VALUE_DEFAULT: &str = "value";
2121

22-
impl TryFrom<&StructType> for ArrowSchema {
23-
type Error = ArrowError;
22+
/// Convert a kernel type into an arrow type (automatically implemented for all types that
23+
/// implement [`TryFromKernel`])
24+
pub trait TryIntoArrow<ArrowType> {
25+
fn try_into_arrow(self) -> Result<ArrowType, ArrowError>;
26+
}
2427

25-
fn try_from(s: &StructType) -> Result<Self, ArrowError> {
26-
let fields: Vec<ArrowField> = s.fields().map(TryInto::try_into).try_collect()?;
27-
Ok(ArrowSchema::new(fields))
28+
/// Convert an arrow type into a kernel type (a similar [`TryIntoKernel`] trait is automatically
29+
/// implemented for all types that implement [`TryFromArrow`])
30+
pub trait TryFromArrow<ArrowType>: Sized {
31+
fn try_from_arrow(t: ArrowType) -> Result<Self, ArrowError>;
32+
}
33+
34+
/// Convert an arrow type into a kernel type (automatically implemented for all types that
35+
/// implement [`TryFromArrow`])
36+
pub trait TryIntoKernel<KernelType> {
37+
fn try_into_kernel(self) -> Result<KernelType, ArrowError>;
38+
}
39+
40+
/// Convert a kernel type into an arrow type (a similar [`TryIntoArrow`] trait is automatically
41+
/// implemented for all types that implement [`TryFromKernel`])
42+
pub trait TryFromKernel<KernelType>: Sized {
43+
fn try_from_kernel(t: KernelType) -> Result<Self, ArrowError>;
44+
}
45+
46+
impl<KernelType, ArrowType> TryIntoArrow<ArrowType> for KernelType
47+
where
48+
ArrowType: TryFromKernel<KernelType>,
49+
{
50+
fn try_into_arrow(self) -> Result<ArrowType, ArrowError> {
51+
ArrowType::try_from_kernel(self)
2852
}
2953
}
3054

31-
impl TryFrom<&StructField> for ArrowField {
32-
type Error = ArrowError;
55+
impl<KernelType, ArrowType> TryIntoKernel<KernelType> for ArrowType
56+
where
57+
KernelType: TryFromArrow<ArrowType>,
58+
{
59+
fn try_into_kernel(self) -> Result<KernelType, ArrowError> {
60+
KernelType::try_from_arrow(self)
61+
}
62+
}
63+
64+
impl TryFromKernel<&StructType> for ArrowSchema {
65+
fn try_from_kernel(s: &StructType) -> Result<Self, ArrowError> {
66+
let fields: Vec<ArrowField> = s.fields().map(|f| f.try_into_arrow()).try_collect()?;
67+
Ok(ArrowSchema::new(fields))
68+
}
69+
}
3370

34-
fn try_from(f: &StructField) -> Result<Self, ArrowError> {
71+
impl TryFromKernel<&StructField> for ArrowField {
72+
fn try_from_kernel(f: &StructField) -> Result<Self, ArrowError> {
3573
let metadata = f
3674
.metadata()
3775
.iter()
@@ -42,45 +80,33 @@ impl TryFrom<&StructField> for ArrowField {
4280
.collect::<Result<_, serde_json::Error>>()
4381
.map_err(|err| ArrowError::JsonError(err.to_string()))?;
4482

45-
let field = ArrowField::new(
46-
f.name(),
47-
ArrowDataType::try_from(f.data_type())?,
48-
f.is_nullable(),
49-
)
50-
.with_metadata(metadata);
83+
let field = ArrowField::new(f.name(), f.data_type().try_into_arrow()?, f.is_nullable())
84+
.with_metadata(metadata);
5185

5286
Ok(field)
5387
}
5488
}
5589

56-
impl TryFrom<&ArrayType> for ArrowField {
57-
type Error = ArrowError;
58-
59-
fn try_from(a: &ArrayType) -> Result<Self, ArrowError> {
90+
impl TryFromKernel<&ArrayType> for ArrowField {
91+
fn try_from_kernel(a: &ArrayType) -> Result<Self, ArrowError> {
6092
Ok(ArrowField::new(
6193
LIST_ARRAY_ROOT,
62-
ArrowDataType::try_from(a.element_type())?,
94+
a.element_type().try_into_arrow()?,
6395
a.contains_null(),
6496
))
6597
}
6698
}
6799

68-
impl TryFrom<&MapType> for ArrowField {
69-
type Error = ArrowError;
70-
71-
fn try_from(a: &MapType) -> Result<Self, ArrowError> {
100+
impl TryFromKernel<&MapType> for ArrowField {
101+
fn try_from_kernel(a: &MapType) -> Result<Self, ArrowError> {
72102
Ok(ArrowField::new(
73103
MAP_ROOT_DEFAULT,
74104
ArrowDataType::Struct(
75105
vec![
76-
ArrowField::new(
77-
MAP_KEY_DEFAULT,
78-
ArrowDataType::try_from(a.key_type())?,
79-
false,
80-
),
106+
ArrowField::new(MAP_KEY_DEFAULT, a.key_type().try_into_arrow()?, false),
81107
ArrowField::new(
82108
MAP_VALUE_DEFAULT,
83-
ArrowDataType::try_from(a.value_type())?,
109+
a.value_type().try_into_arrow()?,
84110
a.value_contains_null(),
85111
),
86112
]
@@ -91,10 +117,8 @@ impl TryFrom<&MapType> for ArrowField {
91117
}
92118
}
93119

94-
impl TryFrom<&DataType> for ArrowDataType {
95-
type Error = ArrowError;
96-
97-
fn try_from(t: &DataType) -> Result<Self, ArrowError> {
120+
impl TryFromKernel<&DataType> for ArrowDataType {
121+
fn try_from_kernel(t: &DataType) -> Result<Self, ArrowError> {
98122
match t {
99123
DataType::Primitive(p) => {
100124
match p {
@@ -128,54 +152,49 @@ impl TryFrom<&DataType> for ArrowDataType {
128152
}
129153
DataType::Struct(s) => Ok(ArrowDataType::Struct(
130154
s.fields()
131-
.map(TryInto::try_into)
155+
.map(TryIntoArrow::try_into_arrow)
132156
.collect::<Result<Vec<ArrowField>, ArrowError>>()?
133157
.into(),
134158
)),
135-
DataType::Array(a) => Ok(ArrowDataType::List(Arc::new(a.as_ref().try_into()?))),
136-
DataType::Map(m) => Ok(ArrowDataType::Map(Arc::new(m.as_ref().try_into()?), false)),
159+
DataType::Array(a) => Ok(ArrowDataType::List(Arc::new(a.as_ref().try_into_arrow()?))),
160+
DataType::Map(m) => Ok(ArrowDataType::Map(
161+
Arc::new(m.as_ref().try_into_arrow()?),
162+
false,
163+
)),
137164
}
138165
}
139166
}
140167

141-
impl TryFrom<&ArrowSchema> for StructType {
142-
type Error = ArrowError;
143-
144-
fn try_from(arrow_schema: &ArrowSchema) -> Result<Self, ArrowError> {
168+
impl TryFromArrow<&ArrowSchema> for StructType {
169+
fn try_from_arrow(arrow_schema: &ArrowSchema) -> Result<Self, ArrowError> {
145170
StructType::try_new(
146171
arrow_schema
147172
.fields()
148173
.iter()
149-
.map(|field| field.as_ref().try_into()),
174+
.map(|field| field.as_ref().try_into_kernel()),
150175
)
151176
}
152177
}
153178

154-
impl TryFrom<ArrowSchemaRef> for StructType {
155-
type Error = ArrowError;
156-
157-
fn try_from(arrow_schema: ArrowSchemaRef) -> Result<Self, ArrowError> {
158-
arrow_schema.as_ref().try_into()
179+
impl TryFromArrow<ArrowSchemaRef> for StructType {
180+
fn try_from_arrow(arrow_schema: ArrowSchemaRef) -> Result<Self, ArrowError> {
181+
arrow_schema.as_ref().try_into_kernel()
159182
}
160183
}
161184

162-
impl TryFrom<&ArrowField> for StructField {
163-
type Error = ArrowError;
164-
165-
fn try_from(arrow_field: &ArrowField) -> Result<Self, ArrowError> {
185+
impl TryFromArrow<&ArrowField> for StructField {
186+
fn try_from_arrow(arrow_field: &ArrowField) -> Result<Self, ArrowError> {
166187
Ok(StructField::new(
167188
arrow_field.name().clone(),
168-
DataType::try_from(arrow_field.data_type())?,
189+
DataType::try_from_arrow(arrow_field.data_type())?,
169190
arrow_field.is_nullable(),
170191
)
171192
.with_metadata(arrow_field.metadata().iter().map(|(k, v)| (k.clone(), v))))
172193
}
173194
}
174195

175-
impl TryFrom<&ArrowDataType> for DataType {
176-
type Error = ArrowError;
177-
178-
fn try_from(arrow_datatype: &ArrowDataType) -> Result<Self, ArrowError> {
196+
impl TryFromArrow<&ArrowDataType> for DataType {
197+
fn try_from_arrow(arrow_datatype: &ArrowDataType) -> Result<Self, ArrowError> {
179198
match arrow_datatype {
180199
ArrowDataType::Utf8 => Ok(DataType::STRING),
181200
ArrowDataType::LargeUtf8 => Ok(DataType::STRING),
@@ -212,28 +231,38 @@ impl TryFrom<&ArrowDataType> for DataType {
212231
{
213232
Ok(DataType::TIMESTAMP)
214233
}
215-
ArrowDataType::Struct(fields) => {
216-
DataType::try_struct_type(fields.iter().map(|field| field.as_ref().try_into()))
217-
}
218-
ArrowDataType::List(field) => {
219-
Ok(ArrayType::new((*field).data_type().try_into()?, (*field).is_nullable()).into())
220-
}
221-
ArrowDataType::ListView(field) => {
222-
Ok(ArrayType::new((*field).data_type().try_into()?, (*field).is_nullable()).into())
223-
}
224-
ArrowDataType::LargeList(field) => {
225-
Ok(ArrayType::new((*field).data_type().try_into()?, (*field).is_nullable()).into())
226-
}
227-
ArrowDataType::LargeListView(field) => {
228-
Ok(ArrayType::new((*field).data_type().try_into()?, (*field).is_nullable()).into())
229-
}
230-
ArrowDataType::FixedSizeList(field, _) => {
231-
Ok(ArrayType::new((*field).data_type().try_into()?, (*field).is_nullable()).into())
232-
}
234+
ArrowDataType::Struct(fields) => DataType::try_struct_type(
235+
fields.iter().map(|field| field.as_ref().try_into_kernel()),
236+
),
237+
ArrowDataType::List(field) => Ok(ArrayType::new(
238+
(*field).data_type().try_into_kernel()?,
239+
(*field).is_nullable(),
240+
)
241+
.into()),
242+
ArrowDataType::ListView(field) => Ok(ArrayType::new(
243+
(*field).data_type().try_into_kernel()?,
244+
(*field).is_nullable(),
245+
)
246+
.into()),
247+
ArrowDataType::LargeList(field) => Ok(ArrayType::new(
248+
(*field).data_type().try_into_kernel()?,
249+
(*field).is_nullable(),
250+
)
251+
.into()),
252+
ArrowDataType::LargeListView(field) => Ok(ArrayType::new(
253+
(*field).data_type().try_into_kernel()?,
254+
(*field).is_nullable(),
255+
)
256+
.into()),
257+
ArrowDataType::FixedSizeList(field, _) => Ok(ArrayType::new(
258+
(*field).data_type().try_into_kernel()?,
259+
(*field).is_nullable(),
260+
)
261+
.into()),
233262
ArrowDataType::Map(field, _) => {
234263
if let ArrowDataType::Struct(struct_fields) = field.data_type() {
235-
let key_type = DataType::try_from(struct_fields[0].data_type())?;
236-
let value_type = DataType::try_from(struct_fields[1].data_type())?;
264+
let key_type = DataType::try_from_arrow(struct_fields[0].data_type())?;
265+
let value_type = DataType::try_from_arrow(struct_fields[1].data_type())?;
237266
let value_type_nullable = struct_fields[1].is_nullable();
238267
Ok(MapType::new(key_type, value_type, value_type_nullable).into())
239268
} else {
@@ -242,7 +271,9 @@ impl TryFrom<&ArrowDataType> for DataType {
242271
}
243272
// Dictionary types are just an optimized in-memory representation of an array.
244273
// Schema-wise, they are the same as the value type.
245-
ArrowDataType::Dictionary(_, value_type) => Ok(value_type.as_ref().try_into()?),
274+
ArrowDataType::Dictionary(_, value_type) => {
275+
Ok(value_type.as_ref().try_into_kernel()?)
276+
}
246277
s => Err(ArrowError::SchemaError(format!(
247278
"Invalid data type for Delta Lake: {s}"
248279
))),
@@ -252,6 +283,7 @@ impl TryFrom<&ArrowDataType> for DataType {
252283

253284
#[cfg(test)]
254285
mod tests {
286+
use super::*;
255287
use crate::engine::arrow_conversion::ArrowField;
256288
use crate::{
257289
schema::{DataType, StructField},
@@ -265,7 +297,7 @@ mod tests {
265297
metadata.insert("description", "hello world".to_owned());
266298
let struct_field = StructField::not_null("name", DataType::STRING).with_metadata(metadata);
267299

268-
let arrow_field = ArrowField::try_from(&struct_field)?;
300+
let arrow_field = ArrowField::try_from_kernel(&struct_field)?;
269301
let new_metadata = arrow_field.metadata();
270302

271303
assert_eq!(

kernel/src/engine/arrow_expression/mod.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use crate::arrow::datatypes::{
66
DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema,
77
};
88

9+
use super::arrow_conversion::{TryFromKernel as _, TryIntoArrow as _};
910
use crate::engine::arrow_data::ArrowEngineData;
1011
use crate::error::{DeltaResult, Error};
1112
use crate::expressions::{Expression, Predicate, Scalar};
@@ -64,7 +65,7 @@ impl ArrayBuilderAs for StructFieldBuilder<'_> {
6465
impl Scalar {
6566
/// Convert scalar to arrow array.
6667
pub fn to_array(&self, num_rows: usize) -> DeltaResult<ArrayRef> {
67-
let data_type = ArrowDataType::try_from(&self.data_type())?;
68+
let data_type = ArrowDataType::try_from_kernel(&self.data_type())?;
6869
let mut builder = array::make_builder(&data_type, num_rows);
6970
self.append_to(&mut builder, num_rows)?;
7071
Ok(builder.finish())
@@ -301,7 +302,7 @@ impl EvaluationHandler for ArrowEvaluationHandler {
301302
.map(|field| Scalar::Null(field.data_type().clone()).to_array(1))
302303
.try_collect()?;
303304
let record_batch =
304-
RecordBatch::try_new(Arc::new(output_schema.as_ref().try_into()?), arrays)?;
305+
RecordBatch::try_new(Arc::new(output_schema.as_ref().try_into_arrow()?), arrays)?;
305306
Ok(Box::new(ArrowEngineData::new(record_batch)))
306307
}
307308
}
@@ -321,7 +322,7 @@ impl ExpressionEvaluator for DefaultExpressionEvaluator {
321322
.downcast_ref::<ArrowEngineData>()
322323
.ok_or_else(|| Error::engine_data_type("ArrowEngineData"))?
323324
.record_batch();
324-
let _input_schema: ArrowSchema = self.input_schema.as_ref().try_into()?;
325+
let _input_schema: ArrowSchema = self.input_schema.as_ref().try_into_arrow()?;
325326
// TODO: make sure we have matching schemas for validation
326327
// if batch.schema().as_ref() != &input_schema {
327328
// return Err(Error::Generic(format!(
@@ -335,7 +336,7 @@ impl ExpressionEvaluator for DefaultExpressionEvaluator {
335336
apply_schema(&array_ref, &self.output_type)?
336337
} else {
337338
let array_ref = apply_schema_to(&array_ref, &self.output_type)?;
338-
let arrow_type: ArrowDataType = ArrowDataType::try_from(&self.output_type)?;
339+
let arrow_type = ArrowDataType::try_from_kernel(&self.output_type)?;
339340
let schema = ArrowSchema::new(vec![ArrowField::new("output", arrow_type, true)]);
340341
RecordBatch::try_new(Arc::new(schema), vec![array_ref])?
341342
};
@@ -357,7 +358,7 @@ impl PredicateEvaluator for DefaultPredicateEvaluator {
357358
.downcast_ref::<ArrowEngineData>()
358359
.ok_or_else(|| Error::engine_data_type("ArrowEngineData"))?
359360
.record_batch();
360-
let _input_schema: ArrowSchema = self.input_schema.as_ref().try_into()?;
361+
let _input_schema: ArrowSchema = self.input_schema.as_ref().try_into_arrow()?;
361362
// TODO: make sure we have matching schemas for validation
362363
// if batch.schema().as_ref() != &input_schema {
363364
// return Err(Error::Generic(format!(

kernel/src/engine/arrow_expression/tests.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -523,7 +523,7 @@ fn test_null_row() {
523523
let handler = ArrowEvaluationHandler;
524524
let result = handler.null_row(schema.clone()).unwrap();
525525
let expected = RecordBatch::try_new(
526-
Arc::new(schema.as_ref().try_into().unwrap()),
526+
Arc::new(schema.as_ref().try_into_arrow().unwrap()),
527527
vec![
528528
Arc::new(StructArray::new_null(
529529
[

0 commit comments

Comments
 (0)