Skip to content

Commit 5f48dea

Browse files
authored
Merge pull request #83 from roeap/partition-values
Leverage more engine capabilities in data skipping 2/n
2 parents d8d3eb3 + 6cb4fe4 commit 5f48dea

24 files changed

+672
-251
lines changed

acceptance/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ tar = "0.4"
2626

2727
[dev-dependencies]
2828
arrow = { version = "^49.0", features = ["json", "prettyprint"] }
29-
datatest-stable = "0.1.3"
29+
datatest-stable = "0.2"
3030
test-log = { version = "0.2", default-features = false, features = ["trace"] }
3131
tempfile = "3"
3232
test-case = { version = "3.1.0" }

kernel/Cargo.toml

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,14 @@ version.workspace = true
1010

1111
[dependencies]
1212
arrow-array = { version = "^49.0" }
13-
arrow-arith = { version = "^49.0" }
14-
arrow-json = { version = "^49.0" }
15-
arrow-ord = { version = "^49.0" }
16-
arrow-schema = { version = "^49.0" }
1713
arrow-select = { version = "^49.0" }
1814
bytes = "1.4"
19-
chrono = { version = "0.4", optional = true }
15+
chrono = { version = "0.4" }
2016
either = "1.8"
2117
fix-hidden-lifetime-bug = "0.2"
18+
indexmap = "2.2.1"
2219
itertools = "0.12"
2320
lazy_static = "1.4"
24-
# need to generalize over arrow, arrow2 and diff parquet etc. (BYOP)
2521
regex = "1.8"
2622
roaring = "0.10.1"
2723
serde = { version = "1", features = ["derive"] }
@@ -37,6 +33,10 @@ z85 = "3.0.5"
3733
visibility = "0.1.0"
3834

3935
# Used in default client
36+
arrow-arith = { version = "^49.0", optional = true }
37+
arrow-json = { version = "^49.0", optional = true }
38+
arrow-ord = { version = "^49.0", optional = true }
39+
arrow-schema = { version = "^49.0", optional = true }
4040
futures = { version = "0.3", optional = true }
4141
object_store = { version = "^0.8.0", optional = true }
4242
parquet = { version = "^49.0", optional = true, features = [
@@ -49,7 +49,15 @@ tokio = { version = "1", optional = true, features = ["rt-multi-thread"] }
4949

5050
[features]
5151
default = ["default-client"]
52-
default-client = ["chrono", "futures", "object_store", "parquet"]
52+
default-client = [
53+
"arrow-arith",
54+
"arrow-json",
55+
"arrow-ord",
56+
"arrow-schema",
57+
"futures",
58+
"object_store",
59+
"parquet",
60+
]
5361
developer-visibility = []
5462

5563
[dev-dependencies]

kernel/examples/dump-table/src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ fn main() {
112112
let scan = ScanBuilder::new(snapshot).build();
113113

114114
let schema = scan.schema();
115-
let header_names = schema.fields.iter().map(|field| {
115+
let header_names = schema.fields().map(|field| {
116116
let cell = Cell::new(field.name());
117117
if cli.ascii {
118118
cell

kernel/src/actions/mod.rs

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,7 @@ fn struct_array_to_map(arr: &StructArray) -> DeltaResult<HashMap<String, Option<
521521
mod tests {
522522
use std::sync::Arc;
523523

524+
use arrow_array::ArrayRef;
524525
use object_store::local::LocalFileSystem;
525526

526527
use super::*;
@@ -534,13 +535,12 @@ mod tests {
534535
let store = Arc::new(LocalFileSystem::new());
535536
let handler = DefaultJsonHandler::new(store, Arc::new(TokioBackgroundExecutor::new()));
536537

537-
let json_strings: StringArray = vec![
538+
let json_strings: ArrayRef = Arc::new(StringArray::from(vec![
538539
r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#,
539540
r#"{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Databricks-Runtime/<unknown>","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}}"#,
540541
r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#,
541542
r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}}"#,
542-
]
543-
.into();
543+
]));
544544
let output_schema = Arc::new(log_schema().clone());
545545
handler.parse_json(json_strings, output_schema).unwrap()
546546
}
@@ -597,15 +597,14 @@ mod tests {
597597
let store = Arc::new(LocalFileSystem::new());
598598
let handler = DefaultJsonHandler::new(store, Arc::new(TokioBackgroundExecutor::new()));
599599

600-
let json_strings: StringArray = vec![
600+
let json_strings: ArrayRef = Arc::new(StringArray::from(vec![
601601
r#"{"commitInfo":{"timestamp":1670892998177,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[\"c1\",\"c2\"]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"3","numOutputRows":"3","numOutputBytes":"1356"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.2.0","txnId":"046a258f-45e3-4657-b0bf-abfb0f76681c"}}"#,
602602
r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#,
603603
r#"{"metaData":{"id":"aff5cb91-8cd9-4195-aef9-446908507302","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#,
604604
r#"{"add":{"path":"c1=4/c2=c/part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#,
605605
r#"{"add":{"path":"c1=5/c2=b/part-00007-4e73fa3b-2c88-424a-8051-f8b54328ffdb.c000.snappy.parquet","partitionValues":{"c1":"5","c2":"b"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":6},\"maxValues\":{\"c3\":6},\"nullCount\":{\"c3\":0}}"}}"#,
606606
r#"{"add":{"path":"c1=6/c2=a/part-00011-10619b10-b691-4fd0-acc4-2a9608499d7c.c000.snappy.parquet","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":4},\"maxValues\":{\"c3\":4},\"nullCount\":{\"c3\":0}}"}}"#,
607-
]
608-
.into();
607+
]));
609608
let output_schema = Arc::new(log_schema().clone());
610609
let batch = handler.parse_json(json_strings, output_schema).unwrap();
611610

kernel/src/client/conversion.rs

Lines changed: 8 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use arrow_schema::{
44
ArrowError, DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema,
55
SchemaRef as ArrowSchemaRef, TimeUnit,
66
};
7+
use itertools::Itertools;
78

89
use crate::actions::ActionType;
910
use crate::schema::{ArrayType, DataType, MapType, PrimitiveType, StructField, StructType};
@@ -20,12 +21,7 @@ impl TryFrom<&StructType> for ArrowSchema {
2021
type Error = ArrowError;
2122

2223
fn try_from(s: &StructType) -> Result<Self, ArrowError> {
23-
let fields = s
24-
.fields()
25-
.iter()
26-
.map(|f| <ArrowField as TryFrom<&StructField>>::try_from(*f))
27-
.collect::<Result<Vec<ArrowField>, ArrowError>>()?;
28-
24+
let fields: Vec<ArrowField> = s.fields().map(TryInto::try_into).try_collect()?;
2925
Ok(ArrowSchema::new(fields))
3026
}
3127
}
@@ -103,23 +99,12 @@ impl TryFrom<&DataType> for ArrowDataType {
10399
PrimitiveType::Boolean => Ok(ArrowDataType::Boolean),
104100
PrimitiveType::Binary => Ok(ArrowDataType::Binary),
105101
PrimitiveType::Decimal(precision, scale) => {
106-
let precision = u8::try_from(*precision).map_err(|_| {
107-
ArrowError::SchemaError(format!(
108-
"Invalid precision for decimal: {}",
109-
precision
110-
))
111-
})?;
112-
let scale = i8::try_from(*scale).map_err(|_| {
113-
ArrowError::SchemaError(format!("Invalid scale for decimal: {}", scale))
114-
})?;
115-
116-
if precision <= 38 {
117-
Ok(ArrowDataType::Decimal128(precision, scale))
118-
} else if precision <= 76 {
119-
Ok(ArrowDataType::Decimal256(precision, scale))
102+
if precision <= &38 {
103+
Ok(ArrowDataType::Decimal128(*precision, *scale))
120104
} else {
105+
// NOTE: since we are converting from delta, we should never get here.
121106
Err(ArrowError::SchemaError(format!(
122-
"Precision too large to be represented in Arrow: {}",
107+
"Precision too large to be represented as Delta type: {} > 38",
123108
precision
124109
)))
125110
}
@@ -137,8 +122,7 @@ impl TryFrom<&DataType> for ArrowDataType {
137122
}
138123
DataType::Struct(s) => Ok(ArrowDataType::Struct(
139124
s.fields()
140-
.iter()
141-
.map(|f| <ArrowField as TryFrom<&StructField>>::try_from(*f))
125+
.map(TryInto::try_into)
142126
.collect::<Result<Vec<ArrowField>, ArrowError>>()?
143127
.into(),
144128
)),
@@ -226,12 +210,7 @@ impl TryFrom<&ArrowDataType> for DataType {
226210
ArrowDataType::Binary => Ok(DataType::Primitive(PrimitiveType::Binary)),
227211
ArrowDataType::FixedSizeBinary(_) => Ok(DataType::Primitive(PrimitiveType::Binary)),
228212
ArrowDataType::LargeBinary => Ok(DataType::Primitive(PrimitiveType::Binary)),
229-
ArrowDataType::Decimal128(p, s) => Ok(DataType::Primitive(PrimitiveType::Decimal(
230-
*p as i32, *s as i32,
231-
))),
232-
ArrowDataType::Decimal256(p, s) => Ok(DataType::Primitive(PrimitiveType::Decimal(
233-
*p as i32, *s as i32,
234-
))),
213+
ArrowDataType::Decimal128(p, s) => Ok(DataType::decimal(*p, *s)),
235214
ArrowDataType::Date32 => Ok(DataType::Primitive(PrimitiveType::Date)),
236215
ArrowDataType::Date64 => Ok(DataType::Primitive(PrimitiveType::Date)),
237216
ArrowDataType::Timestamp(TimeUnit::Microsecond, None) => {

0 commit comments

Comments
 (0)