From 3fcae86f088274eba61d65210e2e2211a43a9226 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 30 Sep 2024 16:37:08 -0700 Subject: [PATCH] Address feedback --- acceptance/Cargo.toml | 1 + acceptance/src/data.rs | 3 +- .../read-table-single-threaded/src/main.rs | 42 ++++++++++--------- kernel/src/scan/mod.rs | 11 ++--- kernel/tests/golden_tables.rs | 7 ++-- kernel/tests/read.rs | 7 ++-- 6 files changed, 39 insertions(+), 32 deletions(-) diff --git a/acceptance/Cargo.toml b/acceptance/Cargo.toml index bfd64170d..38aca42b1 100644 --- a/acceptance/Cargo.toml +++ b/acceptance/Cargo.toml @@ -20,6 +20,7 @@ delta_kernel = { path = "../kernel", features = [ "developer-visibility", ] } futures = "0.3" +itertools = "0.13" object_store = { workspace = true } parquet = { workspace = true } serde = { version = "1", features = ["derive"] } diff --git a/acceptance/src/data.rs b/acceptance/src/data.rs index 933735e71..648f5ff41 100644 --- a/acceptance/src/data.rs +++ b/acceptance/src/data.rs @@ -7,6 +7,7 @@ use arrow_select::{concat::concat_batches, filter::filter_record_batch, take::ta use delta_kernel::{engine::arrow_data::ArrowEngineData, DeltaResult, Engine, Error, Table}; use futures::{stream::TryStreamExt, StreamExt}; +use itertools::Itertools; use object_store::{local::LocalFileSystem, ObjectStore}; use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}; @@ -144,7 +145,7 @@ pub async fn assert_scan_data(engine: Arc, test_case: &TestCaseInfo) Ok(record_batch) } }) - .collect::>>()?; + .try_collect()?; let all_data = concat_batches(&schema.unwrap(), batches.iter()).map_err(Error::from)?; let all_data = sort_record_batch(all_data)?; let golden = read_golden(test_case.root_dir(), None).await?; diff --git a/kernel/examples/read-table-single-threaded/src/main.rs b/kernel/examples/read-table-single-threaded/src/main.rs index 6011d6d64..c159b9b6a 100644 --- a/kernel/examples/read-table-single-threaded/src/main.rs +++ b/kernel/examples/read-table-single-threaded/src/main.rs @@ -13,6 +13,7 @@ use delta_kernel::schema::Schema; use delta_kernel::{DeltaResult, Engine, Table}; use clap::{Parser, ValueEnum}; +use itertools::Itertools; /// An example program that dumps out the data of a delta table. Struct and Map types are not /// supported. @@ -113,27 +114,28 @@ fn try_main() -> DeltaResult<()> { .with_schema_opt(read_schema_opt) .build()?; - let mut batches = vec![]; - for res in scan.execute(engine.as_ref())? { - let res = res?; - let data = res.raw_data?; - let record_batch: RecordBatch = data - .into_any() - .downcast::() - .map_err(|_| delta_kernel::Error::EngineDataType("ArrowEngineData".to_string()))? - .into(); - let batch = if let Some(mut mask) = res.mask { - let extra_rows = record_batch.num_rows() - mask.len(); - if extra_rows > 0 { - // we need to extend the mask here in case it's too short - mask.extend(std::iter::repeat(true).take(extra_rows)); + let batches: Vec = scan + .execute(engine.as_ref())? + .map_ok(|res| -> DeltaResult { + let data = res.raw_data?; + let record_batch: RecordBatch = data + .into_any() + .downcast::() + .map_err(|_| delta_kernel::Error::EngineDataType("ArrowEngineData".to_string()))? + .into(); + if let Some(mut mask) = res.mask { + let extra_rows = record_batch.num_rows() - mask.len(); + if extra_rows > 0 { + // we need to extend the mask here in case it's too short + mask.extend(std::iter::repeat(true).take(extra_rows)); + } + Ok(filter_record_batch(&record_batch, &mask.into())?) + } else { + Ok(record_batch) } - filter_record_batch(&record_batch, &mask.into())? - } else { - record_batch - }; - batches.push(batch); - } + }) + .flatten_ok() + .try_collect()?; print_batches(&batches)?; Ok(()) } diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index b9451ff67..f62ebf54b 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -276,7 +276,7 @@ impl Scan { .flatten_ok() .flatten_ok(); - Ok(scan_files_iter + let result = scan_files_iter .map_ok(move |scan_file| -> DeltaResult<_> { let file_path = self.snapshot.table_root.join(&scan_file.path)?; let mut selection_vector = scan_file @@ -295,8 +295,7 @@ impl Scan { let gs = global_state.clone(); // Arc clone Ok(read_result_iter .into_iter() - .map(move |read_result| -> DeltaResult<_> { - let read_result = read_result?; + .map_ok(move |read_result| -> DeltaResult<_> { // to transform the physical data into the correct logical form let logical = transform_to_logical_internal( engine, @@ -319,11 +318,13 @@ impl Scan { }; selection_vector = rest; Ok(result) - })) + }) + .flatten_ok()) }) .flatten_ok() .flatten_ok() - .flatten_ok()) + .flatten_ok(); + Ok(result) } } diff --git a/kernel/tests/golden_tables.rs b/kernel/tests/golden_tables.rs index 3952f2329..b56885f35 100644 --- a/kernel/tests/golden_tables.rs +++ b/kernel/tests/golden_tables.rs @@ -7,6 +7,7 @@ use arrow::{compute::filter_record_batch, record_batch::RecordBatch}; use arrow_ord::sort::{lexsort_to_indices, SortColumn}; use arrow_schema::Schema; use arrow_select::{concat::concat_batches, take::take}; +use itertools::Itertools; use paste::paste; use std::path::{Path, PathBuf}; use std::sync::Arc; @@ -158,8 +159,7 @@ async fn latest_snapshot_test( let scan = snapshot.into_scan_builder().build()?; let scan_res = scan.execute(&engine)?; let batches: Vec = scan_res - .map(|sr| -> DeltaResult<_> { - let sr = sr?; + .map_ok(|sr| -> DeltaResult<_> { let data = sr.raw_data?; let record_batch = to_arrow(data)?; if let Some(mut mask) = sr.mask { @@ -173,7 +173,8 @@ async fn latest_snapshot_test( Ok(record_batch) } }) - .collect::>>()?; + .flatten_ok() + .try_collect()?; let expected = read_expected(&expected_path.expect("expect an expected dir")).await?; diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index 8382a2d28..7b4d9cfd6 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -18,6 +18,7 @@ use delta_kernel::scan::state::{visit_scan_files, DvInfo, Stats}; use delta_kernel::scan::{transform_to_logical, Scan}; use delta_kernel::schema::Schema; use delta_kernel::{DeltaResult, Engine, EngineData, FileMeta, Table}; +use itertools::Itertools; use object_store::{memory::InMemory, path::Path, ObjectStore}; use parquet::arrow::arrow_writer::ArrowWriter; use parquet::file::properties::WriterProperties; @@ -398,8 +399,7 @@ fn read_with_execute( let result_schema: ArrowSchemaRef = Arc::new(scan.schema().as_ref().try_into()?); let scan_results = scan.execute(engine)?; let batches: Vec = scan_results - .map(|sr| -> DeltaResult<_> { - let sr = sr?; + .map_ok(|sr| -> DeltaResult<_> { let data = sr.raw_data?; let record_batch = to_arrow(data)?; if let Some(mut mask) = sr.mask { @@ -413,7 +413,8 @@ fn read_with_execute( Ok(record_batch) } }) - .collect::>>()?; + .flatten_ok() + .try_collect()?; if expected.is_empty() { assert_eq!(batches.len(), 0);