Skip to content

Commit

Permalink
Address feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
OussamaSaoudi-db committed Sep 30, 2024
1 parent 608df9e commit 3fcae86
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 32 deletions.
1 change: 1 addition & 0 deletions acceptance/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ delta_kernel = { path = "../kernel", features = [
"developer-visibility",
] }
futures = "0.3"
itertools = "0.13"
object_store = { workspace = true }
parquet = { workspace = true }
serde = { version = "1", features = ["derive"] }
Expand Down
3 changes: 2 additions & 1 deletion acceptance/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use arrow_select::{concat::concat_batches, filter::filter_record_batch, take::ta

use delta_kernel::{engine::arrow_data::ArrowEngineData, DeltaResult, Engine, Error, Table};
use futures::{stream::TryStreamExt, StreamExt};
use itertools::Itertools;
use object_store::{local::LocalFileSystem, ObjectStore};
use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder};

Expand Down Expand Up @@ -144,7 +145,7 @@ pub async fn assert_scan_data(engine: Arc<dyn Engine>, test_case: &TestCaseInfo)
Ok(record_batch)
}
})
.collect::<DeltaResult<Vec<RecordBatch>>>()?;
.try_collect()?;
let all_data = concat_batches(&schema.unwrap(), batches.iter()).map_err(Error::from)?;
let all_data = sort_record_batch(all_data)?;
let golden = read_golden(test_case.root_dir(), None).await?;
Expand Down
42 changes: 22 additions & 20 deletions kernel/examples/read-table-single-threaded/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use delta_kernel::schema::Schema;
use delta_kernel::{DeltaResult, Engine, Table};

use clap::{Parser, ValueEnum};
use itertools::Itertools;

/// An example program that dumps out the data of a delta table. Struct and Map types are not
/// supported.
Expand Down Expand Up @@ -113,27 +114,28 @@ fn try_main() -> DeltaResult<()> {
.with_schema_opt(read_schema_opt)
.build()?;

let mut batches = vec![];
for res in scan.execute(engine.as_ref())? {
let res = res?;
let data = res.raw_data?;
let record_batch: RecordBatch = data
.into_any()
.downcast::<ArrowEngineData>()
.map_err(|_| delta_kernel::Error::EngineDataType("ArrowEngineData".to_string()))?
.into();
let batch = if let Some(mut mask) = res.mask {
let extra_rows = record_batch.num_rows() - mask.len();
if extra_rows > 0 {
// we need to extend the mask here in case it's too short
mask.extend(std::iter::repeat(true).take(extra_rows));
let batches: Vec<RecordBatch> = scan
.execute(engine.as_ref())?
.map_ok(|res| -> DeltaResult<RecordBatch> {
let data = res.raw_data?;
let record_batch: RecordBatch = data
.into_any()
.downcast::<ArrowEngineData>()
.map_err(|_| delta_kernel::Error::EngineDataType("ArrowEngineData".to_string()))?
.into();
if let Some(mut mask) = res.mask {
let extra_rows = record_batch.num_rows() - mask.len();
if extra_rows > 0 {
// we need to extend the mask here in case it's too short
mask.extend(std::iter::repeat(true).take(extra_rows));
}
Ok(filter_record_batch(&record_batch, &mask.into())?)
} else {
Ok(record_batch)
}
filter_record_batch(&record_batch, &mask.into())?
} else {
record_batch
};
batches.push(batch);
}
})
.flatten_ok()
.try_collect()?;
print_batches(&batches)?;
Ok(())
}
11 changes: 6 additions & 5 deletions kernel/src/scan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ impl Scan {
.flatten_ok()
.flatten_ok();

Ok(scan_files_iter
let result = scan_files_iter
.map_ok(move |scan_file| -> DeltaResult<_> {
let file_path = self.snapshot.table_root.join(&scan_file.path)?;
let mut selection_vector = scan_file
Expand All @@ -295,8 +295,7 @@ impl Scan {
let gs = global_state.clone(); // Arc clone
Ok(read_result_iter
.into_iter()
.map(move |read_result| -> DeltaResult<_> {
let read_result = read_result?;
.map_ok(move |read_result| -> DeltaResult<_> {
// to transform the physical data into the correct logical form
let logical = transform_to_logical_internal(
engine,
Expand All @@ -319,11 +318,13 @@ impl Scan {
};
selection_vector = rest;
Ok(result)
}))
})
.flatten_ok())
})
.flatten_ok()
.flatten_ok()
.flatten_ok())
.flatten_ok();
Ok(result)
}
}

Expand Down
7 changes: 4 additions & 3 deletions kernel/tests/golden_tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use arrow::{compute::filter_record_batch, record_batch::RecordBatch};
use arrow_ord::sort::{lexsort_to_indices, SortColumn};
use arrow_schema::Schema;
use arrow_select::{concat::concat_batches, take::take};
use itertools::Itertools;
use paste::paste;
use std::path::{Path, PathBuf};
use std::sync::Arc;
Expand Down Expand Up @@ -158,8 +159,7 @@ async fn latest_snapshot_test(
let scan = snapshot.into_scan_builder().build()?;
let scan_res = scan.execute(&engine)?;
let batches: Vec<RecordBatch> = scan_res
.map(|sr| -> DeltaResult<_> {
let sr = sr?;
.map_ok(|sr| -> DeltaResult<_> {
let data = sr.raw_data?;
let record_batch = to_arrow(data)?;
if let Some(mut mask) = sr.mask {
Expand All @@ -173,7 +173,8 @@ async fn latest_snapshot_test(
Ok(record_batch)
}
})
.collect::<DeltaResult<Vec<RecordBatch>>>()?;
.flatten_ok()
.try_collect()?;

let expected = read_expected(&expected_path.expect("expect an expected dir")).await?;

Expand Down
7 changes: 4 additions & 3 deletions kernel/tests/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use delta_kernel::scan::state::{visit_scan_files, DvInfo, Stats};
use delta_kernel::scan::{transform_to_logical, Scan};
use delta_kernel::schema::Schema;
use delta_kernel::{DeltaResult, Engine, EngineData, FileMeta, Table};
use itertools::Itertools;
use object_store::{memory::InMemory, path::Path, ObjectStore};
use parquet::arrow::arrow_writer::ArrowWriter;
use parquet::file::properties::WriterProperties;
Expand Down Expand Up @@ -398,8 +399,7 @@ fn read_with_execute(
let result_schema: ArrowSchemaRef = Arc::new(scan.schema().as_ref().try_into()?);
let scan_results = scan.execute(engine)?;
let batches: Vec<RecordBatch> = scan_results
.map(|sr| -> DeltaResult<_> {
let sr = sr?;
.map_ok(|sr| -> DeltaResult<_> {
let data = sr.raw_data?;
let record_batch = to_arrow(data)?;
if let Some(mut mask) = sr.mask {
Expand All @@ -413,7 +413,8 @@ fn read_with_execute(
Ok(record_batch)
}
})
.collect::<DeltaResult<Vec<RecordBatch>>>()?;
.flatten_ok()
.try_collect()?;

if expected.is_empty() {
assert_eq!(batches.len(), 0);
Expand Down

0 comments on commit 3fcae86

Please sign in to comment.