|
| 1 | +use std::sync::Arc; |
| 2 | + |
| 3 | +use crate::arrow::array::BooleanArray; |
| 4 | +use crate::arrow::compute::filter_record_batch; |
| 5 | +use crate::arrow::record_batch::RecordBatch; |
| 6 | +use itertools::Itertools; |
| 7 | + |
| 8 | +use crate::scan::{Scan, ScanMetadata, ScanResult}; |
| 9 | +use crate::{DeltaResult, Engine, Error, ExpressionRef}; |
| 10 | + |
| 11 | +use super::super::arrow_data::ArrowEngineData; |
| 12 | + |
| 13 | +/// [`ScanMetadata`] contains (1) a [`RecordBatch`] specifying data files to be scanned |
| 14 | +/// and (2) a vector of transforms (one transform per scan file) that must be applied to the data read |
| 15 | +/// from those files. |
| 16 | +pub struct ScanMetadataArrow { |
| 17 | + /// Record batch with one row per file to scan |
| 18 | + pub scan_files: RecordBatch, |
| 19 | + |
| 20 | + /// Row-level transformations to apply to data read from files. |
| 21 | + /// |
| 22 | + /// Each entry in this vector corresponds to a row in the `scan_files` data. The entry is an |
| 23 | + /// expression that must be applied to convert the file's data into the logical schema |
| 24 | + /// expected by the scan: |
| 25 | + /// |
| 26 | + /// - `Some(expr)`: Apply this expression to transform the data to match [`Scan::schema()`]. |
| 27 | + /// - `None`: No transformation is needed; the data is already in the correct logical form. |
| 28 | + /// |
| 29 | + /// Note: This vector can be indexed by row number. |
| 30 | + pub scan_file_transforms: Vec<Option<ExpressionRef>>, |
| 31 | +} |
| 32 | + |
| 33 | +impl TryFrom<ScanMetadata> for ScanMetadataArrow { |
| 34 | + type Error = Error; |
| 35 | + |
| 36 | + fn try_from(metadata: ScanMetadata) -> Result<Self, Self::Error> { |
| 37 | + let scan_file_transforms = metadata |
| 38 | + .scan_file_transforms |
| 39 | + .into_iter() |
| 40 | + .enumerate() |
| 41 | + .filter_map(|(i, v)| metadata.scan_files.selection_vector[i].then_some(v)) |
| 42 | + .collect(); |
| 43 | + let batch = ArrowEngineData::try_from_engine_data(metadata.scan_files.data)?.into(); |
| 44 | + let scan_files = filter_record_batch( |
| 45 | + &batch, |
| 46 | + &BooleanArray::from(metadata.scan_files.selection_vector), |
| 47 | + )?; |
| 48 | + Ok(ScanMetadataArrow { |
| 49 | + scan_files, |
| 50 | + scan_file_transforms, |
| 51 | + }) |
| 52 | + } |
| 53 | +} |
| 54 | + |
| 55 | +impl TryFrom<ScanResult> for RecordBatch { |
| 56 | + type Error = Error; |
| 57 | + |
| 58 | + fn try_from(result: ScanResult) -> Result<Self, Self::Error> { |
| 59 | + let (mask, data) = (result.full_mask(), result.raw_data?); |
| 60 | + let record_batch = ArrowEngineData::try_from_engine_data(data)?.into(); |
| 61 | + mask.map(|m| Ok(filter_record_batch(&record_batch, &m.into())?)) |
| 62 | + .unwrap_or(Ok(record_batch)) |
| 63 | + } |
| 64 | +} |
| 65 | + |
| 66 | +pub trait ScanExt { |
| 67 | + fn scan_metadata_arrow( |
| 68 | + &self, |
| 69 | + engine: &dyn Engine, |
| 70 | + ) -> DeltaResult<impl Iterator<Item = DeltaResult<ScanMetadataArrow>>>; |
| 71 | + |
| 72 | + fn execute_arrow( |
| 73 | + &self, |
| 74 | + engine: Arc<dyn Engine>, |
| 75 | + ) -> DeltaResult<impl Iterator<Item = DeltaResult<RecordBatch>>>; |
| 76 | +} |
| 77 | + |
| 78 | +impl ScanExt for Scan { |
| 79 | + fn scan_metadata_arrow( |
| 80 | + &self, |
| 81 | + engine: &dyn Engine, |
| 82 | + ) -> DeltaResult<impl Iterator<Item = DeltaResult<ScanMetadataArrow>>> { |
| 83 | + Ok(self |
| 84 | + .scan_metadata(engine)? |
| 85 | + .map_ok(TryFrom::try_from) |
| 86 | + .flatten()) |
| 87 | + } |
| 88 | + |
| 89 | + fn execute_arrow( |
| 90 | + &self, |
| 91 | + engine: Arc<dyn Engine>, |
| 92 | + ) -> DeltaResult<impl Iterator<Item = DeltaResult<RecordBatch>>> { |
| 93 | + Ok(self.execute(engine)?.map_ok(TryFrom::try_from).flatten()) |
| 94 | + } |
| 95 | +} |
0 commit comments