delta-io · scovich · Oct 9, 2024 · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/ffi/src/engine_funcs.rs b/ffi/src/engine_funcs.rs
@@ -122,6 +122,7 @@ fn read_parquet_file_impl(
         last_modified: file.last_modified,
         size: file.size,
     };
+    // TODO: Plumb the predicate through the FFI?
     let data = parquet_handler.read_parquet_files(&[delta_fm], physical_schema, None)?;
     let res = Box::new(FileReadResultIterator {
         data,

diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs
@@ -1,21 +1,24 @@
 //! Some utilities for working with arrow data types
 
-use std::{collections::HashSet, sync::Arc};
+use std::{collections::HashSet, io::BufReader, sync::Arc};
 
 use crate::{
+    engine::arrow_data::ArrowEngineData,
     schema::{DataType, PrimitiveType, Schema, SchemaRef, StructField, StructType},
     utils::require,
-    DeltaResult, Error,
+    DeltaResult, EngineData, Error,
 };
 
 use arrow_array::{
     cast::AsArray, new_null_array, Array as ArrowArray, GenericListArray, OffsetSizeTrait,
-    StructArray,
+    RecordBatch, StringArray, StructArray,
 };
+use arrow_json::ReaderBuilder;
 use arrow_schema::{
     DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, Fields,
     SchemaRef as ArrowSchemaRef,
 };
+use arrow_select::concat::concat_batches;
 use itertools::Itertools;
 use parquet::{arrow::ProjectionMask, schema::types::SchemaDescriptor};
 use tracing::debug;
@@ -757,6 +760,59 @@ fn reorder_list<O: OffsetSizeTrait>(
     }
 }
 
+fn hack_parse(
+    stats_schema: &ArrowSchemaRef,
+    json_string: Option<&str>,
+) -> DeltaResult<RecordBatch> {
+    match json_string {
+        Some(s) => Ok(ReaderBuilder::new(stats_schema.clone())
+            .build(BufReader::new(s.as_bytes()))?
+            .next()
+            .transpose()?
+            .ok_or(Error::missing_data("Expected data"))?),
+        None => Ok(RecordBatch::try_new(
+            stats_schema.clone(),
+            stats_schema
+                .fields
+                .iter()
+                .map(|field| new_null_array(field.data_type(), 1))
+                .collect(),
+        )?),
+    }
+}
+
+/// Arrow lacks the functionality to json-parse a string column into a struct column -- even tho the
+/// JSON file reader does exactly the same thing. This function is a hack to work around that gap.
+pub(crate) fn parse_json(
+    json_strings: Box<dyn EngineData>,
+    output_schema: SchemaRef,
+) -> DeltaResult<Box<dyn EngineData>> {
+    let json_strings: RecordBatch = ArrowEngineData::try_from_engine_data(json_strings)?.into();
+    // TODO(nick): this is pretty terrible
+    let struct_array: StructArray = json_strings.into();
+    let json_strings = struct_array
+        .column(0)
+        .as_any()
+        .downcast_ref::<StringArray>()
+        .ok_or_else(|| {
+            Error::generic("Expected json_strings to be a StringArray, found something else")
+        })?;
+    let output_schema: ArrowSchemaRef = Arc::new(output_schema.as_ref().try_into()?);
+    if json_strings.is_empty() {
+        return Ok(Box::new(ArrowEngineData::new(RecordBatch::new_empty(
+            output_schema,
+        ))));
+    }
+    let output: Vec<_> = json_strings
+        .iter()
+        .map(|json_string| hack_parse(&output_schema, json_string))
+        .try_collect()?;
+    Ok(Box::new(ArrowEngineData::new(concat_batches(
+        &output_schema,
+        output.iter(),
+    )?)))
+}
+
 #[cfg(test)]
 mod tests {
     use std::sync::Arc;

diff --git a/kernel/src/engine/default/json.rs b/kernel/src/engine/default/json.rs
@@ -5,19 +5,16 @@ use std::ops::Range;
 use std::sync::Arc;
 use std::task::{ready, Poll};
 
-use arrow_array::{new_null_array, Array, RecordBatch, StringArray, StructArray};
 use arrow_json::ReaderBuilder;
 use arrow_schema::SchemaRef as ArrowSchemaRef;
-use arrow_select::concat::concat_batches;
 use bytes::{Buf, Bytes};
 use futures::{StreamExt, TryStreamExt};
-use itertools::Itertools;
 use object_store::path::Path;
 use object_store::{DynObjectStore, GetResultPayload};
 
 use super::executor::TaskExecutor;
 use super::file_stream::{FileOpenFuture, FileOpener, FileStream};
-use crate::engine::arrow_data::ArrowEngineData;
+use crate::engine::arrow_utils::parse_json as arrow_parse_json;
 use crate::schema::SchemaRef;
 use crate::{
     DeltaResult, EngineData, Error, Expression, FileDataReadResultIterator, FileMeta, JsonHandler,
@@ -62,57 +59,13 @@ impl<E: TaskExecutor> DefaultJsonHandler<E> {
     }
 }
 
-fn hack_parse(
-    stats_schema: &ArrowSchemaRef,
-    json_string: Option<&str>,
-) -> DeltaResult<RecordBatch> {
-    match json_string {
-        Some(s) => Ok(ReaderBuilder::new(stats_schema.clone())
-            .build(BufReader::new(s.as_bytes()))?
-            .next()
-            .transpose()?
-            .ok_or(Error::missing_data("Expected data"))?),
-        None => Ok(RecordBatch::try_new(
-            stats_schema.clone(),
-            stats_schema
-                .fields
-                .iter()
-                .map(|field| new_null_array(field.data_type(), 1))
-                .collect(),
-        )?),
-    }
-}
-
 impl<E: TaskExecutor> JsonHandler for DefaultJsonHandler<E> {
     fn parse_json(
         &self,
         json_strings: Box<dyn EngineData>,
         output_schema: SchemaRef,
     ) -> DeltaResult<Box<dyn EngineData>> {
-        let json_strings: RecordBatch = ArrowEngineData::try_from_engine_data(json_strings)?.into();
-        // TODO(nick): this is pretty terrible
-        let struct_array: StructArray = json_strings.into();
-        let json_strings = struct_array
-            .column(0)
-            .as_any()
-            .downcast_ref::<StringArray>()
-            .ok_or_else(|| {
-                Error::generic("Expected json_strings to be a StringArray, found something else")
-            })?;
-        let output_schema: ArrowSchemaRef = Arc::new(output_schema.as_ref().try_into()?);
-        if json_strings.is_empty() {
-            return Ok(Box::new(ArrowEngineData::new(RecordBatch::new_empty(
-                output_schema,
-            ))));
-        }
-        let output: Vec<_> = json_strings
-            .iter()
-            .map(|json_string| hack_parse(&output_schema, json_string))
-            .try_collect()?;
-        Ok(Box::new(ArrowEngineData::new(concat_batches(
-            &output_schema,
-            output.iter(),
-        )?)))
+        arrow_parse_json(json_strings, output_schema)
     }
 
     fn read_json_files(
@@ -220,14 +173,15 @@ impl FileOpener for JsonOpener {
 mod tests {
     use std::path::PathBuf;
 
-    use arrow::array::AsArray;
+    use arrow::array::{AsArray, RecordBatch, StringArray};
     use arrow_schema::{DataType, Field, Schema as ArrowSchema};
     use itertools::Itertools;
     use object_store::{local::LocalFileSystem, ObjectStore};
 
     use super::*;
     use crate::{
-        actions::get_log_schema, engine::default::executor::tokio::TokioBackgroundExecutor,
+        actions::get_log_schema, engine::arrow_data::ArrowEngineData,
+        engine::default::executor::tokio::TokioBackgroundExecutor,
     };
 
     fn string_array_to_engine_data(string_array: StringArray) -> Box<dyn EngineData> {

diff --git a/kernel/src/engine/default/parquet.rs b/kernel/src/engine/default/parquet.rs
@@ -14,6 +14,7 @@ use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStream
 use super::file_stream::{FileOpenFuture, FileOpener, FileStream};
 use crate::engine::arrow_utils::{generate_mask, get_requested_indices, reorder_struct_array};
 use crate::engine::default::executor::TaskExecutor;
+use crate::engine::parquet_row_group_skipping::ParquetRowGroupSkipping;
 use crate::schema::SchemaRef;
 use crate::{DeltaResult, Error, Expression, FileDataReadResultIterator, FileMeta, ParquetHandler};
 
@@ -47,7 +48,7 @@ impl<E: TaskExecutor> ParquetHandler for DefaultParquetHandler<E> {
         &self,
         files: &[FileMeta],
         physical_schema: SchemaRef,
-        _predicate: Option<Expression>,
+        predicate: Option<Expression>,
     ) -> DeltaResult<FileDataReadResultIterator> {
         if files.is_empty() {
             return Ok(Box::new(std::iter::empty()));
@@ -62,10 +63,15 @@ impl<E: TaskExecutor> ParquetHandler for DefaultParquetHandler<E> {
         //   -> parse to parquet
         // SAFETY: we did is_empty check above, this is ok.
         let file_opener: Box<dyn FileOpener> = match files[0].location.scheme() {
-            "http" | "https" => Box::new(PresignedUrlOpener::new(1024, physical_schema.clone())),
+            "http" | "https" => Box::new(PresignedUrlOpener::new(
+                1024,
+                physical_schema.clone(),
+                predicate,
+            )),
             _ => Box::new(ParquetOpener::new(
                 1024,
                 physical_schema.clone(),
+                predicate,
                 self.store.clone(),
             )),
         };
@@ -83,20 +89,23 @@ impl<E: TaskExecutor> ParquetHandler for DefaultParquetHandler<E> {
 struct ParquetOpener {
     // projection: Arc<[usize]>,
     batch_size: usize,
-    limit: Option<usize>,
     table_schema: SchemaRef,
+    predicate: Option<Expression>,
+    limit: Option<usize>,
     store: Arc<DynObjectStore>,
 }
 
 impl ParquetOpener {
     pub(crate) fn new(
         batch_size: usize,
         table_schema: SchemaRef,
+        predicate: Option<Expression>,
         store: Arc<DynObjectStore>,
     ) -> Self {
         Self {
             batch_size,
             table_schema,
+            predicate,
             limit: None,
             store,
         }
@@ -111,6 +120,7 @@ impl FileOpener for ParquetOpener {
         let batch_size = self.batch_size;
         // let projection = self.projection.clone();
         let table_schema = self.table_schema.clone();
+        let predicate = self.predicate.clone();
         let limit = self.limit;
 
         Ok(Box::pin(async move {
@@ -133,6 +143,9 @@ impl FileOpener for ParquetOpener {
                 builder = builder.with_projection(mask)
             }
 
+            if let Some(ref predicate) = predicate {
+                builder = builder.with_row_group_filter(predicate);
+            }
             if let Some(limit) = limit {
                 builder = builder.with_limit(limit)
             }
@@ -153,16 +166,18 @@ impl FileOpener for ParquetOpener {
 /// Implements [`FileOpener`] for a opening a parquet file from a presigned URL
 struct PresignedUrlOpener {
     batch_size: usize,
+    predicate: Option<Expression>,
     limit: Option<usize>,
     table_schema: SchemaRef,
     client: reqwest::Client,
 }
 
 impl PresignedUrlOpener {
-    pub(crate) fn new(batch_size: usize, schema: SchemaRef) -> Self {
+    pub(crate) fn new(batch_size: usize, schema: SchemaRef, predicate: Option<Expression>) -> Self {
         Self {
             batch_size,
             table_schema: schema,
+            predicate,
             limit: None,
             client: reqwest::Client::new(),
         }
@@ -173,6 +188,7 @@ impl FileOpener for PresignedUrlOpener {
     fn open(&self, file_meta: FileMeta, _range: Option<Range<i64>>) -> DeltaResult<FileOpenFuture> {
         let batch_size = self.batch_size;
         let table_schema = self.table_schema.clone();
+        let predicate = self.predicate.clone();
         let limit = self.limit;
         let client = self.client.clone(); // uses Arc internally according to reqwest docs
 
@@ -196,6 +212,9 @@ impl FileOpener for PresignedUrlOpener {
                 builder = builder.with_projection(mask)
             }
 
+            if let Some(ref predicate) = predicate {
+                builder = builder.with_row_group_filter(predicate);
+            }
             if let Some(limit) = limit {
                 builder = builder.with_limit(limit)
             }
@@ -261,6 +280,7 @@ mod tests {
             size: meta.size,
         }];
 
+        // TODO: add a test that uses predicate skipping?
         let handler = DefaultParquetHandler::new(store, Arc::new(TokioBackgroundExecutor::new()));
         let data: Vec<RecordBatch> = handler
             .read_parquet_files(files, Arc::new(physical_schema.try_into().unwrap()), None)

diff --git a/kernel/src/engine/mod.rs b/kernel/src/engine/mod.rs
@@ -11,6 +11,9 @@ pub mod arrow_expression;
 #[cfg(any(feature = "default-engine", feature = "sync-engine"))]
 pub mod arrow_data;
 
+#[cfg(any(feature = "default-engine", feature = "sync-engine"))]
+pub mod parquet_row_group_skipping;
+
 #[cfg(any(feature = "default-engine", feature = "sync-engine"))]
 pub mod parquet_stats_skipping;