Address PR feedback.

mbutrovich · mbutrovich · commit 7adc770fe1f6 · 2025-10-29T15:49:00.000-04:00
diff --git a/crates/iceberg/src/arrow/caching_delete_file_loader.rs b/crates/iceberg/src/arrow/caching_delete_file_loader.rs
@@ -534,6 +534,7 @@ mod tests {
     use std::fs::File;
     use std::sync::Arc;
 
+    use arrow_array::cast::AsArray;
     use arrow_array::{ArrayRef, Int32Array, Int64Array, RecordBatch, StringArray, StructArray};
     use arrow_schema::{DataType, Field, Fields};
     use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY};
@@ -685,12 +686,13 @@ mod tests {
         assert!(result.is_none()); // no pos dels for file 3
     }
 
-    /// Verifies that evolve_schema on partial-schema equality deletes fails with Arrow
-    /// validation errors when missing REQUIRED columns are filled with NULLs.
+    /// Verifies that evolve_schema on partial-schema equality deletes works correctly
+    /// when only equality_ids columns are evolved, not all table columns.
     ///
-    /// Reproduces the issue that caused 14 TestSparkReaderDeletes failures in Iceberg Java.
+    /// Per the [Iceberg spec](https://iceberg.apache.org/spec/#equality-delete-files),
+    /// equality delete files can contain only a subset of columns.
     #[tokio::test]
-    async fn test_partial_schema_equality_deletes_evolve_fails() {
+    async fn test_partial_schema_equality_deletes_evolve_succeeds() {
         let tmp_dir = TempDir::new().unwrap();
         let table_location = tmp_dir.path().as_os_str().to_str().unwrap();
 
@@ -750,23 +752,32 @@ mod tests {
             .await
             .unwrap();
 
-        let mut evolved_stream = BasicDeleteFileLoader::evolve_schema(batch_stream, table_schema)
-            .await
-            .unwrap();
+        // Only evolve the equality_ids columns (field 2), not all table columns
+        let equality_ids = vec![2];
+        let evolved_stream =
+            BasicDeleteFileLoader::evolve_schema(batch_stream, table_schema, &equality_ids)
+                .await
+                .unwrap();
 
-        let result = evolved_stream.next().await.unwrap();
+        let result = evolved_stream.try_collect::<Vec<_>>().await;
 
         assert!(
-            result.is_err(),
-            "Expected error from evolve_schema adding NULL to non-nullable column"
+            result.is_ok(),
+            "Expected success when evolving only equality_ids columns, got error: {:?}",
+            result.err()
         );
 
-        let err = result.unwrap_err();
-        let err_msg = err.to_string();
-        assert!(
-            err_msg.contains("non-nullable") || err_msg.contains("null values"),
-            "Expected null value error, got: {}",
-            err_msg
-        );
+        let batches = result.unwrap();
+        assert_eq!(batches.len(), 1);
+
+        let batch = &batches[0];
+        assert_eq!(batch.num_rows(), 3);
+        assert_eq!(batch.num_columns(), 1); // Only 'data' column
+
+        // Verify the actual values are preserved after schema evolution
+        let data_col = batch.column(0).as_string::<i32>();
+        assert_eq!(data_col.value(0), "a");
+        assert_eq!(data_col.value(1), "d");
+        assert_eq!(data_col.value(2), "g");
     }
 }
diff --git a/crates/iceberg/src/arrow/delete_file_loader.rs b/crates/iceberg/src/arrow/delete_file_loader.rs
@@ -71,20 +71,17 @@ impl BasicDeleteFileLoader {
         Ok(Box::pin(record_batch_stream) as ArrowRecordBatchStream)
     }
 
-    /// Evolves the schema of the RecordBatches from an equality delete file
+    /// Evolves the schema of the RecordBatches from an equality delete file.
+    ///
+    /// Per the [Iceberg spec](https://iceberg.apache.org/spec/#equality-delete-files),
+    /// only evolves the specified `equality_ids` columns, not all table columns.
     pub(crate) async fn evolve_schema(
         record_batch_stream: ArrowRecordBatchStream,
         target_schema: Arc<Schema>,
+        equality_ids: &[i32],
     ) -> Result<ArrowRecordBatchStream> {
-        let eq_ids = target_schema
-            .as_ref()
-            .field_id_to_name_map()
-            .keys()
-            .cloned()
-            .collect::<Vec<_>>();
-
         let mut record_batch_transformer =
-            RecordBatchTransformer::build(target_schema.clone(), &eq_ids);
+            RecordBatchTransformer::build(target_schema.clone(), equality_ids);
 
         let record_batch_stream = record_batch_stream.map(move |record_batch| {
             record_batch.and_then(|record_batch| {
@@ -105,7 +102,14 @@ impl DeleteFileLoader for BasicDeleteFileLoader {
     ) -> Result<ArrowRecordBatchStream> {
         let raw_batch_stream = self.parquet_to_batch_stream(&task.file_path).await?;
 
-        Self::evolve_schema(raw_batch_stream, schema).await
+        // For equality deletes, only evolve the equality_ids columns.
+        // For positional deletes (equality_ids is None), use all field IDs.
+        let field_ids = match &task.equality_ids {
+            Some(ids) => ids.clone(),
+            None => schema.field_id_to_name_map().keys().cloned().collect(),
+        };
+
+        Self::evolve_schema(raw_batch_stream, schema, &field_ids).await
     }
 }