[VARIANT] Fix shredding-related test failures and respect variant shredding table property (#5838)

harshmotw-db · web-flow · commit bb668988388f · 2026-01-20T18:06:59.000-08:00
#### Which Delta project/connector is this regarding?  - [x] Spark - [ ] Standalone - [ ] Flink - [ ] Kernel - [ ] Other (fill in here) ## Description This PR introduces three key fixes to enable Delta-Spark to properly and legally work with the variant shredding feature in Spark 4.1. 1. This PR fixes delta column mapping when variant pushdown is enabled (i.e. `spark.sql.variant.pushVariantIntoScan = true` which is the default in Spark 4.1). This is in SchemaMergingUtils.scala. 2. This PR fixes data skipping based on null count on variant data when variant pushdown is enabled. This fix is in DataSkippingReader.scala. 3. This PR adds a fix where Delta-Spark respects the `delta.enableVariantShredding` table property when writing shredded files into delta tables. Before this fix, Spark 4.1 would always write shredded files regardless of the table property since the `spark.sql.variant.inferShreddingSchema` config is set to true. This fix is essential since it is illegal to write shredded files into delta files when the table property is disabled. ## How was this patch tested? 1. Re-enable skipped tests related to column mapping and data skipping that failed because `spark.sql.variant.pushVariantIntoScan = true` by default in Spark 4.1. 2. Test that shredded writes are performed based on the `delta.enableVariantShredding` table property in `DeltaVariantShreddingSuite.scala` ## Does this PR introduce _any_ user-facing changes? Yes, now Delta-Spark can properly work with shredded reads and writes.
diff --git a/spark/src/main/scala-shims/spark-4.0/VariantShreddingShims.scala b/spark/src/main/scala-shims/spark-4.0/VariantShreddingShims.scala
@@ -0,0 +1,36 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.delta.shims
+
+/**
+ * Shim for variant shredding configs to handle API changes between Spark versions.
+ * In Spark 4.0, VARIANT_INFER_SHREDDING_SCHEMA config does not exist.
+ *
+ * This shim provides a way to conditionally add the config to the options map
+ * when writing files.
+ */
+object VariantShreddingShims {
+  /**
+   * Returns a Map containing variant shredding related configs for file writing.
+   * In Spark 4.0, this returns an empty map since the config doesn't exist.
+   */
+  def getVariantInferShreddingSchemaOptions(enableVariantShredding: Boolean)
+    : Map[String, String] = {
+    // In Spark 4.0, VARIANT_INFER_SHREDDING_SCHEMA does not exist, so return empty map
+    Map.empty[String, String]
+  }
+}
diff --git a/spark/src/main/scala-shims/spark-4.1/VariantShreddingShims.scala b/spark/src/main/scala-shims/spark-4.1/VariantShreddingShims.scala
@@ -0,0 +1,37 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.delta.shims
+
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * Shim for variant shredding configs to handle API changes between Spark versions.
+ * In Spark 4.1, VARIANT_INFER_SHREDDING_SCHEMA config exists.
+ *
+ * This shim provides a way to conditionally add the config to the options map
+ * when writing files.
+ */
+object VariantShreddingShims {
+  /**
+   * Returns a Map containing variant shredding related configs for file writing.
+   * In Spark 4.1, this returns the VARIANT_INFER_SHREDDING_SCHEMA config.
+   */
+  def getVariantInferShreddingSchemaOptions(enableVariantShredding: Boolean)
+    : Map[String, String] = {
+    Map(SQLConf.VARIANT_INFER_SHREDDING_SCHEMA.key -> enableVariantShredding.toString)
+  }
+}
diff --git a/spark/src/main/scala-shims/spark-4.2/VariantShreddingShims.scala b/spark/src/main/scala-shims/spark-4.2/VariantShreddingShims.scala
@@ -0,0 +1,36 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.delta.shims
+
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * Shim for variant shredding configs to handle API changes between Spark versions.
+ * In Spark 4.2, VARIANT_INFER_SHREDDING_SCHEMA config exists.
+ *
+ * This shim provides a way to conditionally add the config to the options map
+ * when writing files.
+ */
+object VariantShreddingShims {
+  /**
+   * Returns a Map containing variant shredding related configs for file writing.
+   * In Spark 4.2, this returns the VARIANT_INFER_SHREDDING_SCHEMA config.
+   */
+  def getVariantInferShreddingSchemaOptions(enableVariantShredding: Boolean): Map[String, String] = {
+    Map(SQLConf.VARIANT_INFER_SHREDDING_SCHEMA.key -> enableVariantShredding.toString)
+  }
+}
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/files/TransactionalWrite.scala b/spark/src/main/scala/org/apache/spark/sql/delta/files/TransactionalWrite.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.delta.hooks.AutoCompact
 import org.apache.spark.sql.delta.metering.DeltaLogging
 import org.apache.spark.sql.delta.perf.DeltaOptimizedWriterExec
 import org.apache.spark.sql.delta.schema._
+import org.apache.spark.sql.delta.shims.VariantShreddingShims
 import org.apache.spark.sql.delta.sources.DeltaSQLConf
 import org.apache.spark.sql.delta.sources.DeltaSQLConf.DELTA_COLLECT_STATS_USING_TABLE_SCHEMA
 import org.apache.spark.sql.delta.stats.{
@@ -478,7 +479,9 @@ trait TransactionalWrite extends DeltaLogging { self: OptimisticTransactionImpl
             key.equalsIgnoreCase(DeltaOptions.MAX_RECORDS_PER_FILE) ||
               key.equalsIgnoreCase(DeltaOptions.COMPRESSION)
           }.toMap
-      }) + (DeltaOptions.WRITE_PARTITION_COLUMNS -> writePartitionColumns.toString)
+      }) + (DeltaOptions.WRITE_PARTITION_COLUMNS -> writePartitionColumns.toString) ++
+        VariantShreddingShims.getVariantInferShreddingSchemaOptions(
+          DeltaConfigs.ENABLE_VARIANT_SHREDDING.fromMetaData(metadata))
 
       try {
         DeltaFileFormatWriter.write(
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/schema/SchemaMergingUtils.scala b/spark/src/main/scala/org/apache/spark/sql/delta/schema/SchemaMergingUtils.scala
@@ -339,6 +339,11 @@ object SchemaMergingUtils {
       tf: (Seq[String], StructField, Resolver) => StructField): T = {
     def transform[E <: DataType](path: Seq[String], dt: E): E = {
       val newDt = dt match {
+        case s: StructType
+          if org.apache.spark.sql.execution.datasources.VariantMetadata.isVariantStruct(s) =>
+          // A variant struct is logically still a variant, so we should not recurse into its
+          // fields like a normal struct.
+          s
         case StructType(fields) =>
           StructType(fields.map { field =>
             val newField = tf(path, field, DELTA_COL_RESOLVER)
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/stats/DataSkippingReader.scala b/spark/src/main/scala/org/apache/spark/sql/delta/stats/DataSkippingReader.scala
@@ -43,6 +43,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral}
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.execution.InSubqueryExec
+import org.apache.spark.sql.execution.datasources.VariantMetadata
 import org.apache.spark.sql.expressions.SparkUserDefinedFunction
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{AtomicType, BooleanType, CalendarIntervalType, DataType, DateType, LongType, NumericType, StringType, StructField, StructType, TimestampNTZType, TimestampType}
@@ -129,7 +130,12 @@ private [sql] object DataSkippingPredicate {
 object SkippingEligibleColumn {
   def unapply(arg: Expression): Option[(Seq[String], DataType)] = {
     // Only atomic types are eligible for skipping, and args should always be resolved by now.
-    val eligible = arg.resolved && arg.dataType.isInstanceOf[AtomicType]
+    // When `pushVariantIntoScan` is true, Variants in the read schema are transformed into Structs
+    // to facilitate shredded reads. Therefore, filters like `v is not null` where `v` is a variant
+    // column look like the filters on struct data. `VariantMetadata.isVariantStruct` helps in
+    // distinguishing between "true structs" and "variant structs".
+    val eligible = arg.resolved && (arg.dataType.isInstanceOf[AtomicType] ||
+      VariantMetadata.isVariantStruct(arg.dataType))
     if (eligible) searchChain(arg).map(_ -> arg.dataType) else None
   }
 
diff --git a/spark/src/test/scala-shims/spark-4.0/VariantShreddingTestShims.scala b/spark/src/test/scala-shims/spark-4.0/VariantShreddingTestShims.scala
@@ -0,0 +1,36 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.delta.test.shims
+
+/**
+ * Test shim for variant shredding to handle differences between Spark versions.
+ * In Spark 4.0, VARIANT_INFER_SHREDDING_SCHEMA does not exist.
+ */
+object VariantShreddingTestShims {
+  /**
+   * Returns true if VARIANT_INFER_SHREDDING_SCHEMA config is supported in this Spark version.
+   * In Spark 4.0, this returns false.
+   */
+  val variantInferShreddingSchemaSupported: Boolean = false
+
+  /**
+   * Returns a dummy config key for VARIANT_INFER_SHREDDING_SCHEMA.
+   * In Spark 4.0, since this config doesn't exist, we return a dummy key that won't affect tests.
+   * This allows tests to compile but the config will have no effect.
+   */
+  val variantInferShreddingSchemaKey: String = "spark.sql.dummy.variantInferShreddingSchema"
+}
diff --git a/spark/src/test/scala-shims/spark-4.1/VariantShreddingTestShims.scala b/spark/src/test/scala-shims/spark-4.1/VariantShreddingTestShims.scala
@@ -0,0 +1,37 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.delta.test.shims
+
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * Test shim for variant shredding to handle differences between Spark versions.
+ * In Spark 4.1, VARIANT_INFER_SHREDDING_SCHEMA exists.
+ */
+object VariantShreddingTestShims {
+  /**
+   * Returns true if VARIANT_INFER_SHREDDING_SCHEMA config is supported in this Spark version.
+   * In Spark 4.1, this returns true.
+   */
+  val variantInferShreddingSchemaSupported: Boolean = true
+
+  /**
+   * Returns the config key for VARIANT_INFER_SHREDDING_SCHEMA.
+   * In Spark 4.1, this returns the actual SQLConf key.
+   */
+  val variantInferShreddingSchemaKey: String = SQLConf.VARIANT_INFER_SHREDDING_SCHEMA.key
+}
diff --git a/spark/src/test/scala-shims/spark-4.2/VariantShreddingTestShims.scala b/spark/src/test/scala-shims/spark-4.2/VariantShreddingTestShims.scala
@@ -0,0 +1,37 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.delta.test.shims
+
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * Test shim for variant shredding to handle differences between Spark versions.
+ * In Spark 4.2, VARIANT_INFER_SHREDDING_SCHEMA exists.
+ */
+object VariantShreddingTestShims {
+  /**
+   * Returns true if VARIANT_INFER_SHREDDING_SCHEMA config is supported in this Spark version.
+   * In Spark 4.2, this returns true.
+   */
+  val variantInferShreddingSchemaSupported: Boolean = true
+
+  /**
+   * Returns the config key for VARIANT_INFER_SHREDDING_SCHEMA.
+   * In Spark 4.2, this returns the actual SQLConf key.
+   */
+  val variantInferShreddingSchemaKey: String = SQLConf.VARIANT_INFER_SHREDDING_SCHEMA.key
+}
diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/AutoCompactSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/AutoCompactSuite.scala
@@ -239,8 +239,7 @@ class AutoCompactExecutionSuite extends
     checkAutoCompactionWorks(dir, spark.range(10).toDF("id"))
   }
 
-  // TODO: Re-enable this test after fixing Variant data skipping in Spark 4.1.0+
-  ignore("variant auto compact kicks in when enabled - table config") {
+  test("variant auto compact kicks in when enabled - table config") {
     withTempDir { dir =>
       withSQLConf(
           "spark.databricks.delta.properties.defaults.autoOptimize.autoCompact" -> "true",
@@ -252,8 +251,7 @@ class AutoCompactExecutionSuite extends
     }
   }
 
-  // TODO: Re-enable this test after fixing Variant data skipping in Spark 4.1.0+
-  ignore("variant auto compact kicks in when enabled - session config") {
+  test("variant auto compact kicks in when enabled - session config") {
     withTempDir { dir =>
       withSQLConf(
           DeltaSQLConf.DELTA_AUTO_COMPACT_ENABLED.key -> "true",
@@ -377,11 +375,6 @@ class AutoCompactConfigurationIdColumnMappingSuite extends AutoCompactConfigurat
 class AutoCompactExecutionIdColumnMappingSuite extends AutoCompactExecutionSuite
   with DeltaColumnMappingEnableIdMode {
   override def runAllTests: Boolean = true
-  // TODO: these tests need to be fixed for Spark master
-  override def skipTests: Seq[String] = Seq(
-    "variant auto compact kicks in when enabled - table config",
-    "variant auto compact kicks in when enabled - session config"
-  )
 }
 
 class AutoCompactConfigurationNameColumnMappingSuite extends AutoCompactConfigurationSuite
@@ -392,10 +385,5 @@ class AutoCompactConfigurationNameColumnMappingSuite extends AutoCompactConfigur
 class AutoCompactExecutionNameColumnMappingSuite extends AutoCompactExecutionSuite
   with DeltaColumnMappingEnableNameMode {
   override def runAllTests: Boolean = true
-  // TODO: these tests need to be fixed for Spark master
-  override def skipTests: Seq[String] = Seq(
-    "variant auto compact kicks in when enabled - table config",
-    "variant auto compact kicks in when enabled - session config"
-  )
 }
 
diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaVariantShreddingSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaVariantShreddingSuite.scala
diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaVariantSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaVariantSuite.scala
diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/deletionvectors/DeletionVectorsSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/deletionvectors/DeletionVectorsSuite.scala
diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/stats/DataSkippingDeltaTests.scala b/spark/src/test/scala/org/apache/spark/sql/delta/stats/DataSkippingDeltaTests.scala