[KYUUBI #6990] Add rebalance before InsertIntoHiveDirCommand and InsertIntoDataSourceDirCommand to align with behaviors of hive

Z1Wu · pan3793 · commit 2080c2186cb1 · 2025-03-25T00:52:55.000+08:00
### Why are the changes needed? When users switch from Hive to Spark, for sql like INSERT OVERWRITE DIRECTORY AS SELECT, it would be great if small files could be automatically merged through simple configuration, just like in Hive. ### How was this patch tested? UnitTest ### Was this patch authored or co-authored using generative AI tooling? No Closes #6991 from Z1Wu/feat/add_insert_dir_rebalance_support. Closes #6990 2820bb2 [wuziyi] [fix] nit a69c041 [wuziyi] [fix] nit 951a773 [wuziyi] [fix] nit f75dfcb [wuziyi] [Feat] add rebalance before InsertIntoHiveDirCommand and InsertIntoDataSourceDirCommand to align with behaviors of hive Authored-by: wuziyi <wuziyi02@corp.netease.com> Signed-off-by: Cheng Pan <chengpan@apache.org>
diff --git a/extensions/spark/kyuubi-extension-spark-3-3/src/main/scala/org/apache/kyuubi/sql/RepartitionBeforeWritingBase.scala b/extensions/spark/kyuubi-extension-spark-3-3/src/main/scala/org/apache/kyuubi/sql/RepartitionBeforeWritingBase.scala
@@ -20,9 +20,9 @@ package org.apache.kyuubi.sql
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand
+import org.apache.spark.sql.execution.command.{CreateDataSourceTableAsSelectCommand, InsertIntoDataSourceDirCommand}
 import org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand
-import org.apache.spark.sql.hive.execution.{CreateHiveTableAsSelectCommand, InsertIntoHiveTable, OptimizedCreateHiveTableAsSelectCommand}
+import org.apache.spark.sql.hive.execution.{CreateHiveTableAsSelectCommand, InsertIntoHiveDirCommand, InsertIntoHiveTable, OptimizedCreateHiveTableAsSelectCommand}
 import org.apache.spark.sql.internal.StaticSQLConf
 
 trait RepartitionBuilder extends Rule[LogicalPlan] with RepartitionBeforeWriteHelper {
@@ -59,6 +59,10 @@ abstract class RepartitionBeforeWritingDatasourceBase extends RepartitionBuilder
         query.output.filter(attr => table.partitionColumnNames.contains(attr.name))
       c.copy(query = buildRepartition(dynamicPartitionColumns, query))
 
+    case i @ InsertIntoDataSourceDirCommand(_, _, query, _)
+        if query.resolved && canInsertRepartitionByExpression(query) =>
+      i.copy(query = buildRepartition(Seq.empty, query))
+
     case u @ Union(children, _, _) =>
       u.copy(children = children.map(addRepartition))
 
@@ -101,6 +105,10 @@ abstract class RepartitionBeforeWritingHiveBase extends RepartitionBuilder {
         query.output.filter(attr => table.partitionColumnNames.contains(attr.name))
       c.copy(query = buildRepartition(dynamicPartitionColumns, query))
 
+    case c @ InsertIntoHiveDirCommand(_, _, query, _, _)
+        if query.resolved && canInsertRepartitionByExpression(query) =>
+      c.copy(query = buildRepartition(Seq.empty, query))
+
     case u @ Union(children, _, _) =>
       u.copy(children = children.map(addRepartition))
 
diff --git a/extensions/spark/kyuubi-extension-spark-3-3/src/test/scala/org/apache/spark/sql/RebalanceBeforeWritingSuite.scala b/extensions/spark/kyuubi-extension-spark-3-3/src/test/scala/org/apache/spark/sql/RebalanceBeforeWritingSuite.scala
@@ -19,9 +19,9 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, RebalancePartitions, Sort}
-import org.apache.spark.sql.execution.command.DataWritingCommand
+import org.apache.spark.sql.execution.command.{DataWritingCommand, InsertIntoDataSourceDirCommand}
 import org.apache.spark.sql.hive.HiveUtils
-import org.apache.spark.sql.hive.execution.OptimizedCreateHiveTableAsSelectCommand
+import org.apache.spark.sql.hive.execution.{InsertIntoHiveDirCommand, OptimizedCreateHiveTableAsSelectCommand}
 
 import org.apache.kyuubi.sql.KyuubiSQLConf
 
@@ -272,4 +272,42 @@ class RebalanceBeforeWritingSuite extends KyuubiSparkSQLExtensionTest {
       }
     }
   }
+
+  test("Test rebalance in InsertIntoHiveDirCommand") {
+    withSQLConf(
+      HiveUtils.CONVERT_METASTORE_PARQUET.key -> "false",
+      HiveUtils.CONVERT_METASTORE_CTAS.key -> "false",
+      KyuubiSQLConf.INSERT_REPARTITION_BEFORE_WRITE_IF_NO_SHUFFLE.key -> "true") {
+      withTempDir(tmpDir => {
+        spark.range(0, 1000, 1, 10).createOrReplaceTempView("tmp_table")
+        val df = sql(s"INSERT OVERWRITE DIRECTORY '${tmpDir.getPath}' " +
+          s"STORED AS PARQUET SELECT * FROM tmp_table")
+        val insertHiveDirCommand = df.queryExecution.analyzed.collect {
+          case _: InsertIntoHiveDirCommand => true
+        }
+        assert(insertHiveDirCommand.size == 1)
+        val repartition = df.queryExecution.analyzed.collect {
+          case _: RebalancePartitions => true
+        }
+        assert(repartition.size == 1)
+      })
+    }
+  }
+
+  test("Test rebalance in InsertIntoDataSourceDirCommand") {
+    withSQLConf(
+      KyuubiSQLConf.INSERT_REPARTITION_BEFORE_WRITE_IF_NO_SHUFFLE.key -> "true") {
+      withTempDir(tmpDir => {
+        spark.range(0, 1000, 1, 10).createOrReplaceTempView("tmp_table")
+        val df = sql(s"INSERT OVERWRITE DIRECTORY '${tmpDir.getPath}' " +
+          s"USING PARQUET SELECT * FROM tmp_table")
+        assert(df.queryExecution.analyzed.isInstanceOf[InsertIntoDataSourceDirCommand])
+        val repartition =
+          df.queryExecution.analyzed.asInstanceOf[InsertIntoDataSourceDirCommand].query.collect {
+            case _: RebalancePartitions => true
+          }
+        assert(repartition.size == 1)
+      })
+    }
+  }
 }
diff --git a/extensions/spark/kyuubi-extension-spark-3-4/src/main/scala/org/apache/kyuubi/sql/RepartitionBeforeWritingBase.scala b/extensions/spark/kyuubi-extension-spark-3-4/src/main/scala/org/apache/kyuubi/sql/RepartitionBeforeWritingBase.scala
@@ -20,8 +20,9 @@ package org.apache.kyuubi.sql
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.command.InsertIntoDataSourceDirCommand
 import org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand
-import org.apache.spark.sql.hive.execution.InsertIntoHiveTable
+import org.apache.spark.sql.hive.execution.{InsertIntoHiveDirCommand, InsertIntoHiveTable}
 import org.apache.spark.sql.internal.StaticSQLConf
 
 trait RepartitionBuilder extends Rule[LogicalPlan] with RepartitionBeforeWriteHelper {
@@ -52,6 +53,10 @@ abstract class RepartitionBeforeWritingDatasourceBase extends RepartitionBuilder
       val dynamicPartitionColumns = pc.filterNot(attr => sp.contains(attr.name))
       i.copy(query = buildRepartition(dynamicPartitionColumns, query))
 
+    case i @ InsertIntoDataSourceDirCommand(_, _, query, _)
+        if query.resolved && canInsertRepartitionByExpression(query) =>
+      i.copy(query = buildRepartition(Seq.empty, query))
+
     case u @ Union(children, _, _) =>
       u.copy(children = children.map(addRepartition))
 
@@ -82,6 +87,10 @@ abstract class RepartitionBeforeWritingHiveBase extends RepartitionBuilder {
         .flatMap(name => query.output.find(_.name == name)).toSeq
       i.copy(query = buildRepartition(dynamicPartitionColumns, query))
 
+    case i @ InsertIntoHiveDirCommand(_, _, query, _, _)
+        if query.resolved && canInsertRepartitionByExpression(query) =>
+      i.copy(query = buildRepartition(Seq.empty, query))
+
     case u @ Union(children, _, _) =>
       u.copy(children = children.map(addRepartition))
 
diff --git a/extensions/spark/kyuubi-extension-spark-3-4/src/test/scala/org/apache/spark/sql/RebalanceBeforeWritingSuite.scala b/extensions/spark/kyuubi-extension-spark-3-4/src/test/scala/org/apache/spark/sql/RebalanceBeforeWritingSuite.scala
@@ -19,10 +19,10 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, RebalancePartitions, Sort}
-import org.apache.spark.sql.execution.command.DataWritingCommand
+import org.apache.spark.sql.execution.command.{DataWritingCommand, InsertIntoDataSourceDirCommand}
 import org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand
 import org.apache.spark.sql.hive.HiveUtils
-import org.apache.spark.sql.hive.execution.InsertIntoHiveTable
+import org.apache.spark.sql.hive.execution.{InsertIntoHiveDirCommand, InsertIntoHiveTable}
 
 import org.apache.kyuubi.sql.KyuubiSQLConf
 
@@ -267,4 +267,42 @@ class RebalanceBeforeWritingSuite extends KyuubiSparkSQLExtensionTest {
       }
     }
   }
+
+  test("Test rebalance in InsertIntoHiveDirCommand") {
+    withSQLConf(
+      HiveUtils.CONVERT_METASTORE_PARQUET.key -> "false",
+      HiveUtils.CONVERT_METASTORE_CTAS.key -> "false",
+      KyuubiSQLConf.INSERT_REPARTITION_BEFORE_WRITE_IF_NO_SHUFFLE.key -> "true") {
+      withTempDir(tmpDir => {
+        spark.range(0, 1000, 1, 10).createOrReplaceTempView("tmp_table")
+        val df = sql(s"INSERT OVERWRITE DIRECTORY '${tmpDir.getPath}' " +
+          s"STORED AS PARQUET SELECT * FROM tmp_table")
+        val insertHiveDirCommand = df.queryExecution.analyzed.collect {
+          case _: InsertIntoHiveDirCommand => true
+        }
+        assert(insertHiveDirCommand.size == 1)
+        val repartition = df.queryExecution.analyzed.collect {
+          case _: RebalancePartitions => true
+        }
+        assert(repartition.size == 1)
+      })
+    }
+  }
+
+  test("Test rebalance in InsertIntoDataSourceDirCommand") {
+    withSQLConf(
+      KyuubiSQLConf.INSERT_REPARTITION_BEFORE_WRITE_IF_NO_SHUFFLE.key -> "true") {
+      withTempDir(tmpDir => {
+        spark.range(0, 1000, 1, 10).createOrReplaceTempView("tmp_table")
+        val df = sql(s"INSERT OVERWRITE DIRECTORY '${tmpDir.getPath}' " +
+          s"USING PARQUET SELECT * FROM tmp_table")
+        assert(df.queryExecution.analyzed.isInstanceOf[InsertIntoDataSourceDirCommand])
+        val repartition =
+          df.queryExecution.analyzed.asInstanceOf[InsertIntoDataSourceDirCommand].query.collect {
+            case _: RebalancePartitions => true
+          }
+        assert(repartition.size == 1)
+      })
+    }
+  }
 }
diff --git a/extensions/spark/kyuubi-extension-spark-3-5/src/main/scala/org/apache/kyuubi/sql/RebalanceBeforeWriting.scala b/extensions/spark/kyuubi-extension-spark-3-5/src/main/scala/org/apache/kyuubi/sql/RebalanceBeforeWriting.scala
@@ -23,8 +23,9 @@ import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.command.InsertIntoDataSourceDirCommand
 import org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand
-import org.apache.spark.sql.hive.execution.InsertIntoHiveTable
+import org.apache.spark.sql.hive.execution.{InsertIntoHiveDirCommand, InsertIntoHiveTable}
 import org.apache.spark.sql.internal.StaticSQLConf
 
 trait RebalanceBeforeWritingBase extends Rule[LogicalPlan] {
@@ -112,6 +113,10 @@ case class RebalanceBeforeWritingDatasource(session: SparkSession)
       val dynamicPartitionColumns = pc.filterNot(attr => sp.contains(attr.name))
       i.copy(query = buildRebalance(dynamicPartitionColumns, query))
 
+    case i @ InsertIntoDataSourceDirCommand(_, _, query, _)
+        if query.resolved && canInsertRebalance(query) =>
+      i.copy(query = buildRebalance(Seq.empty, query))
+
     case u @ Union(children, _, _) =>
       u.copy(children = children.map(addRebalance))
 
@@ -144,6 +149,10 @@ case class RebalanceBeforeWritingHive(session: SparkSession)
         .flatMap(name => query.output.find(_.name == name)).toSeq
       i.copy(query = buildRebalance(dynamicPartitionColumns, query))
 
+    case i @ InsertIntoHiveDirCommand(_, _, query, _, _)
+        if query.resolved && canInsertRebalance(query) =>
+      i.copy(query = buildRebalance(Seq.empty, query))
+
     case u @ Union(children, _, _) =>
       u.copy(children = children.map(addRebalance))
 
diff --git a/extensions/spark/kyuubi-extension-spark-3-5/src/test/scala/org/apache/spark/sql/RebalanceBeforeWritingSuite.scala b/extensions/spark/kyuubi-extension-spark-3-5/src/test/scala/org/apache/spark/sql/RebalanceBeforeWritingSuite.scala
@@ -20,9 +20,10 @@ package org.apache.spark.sql
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, RebalancePartitions, Sort}
 import org.apache.spark.sql.execution.command.DataWritingCommand
+import org.apache.spark.sql.execution.command.InsertIntoDataSourceDirCommand
 import org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand
 import org.apache.spark.sql.hive.HiveUtils
-import org.apache.spark.sql.hive.execution.InsertIntoHiveTable
+import org.apache.spark.sql.hive.execution.{InsertIntoHiveDirCommand, InsertIntoHiveTable}
 
 import org.apache.kyuubi.sql.KyuubiSQLConf
 
@@ -292,4 +293,42 @@ class RebalanceBeforeWritingSuite extends KyuubiSparkSQLExtensionTest {
       }
     }
   }
+
+  test("Test rebalance in InsertIntoHiveDirCommand") {
+    withSQLConf(
+      HiveUtils.CONVERT_METASTORE_PARQUET.key -> "false",
+      HiveUtils.CONVERT_METASTORE_CTAS.key -> "false",
+      KyuubiSQLConf.INSERT_REPARTITION_BEFORE_WRITE_IF_NO_SHUFFLE.key -> "true") {
+      withTempDir(tmpDir => {
+        spark.range(0, 1000, 1, 10).createOrReplaceTempView("tmp_table")
+        val df = sql(s"INSERT OVERWRITE DIRECTORY '${tmpDir.getPath}' " +
+          s"STORED AS PARQUET SELECT * FROM tmp_table")
+        val insertHiveDirCommand = df.queryExecution.analyzed.collect {
+          case _: InsertIntoHiveDirCommand => true
+        }
+        assert(insertHiveDirCommand.size == 1)
+        val repartition = df.queryExecution.analyzed.collect {
+          case _: RebalancePartitions => true
+        }
+        assert(repartition.size == 1)
+      })
+    }
+  }
+
+  test("Test rebalance in InsertIntoDataSourceDirCommand") {
+    withSQLConf(
+      KyuubiSQLConf.INSERT_REPARTITION_BEFORE_WRITE_IF_NO_SHUFFLE.key -> "true") {
+      withTempDir(tmpDir => {
+        spark.range(0, 1000, 1, 10).createOrReplaceTempView("tmp_table")
+        val df = sql(s"INSERT OVERWRITE DIRECTORY '${tmpDir.getPath}' " +
+          s"USING PARQUET SELECT * FROM tmp_table")
+        assert(df.queryExecution.analyzed.isInstanceOf[InsertIntoDataSourceDirCommand])
+        val repartition =
+          df.queryExecution.analyzed.asInstanceOf[InsertIntoDataSourceDirCommand].query.collect {
+            case _: RebalancePartitions => true
+          }
+        assert(repartition.size == 1)
+      })
+    }
+  }
 }