create PruneShuffleAndSort physical rule

bmarcott · bmarcott · commit 1916ef9b679a · 2020-01-14T02:14:40.000-08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.util.StringUtils.PlanStringConcat
 import org.apache.spark.sql.catalyst.util.truncatedString
 import org.apache.spark.sql.dynamicpruning.PlanDynamicPruningFilters
 import org.apache.spark.sql.execution.adaptive.{AdaptiveExecutionContext, InsertAdaptiveSparkPlan}
-import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReuseExchange}
+import org.apache.spark.sql.execution.exchange.{EnsureRequirements, PruneShuffleAndSort, ReuseExchange}
 import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming.OutputMode
@@ -279,6 +279,7 @@ object QueryExecution {
       PlanDynamicPruningFilters(sparkSession),
       PlanSubqueries(sparkSession),
       EnsureRequirements(sparkSession.sessionState.conf),
+      PruneShuffleAndSort(),
       ApplyColumnarRulesAndInsertTransitions(sparkSession.sessionState.conf,
         sparkSession.sessionState.columnarRules),
       CollapseCodegenStages(sparkSession.sessionState.conf),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -217,12 +217,6 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
   }
 
   def apply(plan: SparkPlan): SparkPlan = plan.transformUp {
-    // TODO: remove this after we create a physical operator for `RepartitionByExpression`.
-    case operator @ ShuffleExchangeExec(upper: HashPartitioning, child, _) =>
-      child.outputPartitioning match {
-        case lower: HashPartitioning if upper.semanticEquals(lower) => child
-        case _ => operator
-      }
     case operator: SparkPlan =>
       ensureDistributionAndOrdering(reorderJoinPredicates(operator))
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/PruneShuffleAndSort.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/PruneShuffleAndSort.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.exchange
+
+import org.apache.spark.sql.catalyst.expressions.SortOrder
+import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, PartitioningCollection}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.{SortExec, SparkPlan}
+
+case class PruneShuffleAndSort() extends Rule[SparkPlan] {
+
+  override def apply(plan: SparkPlan): SparkPlan = {
+    plan.transformUp {
+      case operator @ ShuffleExchangeExec(upper: HashPartitioning, child, _) =>
+        child.outputPartitioning match {
+          case lower: HashPartitioning if upper.semanticEquals(lower) => child
+          case _ @ PartitioningCollection(partitionings) =>
+            if (partitionings.exists{case lower: HashPartitioning =>
+              upper.semanticEquals(lower)
+            }) {
+              child
+            } else {
+              operator
+            }
+          case _ => operator
+        }
+      case SortExec(upper, false, child, _)
+        if SortOrder.orderingSatisfies(child.outputOrdering, upper) => child
+      case subPlan: SparkPlan => subPlan
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan,
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
 import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec}
-import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReusedExchangeExec, ReuseExchange, ShuffleExchangeExec}
+import org.apache.spark.sql.execution.exchange.{EnsureRequirements, PruneShuffleAndSort, ReusedExchangeExec, ReuseExchange, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, SortMergeJoinExec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -482,7 +482,7 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper {
     val inputPlan = ShuffleExchangeExec(
       partitioning,
       DummySparkPlan(outputPartitioning = partitioning))
-    val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
+    val outputPlan = PruneShuffleAndSort().apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
     if (outputPlan.collect { case e: ShuffleExchangeExec => true }.size == 1) {
       fail(s"Topmost Exchange should not have been eliminated:\n$outputPlan")
@@ -775,6 +775,42 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper {
     }
   }
 
+  test("SPARK-28148: repartition after join is not optimized away") {
+    val df1 = spark.range(0, 5000000, 1, 5)
+    val df2 = spark.range(0, 10000000, 1, 5)
+
+    // non global sort order and partitioning should be reusable after left join
+    val outputPlan1 = df1.join(df2, Seq("id"), "left")
+      .repartition(df1("id"))
+      .sortWithinPartitions(df1("id"))
+      .queryExecution.executedPlan
+    val numSorts1 = outputPlan1.collect{case s: SortExec => s }
+    val numShuffles1 = outputPlan1.collect{case s: ShuffleExchangeExec => s }
+    assert(numSorts1.length == 2)
+    assert(numShuffles1.length == 2)
+
+    // non global sort order and partitioning should be reusable after inner join
+    val outputPlan2 = df1.join(df2, Seq("id"))
+      .repartition(df1("id"))
+      .sortWithinPartitions(df1("id"))
+      .queryExecution.executedPlan
+
+    val numSorts2 = outputPlan2.collect{case s: SortExec => s }
+    val numShuffles2 = outputPlan2.collect{case s: ShuffleExchangeExec => s }
+    assert(numSorts2.length == 2)
+    assert(numShuffles2.length == 2)
+
+    // global sort should not be removed
+    val outputPlan3 = df1.join(df2, Seq("id"))
+      .orderBy(df1("id"))
+      .queryExecution.executedPlan
+
+    val numSorts3 = outputPlan3.collect{case s: SortExec => s }
+    val numShuffles3 = outputPlan3.collect{case s: ShuffleExchangeExec => s }
+    assert(numSorts3.length == 3)
+    assert(numShuffles3.length == 3)
+  }
+
   test("SPARK-24500: create union with stream of children") {
     val df = Union(Stream(
       Range(1, 1, 1, 1),

Original file line number	Diff line number	Diff line change
`@@ -217,12 +217,6 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {`
`217`	`217`	`}`
`218`	`218`
`219`	`219`	`def apply(plan: SparkPlan): SparkPlan = plan.transformUp {`
`220`		- // TODO: remove this after we create a physical operator for `RepartitionByExpression`.
`221`		`- case operator @ ShuffleExchangeExec(upper: HashPartitioning, child, _) =>`
`222`		`- child.outputPartitioning match {`
`223`		`- case lower: HashPartitioning if upper.semanticEquals(lower) => child`
`224`		`- case _ => operator`
`225`		`- }`
`226`	`220`	`case operator: SparkPlan =>`
`227`	`221`	`ensureDistributionAndOrdering(reorderJoinPredicates(operator))`
`228`	`222`	`}`