palantir · LorenzoMartini · Apr 20, 2021 · Oct 7, 2020 · LorenzoMartini · Apr 19, 2021
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -28,9 +28,6 @@ all-branches-and-tags: &all-branches-and-tags
 # Step templates
 
 step_templates:
-  restore-build-binaries-cache: &restore-build-binaries-cache
-    restore_cache:
-      key: build-binaries-{{ checksum "build/mvn" }}-{{ checksum "build/sbt" }}
   restore-ivy-cache: &restore-ivy-cache
     restore_cache:
       keys:
@@ -136,20 +133,11 @@ jobs:
             - maven-dependency-cache-{{ checksum "pom.xml" }}
             # Fallback - see https://circleci.com/docs/2.0/configuration-reference/#example-2
             - maven-dependency-cache-
-      # Given the build-maven cache, this is superfluous, but leave it in in case we will want to remove the former
-      - restore_cache:
-          keys:
-            - build-binaries-{{ checksum "build/mvn" }}-{{ checksum "build/sbt" }}
-            - build-binaries-
       - run:
           command: ./build/mvn -DskipTests -Psparkr -Phadoop-palantir install
           no_output_timeout: 20m
       # Get sbt to run trivially, ensures its launcher is downloaded under build/
       - run: ./build/sbt -h || true
-      - save_cache:
-          key: build-binaries-{{ checksum "build/mvn" }}-{{ checksum "build/sbt" }}
-          paths:
-            - ./build
       - save_cache:
           key: maven-dependency-cache-{{ checksum "pom.xml" }}
           paths:
@@ -165,7 +153,6 @@ jobs:
       # Failed to execute goal on project spark-assembly_2.11: Could not resolve dependencies for project org.apache.spark:spark-assembly_2.11:pom:2.4.0-SNAPSHOT
       - restore_cache:
           key: maven-dependency-cache-{{ checksum "pom.xml" }}
-      - *restore-build-binaries-cache
       - run:
           name: Run style tests
           command: dev/run-style-tests.py
@@ -181,7 +168,6 @@ jobs:
       #    key: build-maven-{{ .Branch }}-{{ .BuildNum }}
       - restore_cache:
           key: maven-dependency-cache-{{ checksum "pom.xml" }}
-      - *restore-build-binaries-cache
       - run: |
           dev/run-build-tests.py | tee /tmp/run-build-tests.log
       - store_artifacts:
@@ -206,7 +192,6 @@ jobs:
             fi
       - *restore-ivy-cache
       - *restore-home-sbt-cache
-      - *restore-build-binaries-cache
       - run:
           name: Download all external dependencies for the test configuration (which extends compile) and ensure we update first
           command: dev/sbt test:externalDependencyClasspath oldDeps/test:externalDependencyClasspath
@@ -251,7 +236,6 @@ jobs:
       - attach_workspace:
           at: .
       - *restore-ivy-cache
-      - *restore-build-binaries-cache
       - *restore-home-sbt-cache
       - run: |
           dev/run-backcompat-tests.py | tee /tmp/run-backcompat-tests.log
@@ -305,7 +289,7 @@ jobs:
   run-scala-tests:
     <<: *test-defaults
     # project/CirclePlugin.scala does its own test splitting in SBT based on CIRCLE_NODE_INDEX, CIRCLE_NODE_TOTAL
-    parallelism: 12
+    parallelism: 8
     # Spark runs a lot of tests in parallel, we need 16 GB of RAM for this
     resource_class: xlarge
     steps:
@@ -320,7 +304,6 @@ jobs:
       - *link-in-build-sbt-cache
       # ---
       - *restore-ivy-cache
-      - *restore-build-binaries-cache
       - *restore-home-sbt-cache
       - restore_cache:
           keys:
@@ -407,7 +390,6 @@ jobs:
       - *checkout-code
       - restore_cache:
           key: maven-dependency-cache-{{ checksum "pom.xml" }}
-      - *restore-build-binaries-cache
       - run:
           command: dev/set_version_and_package.sh
           no_output_timeout: 15m

diff --git a/.sbtopts b/.sbtopts
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+-J-Xmx4G
+-J-Xss4m
diff --git a/build/sbt-launch-lib.bash b/build/sbt-launch-lib.bash
@@ -39,7 +39,7 @@ dlog () {
 
 acquire_sbt_jar () {
   SBT_VERSION=`awk -F "=" '/sbt\.version/ {print $2}' ./project/build.properties`
-  URL1=https://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
+  URL1=https://repo1.maven.org/maven2/org/scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch-${SBT_VERSION}.jar
   JAR=build/sbt-launch-${SBT_VERSION}.jar
 
   sbt_jar=$JAR

diff --git a/dev/run-tests.py b/dev/run-tests.py
@@ -390,7 +390,8 @@ def build_spark_assembly_sbt(extra_profiles, checkstyle=False):
     if checkstyle:
         run_java_style_checks(build_profiles)
 
-    build_spark_unidoc_sbt(extra_profiles)
+    # TODO(lmartini): removed because broken, checks generated classes
+    # build_spark_unidoc_sbt(extra_profiles)
 
 
 def build_apache_spark(build_tool, extra_profiles):

diff --git a/project/CirclePlugin.scala b/project/CirclePlugin.scala
@@ -288,8 +288,8 @@ object CirclePlugin extends AutoPlugin {
       }
     },
 
-    test := (test, copyTestReportsToCircle) { (test, copy) =>
-      test.doFinally(copy.map(_ => ()))
-    }.value
+    test := (test andFinally Def.taskDyn {
+      copyTestReportsToCircle
+    }).value
   ))
 }
diff --git a/project/MimaBuild.scala b/project/MimaBuild.scala
@@ -22,9 +22,7 @@ import com.typesafe.tools.mima.core._
 import com.typesafe.tools.mima.core.MissingClassProblem
 import com.typesafe.tools.mima.core.MissingTypesProblem
 import com.typesafe.tools.mima.core.ProblemFilters._
-import com.typesafe.tools.mima.plugin.MimaKeys.{mimaBinaryIssueFilters, mimaPreviousArtifacts}
-import com.typesafe.tools.mima.plugin.MimaPlugin.mimaDefaultSettings
-
+import com.typesafe.tools.mima.plugin.MimaKeys.{mimaBinaryIssueFilters, mimaPreviousArtifacts, mimaFailOnNoPrevious}
 
 object MimaBuild {
 
@@ -86,14 +84,17 @@ object MimaBuild {
     ignoredMembers.flatMap(excludeMember) ++ MimaExcludes.excludes(currentSparkVersion)
   }
 
-  def mimaSettings(sparkHome: File, projectRef: ProjectRef) = {
+  def mimaSettings(sparkHome: File, projectRef: ProjectRef): Seq[Setting[_]] = {
     val organization = "org.apache.spark"
-    val previousSparkVersion = "2.4.0"
+    val previousSparkVersion = "3.0.0"
     val project = projectRef.project
     val fullId = "spark-" + project + "_2.12"
-    mimaDefaultSettings ++
-    Seq(mimaPreviousArtifacts := Set(organization % fullId % previousSparkVersion),
-      mimaBinaryIssueFilters ++= ignoredABIProblems(sparkHome, version.value))
+
+    Seq(
+      mimaFailOnNoPrevious := true,
+      mimaPreviousArtifacts := Set(organization % fullId % previousSparkVersion),
+      mimaBinaryIssueFilters ++= ignoredABIProblems(sparkHome, version.value)
+    )
   }
 
 }
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
@@ -36,6 +36,44 @@ object MimaExcludes {
 
   // Exclude rules for 3.0.x
   lazy val v30excludes = v24excludes ++ Seq(
+    //[SPARK-21708][BUILD] Migrate build to sbt 1.x
+    // mima plugin update caused new incompatibilities to be detected
+    // core module
+    // TODO(lmartini): this group was originally on top of 3.1 but applied on 3.0 because we picked the above commit
+    // on top of 3.0
+    ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.shuffle.sort.io.LocalDiskShuffleMapOutputWriter.commitAllPartitions"),
+    ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.shuffle.api.ShuffleMapOutputWriter.commitAllPartitions"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.shuffle.api.ShuffleMapOutputWriter.commitAllPartitions"),
+    // mllib module
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.LogisticRegressionTrainingSummary.totalIterations"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionTrainingSummary.$init$"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.labels"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.truePositiveRateByLabel"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.falsePositiveRateByLabel"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.precisionByLabel"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.recallByLabel"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.fMeasureByLabel"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.fMeasureByLabel"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.accuracy"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedTruePositiveRate"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedFalsePositiveRate"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedRecall"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedPrecision"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedFMeasure"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedFMeasure"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.roc"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.areaUnderROC"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.pr"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.fMeasureByThreshold"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.precisionByThreshold"),
+    ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.recallByThreshold"),
+    ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.FMClassifier.trainImpl"),
+    ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.FMRegressor.trainImpl"),
+    // TODO(lmartini): Additional excludes not in upstream but unique to palantir fork
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.SparkContext.initializeForcefully"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.SparkContext.initializeForcefully"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.broadcast.Broadcast.initializeForcefully"),
+
     // [SPARK-23429][CORE] Add executor memory metrics to heartbeat and expose in executors REST API
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.scheduler.SparkListenerExecutorMetricsUpdate.apply"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.scheduler.SparkListenerExecutorMetricsUpdate.copy"),