Sort dataframe and disable localcheckpoint of final dataframe

istreeter · istreeter · commit 041335167203 · 2025-01-07T15:17:12.000Z
diff --git a/modules/core/src/main/resources/reference.conf b/modules/core/src/main/resources/reference.conf
@@ -128,6 +128,7 @@
       "spark.memory.storageFraction": "0"
       "spark.databricks.delta.autoCompact.enabled": "false"
       "spark.scheduler.mode": "FAIR"
+      "spark.sql.adaptive.enabled": "false"
     }
     "gcpUserAgent": ${gcpUserAgent}
   }
diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/LakeWriter.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/LakeWriter.scala
@@ -81,10 +81,9 @@ object LakeWriter {
     }
     for {
       session <- SparkUtils.session[F](config, w, target.location)
-      writerParallelism = chooseWriterParallelism()
       mutex1 <- Resource.eval(Mutex[F])
       mutex2 <- Resource.eval(Mutex[F])
-    } yield impl(session, w, writerParallelism, mutex1, mutex2)
+    } yield impl(session, w, mutex1, mutex2)
   }
 
   def withHandledErrors[F[_]: Async](
@@ -142,7 +141,6 @@ object LakeWriter {
   private def impl[F[_]: Sync](
     spark: SparkSession,
     w: Writer,
-    writerParallelism: Int,
     mutexForWriting: Mutex[F],
     mutexForUnioning: Mutex[F]
   ): LakeWriter[F] = new LakeWriter[F] {
@@ -165,22 +163,12 @@ object LakeWriter {
     def commit(viewName: String): F[Unit] =
       for {
         df <- mutexForUnioning.lock.surround {
-                SparkUtils.prepareFinalDataFrame(spark, viewName, writerParallelism)
+                SparkUtils.prepareFinalDataFrame(spark, viewName)
               }
         _ <- mutexForWriting.lock
                .surround {
                  w.write(df)
                }
       } yield ()
   }
-
-  /**
-   * Allow spark to parallelize over _most_ of the available processors for writing to the lake,
-   * because this speeds up how quickly we can sink a batch.
-   *
-   * But leave 1 processor always available, so that we are never blocked when trying to save one of
-   * the intermediate dataframes.
-   */
-  private def chooseWriterParallelism(): Int =
-    (Runtime.getRuntime.availableProcessors - 1).max(1)
 }
diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/SparkUtils.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/SparkUtils.scala
@@ -18,7 +18,7 @@ import org.typelevel.log4cats.Logger
 import org.typelevel.log4cats.slf4j.Slf4jLogger
 
 import org.apache.spark.sql.{DataFrame, Row, SparkSession}
-import org.apache.spark.sql.functions.{col, current_timestamp}
+import org.apache.spark.sql.functions.current_timestamp
 import org.apache.spark.sql.types.StructType
 
 import com.snowplowanalytics.snowplow.lakes.Config
@@ -63,7 +63,8 @@ private[processing] object SparkUtils {
   private def sparkConfigOptions(config: Config.Spark, writer: Writer): Map[String, String] = {
     val gcpUserAgentKey   = "fs.gs.storage.http.headers.user-agent"
     val gcpUserAgentValue = s"${config.gcpUserAgent.productName}/lake-loader (GPN:Snowplow;)"
-    writer.sparkConfig ++ config.conf + (gcpUserAgentKey -> gcpUserAgentValue)
+    val shuffleKey        = "spark.sql.shuffle.partitions"
+    writer.sparkConfig + (shuffleKey -> chooseWriterParallelism().show) ++ config.conf + (gcpUserAgentKey -> gcpUserAgentValue)
   }
 
   def initializeLocalDataFrame[F[_]: Sync](spark: SparkSession, viewName: String): F[Unit] =
@@ -94,21 +95,31 @@ private[processing] object SparkUtils {
 
   def prepareFinalDataFrame[F[_]: Sync](
     spark: SparkSession,
-    viewName: String,
-    writerParallelism: Int
+    viewName: String
   ): F[DataFrame] =
-    Sync[F].blocking {
-      spark
-        .table(viewName)
-        .withColumn("load_tstamp", current_timestamp())
-        .repartition(col("event_name"))
-        .coalesce(writerParallelism)
-        .localCheckpoint()
-    }
+    Logger[F].debug(s"Analyzing final DataFrame $viewName") >>
+      Sync[F].blocking {
+        val ret = spark
+          .table(viewName)
+          .withColumn("load_tstamp", current_timestamp())
+          .sort("event_name")
+        ret.queryExecution.assertAnalyzed()
+        ret
+      } <* Logger[F].debug(s"Finished analyzing final DataFrame $viewName")
 
   def dropView[F[_]: Sync](spark: SparkSession, viewName: String): F[Unit] =
     Logger[F].info(s"Removing Spark data frame $viewName from local disk...") >>
       Sync[F].blocking {
         spark.catalog.dropTempView(viewName)
       }.void
+
+  /**
+   * Allow spark to parallelize over _most_ of the available processors for writing to the lake,
+   * because this speeds up how quickly we can sink a batch.
+   *
+   * But leave 1 processor always available, so that we are never blocked when trying to save one of
+   * the intermediate dataframes.
+   */
+  private def chooseWriterParallelism(): Int =
+    (Runtime.getRuntime.availableProcessors - 1).max(1)
 }

Original file line number	Diff line number	Diff line change
`@@ -128,6 +128,7 @@`
`128`	`128`	`"spark.memory.storageFraction": "0"`
`129`	`129`	`"spark.databricks.delta.autoCompact.enabled": "false"`
`130`	`130`	`"spark.scheduler.mode": "FAIR"`
	`131`	`+ "spark.sql.adaptive.enabled": "false"`
`131`	`132`	`}`
`132`	`133`	`"gcpUserAgent": ${gcpUserAgent}`
`133`	`134`	`}`