Sort dataframe and disable localcheckpoint of final dataframe

istreeter · istreeter · commit 2f2482fca7f7 · 2025-01-07T18:31:19.000Z
diff --git a/modules/core/src/main/resources/reference.conf b/modules/core/src/main/resources/reference.conf
@@ -128,6 +128,7 @@
       "spark.memory.storageFraction": "0"
       "spark.databricks.delta.autoCompact.enabled": "false"
       "spark.scheduler.mode": "FAIR"
+      "spark.sql.adaptive.enabled": "false"
     }
     "gcpUserAgent": ${gcpUserAgent}
   }
diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/LakeWriter.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/LakeWriter.scala
@@ -81,10 +81,8 @@ object LakeWriter {
     }
     for {
       session <- SparkUtils.session[F](config, w, target.location)
-      writerParallelism = chooseWriterParallelism()
-      mutex1 <- Resource.eval(Mutex[F])
-      mutex2 <- Resource.eval(Mutex[F])
-    } yield impl(session, w, writerParallelism, mutex1, mutex2)
+      mutex <- Resource.eval(Mutex[F])
+    } yield impl(session, w, mutex)
   }
 
   def withHandledErrors[F[_]: Async](
@@ -129,22 +127,17 @@ object LakeWriter {
   /**
    * Implementation of the LakeWriter
    *
-   * The mutexes are needed because we allow overlapping windows. They prevent two different windows
+   * The mutex is needed because we allow overlapping windows. They prevent two different windows
    * from trying to run the same expensive operation at the same time.
    *
    * @param mutextForWriting
    *   Makes sure there is only ever one spark job trying to write events to the lake. This is a
    *   IO-intensive task.
-   * @param mutexForUnioning
-   *   Makes sure there is only ever one spark job trying to union smaller DataFrames into a larger
-   *   DataFrame, immediately before writing to the lake. This is a cpu-intensive task.
    */
   private def impl[F[_]: Sync](
     spark: SparkSession,
     w: Writer,
-    writerParallelism: Int,
-    mutexForWriting: Mutex[F],
-    mutexForUnioning: Mutex[F]
+    mutexForWriting: Mutex[F]
   ): LakeWriter[F] = new LakeWriter[F] {
     def createTable: F[Unit] =
       w.prepareTable(spark)
@@ -164,23 +157,11 @@ object LakeWriter {
 
     def commit(viewName: String): F[Unit] =
       for {
-        df <- mutexForUnioning.lock.surround {
-                SparkUtils.prepareFinalDataFrame(spark, viewName, writerParallelism)
-              }
+        df <- SparkUtils.prepareFinalDataFrame(spark, viewName)
         _ <- mutexForWriting.lock
                .surround {
                  w.write(df)
                }
       } yield ()
   }
-
-  /**
-   * Allow spark to parallelize over _most_ of the available processors for writing to the lake,
-   * because this speeds up how quickly we can sink a batch.
-   *
-   * But leave 1 processor always available, so that we are never blocked when trying to save one of
-   * the intermediate dataframes.
-   */
-  private def chooseWriterParallelism(): Int =
-    (Runtime.getRuntime.availableProcessors - 1).max(1)
 }
diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/SparkUtils.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/SparkUtils.scala
@@ -18,7 +18,7 @@ import org.typelevel.log4cats.Logger
 import org.typelevel.log4cats.slf4j.Slf4jLogger
 
 import org.apache.spark.sql.{DataFrame, Row, SparkSession}
-import org.apache.spark.sql.functions.{col, current_timestamp}
+import org.apache.spark.sql.functions.current_timestamp
 import org.apache.spark.sql.types.StructType
 
 import com.snowplowanalytics.snowplow.lakes.Config
@@ -63,7 +63,8 @@ private[processing] object SparkUtils {
   private def sparkConfigOptions(config: Config.Spark, writer: Writer): Map[String, String] = {
     val gcpUserAgentKey   = "fs.gs.storage.http.headers.user-agent"
     val gcpUserAgentValue = s"${config.gcpUserAgent.productName}/lake-loader (GPN:Snowplow;)"
-    writer.sparkConfig ++ config.conf + (gcpUserAgentKey -> gcpUserAgentValue)
+    val shuffleKey        = "spark.sql.shuffle.partitions"
+    writer.sparkConfig + (shuffleKey -> chooseWriterParallelism().show) ++ config.conf + (gcpUserAgentKey -> gcpUserAgentValue)
   }
 
   def initializeLocalDataFrame[F[_]: Sync](spark: SparkSession, viewName: String): F[Unit] =
@@ -94,21 +95,29 @@ private[processing] object SparkUtils {
 
   def prepareFinalDataFrame[F[_]: Sync](
     spark: SparkSession,
-    viewName: String,
-    writerParallelism: Int
+    viewName: String
   ): F[DataFrame] =
-    Sync[F].blocking {
-      spark
-        .table(viewName)
-        .withColumn("load_tstamp", current_timestamp())
-        .repartition(col("event_name"))
-        .coalesce(writerParallelism)
-        .localCheckpoint()
-    }
+    Logger[F].debug(s"Analyzing final DataFrame $viewName") >>
+      Sync[F].delay {
+        spark
+          .table(viewName)
+          .withColumn("load_tstamp", current_timestamp())
+          .sort("event_name")
+      } <* Logger[F].debug(s"Finished analyzing final DataFrame $viewName")
 
   def dropView[F[_]: Sync](spark: SparkSession, viewName: String): F[Unit] =
     Logger[F].info(s"Removing Spark data frame $viewName from local disk...") >>
       Sync[F].blocking {
         spark.catalog.dropTempView(viewName)
       }.void
+
+  /**
+   * Allow spark to parallelize over _most_ of the available processors for writing to the lake,
+   * because this speeds up how quickly we can sink a batch.
+   *
+   * But leave 1 processor always available, so that we are never blocked when trying to save one of
+   * the intermediate dataframes.
+   */
+  private def chooseWriterParallelism(): Int =
+    (Runtime.getRuntime.availableProcessors - 1).max(1)
 }
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
@@ -15,12 +15,12 @@ object Dependencies {
     object Spark {
 
       // A version of Spark which is compatible with the current version of Iceberg and Delta
-      val forIcebergDelta      = "3.5.3"
+      val forIcebergDelta      = "3.5.4"
       val forIcebergDeltaMinor = "3.5"
 
       // Hudi can use a different version of Spark because we bundle a separate Docker image
       // This version of Spark must be compatible with the current version of Hudi
-      val forHudi      = "3.5.3"
+      val forHudi      = "3.5.4"
       val forHudiMinor = "3.5"
     }
 
@@ -35,7 +35,7 @@ object Dependencies {
     val delta        = "3.2.1"
     val hudi         = "0.15.0"
     val hudiAws      = "1.0.0-beta2"
-    val iceberg      = "1.6.1"
+    val iceberg      = "1.7.1"
     val hadoop       = "3.4.1"
     val gcsConnector = "hadoop3-2.2.25"
     val hive         = "3.1.3"

Original file line number	Diff line number	Diff line change
`@@ -128,6 +128,7 @@`
`128`	`128`	`"spark.memory.storageFraction": "0"`
`129`	`129`	`"spark.databricks.delta.autoCompact.enabled": "false"`
`130`	`130`	`"spark.scheduler.mode": "FAIR"`
	`131`	`+ "spark.sql.adaptive.enabled": "false"`
`131`	`132`	`}`
`132`	`133`	`"gcpUserAgent": ${gcpUserAgent}`
`133`	`134`	`}`