Repartition by event name before writing to the lake

istreeter · istreeter · commit 56146cb4efcb · 2025-01-03T19:50:25.000Z
Previously, our Iceberg writer was using the [hash write distribution mode][1] because that is the default for Iceberg. In this mode, Spark repartitions by the dataframe immediately before writing to the lake. After this commit, we explicitly repartition the dataframe as part of the existing spark task for preparing the final dataframe. This means we can change the Iceberg write distribution mode to `none`. Overall this seems to improve the time taken to write a window of events to Iceberg. This fixes a problem we found, in which the write phase could get too slow when under high load (Iceberg only): specifically, a write was taking longer than the loader's "window" and this caused periods of low cpu usage, where the loader's processing phase was waiting for the write phase to catch up. This commit also removes the config option `writerParallelismFraction`. Before this commit, there were disadvantages to making the writer parallelism too high, because it would lead to smaller file sizes. But after this commit, now that we partition by event_name, we might as well make the writer parallelism as high as reasonably possible, which also speeds up the write phase of the loader. Note: this improvement will not help Snowplow users who have changed the parition key to something different to our default. We might want to make a follow-up change, in which it auto-discovers the lake's partition key. For example, some users might want to partition by `app_id` instead of `event_name`. [1]: https://iceberg.apache.org/docs/1.7.1/spark-writes/#writing-distribution-modes
diff --git a/config/config.aws.reference.hocon b/config/config.aws.reference.hocon
@@ -127,6 +127,12 @@
 #     "icebergTableProperties": {
 #       "write.metadata.metrics.column.event_id": "count"
 #     }
+#
+#     # -- Any valid Iceberg write option
+#     # -- This can be blank in most setups because the loader already sets sensible defaults.
+#     "icebergWriteOptions": {
+#       "write-format": "parquet"
+#     }
 #   }
 
     "bad": {
@@ -181,11 +187,6 @@
       # -- E.g. to change credentials provider
       "fs.s3a.aws.credentials.provider": "com.amazonaws.auth.InstanceProfileCredentialsProvider"
     }
-
-    # -- Controls how many spark tasks run in parallel during writing the events to cloud storage.
-    # -- E.g. If there are 8 available processors, and cpuParallelismFraction = 0.5, then we have 4 spark tasks for writing.
-    # -- The default value is known to work well. Changing this setting might affect memory usage, file sizes, and/or latency.
-    "writerParallelismFraction": 0.5
   }
 
   # Retry configuration for lake operation failures
diff --git a/config/config.azure.reference.hocon b/config/config.azure.reference.hocon
@@ -94,6 +94,12 @@
 #     "icebergTableProperties": {
 #       "write.metadata.metrics.column.event_id": "count"
 #     }
+#
+#     # -- Any valid Iceberg write option
+#     # -- This can be blank in most setups because the loader already sets sensible defaults.
+#     "icebergWriteOptions": {
+#       "write-format": "parquet"
+#     }
 #   }
 
     "bad": {
@@ -145,11 +151,6 @@
       # -- E.g. to enable the spark ui for debugging:
       "spark.ui.enabled": true
     }
-
-    # -- Controls how many spark tasks run in parallel during writing the events to cloud storage.
-    # -- E.g. If there are 8 available processors, and cpuParallelismFraction = 0.5, then we have 4 spark tasks for writing.
-    # -- The default value is known to work well. Changing this setting might affect memory usage, file sizes, and/or latency.
-    "writerParallelismFraction": 0.5
   }
 
   # Retry configuration for lake operation failures
diff --git a/config/config.gcp.reference.hocon b/config/config.gcp.reference.hocon
@@ -116,6 +116,12 @@
 #     "icebergTableProperties": {
 #       "write.metadata.metrics.column.event_id": "count"
 #     }
+#
+#     # -- Any valid Iceberg write option
+#     # -- This can be blank in most setups because the loader already sets sensible defaults.
+#     "icebergWriteOptions": {
+#       "write-format": "parquet"
+#     }
 #   }
 
     "bad": {
@@ -160,11 +166,6 @@
       # -- E.g. to enable the spark ui for debugging:
       "spark.ui.enabled": true
     }
-
-    # -- Controls how many spark tasks run in parallel during writing the events to cloud storage.
-    # -- E.g. If there are 8 available processors, and cpuParallelismFraction = 0.5, then we have 4 spark tasks for writing.
-    # -- The default value is known to work well. Changing this setting might affect memory usage, file sizes, and/or latency.
-    "writerParallelismFraction": 0.5
   }
 
   # Retry configuration for lake operation failures
diff --git a/modules/core/src/main/resources/reference.conf b/modules/core/src/main/resources/reference.conf
@@ -41,6 +41,12 @@
         "write.metadata.metrics.column.true_tstamp": "full"
       }
 
+      "icebergWriteOptions": {
+        "merge-schema": "true"
+        "check-ordering": "false"
+        "distribution-mode": "none"
+      }
+
       "hudiTableProperties": {
         "hoodie.table.name": "events"
         "hoodie.table.keygenerator.class": "org.apache.hudi.keygen.TimestampBasedKeyGenerator"
@@ -121,9 +127,9 @@
       "spark.sql.parquet.datetimeRebaseModeInWrite": "CORRECTED"
       "spark.memory.storageFraction": "0"
       "spark.databricks.delta.autoCompact.enabled": "false"
+      "spark.scheduler.mode": "FAIR"
     }
     "gcpUserAgent": ${gcpUserAgent}
-    "writerParallelismFraction": 0.5
   }
 
   "retries": {
diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/Config.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/Config.scala
@@ -74,7 +74,8 @@ object Config {
     table: String,
     catalog: IcebergCatalog,
     location: URI,
-    icebergTableProperties: Map[String, String]
+    icebergTableProperties: Map[String, String],
+    icebergWriteOptions: Map[String, String]
   ) extends Target
 
   sealed trait IcebergCatalog
@@ -100,8 +101,7 @@ object Config {
   case class Spark(
     taskRetries: Int,
     conf: Map[String, String],
-    gcpUserAgent: GcpUserAgent,
-    writerParallelismFraction: BigDecimal
+    gcpUserAgent: GcpUserAgent
   )
 
   case class Metrics(
diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/LakeWriter.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/LakeWriter.scala
@@ -81,7 +81,7 @@ object LakeWriter {
     }
     for {
       session <- SparkUtils.session[F](config, w, target.location)
-      writerParallelism = chooseWriterParallelism(config)
+      writerParallelism = chooseWriterParallelism()
       mutex1 <- Resource.eval(Mutex[F])
       mutex2 <- Resource.eval(Mutex[F])
     } yield impl(session, w, writerParallelism, mutex1, mutex2)
@@ -175,14 +175,12 @@ object LakeWriter {
   }
 
   /**
-   * Converts `writerParallelismFraction` into a suggested number of threads
+   * Allow spark to parallelize over _most_ of the available processors for writing to the lake,
+   * because this speeds up how quickly we can sink a batch.
    *
-   * For bigger instances (more cores) we want more parallelism in the writer. This avoids a
-   * situation where writing tasks exceed the length of a window, which causes an unbalanced use of
-   * cpu.
+   * But leave 1 processor always available, so that we are never blocked when trying to save one of
+   * the intermediate dataframes.
    */
-  private def chooseWriterParallelism(config: Config.Spark): Int =
-    (Runtime.getRuntime.availableProcessors * config.writerParallelismFraction)
-      .setScale(0, BigDecimal.RoundingMode.UP)
-      .toInt
+  private def chooseWriterParallelism(): Int =
+    (Runtime.getRuntime.availableProcessors - 1).max(1)
 }
diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/SparkUtils.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/SparkUtils.scala
@@ -18,7 +18,7 @@ import org.typelevel.log4cats.Logger
 import org.typelevel.log4cats.slf4j.Slf4jLogger
 
 import org.apache.spark.sql.{DataFrame, Row, SparkSession}
-import org.apache.spark.sql.functions.current_timestamp
+import org.apache.spark.sql.functions.{col, current_timestamp}
 import org.apache.spark.sql.types.StructType
 
 import com.snowplowanalytics.snowplow.lakes.Config
@@ -101,6 +101,7 @@ private[processing] object SparkUtils {
       spark
         .table(viewName)
         .withColumn("load_tstamp", current_timestamp())
+        .repartition(col("event_name"))
         .coalesce(writerParallelism)
         .localCheckpoint()
     }
diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/tables/IcebergWriter.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/tables/IcebergWriter.scala
@@ -61,8 +61,7 @@ class IcebergWriter(config: Config.Iceberg) extends Writer {
       df.write
         .format("iceberg")
         .mode("append")
-        .option("merge-schema", true)
-        .option("check-ordering", false)
+        .options(config.icebergWriteOptions)
         .saveAsTable(fqTable)
     }
 

Original file line number	Diff line number	Diff line change
`@@ -61,8 +61,7 @@ class IcebergWriter(config: Config.Iceberg) extends Writer {`
`61`	`61`	`df.write`
`62`	`62`	`.format("iceberg")`
`63`	`63`	`.mode("append")`
`64`		`- .option("merge-schema", true)`
`65`		`- .option("check-ordering", false)`
	`64`	`+ .options(config.icebergWriteOptions)`
`66`	`65`	`.saveAsTable(fqTable)`
`67`	`66`	`}`
`68`	`67`