Skip to content

Commit 0413351

Browse files
committed
Sort dataframe and disable localcheckpoint of final dataframe
1 parent 56146cb commit 0413351

File tree

3 files changed

+26
-26
lines changed

3 files changed

+26
-26
lines changed

modules/core/src/main/resources/reference.conf

+1
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@
128128
"spark.memory.storageFraction": "0"
129129
"spark.databricks.delta.autoCompact.enabled": "false"
130130
"spark.scheduler.mode": "FAIR"
131+
"spark.sql.adaptive.enabled": "false"
131132
}
132133
"gcpUserAgent": ${gcpUserAgent}
133134
}

modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/LakeWriter.scala

+2-14
Original file line numberDiff line numberDiff line change
@@ -81,10 +81,9 @@ object LakeWriter {
8181
}
8282
for {
8383
session <- SparkUtils.session[F](config, w, target.location)
84-
writerParallelism = chooseWriterParallelism()
8584
mutex1 <- Resource.eval(Mutex[F])
8685
mutex2 <- Resource.eval(Mutex[F])
87-
} yield impl(session, w, writerParallelism, mutex1, mutex2)
86+
} yield impl(session, w, mutex1, mutex2)
8887
}
8988

9089
def withHandledErrors[F[_]: Async](
@@ -142,7 +141,6 @@ object LakeWriter {
142141
private def impl[F[_]: Sync](
143142
spark: SparkSession,
144143
w: Writer,
145-
writerParallelism: Int,
146144
mutexForWriting: Mutex[F],
147145
mutexForUnioning: Mutex[F]
148146
): LakeWriter[F] = new LakeWriter[F] {
@@ -165,22 +163,12 @@ object LakeWriter {
165163
def commit(viewName: String): F[Unit] =
166164
for {
167165
df <- mutexForUnioning.lock.surround {
168-
SparkUtils.prepareFinalDataFrame(spark, viewName, writerParallelism)
166+
SparkUtils.prepareFinalDataFrame(spark, viewName)
169167
}
170168
_ <- mutexForWriting.lock
171169
.surround {
172170
w.write(df)
173171
}
174172
} yield ()
175173
}
176-
177-
/**
178-
* Allow spark to parallelize over _most_ of the available processors for writing to the lake,
179-
* because this speeds up how quickly we can sink a batch.
180-
*
181-
* But leave 1 processor always available, so that we are never blocked when trying to save one of
182-
* the intermediate dataframes.
183-
*/
184-
private def chooseWriterParallelism(): Int =
185-
(Runtime.getRuntime.availableProcessors - 1).max(1)
186174
}

modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/SparkUtils.scala

+23-12
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ import org.typelevel.log4cats.Logger
1818
import org.typelevel.log4cats.slf4j.Slf4jLogger
1919

2020
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
21-
import org.apache.spark.sql.functions.{col, current_timestamp}
21+
import org.apache.spark.sql.functions.current_timestamp
2222
import org.apache.spark.sql.types.StructType
2323

2424
import com.snowplowanalytics.snowplow.lakes.Config
@@ -63,7 +63,8 @@ private[processing] object SparkUtils {
6363
private def sparkConfigOptions(config: Config.Spark, writer: Writer): Map[String, String] = {
6464
val gcpUserAgentKey = "fs.gs.storage.http.headers.user-agent"
6565
val gcpUserAgentValue = s"${config.gcpUserAgent.productName}/lake-loader (GPN:Snowplow;)"
66-
writer.sparkConfig ++ config.conf + (gcpUserAgentKey -> gcpUserAgentValue)
66+
val shuffleKey = "spark.sql.shuffle.partitions"
67+
writer.sparkConfig + (shuffleKey -> chooseWriterParallelism().show) ++ config.conf + (gcpUserAgentKey -> gcpUserAgentValue)
6768
}
6869

6970
def initializeLocalDataFrame[F[_]: Sync](spark: SparkSession, viewName: String): F[Unit] =
@@ -94,21 +95,31 @@ private[processing] object SparkUtils {
9495

9596
def prepareFinalDataFrame[F[_]: Sync](
9697
spark: SparkSession,
97-
viewName: String,
98-
writerParallelism: Int
98+
viewName: String
9999
): F[DataFrame] =
100-
Sync[F].blocking {
101-
spark
102-
.table(viewName)
103-
.withColumn("load_tstamp", current_timestamp())
104-
.repartition(col("event_name"))
105-
.coalesce(writerParallelism)
106-
.localCheckpoint()
107-
}
100+
Logger[F].debug(s"Analyzing final DataFrame $viewName") >>
101+
Sync[F].blocking {
102+
val ret = spark
103+
.table(viewName)
104+
.withColumn("load_tstamp", current_timestamp())
105+
.sort("event_name")
106+
ret.queryExecution.assertAnalyzed()
107+
ret
108+
} <* Logger[F].debug(s"Finished analyzing final DataFrame $viewName")
108109

109110
def dropView[F[_]: Sync](spark: SparkSession, viewName: String): F[Unit] =
110111
Logger[F].info(s"Removing Spark data frame $viewName from local disk...") >>
111112
Sync[F].blocking {
112113
spark.catalog.dropTempView(viewName)
113114
}.void
115+
116+
/**
117+
* Allow spark to parallelize over _most_ of the available processors for writing to the lake,
118+
* because this speeds up how quickly we can sink a batch.
119+
*
120+
* But leave 1 processor always available, so that we are never blocked when trying to save one of
121+
* the intermediate dataframes.
122+
*/
123+
private def chooseWriterParallelism(): Int =
124+
(Runtime.getRuntime.availableProcessors - 1).max(1)
114125
}

0 commit comments

Comments
 (0)