Skip to content

Commit e4a3e07

Browse files
committed
Parallelize cpu-intensive processing
This allows the loader to better utilize all cpu available on a larger instance. Parallelism is configured by a new config parameter `cpuParallelismFraction`. The actual parallelism is chosen dynamically based on the number of available CPU, so the default value should be appropriate for all sized VMs.
1 parent e65e9d8 commit e4a3e07

File tree

11 files changed

+71
-44
lines changed

11 files changed

+71
-44
lines changed

config/config.azure.reference.hocon

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,11 @@
7979
"uploadConcurrency": 1
8080
}
8181

82+
# -- Controls how the app splits the workload into concurrent batches which can be run in parallel.
83+
# -- E.g. If there are 4 available processors, and cpuParallelismFraction = 0.75, then we process 3 batches concurrently.
84+
# -- Adjusting this value can cause the app to use more or less of the available CPU.
85+
"cpuParallelismFraction": 0.75
86+
8287
# Retry configuration for Snowflake operation failures
8388
"retries": {
8489

config/config.kinesis.reference.hocon

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,11 @@
9696
"uploadConcurrency": 1
9797
}
9898

99+
# -- Controls how the app splits the workload into concurrent batches which can be run in parallel.
100+
# -- E.g. If there are 4 available processors, and cpuParallelismFraction = 0.75, then we process 3 batches concurrently.
101+
# -- Adjusting this value can cause the app to use more or less of the available CPU.
102+
"cpuParallelismFraction": 0.75
103+
99104
# Retry configuration for Snowflake operation failures
100105
"retries": {
101106

config/config.pubsub.reference.hocon

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,11 @@
8585
"uploadConcurrency": 1
8686
}
8787

88+
# -- Controls how the app splits the workload into concurrent batches which can be run in parallel.
89+
# -- E.g. If there are 4 available processors, and cpuParallelismFraction = 0.75, then we process 3 batches concurrently.
90+
# -- Adjusting this value can cause the app to use more or less of the available CPU.
91+
"cpuParallelismFraction": 0.75
92+
8893
# Retry configuration for Snowflake operation failures
8994
"retries": {
9095

modules/core/src/main/resources/reference.conf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
"maxDelay": "1 second"
2424
"uploadConcurrency": 3
2525
}
26+
"cpuParallelismFraction": 0.75
2627

2728
"retries": {
2829
"setupErrors": {

modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Config.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ case class Config[+Source, +Sink](
2929
input: Source,
3030
output: Config.Output[Sink],
3131
batching: Config.Batching,
32+
cpuParallelismFraction: BigDecimal,
3233
retries: Config.Retries,
3334
skipSchemas: List[SchemaCriterion],
3435
telemetry: Telemetry.Config,

modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Environment.scala

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,16 @@ import com.snowplowanalytics.snowplow.snowflake.processing.{Channel, TableManage
1919
import com.snowplowanalytics.snowplow.sources.SourceAndAck
2020
import org.http4s.client.Client
2121

22+
/**
23+
* Resources and runtime-derived configuration needed for processing events
24+
*
25+
* @param cpuParallelism
26+
* The processing Pipe involves several steps, some of which are cpu-intensive. We run
27+
* cpu-intensive steps in parallel, so that on big instances we can take advantage of all cores.
28+
* For each of those cpu-intensive steps, `cpuParallelism` controls the parallelism of that step.
29+
*
30+
* Other params are self-explanatory
31+
*/
2232
case class Environment[F[_]](
2333
appInfo: AppInfo,
2434
source: SourceAndAck[F],
@@ -29,6 +39,7 @@ case class Environment[F[_]](
2939
metrics: Metrics[F],
3040
appHealth: AppHealth.Interface[F, Alert, RuntimeService],
3141
batching: Config.Batching,
42+
cpuParallelism: Int,
3243
schemasToSkip: List[SchemaCriterion],
3344
badRowMaxSize: Int
3445
)
@@ -54,17 +65,30 @@ object Environment {
5465
tableManager <- Resource.eval(TableManager.make(config.output.good, appHealth, config.retries))
5566
channelOpener <- Channel.opener(config.output.good, config.batching, config.retries, appHealth)
5667
channelProvider <- Channel.provider(channelOpener, config.retries, appHealth)
68+
cpuParallelism = chooseCpuParallelism(config)
5769
} yield Environment(
58-
appInfo = appInfo,
59-
source = sourceAndAck,
60-
badSink = badSink,
61-
httpClient = httpClient,
62-
tableManager = tableManager,
63-
channel = channelProvider,
64-
metrics = metrics,
65-
appHealth = appHealth,
66-
batching = config.batching,
67-
schemasToSkip = config.skipSchemas,
68-
badRowMaxSize = config.output.bad.maxRecordSize
70+
appInfo = appInfo,
71+
source = sourceAndAck,
72+
badSink = badSink,
73+
httpClient = httpClient,
74+
tableManager = tableManager,
75+
channel = channelProvider,
76+
metrics = metrics,
77+
appHealth = appHealth,
78+
batching = config.batching,
79+
cpuParallelism = cpuParallelism,
80+
schemasToSkip = config.skipSchemas,
81+
badRowMaxSize = config.output.bad.maxRecordSize
6982
)
83+
84+
/**
85+
* See the description of `cpuParallelism` on the [[Environment]] class
86+
*
87+
* For bigger instances (more cores) we want more parallelism, so that cpu-intensive steps can
88+
* take advantage of all the cores.
89+
*/
90+
private def chooseCpuParallelism(config: Config[Any, Any]): Int =
91+
(Runtime.getRuntime.availableProcessors * config.cpuParallelismFraction)
92+
.setScale(0, BigDecimal.RoundingMode.UP)
93+
.toInt
7094
}

modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/Processing.scala

Lines changed: 10 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,6 @@ object Processing {
4747

4848
/** Model used between stages of the processing pipeline */
4949

50-
private case class ParsedBatch(
51-
events: List[Event],
52-
parseFailures: List[BadRow],
53-
countBytes: Long,
54-
countItems: Int,
55-
token: Unique.Token
56-
)
57-
5850
private case class TransformedBatch(
5951
events: List[EventWithTransform],
6052
parseFailures: List[BadRow],
@@ -128,8 +120,7 @@ object Processing {
128120
val badProcessor = BadRowProcessor(env.appInfo.name, env.appInfo.version)
129121

130122
in.through(setLatency(env.metrics))
131-
.through(parseBytes(badProcessor))
132-
.through(transform(badProcessor, env.schemasToSkip))
123+
.through(parseAndTransform(env, badProcessor))
133124
.through(BatchUp.withTimeout(env.batching.maxBytes, env.batching.maxDelay))
134125
.through(writeToSnowflake(env, badProcessor))
135126
.through(sendFailedEvents(env, badProcessor))
@@ -152,8 +143,8 @@ object Processing {
152143
}
153144

154145
/** Parse raw bytes into Event using analytics sdk */
155-
private def parseBytes[F[_]: Sync](badProcessor: BadRowProcessor): Pipe[F, TokenedEvents, ParsedBatch] =
156-
_.evalMap { case TokenedEvents(chunk, token, _) =>
146+
private def parseAndTransform[F[_]: Async](env: Environment[F], badProcessor: BadRowProcessor): Pipe[F, TokenedEvents, TransformedBatch] =
147+
_.parEvalMap(env.cpuParallelism) { case TokenedEvents(chunk, token, _) =>
157148
for {
158149
numBytes <- Sync[F].delay(Foldable[Chunk].sumBytes(chunk))
159150
(badRows, events) <- Foldable[Chunk].traverseSeparateUnordered(chunk) { bytes =>
@@ -164,29 +155,20 @@ object Processing {
164155
}
165156
}
166157
}
167-
} yield ParsedBatch(events, badRows, numBytes, chunk.size, token)
168-
}
169-
170-
/** Transform the Event into values compatible with the snowflake ingest sdk */
171-
private def transform[F[_]: Sync](
172-
badProcessor: BadRowProcessor,
173-
schemasToSkip: List[SchemaCriterion]
174-
): Pipe[F, ParsedBatch, TransformedBatch] =
175-
_.evalMap { batch =>
176-
Sync[F].realTimeInstant.flatMap { now =>
177-
val loadTstamp = SnowflakeCaster.timestampValue(now)
178-
transformBatch[F](badProcessor, loadTstamp, batch, schemasToSkip)
179-
}
158+
now <- Sync[F].realTimeInstant
159+
loadTstamp = SnowflakeCaster.timestampValue(now)
160+
(transformBad, transformed) <- transformBatch(badProcessor, loadTstamp, events, env.schemasToSkip)
161+
} yield TransformedBatch(transformed, transformBad, badRows, numBytes, chunk.size, token)
180162
}
181163

182164
private def transformBatch[F[_]: Sync](
183165
badProcessor: BadRowProcessor,
184166
loadTstamp: OffsetDateTime,
185-
batch: ParsedBatch,
167+
events: List[Event],
186168
schemasToSkip: List[SchemaCriterion]
187-
): F[TransformedBatch] =
169+
): F[(List[BadRow], List[EventWithTransform])] =
188170
Foldable[List]
189-
.traverseSeparateUnordered(batch.events) { event =>
171+
.traverseSeparateUnordered(events) { event =>
190172
Sync[F].delay {
191173
Transform
192174
.transformEventUnstructured[AnyRef](badProcessor, SnowflakeCaster, SnowflakeJsonFolder, event, schemasToSkip)
@@ -201,9 +183,6 @@ object Processing {
201183
}
202184
}
203185
}
204-
.map { case (transformFailures, eventsWithTransforms) =>
205-
TransformedBatch(eventsWithTransforms, batch.parseFailures, transformFailures, batch.countBytes, batch.countItems, batch.token)
206-
}
207186

208187
private def writeToSnowflake[F[_]: Async](
209188
env: Environment[F],

modules/core/src/test/scala/com.snowplowanalytics.snowplow.snowflake/MockEnvironment.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,9 @@ object MockEnvironment {
6464
maxDelay = 10.seconds,
6565
uploadConcurrency = 1
6666
),
67-
schemasToSkip = List.empty,
68-
badRowMaxSize = 1000000
67+
cpuParallelism = 2,
68+
schemasToSkip = List.empty,
69+
badRowMaxSize = 1000000
6970
)
7071
MockEnvironment(state, env)
7172
}

modules/kafka/src/test/scala/com/snowplowanalytics/snowplow/snowflake/KafkaConfigSpec.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ object KafkaConfigSpec {
109109
maxDelay = 1.second,
110110
uploadConcurrency = 3
111111
),
112+
cpuParallelismFraction = BigDecimal(0.75),
112113
retries = Config.Retries(
113114
setupErrors = Retrying.Config.ForSetup(delay = 30.seconds),
114115
transientErrors = Retrying.Config.ForTransient(delay = 1.second, attempts = 5)
@@ -189,6 +190,7 @@ object KafkaConfigSpec {
189190
maxDelay = 1.second,
190191
uploadConcurrency = 1
191192
),
193+
cpuParallelismFraction = BigDecimal(0.75),
192194
retries = Config.Retries(
193195
setupErrors = Retrying.Config.ForSetup(delay = 30.seconds),
194196
transientErrors = Retrying.Config.ForTransient(delay = 1.second, attempts = 5)

modules/kinesis/src/test/scala/com/snowplowanalytics/snowplow/snowflake/KinesisConfigSpec.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ object KinesisConfigSpec {
105105
maxDelay = 1.second,
106106
uploadConcurrency = 3
107107
),
108+
cpuParallelismFraction = BigDecimal(0.75),
108109
retries = Config.Retries(
109110
setupErrors = Retrying.Config.ForSetup(delay = 30.seconds),
110111
transientErrors = Retrying.Config.ForTransient(delay = 1.second, attempts = 5)
@@ -180,6 +181,7 @@ object KinesisConfigSpec {
180181
maxDelay = 1.second,
181182
uploadConcurrency = 1
182183
),
184+
cpuParallelismFraction = BigDecimal(0.75),
183185
retries = Config.Retries(
184186
setupErrors = Retrying.Config.ForSetup(delay = 30.seconds),
185187
transientErrors = Retrying.Config.ForTransient(delay = 1.second, attempts = 5)

modules/pubsub/src/test/scala/com/snowplowanalytics/snowplow/snowflake/PubsubConfigSpec.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ object PubsubConfigSpec {
105105
maxDelay = 1.second,
106106
uploadConcurrency = 3
107107
),
108+
cpuParallelismFraction = BigDecimal(0.75),
108109
retries = Config.Retries(
109110
setupErrors = Retrying.Config.ForSetup(delay = 30.seconds),
110111
transientErrors = Retrying.Config.ForTransient(delay = 1.second, attempts = 5)
@@ -179,6 +180,7 @@ object PubsubConfigSpec {
179180
maxDelay = 1.second,
180181
uploadConcurrency = 1
181182
),
183+
cpuParallelismFraction = BigDecimal(0.75),
182184
retries = Config.Retries(
183185
setupErrors = Retrying.Config.ForSetup(delay = 30.seconds),
184186
transientErrors = Retrying.Config.ForTransient(delay = 1.second, attempts = 5)

0 commit comments

Comments
 (0)