Skip to content

Commit ad34bc6

Browse files
committed
update for common libraries to 0.1.0
1 parent 798b621 commit ad34bc6

File tree

13 files changed

+102
-184
lines changed

13 files changed

+102
-184
lines changed

config/config.kinesis.minimal.hocon

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
"schema": "atomic"
1414
}
1515

16-
"bad": null # TODO need kinesis sink
17-
16+
"bad": {
17+
"streamName": "bad"
18+
}
1819
}
1920
}

config/config.kinesis.reference.hocon

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,30 @@
66
# -- name to use for the KCL dynamodb table
77
"appName": "snowplow-snowflake-loader"
88

9-
# -- TODO explain the options
9+
# -- From where the app should start consuming if this is the first time it is run.
10+
# -- On subsequent runs, it will always resume from where it last checkpointed.
1011
"initialPosition": {
12+
# -- Options are `TRIM_HORIZON` for the oldest available events, `LATEST` for latest events,
13+
# -- or `AT_TIMESTAMP` to start consuming from events written at a particular time.
1114
"type": "TRIM_HORIZON"
15+
16+
# -- Only required if `initialPosition.type` is AT_TIMESTAMP
17+
"timestamp": "2020-07-17T10:00:00Z" # Required for AT_TIMESTAMP
1218
}
1319

14-
# -- TODO explain the options
20+
# -- How the underlying Kinesis client should fetch events from the stream
1521
"retrievalMode": {
22+
# -- Options are "Polling" for the client to poll Kinesis for more events when needed
23+
# -- or "FanOut" to enabled Kinesis's Enhanced Fan Out feature using HTTP/2
1624
"type": "Polling"
25+
26+
# -- Only used if retrieval mode is type Polling. How many events the client may fetch in a single poll.
1727
"maxRecords": 1000
1828
}
1929

20-
# -- TODO explain what this is
21-
"bufferSize": 3
30+
# -- The number of batches of events which are pre-fetched from kinesis.
31+
# -- Increasing this above 1 is not known to improve performance.
32+
"bufferSize": 1
2233

2334
}
2435

@@ -58,7 +69,22 @@
5869
"jdbcQueryTimeout": "60 seconds"
5970
}
6071

61-
"bad": null # TODO need kinesis sink for failed events
72+
"bad": {
73+
# -- output kinesis stream for emitting failed events that could not be processed
74+
"streamName": "bad"
75+
76+
# -- how to retry sending failed events if we exceed the kinesis write throughput limits
77+
"throttledBackoffPolicy": {
78+
"minBackoff": "100 milliseconds"
79+
"maxBackoff": "1 second"
80+
}
81+
82+
# -- the maximum allowed to records we are allowed to send to Kinesis in 1 PutRecords request
83+
"recordLimit": 500
84+
85+
# -- the maximum allowed to bytes we are allowed to send to Kinesis in 1 PutRecords request
86+
"byteLimit": 5242880
87+
}
6288

6389
}
6490

config/config.pubsub.reference.hocon

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,10 @@
6060
# -- output pubsub topic for emitting failed events that could not be processed
6161
"topic": "projects/myproject/topics/snowplow-bad"
6262

63-
# -- bad events are held in memory until we accumulate this batch size, and then sent to pubsub
63+
# -- Failed sends events to pubsub in batches not exceeding this size.
6464
"batchSize": 100
65-
# -- bad events are held in memory until we accumulate this total byte count, and then sent to pubsub
65+
# -- Failed events to pubsub in batches not exceeding this size number of bytes
6666
"requestByteThreshold": 1000000
67-
# -- pending bad events are flushed to pubsub after this delay, regardless of whether we reached the max batch size or byte count
68-
"delayThreshold": "100 millis"
6967
}
7068

7169
}

modules/core/src/main/resources/reference.conf

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,8 @@
1818

1919
"monitoring": {
2020
"metrics": {
21+
"statsd": ${snowplow.defaults.statsd}
2122
"statsd": {
22-
"port": 8125,
23-
"tags": {}
24-
"period": "1 minute"
2523
"prefix": "snowplow.snowflake-loader"
2624
}
2725
}
@@ -35,11 +33,5 @@
3533
}
3634
}
3735

38-
"telemetry": {
39-
"disable": false
40-
"interval": "15 minutes"
41-
"collectorUri": "collector-g.snowplowanalytics.com"
42-
"collectorPort": 443
43-
"secure": true
44-
}
36+
"telemetry": ${snowplow.defaults.telemetry}
4537
}

modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Config.scala

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import scala.concurrent.duration.FiniteDuration
1919
import scala.util.Try
2020

2121
import com.snowplowanalytics.snowplow.runtime.{Metrics => CommonMetrics, Telemetry}
22+
import com.snowplowanalytics.snowplow.runtime.HealthProbe.decoders._
2223

2324
case class Config[+Source, +Sink](
2425
input: Source,
@@ -57,25 +58,6 @@ object Config {
5758
statsd: Option[CommonMetrics.StatsdConfig]
5859
)
5960

60-
private case class StatsdUnresolved(
61-
hostname: Option[String],
62-
port: Int,
63-
tags: Map[String, String],
64-
period: FiniteDuration,
65-
prefix: String
66-
)
67-
68-
private object Statsd {
69-
70-
def resolve(statsd: StatsdUnresolved): Option[CommonMetrics.StatsdConfig] =
71-
statsd match {
72-
case StatsdUnresolved(Some(hostname), port, tags, period, prefix) =>
73-
Some(CommonMetrics.StatsdConfig(hostname, port, tags, period, prefix))
74-
case StatsdUnresolved(None, _, _, _, _) =>
75-
None
76-
}
77-
}
78-
7961
case class SentryM[M[_]](
8062
dsn: M[String],
8163
tags: Map[String, String]
@@ -100,7 +82,6 @@ object Config {
10082
implicit val output = deriveConfiguredDecoder[Output[Sink]]
10183
implicit val batching = deriveConfiguredDecoder[Batching]
10284
implicit val telemetry = deriveConfiguredDecoder[Telemetry.Config]
103-
implicit val statsdDecoder = deriveConfiguredDecoder[StatsdUnresolved].map(Statsd.resolve(_))
10485
implicit val sentryDecoder = deriveConfiguredDecoder[SentryM[Option]]
10586
.map[Option[Sentry]] {
10687
case SentryM(Some(dsn), tags) =>
@@ -109,9 +90,6 @@ object Config {
10990
None
11091
}
11192
implicit val metricsDecoder = deriveConfiguredDecoder[Metrics]
112-
implicit val portDecoder = Decoder.decodeInt.emap { port =>
113-
Port.fromInt(port).toRight("Invalid port")
114-
}
11593
implicit val healthProbeDecoder = deriveConfiguredDecoder[HealthProbe]
11694
implicit val monitoringDecoder = deriveConfiguredDecoder[Monitoring]
11795
deriveConfiguredDecoder[Config[Source, Sink]]

modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/package.scala

Lines changed: 0 additions & 12 deletions
This file was deleted.

modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/Processing.scala

Lines changed: 49 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -8,24 +8,25 @@
88
package com.snowplowanalytics.snowplow.snowflake.processing
99

1010
import cats.implicits._
11-
import cats.{Applicative, Foldable, Monad, Semigroup}
11+
import cats.{Applicative, Foldable, Monad}
1212
import cats.effect.{Async, Sync}
1313
import cats.effect.kernel.Unique
14-
import fs2.{Pipe, Pull, Stream}
14+
import fs2.{Chunk, Pipe, Stream}
1515
import net.snowflake.ingest.utils.{ErrorCode, SFException}
1616
import org.typelevel.log4cats.Logger
1717
import org.typelevel.log4cats.slf4j.Slf4jLogger
1818

1919
import java.nio.charset.StandardCharsets
2020
import java.time.OffsetDateTime
21-
import scala.concurrent.duration.Duration
2221

2322
import com.snowplowanalytics.iglu.schemaddl.parquet.Caster
2423
import com.snowplowanalytics.snowplow.analytics.scalasdk.Event
2524
import com.snowplowanalytics.snowplow.badrows.{BadRow, Payload => BadPayload, Processor => BadRowProcessor}
2625
import com.snowplowanalytics.snowplow.badrows.Payload.{RawPayload => BadRowRawPayload}
2726
import com.snowplowanalytics.snowplow.sources.{EventProcessingConfig, EventProcessor, TokenedEvents}
28-
import com.snowplowanalytics.snowplow.snowflake.{Config, Environment, Metrics}
27+
import com.snowplowanalytics.snowplow.snowflake.{Environment, Metrics}
28+
import com.snowplowanalytics.snowplow.runtime.syntax.foldable._
29+
import com.snowplowanalytics.snowplow.runtime.processing.BatchUp
2930
import com.snowplowanalytics.snowplow.loaders.transform.Transform
3031

3132
object Processing {
@@ -69,7 +70,7 @@ object Processing {
6970
origBatchBytes: Long,
7071
badAccumulated: List[BadRow],
7172
countInserted: Int,
72-
tokens: List[Unique.Token]
73+
tokens: Vector[Unique.Token]
7374
)
7475

7576
/**
@@ -110,7 +111,7 @@ object Processing {
110111
in.through(setLatency(env.metrics))
111112
.through(parseBytes(badProcessor))
112113
.through(transform(badProcessor))
113-
.through(batchUp(env.batching))
114+
.through(BatchUp.withTimeout(env.batching.maxBytes, env.batching.maxDelay))
114115
.through(writeToSnowflake(env, badProcessor))
115116
.through(sendFailedEvents(env))
116117
.through(sendMetrics(env))
@@ -132,58 +133,55 @@ object Processing {
132133
}
133134

134135
/** Parse raw bytes into Event using analytics sdk */
135-
private def parseBytes[F[_]: Monad](badProcessor: BadRowProcessor): Pipe[F, TokenedEvents, ParsedBatch] =
136-
_.evalMap { case TokenedEvents(list, token, _) =>
137-
Foldable[List].foldM(list, ParsedBatch(Nil, Nil, 0L, token)) { case (acc, bytes) =>
138-
Applicative[F].pure {
139-
val bytesSize = bytes.capacity
140-
val stringified = StandardCharsets.UTF_8.decode(bytes).toString
141-
Event.parse(stringified).toEither match {
142-
case Right(e) =>
143-
acc.copy(events = e :: acc.events, countBytes = acc.countBytes + bytesSize)
144-
case Left(failure) =>
145-
val payload = BadRowRawPayload(stringified)
146-
val bad = BadRow.LoaderParsingError(badProcessor, failure, payload)
147-
acc.copy(bad = bad :: acc.bad, countBytes = acc.countBytes + bytesSize)
148-
}
149-
}
150-
}
136+
private def parseBytes[F[_]: Sync](badProcessor: BadRowProcessor): Pipe[F, TokenedEvents, ParsedBatch] =
137+
_.evalMap { case TokenedEvents(chunk, token, _) =>
138+
for {
139+
numBytes <- Sync[F].delay(Foldable[Chunk].sumBytes(chunk))
140+
(badRows, events) <- Foldable[Chunk].traverseSeparateUnordered(chunk) { bytes =>
141+
Sync[F].delay {
142+
val stringified = StandardCharsets.UTF_8.decode(bytes).toString
143+
Event.parse(stringified).toEither.leftMap { case failure =>
144+
val payload = BadRowRawPayload(stringified)
145+
BadRow.LoaderParsingError(badProcessor, failure, payload)
146+
}
147+
}
148+
}
149+
} yield ParsedBatch(events, badRows, numBytes, token)
151150
}
152151

153152
/** Transform the Event into values compatible with the snowflake ingest sdk */
154153
private def transform[F[_]: Sync](badProcessor: BadRowProcessor): Pipe[F, ParsedBatch, BatchAfterTransform] =
155-
in =>
156-
for {
157-
ParsedBatch(events, bad, bytes, token) <- in
158-
loadTstamp <- Stream.eval(Sync[F].realTimeInstant).map(SnowflakeCaster.timestampValue)
159-
result <- Stream.eval(transformBatch[F](badProcessor, events, loadTstamp))
160-
(moreBad, transformed) = result.separate
161-
} yield BatchAfterTransform(
162-
toBeInserted = transformed.toVector,
163-
origBatchBytes = bytes,
164-
badAccumulated = bad ::: moreBad,
165-
countInserted = 0,
166-
tokens = List(token)
167-
)
154+
_.evalMap { batch =>
155+
Sync[F].realTimeInstant.flatMap { now =>
156+
val loadTstamp = SnowflakeCaster.timestampValue(now)
157+
transformBatch[F](badProcessor, loadTstamp, batch)
158+
}
159+
}
168160

169-
private def transformBatch[F[_]: Monad](
161+
private def transformBatch[F[_]: Sync](
170162
badProcessor: BadRowProcessor,
171-
events: List[Event],
172-
loadTstamp: OffsetDateTime
173-
): F[List[Either[BadRow, (Event, Map[String, AnyRef])]]] =
174-
events
175-
.traverse { e =>
176-
Applicative[F].pure {
163+
loadTstamp: OffsetDateTime,
164+
batch: ParsedBatch
165+
): F[BatchAfterTransform] =
166+
Foldable[List]
167+
.traverseSeparateUnordered(batch.events) { event =>
168+
Sync[F].delay {
177169
Transform
178-
.transformEventUnstructured[AnyRef](badProcessor, SnowflakeCaster, SnowflakeJsonFolder, e)
170+
.transformEventUnstructured[AnyRef](badProcessor, SnowflakeCaster, SnowflakeJsonFolder, event)
179171
.map { namedValues =>
180-
val asMap = namedValues.map { case Caster.NamedValue(k, v) =>
181-
k -> v
182-
}.toMap
183-
(e, asMap + ("load_tstamp" -> loadTstamp))
172+
val map = namedValues
173+
.map { case Caster.NamedValue(k, v) =>
174+
k -> v
175+
}
176+
.toMap
177+
.updated("load_tstamp", loadTstamp)
178+
event -> map
184179
}
185180
}
186181
}
182+
.map { case (badRows, eventsWithTransforms) =>
183+
BatchAfterTransform(eventsWithTransforms.toVector, batch.countBytes, badRows ::: batch.bad, 0, Vector(batch.token))
184+
}
187185

188186
private def writeToSnowflake[F[_]: Async](
189187
env: Environment[F],
@@ -342,57 +340,18 @@ object Processing {
342340

343341
private def fastGetByIndex[A](items: Vector[A], index: Long): A = items(index.toInt)
344342

345-
private implicit def batchedSemigroup: Semigroup[BatchAfterTransform] = new Semigroup[BatchAfterTransform] {
343+
private implicit def batchable: BatchUp.Batchable[BatchAfterTransform] = new BatchUp.Batchable[BatchAfterTransform] {
346344
def combine(x: BatchAfterTransform, y: BatchAfterTransform): BatchAfterTransform =
347345
BatchAfterTransform(
348346
toBeInserted = x.toBeInserted ++ y.toBeInserted,
349347
origBatchBytes = x.origBatchBytes + y.origBatchBytes,
350348
badAccumulated = x.badAccumulated ::: y.badAccumulated,
351349
countInserted = x.countInserted + y.countInserted,
352-
tokens = x.tokens ::: y.tokens
350+
tokens = x.tokens ++ y.tokens
353351
)
354-
}
355352

356-
private def batchUp[F[_]: Async](config: Config.Batching): Pipe[F, BatchAfterTransform, BatchAfterTransform] = {
357-
def go(
358-
timedPull: Pull.Timed[F, BatchAfterTransform],
359-
unflushed: Option[BatchAfterTransform]
360-
): Pull[F, BatchAfterTransform, Unit] =
361-
timedPull.uncons.flatMap {
362-
case None => // Upstream stream has finished cleanly
363-
unflushed match {
364-
case None => Pull.done
365-
case Some(b) => Pull.output1(b) *> Pull.done
366-
}
367-
case Some((Left(_), next)) => // The timer we set has timed out.
368-
unflushed match {
369-
case None => go(next, None)
370-
case Some(b) => Pull.output1(b) >> go(next, None)
371-
}
372-
case Some((Right(pulled), next)) if pulled.isEmpty =>
373-
go(next, unflushed)
374-
case Some((Right(nonEmptyChunk), next)) => // Received another batch before the timer timed out
375-
val combined = unflushed match {
376-
case None => nonEmptyChunk.iterator.reduce(_ |+| _)
377-
case Some(b) => nonEmptyChunk.iterator.foldLeft(b)(_ |+| _)
378-
}
379-
if (combined.origBatchBytes > config.maxBytes)
380-
for {
381-
_ <- Pull.output1(combined)
382-
_ <- next.timeout(Duration.Zero)
383-
_ <- go(next, None)
384-
} yield ()
385-
else {
386-
for {
387-
_ <- if (unflushed.isEmpty) next.timeout(config.maxDelay) else Pull.pure(())
388-
_ <- go(next, Some(combined))
389-
} yield ()
390-
}
391-
}
392-
in =>
393-
in.pull.timed { timedPull =>
394-
go(timedPull, None)
395-
}.stream
353+
def weightOf(a: BatchAfterTransform): Long =
354+
a.origBatchBytes
396355
}
397356

398357
}

0 commit comments

Comments
 (0)