Skip to content

Commit 5ca2239

Browse files
committed
common-streams 0.10.0
Common streams 0.10.0 brings significant to changes to the Kinesis and Pubsub sources: - PubSub source completely re-written to be a wrapper around UnaryPull snowplow-incubator/common-streams#101 - Kinesis source is more efficient when the stream is re-sharded snowplow-incubator/common-streams#102 - Kinesis source better tuned for larger deployments snowplow-incubator/common-streams#99 And improvements to latency metrics: - Sources should report stream latency of stuck events snowplow-incubator/common-streams#104
1 parent cb3176e commit 5ca2239

File tree

8 files changed

+50
-16
lines changed

8 files changed

+50
-16
lines changed

config/config.aws.reference.hocon

+11
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,17 @@
3939

4040
# -- Duration of shard leases. KCL workers must periodically refresh leases in the dynamodb table before this duration expires.
4141
"leaseDuration": "10 seconds"
42+
43+
# -- Controls how to pick the max number of leases to steal at one time.
44+
# -- E.g. If there are 4 available processors, and maxLeasesToStealAtOneTimeFactor = 2.0, then allow the KCL to steal up to 8 leases.
45+
# -- Allows bigger instances to more quickly acquire the shard-leases they need to combat latency
46+
"maxLeasesToStealAtOneTimeFactor": 2.0
47+
48+
# -- Configures how to backoff and retry in case of DynamoDB provisioned throughput limits
49+
"checkpointThrottledBackoffPolicy": {
50+
"minBackoff": "100 millis"
51+
"maxBackoff": "1 second"
52+
}
4253
}
4354

4455
"output": {

config/config.gcp.reference.hocon

+16-9
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,22 @@
1414
# -- The number of threads is equal to this factor multiplied by the number of availble cpu cores
1515
"parallelPullFactor": 0.5
1616

17-
# -- How many bytes can be buffered by the loader app before blocking the pubsub client library
18-
# -- from fetching more events.
19-
# -- This is a balance between memory usage vs how efficiently the app can operate. The default value works well.
20-
"bufferMaxBytes": 10000000
21-
22-
# -- Sets min/max boundaries on the value by which an ack deadline is extended.
23-
# -- The actual value used is guided by runtime statistics collected by the pubsub client library.
24-
"minDurationPerAckExtension": "60 seconds"
25-
"maxDurationPerAckExtension": "600 seconds"
17+
# -- Pubsub ack deadlines are extended for this duration when needed.
18+
# -- A sensible value is double the size of the "windowing" config parameter, but no higher than 10 minutes.
19+
"durationPerAckExtension": "600 seconds"
20+
21+
# -- Controls when ack deadlines are re-extended, for a message that is close to exceeding its ack deadline.
22+
# -- For example, if `durationPerAckExtension` is `600 seconds` and `minRemainingAckDeadline` is `0.1` then the Source
23+
# -- will wait until there is `60 seconds` left of the remining deadline, before re-extending the message deadline.
24+
"minRemainingAckDeadline": 0.1
25+
26+
# -- How many pubsub messages to pull from the server in a single request.
27+
"maxMessagesPerPull": 1000
28+
29+
# -- Adds an artifical delay between consecutive requests to pubsub for more messages.
30+
# -- Under some circumstances, this was found to slightly alleviate a problem in which pubsub might re-deliver
31+
# -- the same messages multiple times.
32+
"debounceRequests": "100 millis"
2633
}
2734

2835
"output": {

modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/Environment.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ object Environment {
7878
windowing <- Resource.eval(EventProcessingConfig.TimedWindows.build(config.main.windowing, config.main.numEagerWindows))
7979
lakeWriter <- LakeWriter.build(config.main.spark, config.main.output.good)
8080
lakeWriterWrapped = LakeWriter.withHandledErrors(lakeWriter, appHealth, config.main.retries, destinationSetupErrorCheck)
81-
metrics <- Resource.eval(Metrics.build(config.main.monitoring.metrics))
81+
metrics <- Resource.eval(Metrics.build(config.main.monitoring.metrics, sourceAndAck))
8282
cpuParallelism = chooseCpuParallelism(config.main)
8383
} yield Environment(
8484
appInfo = appInfo,

modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/Metrics.scala

+14-5
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,15 @@
1010

1111
package com.snowplowanalytics.snowplow.lakes
1212

13+
import cats.Functor
1314
import cats.effect.Async
1415
import cats.effect.kernel.Ref
1516
import cats.implicits._
1617
import fs2.Stream
1718

1819
import scala.concurrent.duration.{Duration, FiniteDuration}
1920

21+
import com.snowplowanalytics.snowplow.sources.SourceAndAck
2022
import com.snowplowanalytics.snowplow.runtime.{Metrics => CommonMetrics}
2123

2224
trait Metrics[F[_]] {
@@ -32,8 +34,8 @@ trait Metrics[F[_]] {
3234

3335
object Metrics {
3436

35-
def build[F[_]: Async](config: Config.Metrics): F[Metrics[F]] =
36-
Ref[F].of(State.empty).map(impl(config, _))
37+
def build[F[_]: Async](config: Config.Metrics, sourceAndAck: SourceAndAck[F]): F[Metrics[F]] =
38+
Ref.ofEffect(State.initialize(sourceAndAck)).map(impl(config, _, sourceAndAck))
3739

3840
private case class State(
3941
received: Int,
@@ -53,11 +55,18 @@ object Metrics {
5355
}
5456

5557
private object State {
56-
def empty: State = State(0, 0, 0, Duration.Zero, None, None)
58+
def initialize[F[_]: Functor](sourceAndAck: SourceAndAck[F]): F[State] =
59+
sourceAndAck.currentStreamLatency.map { latency =>
60+
State(0, 0, 0, latency.getOrElse(Duration.Zero), None, None)
61+
}
5762
}
5863

59-
private def impl[F[_]: Async](config: Config.Metrics, ref: Ref[F, State]): Metrics[F] =
60-
new CommonMetrics[F, State](ref, State.empty, config.statsd) with Metrics[F] {
64+
private def impl[F[_]: Async](
65+
config: Config.Metrics,
66+
ref: Ref[F, State],
67+
sourceAndAck: SourceAndAck[F]
68+
): Metrics[F] =
69+
new CommonMetrics[F, State](ref, State.initialize(sourceAndAck), config.statsd) with Metrics[F] {
6170
def addReceived(count: Int): F[Unit] =
6271
ref.update(s => s.copy(received = s.received + count))
6372
def addBad(count: Int): F[Unit] =

modules/core/src/test/scala/com.snowplowanalytics.snowplow.lakes/MockEnvironment.scala

+3
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,9 @@ object MockEnvironment {
135135

136136
def isHealthy(maxAllowedProcessingLatency: FiniteDuration): IO[SourceAndAck.HealthStatus] =
137137
IO.pure(SourceAndAck.Healthy)
138+
139+
def currentStreamLatency: IO[Option[FiniteDuration]] =
140+
IO.pure(None)
138141
}
139142

140143
private def testSink(ref: Ref[IO, Vector[Action]]): Sink[IO] = Sink[IO] { batch =>

modules/core/src/test/scala/com.snowplowanalytics.snowplow.lakes/TestSparkEnvironment.scala

+3
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ object TestSparkEnvironment {
7070

7171
def isHealthy(maxAllowedProcessingLatency: FiniteDuration): IO[SourceAndAck.HealthStatus] =
7272
IO.pure(SourceAndAck.Healthy)
73+
74+
def currentStreamLatency: IO[Option[FiniteDuration]] =
75+
IO.pure(None)
7376
}
7477

7578
private def testHttpClient: Client[IO] = Client[IO] { _ =>

modules/gcp/src/main/resources/application.conf

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
"input": ${snowplow.defaults.sources.pubsub}
1111
"input": {
1212
"gcpUserAgent": ${gcpUserAgent}
13+
"durationPerAckExtension": "600 seconds"
1314
}
1415
"output": {
1516
"bad": ${snowplow.defaults.sinks.pubsub}

project/Dependencies.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ object Dependencies {
4949
val awsRegistry = "1.1.20"
5050

5151
// Snowplow
52-
val streams = "0.8.2-M1"
52+
val streams = "0.10.0-M2"
5353
val igluClient = "4.0.0"
5454

5555
// Transitive overrides

0 commit comments

Comments
 (0)