Skip to content

Commit 306c97a

Browse files
committed
[WIP] common-streams 0.8.x with refactored health monitoring
1 parent 27e44b6 commit 306c97a

File tree

24 files changed

+287
-675
lines changed

24 files changed

+287
-675
lines changed

config/config.azure.reference.hocon

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,15 +136,17 @@
136136
}
137137
}
138138

139-
# -- Report alerts to the webhook
139+
# -- Report alerts and heartbeats to the webhook
140140
"webhook": {
141141
# An actual HTTP endpoint
142142
"endpoint": "https://webhook.acme.com",
143143
# Set of arbitrary key-value pairs attached to the payload
144144
"tags": {
145145
"pipeline": "production"
146146
}
147-
}
147+
# How often to send the heartbeat event
148+
"heartbeat": "60.minutes"
149+
}
148150
}
149151

150152
# -- Optional, configure telemetry

config/config.kinesis.reference.hocon

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,14 +157,16 @@
157157
}
158158
}
159159

160-
# -- Report alerts to the webhook
160+
# -- Report alerts and heartbeats to the webhook
161161
"webhook": {
162162
# An actual HTTP endpoint
163163
"endpoint": "https://webhook.acme.com",
164164
# Set of arbitrary key-value pairs attached to the payload
165165
"tags": {
166166
"pipeline": "production"
167167
}
168+
# How often to send the heartbeat event
169+
"heartbeat": "60.minutes"
168170
}
169171
}
170172

config/config.pubsub.reference.hocon

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
# -- pubsub subscription for the source of enriched events
44
"subscription": "projects/myproject/subscriptions/snowplow-enriched"
55

6-
# -- How many threads are used by the pubsub client library for fetching events
7-
"parallelPullCount": 3
6+
# -- Controls how many threads are used internally by the pubsub client library for fetching events.
7+
# -- The number of threads is equal to this factor multiplied by the number of availble cpu cores
8+
"parallelPullFactor": 0.5
89

910
# -- How many bytes can be buffered by the loader app before blocking the pubsub client library
1011
# -- from fetching more events.
@@ -137,15 +138,17 @@
137138
}
138139
}
139140

140-
# -- Report alerts to the webhook
141+
# -- Report alerts and heartbeats to the webhook
141142
"webhook": {
142143
# An actual HTTP endpoint
143144
"endpoint": "https://webhook.acme.com",
144145
# Set of arbitrary key-value pairs attached to the payload
145146
"tags": {
146147
"pipeline": "production"
147148
}
148-
}
149+
# How often to send the heartbeat event
150+
"heartbeat": "60.minutes"
151+
}
149152
}
150153

151154
# -- Optional, configure telemetry

modules/core/src/main/resources/reference.conf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
"prefix": "snowplow.snowflake-loader"
4444
}
4545
}
46+
"webhook": ${snowplow.defaults.webhook}
4647
"sentry": {
4748
"tags": {
4849
}

modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Alert.scala

Lines changed: 12 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -10,78 +10,27 @@
1010

1111
package com.snowplowanalytics.snowplow.snowflake
1212

13+
import cats.implicits._
1314
import cats.Show
1415
import cats.implicits.showInterpolator
15-
import com.snowplowanalytics.iglu.core.circe.implicits.igluNormalizeDataJson
16-
import com.snowplowanalytics.iglu.core.{SchemaKey, SchemaVer, SelfDescribingData}
17-
import com.snowplowanalytics.snowplow.runtime.AppInfo
18-
import io.circe.Json
19-
import io.circe.syntax.EncoderOps
2016

21-
import java.sql.SQLException
17+
import com.snowplowanalytics.snowplow.runtime.SetupExceptionMessages
2218

2319
sealed trait Alert
2420
object Alert {
2521

26-
/** Restrict the length of an alert message to be compliant with alert iglu schema */
27-
private val MaxAlertPayloadLength = 4096
28-
29-
final case class FailedToCreateEventsTable(cause: Throwable) extends Alert
30-
final case class FailedToAddColumns(columns: List[String], cause: Throwable) extends Alert
31-
final case class FailedToOpenSnowflakeChannel(cause: Throwable) extends Alert
32-
final case class FailedToParsePrivateKey(cause: Throwable) extends Alert
22+
final case class FailedToCreateEventsTable(cause: SetupExceptionMessages) extends Alert
23+
final case class FailedToAddColumns(columns: List[String], cause: SetupExceptionMessages) extends Alert
24+
final case class FailedToOpenSnowflakeChannel(cause: SetupExceptionMessages) extends Alert
25+
final case class FailedToParsePrivateKey(cause: SetupExceptionMessages) extends Alert
3326
final case class TableIsMissingAtomicColumn(columnName: String) extends Alert
3427

35-
def toSelfDescribingJson(
36-
alert: Alert,
37-
appInfo: AppInfo,
38-
tags: Map[String, String]
39-
): Json =
40-
SelfDescribingData(
41-
schema = SchemaKey("com.snowplowanalytics.monitoring.loader", "alert", "jsonschema", SchemaVer.Full(1, 0, 0)),
42-
data = Json.obj(
43-
"appName" -> appInfo.name.asJson,
44-
"appVersion" -> appInfo.version.asJson,
45-
"message" -> getMessage(alert).asJson,
46-
"tags" -> tags.asJson
47-
)
48-
).normalize
49-
50-
private def getMessage(alert: Alert): String = {
51-
val full = alert match {
52-
case FailedToCreateEventsTable(cause) => show"Failed to create events table: $cause"
53-
case FailedToAddColumns(columns, cause) => show"Failed to add columns: ${columns.mkString("[", ",", "]")}. Cause: $cause"
54-
case FailedToOpenSnowflakeChannel(cause) => show"Failed to open Snowflake channel: $cause"
55-
case FailedToParsePrivateKey(cause) => show"Failed to parse private key: $cause"
56-
case TableIsMissingAtomicColumn(colName) => show"Table is missing required column $colName"
57-
}
58-
59-
full.take(MaxAlertPayloadLength)
28+
implicit def showAlert: Show[Alert] = Show {
29+
case FailedToCreateEventsTable(cause) => show"Failed to create events table: $cause"
30+
case FailedToAddColumns(columns, cause) => show"Failed to add columns: ${columns.mkString("[", ",", "]")}. Cause: $cause"
31+
case FailedToOpenSnowflakeChannel(cause) => show"Failed to open Snowflake channel: $cause"
32+
case FailedToParsePrivateKey(cause) => show"Failed to parse private key: $cause"
33+
case TableIsMissingAtomicColumn(colName) => show"Table is missing required column $colName"
6034
}
6135

62-
private implicit def throwableShow: Show[Throwable] = {
63-
def removeDuplicateMessages(in: List[String]): List[String] =
64-
in match {
65-
case h :: t :: rest =>
66-
if (h.contains(t)) removeDuplicateMessages(h :: rest)
67-
else if (t.contains(h)) removeDuplicateMessages(t :: rest)
68-
else h :: removeDuplicateMessages(t :: rest)
69-
case fewer => fewer
70-
}
71-
72-
def accumulateMessages(t: Throwable): List[String] = {
73-
val nextMessage = t match {
74-
case t: SQLException => Some(s"${t.getMessage} = SqlState: ${t.getSQLState}")
75-
case t => Option(t.getMessage)
76-
}
77-
Option(t.getCause) match {
78-
case Some(cause) => nextMessage.toList ::: accumulateMessages(cause)
79-
case None => nextMessage.toList
80-
}
81-
}
82-
83-
Show.show { t =>
84-
removeDuplicateMessages(accumulateMessages(t)).mkString(": ")
85-
}
86-
}
8736
}

modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/AppHealth.scala

Lines changed: 0 additions & 85 deletions
This file was deleted.

modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Config.scala

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
package com.snowplowanalytics.snowplow.snowflake
1212

1313
import cats.Id
14-
import cats.syntax.either._
1514
import io.circe.Decoder
1615
import io.circe.generic.extras.semiauto._
1716
import io.circe.generic.extras.Configuration
@@ -23,9 +22,8 @@ import com.snowplowanalytics.iglu.core.circe.CirceIgluCodecs.schemaCriterionDeco
2322

2423
import scala.concurrent.duration.FiniteDuration
2524
import scala.util.Try
26-
import com.snowplowanalytics.snowplow.runtime.{Metrics => CommonMetrics, Telemetry}
25+
import com.snowplowanalytics.snowplow.runtime.{Metrics => CommonMetrics, Retrying, Telemetry, Webhook}
2726
import com.snowplowanalytics.snowplow.runtime.HealthProbe.decoders._
28-
import org.http4s.{ParseFailure, Uri}
2927

3028
case class Config[+Source, +Sink](
3129
input: Source,
@@ -90,17 +88,15 @@ object Config {
9088
metrics: Metrics,
9189
sentry: Option[Sentry],
9290
healthProbe: HealthProbe,
93-
webhook: Option[Webhook]
91+
webhook: Webhook.Config
9492
)
9593

96-
final case class Webhook(endpoint: Uri, tags: Map[String, String])
97-
9894
case class SetupErrorRetries(delay: FiniteDuration)
9995
case class TransientErrorRetries(delay: FiniteDuration, attempts: Int)
10096

10197
case class Retries(
102-
setupErrors: SetupErrorRetries,
103-
transientErrors: TransientErrorRetries
98+
setupErrors: Retrying.Config.ForSetup,
99+
transientErrors: Retrying.Config.ForTransient
104100
)
105101

106102
implicit def decoder[Source: Decoder, Sink: Decoder]: Decoder[Config[Source, Sink]] = {
@@ -125,15 +121,9 @@ object Config {
125121
case SentryM(None, _) =>
126122
None
127123
}
128-
implicit val http4sUriDecoder: Decoder[Uri] =
129-
Decoder[String].emap(s => Either.catchOnly[ParseFailure](Uri.unsafeFromString(s)).leftMap(_.toString))
130-
131124
implicit val metricsDecoder = deriveConfiguredDecoder[Metrics]
132125
implicit val healthProbeDecoder = deriveConfiguredDecoder[HealthProbe]
133-
implicit val webhookDecoder = deriveConfiguredDecoder[Webhook]
134126
implicit val monitoringDecoder = deriveConfiguredDecoder[Monitoring]
135-
implicit val setupRetries = deriveConfiguredDecoder[SetupErrorRetries]
136-
implicit val transientRetries = deriveConfiguredDecoder[TransientErrorRetries]
137127
implicit val retriesDecoder = deriveConfiguredDecoder[Retries]
138128
deriveConfiguredDecoder[Config[Source, Sink]]
139129
}

modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Environment.scala

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010

1111
package com.snowplowanalytics.snowplow.snowflake
1212

13+
import cats.implicits._
1314
import cats.effect.unsafe.implicits.global
1415
import cats.effect.{Async, Resource}
1516
import com.snowplowanalytics.iglu.core.SchemaCriterion
16-
import com.snowplowanalytics.snowplow.runtime.{AppInfo, HealthProbe}
17+
import com.snowplowanalytics.snowplow.runtime.{AppHealth, AppInfo, HealthProbe, Webhook}
1718
import com.snowplowanalytics.snowplow.sinks.Sink
18-
import com.snowplowanalytics.snowplow.snowflake.AppHealth.Service
1919
import com.snowplowanalytics.snowplow.snowflake.processing.{Channel, TableManager}
2020
import com.snowplowanalytics.snowplow.sources.SourceAndAck
2121
import org.http4s.blaze.client.BlazeClientBuilder
@@ -29,19 +29,14 @@ case class Environment[F[_]](
2929
tableManager: TableManager[F],
3030
channel: Channel.Provider[F],
3131
metrics: Metrics[F],
32-
appHealth: AppHealth[F],
32+
appHealth: AppHealth.Interface[F, Alert, RuntimeService],
3333
batching: Config.Batching,
3434
schemasToSkip: List[SchemaCriterion],
3535
badRowMaxSize: Int
3636
)
3737

3838
object Environment {
3939

40-
private val initialAppHealth: Map[Service, Boolean] = Map(
41-
Service.Snowflake -> false,
42-
Service.BadSink -> true
43-
)
44-
4540
def fromConfig[F[_]: Async, SourceConfig, SinkConfig](
4641
config: Config[SourceConfig, SinkConfig],
4742
appInfo: AppInfo,
@@ -51,18 +46,16 @@ object Environment {
5146
for {
5247
_ <- Sentry.capturingAnyException(appInfo, config.monitoring.sentry)
5348
sourceAndAck <- Resource.eval(toSource(config.input))
54-
appHealth <- Resource.eval(AppHealth.init(config.monitoring.healthProbe.unhealthyLatency, sourceAndAck, initialAppHealth))
55-
_ <- HealthProbe.resource(
56-
config.monitoring.healthProbe.port,
57-
appHealth.status()
58-
)
49+
sourceReporter = sourceAndAck.isHealthy(config.monitoring.healthProbe.unhealthyLatency).map(_.showIfUnhealthy)
50+
appHealth <- Resource.eval(AppHealth.init[F, Alert, RuntimeService](List(sourceReporter)))
5951
httpClient <- BlazeClientBuilder[F].withExecutionContext(global.compute).resource
60-
monitoring <- Monitoring.create[F](config.monitoring.webhook, appInfo, httpClient)
52+
_ <- HealthProbe.resource(config.monitoring.healthProbe.port, appHealth)
53+
_ <- Webhook.resource(config.monitoring.webhook, appInfo, httpClient, appHealth)
6154
badSink <- toSink(config.output.bad.sink)
6255
metrics <- Resource.eval(Metrics.build(config.monitoring.metrics))
63-
tableManager <- Resource.eval(TableManager.make(config.output.good, appHealth, config.retries, monitoring))
64-
channelOpener <- Channel.opener(config.output.good, config.batching, config.retries, monitoring, appHealth)
65-
channelProvider <- Channel.provider(channelOpener, config.retries, appHealth, monitoring)
56+
tableManager <- Resource.eval(TableManager.make(config.output.good, appHealth, config.retries))
57+
channelOpener <- Channel.opener(config.output.good, config.batching, config.retries, appHealth)
58+
channelProvider <- Channel.provider(channelOpener, config.retries, appHealth)
6659
} yield Environment(
6760
appInfo = appInfo,
6861
source = sourceAndAck,

0 commit comments

Comments
 (0)