From 46508938b9c123014c2b85b2a06a32bbf13df4da Mon Sep 17 00:00:00 2001 From: Ian Streeter Date: Wed, 27 Mar 2024 13:19:15 +0000 Subject: [PATCH 1/4] Use TIMESTAMP_TZ types when creating the table (#36) Previously we used the `TIMESTAMP` column type. In Snowflake, this is _probably_ an alias for `TIMESTAMP_NTZ`, but actually it depends on whether the account owner has change the default of the `TIMESTAMP_TYPE_MAPPING` session parameter. We should pick a concrete timestamp type and use it explicitly for all new tables. This will prevent future confusion where we are not 100% sure what timestamp got used when creating the table. This change was supposed to happen in the first release, but it got overlooked. --- .../processing/TableManager.scala | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/TableManager.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/TableManager.scala index 80c4233..5bc21c0 100644 --- a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/TableManager.scala +++ b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/TableManager.scala @@ -108,16 +108,16 @@ object TableManager { CREATE TABLE IF NOT EXISTS identifier($tableName) ( app_id VARCHAR, platform VARCHAR, - etl_tstamp TIMESTAMP, - collector_tstamp TIMESTAMP NOT NULL, - dvce_created_tstamp TIMESTAMP, + etl_tstamp TIMESTAMP_NTZ, + collector_tstamp TIMESTAMP_NTZ NOT NULL, + dvce_created_tstamp TIMESTAMP_NTZ, event VARCHAR, event_id VARCHAR NOT NULL UNIQUE, txn_id INTEGER, name_tracker VARCHAR, v_tracker VARCHAR, - v_collector VARCHAR NOT NULL, - v_etl VARCHAR NOT NULL, + v_collector VARCHAR NOT NULL, + v_etl VARCHAR NOT NULL, user_id VARCHAR, user_ipaddress VARCHAR, user_fingerprint VARCHAR, @@ -223,18 +223,18 @@ object TableManager { mkt_clickid VARCHAR, mkt_network VARCHAR, etl_tags VARCHAR, - dvce_sent_tstamp TIMESTAMP, + dvce_sent_tstamp TIMESTAMP_NTZ, refr_domain_userid VARCHAR, - refr_dvce_tstamp TIMESTAMP, + refr_dvce_tstamp TIMESTAMP_NTZ, domain_sessionid VARCHAR, - derived_tstamp TIMESTAMP, + derived_tstamp TIMESTAMP_NTZ, event_vendor VARCHAR, event_name VARCHAR, event_format VARCHAR, event_version VARCHAR, event_fingerprint VARCHAR, - true_tstamp TIMESTAMP, - load_tstamp TIMESTAMP, + true_tstamp TIMESTAMP_NTZ, + load_tstamp TIMESTAMP_NTZ, CONSTRAINT event_id_pk PRIMARY KEY(event_id) ) """ From 885125db3be6bdfb7b06a6a5cd68b62bb9f00923 Mon Sep 17 00:00:00 2001 From: Ian Streeter Date: Fri, 5 Apr 2024 11:13:00 +0100 Subject: [PATCH 2/4] Missing atomic column should be treated as a setup error (#38) Before this PR, the loader would crash immediately if the events table is missing a required column like `load_tstamp`. It is better if the loader stays alive long enough to send an informative message to the monitoring webhook. --- .../Alert.scala | 27 +++++++--- .../processing/TableManager.scala | 52 +++++++++++++++---- 2 files changed, 61 insertions(+), 18 deletions(-) diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Alert.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Alert.scala index 949a039..ba88cca 100644 --- a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Alert.scala +++ b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Alert.scala @@ -30,6 +30,7 @@ object Alert { final case class FailedToAddColumns(columns: List[String], cause: Throwable) extends Alert final case class FailedToOpenSnowflakeChannel(cause: Throwable) extends Alert final case class FailedToParsePrivateKey(cause: Throwable) extends Alert + final case class TableIsMissingAtomicColumn(columnName: String) extends Alert def toSelfDescribingJson( alert: Alert, @@ -52,25 +53,35 @@ object Alert { case FailedToAddColumns(columns, cause) => show"Failed to add columns: ${columns.mkString("[", ",", "]")}. Cause: $cause" case FailedToOpenSnowflakeChannel(cause) => show"Failed to open Snowflake channel: $cause" case FailedToParsePrivateKey(cause) => show"Failed to parse private key: $cause" + case TableIsMissingAtomicColumn(colName) => show"Table is missing required column $colName" } full.take(MaxAlertPayloadLength) } private implicit def throwableShow: Show[Throwable] = { - def go(acc: List[String], next: Throwable): String = { - val nextMessage = next match { + def removeDuplicateMessages(in: List[String]): List[String] = + in match { + case h :: t :: rest => + if (h.contains(t)) removeDuplicateMessages(h :: rest) + else if (t.contains(h)) removeDuplicateMessages(t :: rest) + else h :: removeDuplicateMessages(t :: rest) + case fewer => fewer + } + + def accumulateMessages(t: Throwable): List[String] = { + val nextMessage = t match { case t: SQLException => Some(s"${t.getMessage} = SqlState: ${t.getSQLState}") case t => Option(t.getMessage) } - val msgs = nextMessage.filterNot(msg => acc.headOption.contains(msg)) ++: acc - - Option(next.getCause) match { - case Some(cause) => go(msgs, cause) - case None => msgs.reverse.mkString(": ") + Option(t.getCause) match { + case Some(cause) => nextMessage.toList ::: accumulateMessages(cause) + case None => nextMessage.toList } } - Show.show(go(Nil, _)) + Show.show { t => + removeDuplicateMessages(accumulateMessages(t)).mkString(": ") + } } } diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/TableManager.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/TableManager.scala index 5bc21c0..7d6b650 100644 --- a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/TableManager.scala +++ b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/TableManager.scala @@ -19,6 +19,8 @@ import net.snowflake.client.jdbc.SnowflakeSQLException import org.typelevel.log4cats.Logger import org.typelevel.log4cats.slf4j.Slf4jLogger +import com.snowplowanalytics.snowplow.loaders.transform.AtomicFields + import scala.util.matching.Regex trait TableManager[F[_]] { @@ -50,8 +52,11 @@ object TableManager { override def addColumns(columns: List[String]): F[Unit] = SnowflakeRetrying.withRetries(appHealth, retriesConfig, monitoring, Alert.FailedToAddColumns(columns, _)) { - Logger[F].info(s"Altering table to add columns [${columns.mkString(", ")}]") *> - executeAddColumnsQuery(columns) + for { + _ <- Logger[F].info(s"Altering table to add columns [${columns.mkString(", ")}]") + addable <- columns.traverse(addableColumn(appHealth, monitoring, _)) + _ <- executeAddColumnsQuery(addable) + } yield () } def executeInitTableQuery(): F[Unit] = { @@ -68,31 +73,58 @@ object TableManager { } } - def executeAddColumnsQuery(columns: List[String]): F[Unit] = + def executeAddColumnsQuery(columns: List[AddableColumn]): F[Unit] = transactor.rawTrans.apply { columns.traverse_ { column => sqlAlterTable(config, column).update.run.void .recoverWith { case e: SnowflakeSQLException if e.getErrorCode === 1430 => - Logger[ConnectionIO].info(show"Column already exists: $column") + Logger[ConnectionIO].info(show"Column already exists: ${column.name}") } } } } } + private sealed trait AddableColumn { + def name: String + } + private object AddableColumn { + case class UnstructEvent(name: String) extends AddableColumn + case class Contexts(name: String) extends AddableColumn + } + + private def addableColumn[F[_]: Async]( + appHealth: AppHealth[F], + monitoring: Monitoring[F], + name: String + ): F[AddableColumn] = + name match { + case reUnstruct() => Sync[F].pure(AddableColumn.UnstructEvent(name)) + case reContext() => Sync[F].pure(AddableColumn.Contexts(name)) + case other if AtomicFields.withLoadTstamp.exists(_.name === other) => + Logger[F].error(s"Table is missing required field $name. Will do nothing but wait for loader to be killed") *> + appHealth.setServiceHealth(AppHealth.Service.Snowflake, false) *> + // This is a type of "setup" error, so we send a monitoring alert + monitoring.alert(Alert.TableIsMissingAtomicColumn(name)) *> + // We don't want to crash and exit, because we don't want to spam Sentry with exceptions about setup errors. + // But there's no point in continuing or retrying. Instead we just block the fiber so the health probe appears unhealthy. + Async[F].never + case other => + Sync[F].raiseError(new IllegalStateException(s"Cannot alter table to add unrecognized column $other")) + } + private val reUnstruct: Regex = "^unstruct_event_.*$".r private val reContext: Regex = "^contexts_.*$".r - private def sqlAlterTable(config: Config.Snowflake, colName: String): Fragment = { + private def sqlAlterTable(config: Config.Snowflake, addableColumn: AddableColumn): Fragment = { val tableName = fqTableName(config) - val colType = colName match { - case reUnstruct() => "OBJECT" - case reContext() => "ARRAY" - case other => throw new IllegalStateException(s"Cannot alter table to add column $other") + val colType = addableColumn match { + case AddableColumn.UnstructEvent(_) => "OBJECT" + case AddableColumn.Contexts(_) => "ARRAY" } val colTypeFrag = Fragment.const0(colType) - val colNameFrag = Fragment.const0(colName) + val colNameFrag = Fragment.const0(addableColumn.name) sql""" ALTER TABLE identifier($tableName) ADD COLUMN $colNameFrag $colTypeFrag From 7f245df59d372dba27152c944847f1fa510a7620 Mon Sep 17 00:00:00 2001 From: Ian Streeter Date: Fri, 5 Apr 2024 11:13:12 +0100 Subject: [PATCH 3/4] Treat all client errors as setup errors when opening channel (#37) Snowflake responds with a 4xx response code for various reasons when opening a channel: - User does not have permissions (403) - This table type does not support channels (400) - There are probably others I have not seen yet! Before this PR, if we treated the 403 as a setup error, but actually all 4xxs are probably setup errors. We should treat them gracefully by staying alive and sending an informative message to the monitoring webhook. --- .../processing/SnowflakeRetrying.scala | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/SnowflakeRetrying.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/SnowflakeRetrying.scala index 57de785..b2efd78 100644 --- a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/SnowflakeRetrying.scala +++ b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/SnowflakeRetrying.scala @@ -59,8 +59,11 @@ object SnowflakeRetrying { /** Is an error associated with setting up Snowflake as a destination */ private def isSetupError[F[_]: Sync](t: Throwable): F[Boolean] = t match { - case CausedByIngestResponseException(ire) if ire.getErrorCode === 403 => - true.pure[F] + case CausedByIngestResponseException(ire) => + if (ire.getErrorCode >= 400 && ire.getErrorCode < 500) + true.pure[F] + else + false.pure[F] case _: SecurityException => // Authentication failure, i.e. user unrecognized or bad private key true.pure[F] From 5c378624da88c5143f1f28ef811b398abd06c107 Mon Sep 17 00:00:00 2001 From: Ian Streeter Date: Fri, 5 Apr 2024 15:14:59 +0100 Subject: [PATCH 4/4] Prepare for 0.2.2 release --- CHANGELOG | 6 ++++++ README.md | 8 ++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index cf783fd..1e055c5 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,9 @@ +Version 0.2.2 (2024-04-05) +-------------------------- +Treat all client errors as setup errors when opening channel (#37) +Missing atomic column should be treated as a setup error (#38) +Use TIMESTAMP_TZ types when creating the table (#36) + Version 0.2.1 (2024-02-26) -------------------------- Change the channel’s app name to Snowplow_Streaming (#34) diff --git a/README.md b/README.md index dca0c41..eaf2c58 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ Basic usage: ```bash docker run \ -v /path/to/config.hocon:/var/config.hocon \ - snowplow/snowflake-loader-kafka:0.2.1 \ + snowplow/snowflake-loader-kafka:0.2.2 \ --config /var/config.hocon ``` @@ -30,7 +30,7 @@ The GCP snowflake loader reads the stream of enriched events from Pubsub. ```bash docker run \ -v /path/to/config.hocon:/var/config.hocon \ - snowplow/snowflake-loader-pubsub:0.2.1 \ + snowplow/snowflake-loader-pubsub:0.2.2 \ --config /var/config.hocon ``` @@ -41,7 +41,7 @@ The AWS snowflake loader reads the stream of enriched events from Kinesis. ```bash docker run \ -v /path/to/config.hocon:/var/config.hocon \ - snowplow/snowflake-loader-kinesis:0.2.1 \ + snowplow/snowflake-loader-kinesis:0.2.2 \ --config /var/config.hocon ``` @@ -71,7 +71,7 @@ Licensed under the [Snowplow Limited Use License Agreement][license]. _(If you a [build-image]: https://github.com/snowplow-incubator/snowplow-snowflake-streaming-loader/workflows/CI/badge.svg [build]: https://github.com/snowplow-incubator/snowplow-snowflake-streaming-loader/actions/workflows/ci.yml -[release-image]: https://img.shields.io/badge/release-0.2.1-blue.svg?style=flat +[release-image]: https://img.shields.io/badge/release-0.2.2-blue.svg?style=flat [releases]: https://github.com/snowplow-incubator/snowplow-snowflake-streaming-loader/releases [license]: https://docs.snowplow.io/limited-use-license-1.0