feat: Improve endpoint reliability check (#991)

jrconlin · pjenvey · web-flow · commit e658a2f8524a · 2025-06-16T16:19:03.000-07:00
* Add some hinting to `health` to report vapid key signatures (to ensure
that key values are propagating)
* Add db check to autoendpoint `health` check
* Try fix for off counts in `internal_record`
   * wrap action in a transaction (with retries), 
   * Add missing old state removal (not sure when/how that got dropped)
   * Add unit test for `internal_record`
* Add `LOCK_` prefix for redis lock record (because otherwise it's
confusing)

---------

Co-authored-by: Philip Jenvey &lt;pjenvey@underboss.org&gt;
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/autoendpoint/src/routes/health.rs b/autoendpoint/src/routes/health.rs
@@ -11,9 +11,10 @@ use serde_json::json;
 
 use crate::error::{ApiErrorKind, ApiResult};
 use crate::server::AppState;
-use autopush_common::db::error::DbResult;
 use autopush_common::metric_name::MetricName;
 use autopush_common::metrics::StatsdClientExt;
+use autopush_common::util::b64_encode_url;
+use autopush_common::{db::error::DbResult, errors::ApcError};
 
 /// Handle the `/health` and `/__heartbeat__` routes
 pub async fn health_route(state: Data<AppState>) -> Json<serde_json::Value> {
@@ -24,7 +25,19 @@ pub async fn health_route(state: Data<AppState>) -> Json<serde_json::Value> {
     routers.insert("fcm", state.fcm_router.active());
 
     let mut health = json!({
-        "status": "OK",
+        "status": if state
+            .db
+            .health_check()
+            .await
+            .map_err(|e| {
+                error!("Autoendpoint health error: {:?}", e);
+                e
+            })
+            .is_ok() {
+            "OK"
+        } else {
+            "ERROR"
+        },
         "version": env!("CARGO_PKG_VERSION"),
         "router_table": router_health,
         "message_table": message_health,
@@ -33,15 +46,37 @@ pub async fn health_route(state: Data<AppState>) -> Json<serde_json::Value> {
 
     #[cfg(feature = "reliable_report")]
     {
-        health["reliability"] = json!(state.reliability.health_check().await.unwrap_or_else(|e| {
-            state
-                .metrics
-                .incr_with_tags(MetricName::ReliabilityErrorRedisUnavailable)
-                .with_tag("application", "autoendpoint")
-                .send();
-            error!("🔍🟥 Reliability reporting down: {:?}", e);
-            "ERROR"
-        }));
+        let reliability_health: Result<String, ApcError> = state
+            .reliability
+            .health_check()
+            .await
+            .map(|_| {
+                let keys: Vec<String> = state
+                    .settings
+                    .tracking_keys()
+                    .unwrap_or_default()
+                    .iter()
+                    .map(|k|
+                        // Hint the key values
+                        b64_encode_url(k)[..8].to_string())
+                    .collect();
+                if keys.is_empty() {
+                    Ok("NO_TRACKING_KEYS".to_owned())
+                } else {
+                    Ok(format!("OK: {}", keys.join(",")))
+                }
+            })
+            .unwrap_or_else(|e| {
+                // Record that Redis is down.
+                state
+                    .metrics
+                    .incr_with_tags(MetricName::ReliabilityErrorRedisUnavailable)
+                    .with_tag("application", "autoendpoint")
+                    .send();
+                error!("🔍🟥 Reliability reporting down: {:?}", e);
+                Ok("STORE_ERROR".to_owned())
+            });
+        health["reliability"] = json!(reliability_health);
     }
     Json(health)
 }
diff --git a/autopush-common/src/reliability.rs b/autopush-common/src/reliability.rs
@@ -126,7 +126,7 @@ impl PushReliability {
         })
     }
 
-    // Record the record state change to storage.
+    // Record the message state change to storage.
     pub async fn record(
         &self,
         reliability_id: &Option<String>,
@@ -146,46 +146,81 @@ impl PushReliability {
                 new
             );
             match pool.get().await {
-                Ok(mut conn) => self.internal_record(&mut conn, old, new, expr, id).await,
-                Err(e) => warn!("🔍⚠️ Unable to record reliability state, {:?}", e),
-            }
+                Ok(mut conn) => {
+                    let _ = self
+                        .internal_record(&mut conn, old, new, expr, id)
+                        .await
+                        .inspect_err(|e| {
+                            warn!("🔍⚠️ Unable to record reliability state: {:?}", e);
+                        });
+                }
+                Err(e) => warn!("🔍⚠️ Unable to get reliability state pool, {:?}", e),
+            };
         };
         // Errors are not fatal, and should not impact message flow, but
         // we should record them somewhere.
         let _ = self.db.log_report(id, new).await.inspect_err(|e| {
-            warn!("🔍⚠️ Unable to record reliability state: {:?}", e);
+            warn!("🔍⚠️ Unable to record reliability state log: {:?}", e);
         });
         Some(new)
     }
 
-    pub(crate) async fn internal_record<C: ConnectionLike>(
+    pub(crate) async fn internal_record<C: ConnectionLike + AsyncCommands>(
         &self,
         conn: &mut C,
         old: &Option<ReliabilityState>,
         new: ReliabilityState,
         expr: Option<u64>,
         id: &str,
-    ) {
-        let mut pipeline = redis::pipe();
-        pipeline.hincr(COUNTS, new.to_string(), 1);
-
-        if let Some(old) = old {
-            pipeline.hincr(COUNTS, old.to_string(), -1);
-        };
-        // Errors are not fatal, and should not impact message flow, but
-        // we should record them somewhere.
-        if !new.is_terminal() {
-            // Write the expiration only if the state is non-terminal. Otherwise we run the risk of
-            // messages reporting a false "expired" state even if they were "successful".
-            let cc = pipeline
-                .zadd(EXPIRY, format!("{}#{}", new, id), expr.unwrap_or_default())
-                .exec_async(conn)
-                .await
-                .inspect_err(|e| {
-                    warn!("🔍 Failed to write to storage: {:?}", e);
-                });
-            trace!("🔍 internal record result: {:?}", cc);
-        }
+    ) -> Result<()> {
+        trace!(
+            "🔍 internal record: {} from {} to {}",
+            id,
+            old.map(|v| v.to_string())
+                .unwrap_or_else(|| "None".to_owned()),
+            new
+        );
+        crate::redis_util::transaction(
+            conn,
+            &[COUNTS, EXPIRY],
+            self.retries,
+            || ApcErrorKind::GeneralError("Exceeded reliability record retry attempts".to_owned()),
+            async |conn, pipe| {
+                // First, increment the new state.
+                pipe.hincr(COUNTS, new.to_string(), 1);
+                // If there is an old state, decrement it.
+                if let Some(old_state) = old {
+                    pipe.hincr(COUNTS, old_state.to_string(), -1);
+                    // remove the old state from the expiry set, if it exists.
+                    // There should only be one message at a given state in the `expiry` table.
+                    // Since we only use that table to track messages that may expire. (We
+                    // decrement "expired" messages in the `gc` function, so having messages
+                    // in multiple states may decrement counts incorrectly.))
+                    let key = format!("{}#{}", id, old_state);
+                    pipe.zrem(EXPIRY, &key);
+                    trace!("🔍 internal remove old state: {:?}", key);
+                }
+                if !new.is_terminal() {
+                    // Write the expiration only if the state is non-terminal. Otherwise we run the risk of
+                    // messages reporting a false "expired" state even if they were "successful".
+                    let key = format!("{}#{}", id, new);
+                    let _ = pipe.zadd(EXPIRY, &key, expr.unwrap_or_default());
+                    trace!("🔍 internal record result: {:?}", key);
+                }
+                // `exec_query` returns `RedisResult<()>`.
+                // `query_async` returns `RedisResult<Option<redis::Value>, RedisError>`.
+                // We really don't care about the returned result here, but transaction
+                // retries if we return Ok(None), so we run the exec and return
+                // a nonce `Some` value.
+                // The turbo-fish is a fallback for edition 2024
+                pipe.query_async::<()>(conn).await.inspect_err(|e| {
+                    warn!("🔍 Redis internal storage error: {:?}", e);
+                })?;
+                Ok(Some(redis::Value::Okay))
+            },
+        )
+        .await?;
+        Ok(())
     }
 
     /// Perform a garbage collection cycle on a reliability object.
@@ -443,12 +478,96 @@ mod tests {
             .await;
 
         // and mock the redis call.
-        pr.internal_record(&mut conn, &old, new, Some(expr), &test_id)
+        let _ = pr
+            .internal_record(&mut conn, &old, new, Some(expr), &test_id)
             .await;
 
         Ok(())
     }
 
+    //*
+    #[actix_rt::test]
+    async fn test_push_reliabilty_record() -> Result<()> {
+        let db = crate::db::mock::MockDbClient::new();
+        let test_id = format!("TEST_VALUE_{}", Uuid::new_v4());
+        let new = ReliabilityState::Stored;
+        let old = ReliabilityState::Received;
+        let expr = 1;
+
+        let metrics = Arc::new(StatsdClient::builder("", cadence::NopMetricSink).build());
+        let new_key = format!("{}#{}", &test_id, &new);
+        let old_key = format!("{}#{}", &test_id, &old);
+        let mut mock_pipe = redis::Pipeline::new();
+        mock_pipe
+            .cmd("MULTI")
+            .ignore()
+            .cmd("HINCRBY")
+            .arg(COUNTS)
+            .arg(new.to_string())
+            .arg(1)
+            .ignore()
+            .cmd("HINCRBY")
+            .arg(COUNTS)
+            .arg(old.to_string())
+            .arg(-1)
+            .ignore()
+            .cmd("ZREM")
+            .arg(EXPIRY)
+            .arg(old_key)
+            .ignore()
+            .cmd("ZADD")
+            .arg(EXPIRY)
+            .arg(new_key)
+            .ignore()
+            .cmd("EXEC")
+            .ignore();
+
+        let mut conn = MockRedisConnection::new(vec![
+            MockCmd::new(
+                redis::cmd("WATCH").arg(COUNTS).arg(EXPIRY),
+                Ok(redis::Value::Okay),
+            ),
+            // NOTE: Technically, since we `.ignore()` these, we could just have a
+            // Vec containing just `Okay`. I'm being a bit pedantic here because I know
+            // that this will come back to haunt me if I'm not, and because figuring out
+            // the proper response for this was annoying.
+            MockCmd::new(
+                mock_pipe,
+                Ok(redis::Value::Array(vec![
+                    redis::Value::Okay,
+                    // Match the number of commands that are being held for processing
+                    redis::Value::SimpleString("QUEUED".to_owned()),
+                    redis::Value::SimpleString("QUEUED".to_owned()),
+                    redis::Value::SimpleString("QUEUED".to_owned()),
+                    redis::Value::SimpleString("QUEUED".to_owned()),
+                    // the exec has been called, return an array containing the results.
+                    redis::Value::Array(vec![
+                        redis::Value::Okay,
+                        redis::Value::Okay,
+                        redis::Value::Okay,
+                        redis::Value::Okay,
+                    ]),
+                ])),
+            ),
+            // If the transaction fails, this should return a redis::Value::Nil
+            MockCmd::new(redis::cmd("UNWATCH"), Ok(redis::Value::Okay)),
+        ]);
+
+        // test the main report function (note, this does not test redis)
+        let pr = PushReliability::new(
+            &None,
+            Box::new(Arc::new(db)),
+            &metrics,
+            crate::redis_util::MAX_TRANSACTION_LOOP,
+        )
+        .unwrap();
+        let _ = pr
+            .internal_record(&mut conn, &Some(old), new, Some(expr), &test_id)
+            .await;
+
+        Ok(())
+    }
+    // */
     #[actix_rt::test]
     async fn test_push_reliability_gc() -> Result<()> {
         let db = crate::db::mock::MockDbClient::new();
diff --git a/scripts/reliability/reliability_report.py b/scripts/reliability/reliability_report.py
@@ -386,7 +386,7 @@ async def terminal_snapshot(self) -> Dict[str, int]:
 
     async def get_lock(self) -> bool:
         """Use RedLock locking"""
-        lock_name = datetime.now().isoformat()
+        lock_name = f"LOCK_{datetime.now().isoformat()}"
         # set the default hold time fairly short, we'll extend the lock later if we succeed.
         self.lock = self.redis.lock(lock_name, timeout=self.settings.lock_acquire_time)
         # Fail the lock check quickly.