Skip to content

Commit e658a2f

Browse files
jrconlinpjenvey
andauthored
feat: Improve endpoint reliability check (#991)
* Add some hinting to `health` to report vapid key signatures (to ensure that key values are propagating) * Add db check to autoendpoint `health` check * Try fix for off counts in `internal_record` * wrap action in a transaction (with retries), * Add missing old state removal (not sure when/how that got dropped) * Add unit test for `internal_record` * Add `LOCK_` prefix for redis lock record (because otherwise it's confusing) --------- Co-authored-by: Philip Jenvey <[email protected]>
1 parent 4e1e708 commit e658a2f

File tree

4 files changed

+202
-48
lines changed

4 files changed

+202
-48
lines changed

Cargo.lock

Lines changed: 8 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

autoendpoint/src/routes/health.rs

Lines changed: 46 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,10 @@ use serde_json::json;
1111

1212
use crate::error::{ApiErrorKind, ApiResult};
1313
use crate::server::AppState;
14-
use autopush_common::db::error::DbResult;
1514
use autopush_common::metric_name::MetricName;
1615
use autopush_common::metrics::StatsdClientExt;
16+
use autopush_common::util::b64_encode_url;
17+
use autopush_common::{db::error::DbResult, errors::ApcError};
1718

1819
/// Handle the `/health` and `/__heartbeat__` routes
1920
pub async fn health_route(state: Data<AppState>) -> Json<serde_json::Value> {
@@ -24,7 +25,19 @@ pub async fn health_route(state: Data<AppState>) -> Json<serde_json::Value> {
2425
routers.insert("fcm", state.fcm_router.active());
2526

2627
let mut health = json!({
27-
"status": "OK",
28+
"status": if state
29+
.db
30+
.health_check()
31+
.await
32+
.map_err(|e| {
33+
error!("Autoendpoint health error: {:?}", e);
34+
e
35+
})
36+
.is_ok() {
37+
"OK"
38+
} else {
39+
"ERROR"
40+
},
2841
"version": env!("CARGO_PKG_VERSION"),
2942
"router_table": router_health,
3043
"message_table": message_health,
@@ -33,15 +46,37 @@ pub async fn health_route(state: Data<AppState>) -> Json<serde_json::Value> {
3346

3447
#[cfg(feature = "reliable_report")]
3548
{
36-
health["reliability"] = json!(state.reliability.health_check().await.unwrap_or_else(|e| {
37-
state
38-
.metrics
39-
.incr_with_tags(MetricName::ReliabilityErrorRedisUnavailable)
40-
.with_tag("application", "autoendpoint")
41-
.send();
42-
error!("🔍🟥 Reliability reporting down: {:?}", e);
43-
"ERROR"
44-
}));
49+
let reliability_health: Result<String, ApcError> = state
50+
.reliability
51+
.health_check()
52+
.await
53+
.map(|_| {
54+
let keys: Vec<String> = state
55+
.settings
56+
.tracking_keys()
57+
.unwrap_or_default()
58+
.iter()
59+
.map(|k|
60+
// Hint the key values
61+
b64_encode_url(k)[..8].to_string())
62+
.collect();
63+
if keys.is_empty() {
64+
Ok("NO_TRACKING_KEYS".to_owned())
65+
} else {
66+
Ok(format!("OK: {}", keys.join(",")))
67+
}
68+
})
69+
.unwrap_or_else(|e| {
70+
// Record that Redis is down.
71+
state
72+
.metrics
73+
.incr_with_tags(MetricName::ReliabilityErrorRedisUnavailable)
74+
.with_tag("application", "autoendpoint")
75+
.send();
76+
error!("🔍🟥 Reliability reporting down: {:?}", e);
77+
Ok("STORE_ERROR".to_owned())
78+
});
79+
health["reliability"] = json!(reliability_health);
4580
}
4681
Json(health)
4782
}

autopush-common/src/reliability.rs

Lines changed: 147 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ impl PushReliability {
126126
})
127127
}
128128

129-
// Record the record state change to storage.
129+
// Record the message state change to storage.
130130
pub async fn record(
131131
&self,
132132
reliability_id: &Option<String>,
@@ -146,46 +146,81 @@ impl PushReliability {
146146
new
147147
);
148148
match pool.get().await {
149-
Ok(mut conn) => self.internal_record(&mut conn, old, new, expr, id).await,
150-
Err(e) => warn!("🔍⚠️ Unable to record reliability state, {:?}", e),
151-
}
149+
Ok(mut conn) => {
150+
let _ = self
151+
.internal_record(&mut conn, old, new, expr, id)
152+
.await
153+
.inspect_err(|e| {
154+
warn!("🔍⚠️ Unable to record reliability state: {:?}", e);
155+
});
156+
}
157+
Err(e) => warn!("🔍⚠️ Unable to get reliability state pool, {:?}", e),
158+
};
152159
};
153160
// Errors are not fatal, and should not impact message flow, but
154161
// we should record them somewhere.
155162
let _ = self.db.log_report(id, new).await.inspect_err(|e| {
156-
warn!("🔍⚠️ Unable to record reliability state: {:?}", e);
163+
warn!("🔍⚠️ Unable to record reliability state log: {:?}", e);
157164
});
158165
Some(new)
159166
}
160167

161-
pub(crate) async fn internal_record<C: ConnectionLike>(
168+
pub(crate) async fn internal_record<C: ConnectionLike + AsyncCommands>(
162169
&self,
163170
conn: &mut C,
164171
old: &Option<ReliabilityState>,
165172
new: ReliabilityState,
166173
expr: Option<u64>,
167174
id: &str,
168-
) {
169-
let mut pipeline = redis::pipe();
170-
pipeline.hincr(COUNTS, new.to_string(), 1);
171-
172-
if let Some(old) = old {
173-
pipeline.hincr(COUNTS, old.to_string(), -1);
174-
};
175-
// Errors are not fatal, and should not impact message flow, but
176-
// we should record them somewhere.
177-
if !new.is_terminal() {
178-
// Write the expiration only if the state is non-terminal. Otherwise we run the risk of
179-
// messages reporting a false "expired" state even if they were "successful".
180-
let cc = pipeline
181-
.zadd(EXPIRY, format!("{}#{}", new, id), expr.unwrap_or_default())
182-
.exec_async(conn)
183-
.await
184-
.inspect_err(|e| {
185-
warn!("🔍 Failed to write to storage: {:?}", e);
186-
});
187-
trace!("🔍 internal record result: {:?}", cc);
188-
}
175+
) -> Result<()> {
176+
trace!(
177+
"🔍 internal record: {} from {} to {}",
178+
id,
179+
old.map(|v| v.to_string())
180+
.unwrap_or_else(|| "None".to_owned()),
181+
new
182+
);
183+
crate::redis_util::transaction(
184+
conn,
185+
&[COUNTS, EXPIRY],
186+
self.retries,
187+
|| ApcErrorKind::GeneralError("Exceeded reliability record retry attempts".to_owned()),
188+
async |conn, pipe| {
189+
// First, increment the new state.
190+
pipe.hincr(COUNTS, new.to_string(), 1);
191+
// If there is an old state, decrement it.
192+
if let Some(old_state) = old {
193+
pipe.hincr(COUNTS, old_state.to_string(), -1);
194+
// remove the old state from the expiry set, if it exists.
195+
// There should only be one message at a given state in the `expiry` table.
196+
// Since we only use that table to track messages that may expire. (We
197+
// decrement "expired" messages in the `gc` function, so having messages
198+
// in multiple states may decrement counts incorrectly.))
199+
let key = format!("{}#{}", id, old_state);
200+
pipe.zrem(EXPIRY, &key);
201+
trace!("🔍 internal remove old state: {:?}", key);
202+
}
203+
if !new.is_terminal() {
204+
// Write the expiration only if the state is non-terminal. Otherwise we run the risk of
205+
// messages reporting a false "expired" state even if they were "successful".
206+
let key = format!("{}#{}", id, new);
207+
let _ = pipe.zadd(EXPIRY, &key, expr.unwrap_or_default());
208+
trace!("🔍 internal record result: {:?}", key);
209+
}
210+
// `exec_query` returns `RedisResult<()>`.
211+
// `query_async` returns `RedisResult<Option<redis::Value>, RedisError>`.
212+
// We really don't care about the returned result here, but transaction
213+
// retries if we return Ok(None), so we run the exec and return
214+
// a nonce `Some` value.
215+
// The turbo-fish is a fallback for edition 2024
216+
pipe.query_async::<()>(conn).await.inspect_err(|e| {
217+
warn!("🔍 Redis internal storage error: {:?}", e);
218+
})?;
219+
Ok(Some(redis::Value::Okay))
220+
},
221+
)
222+
.await?;
223+
Ok(())
189224
}
190225

191226
/// Perform a garbage collection cycle on a reliability object.
@@ -443,12 +478,96 @@ mod tests {
443478
.await;
444479

445480
// and mock the redis call.
446-
pr.internal_record(&mut conn, &old, new, Some(expr), &test_id)
481+
let _ = pr
482+
.internal_record(&mut conn, &old, new, Some(expr), &test_id)
447483
.await;
448484

449485
Ok(())
450486
}
451487

488+
//*
489+
#[actix_rt::test]
490+
async fn test_push_reliabilty_record() -> Result<()> {
491+
let db = crate::db::mock::MockDbClient::new();
492+
let test_id = format!("TEST_VALUE_{}", Uuid::new_v4());
493+
let new = ReliabilityState::Stored;
494+
let old = ReliabilityState::Received;
495+
let expr = 1;
496+
497+
let metrics = Arc::new(StatsdClient::builder("", cadence::NopMetricSink).build());
498+
let new_key = format!("{}#{}", &test_id, &new);
499+
let old_key = format!("{}#{}", &test_id, &old);
500+
let mut mock_pipe = redis::Pipeline::new();
501+
mock_pipe
502+
.cmd("MULTI")
503+
.ignore()
504+
.cmd("HINCRBY")
505+
.arg(COUNTS)
506+
.arg(new.to_string())
507+
.arg(1)
508+
.ignore()
509+
.cmd("HINCRBY")
510+
.arg(COUNTS)
511+
.arg(old.to_string())
512+
.arg(-1)
513+
.ignore()
514+
.cmd("ZREM")
515+
.arg(EXPIRY)
516+
.arg(old_key)
517+
.ignore()
518+
.cmd("ZADD")
519+
.arg(EXPIRY)
520+
.arg(new_key)
521+
.ignore()
522+
.cmd("EXEC")
523+
.ignore();
524+
525+
let mut conn = MockRedisConnection::new(vec![
526+
MockCmd::new(
527+
redis::cmd("WATCH").arg(COUNTS).arg(EXPIRY),
528+
Ok(redis::Value::Okay),
529+
),
530+
// NOTE: Technically, since we `.ignore()` these, we could just have a
531+
// Vec containing just `Okay`. I'm being a bit pedantic here because I know
532+
// that this will come back to haunt me if I'm not, and because figuring out
533+
// the proper response for this was annoying.
534+
MockCmd::new(
535+
mock_pipe,
536+
Ok(redis::Value::Array(vec![
537+
redis::Value::Okay,
538+
// Match the number of commands that are being held for processing
539+
redis::Value::SimpleString("QUEUED".to_owned()),
540+
redis::Value::SimpleString("QUEUED".to_owned()),
541+
redis::Value::SimpleString("QUEUED".to_owned()),
542+
redis::Value::SimpleString("QUEUED".to_owned()),
543+
// the exec has been called, return an array containing the results.
544+
redis::Value::Array(vec![
545+
redis::Value::Okay,
546+
redis::Value::Okay,
547+
redis::Value::Okay,
548+
redis::Value::Okay,
549+
]),
550+
])),
551+
),
552+
// If the transaction fails, this should return a redis::Value::Nil
553+
MockCmd::new(redis::cmd("UNWATCH"), Ok(redis::Value::Okay)),
554+
]);
555+
556+
// test the main report function (note, this does not test redis)
557+
let pr = PushReliability::new(
558+
&None,
559+
Box::new(Arc::new(db)),
560+
&metrics,
561+
crate::redis_util::MAX_TRANSACTION_LOOP,
562+
)
563+
.unwrap();
564+
let _ = pr
565+
.internal_record(&mut conn, &Some(old), new, Some(expr), &test_id)
566+
.await;
567+
568+
Ok(())
569+
}
570+
// */
452571
#[actix_rt::test]
453572
async fn test_push_reliability_gc() -> Result<()> {
454573
let db = crate::db::mock::MockDbClient::new();

scripts/reliability/reliability_report.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,7 @@ async def terminal_snapshot(self) -> Dict[str, int]:
386386

387387
async def get_lock(self) -> bool:
388388
"""Use RedLock locking"""
389-
lock_name = datetime.now().isoformat()
389+
lock_name = f"LOCK_{datetime.now().isoformat()}"
390390
# set the default hold time fairly short, we'll extend the lock later if we succeed.
391391
self.lock = self.redis.lock(lock_name, timeout=self.settings.lock_acquire_time)
392392
# Fail the lock check quickly.

0 commit comments

Comments
 (0)