Skip to content

Commit 3b014e4

Browse files
authored
fix: filter non-state entries from report (#1022)
Due to a previous bug, invalid states were being recorded into redis. This was causing too many metrics being created in prometheus. Add a filter to prevent those invalid states from being reported. In addition, autoendpoint would not properly start up due to timeouts possibly related to frequent retries for reliability data. Instead of retrying, just fail the operation and report the state transition to the logs for further analysis. Closes [PUSH-570](https://mozilla-hub.atlassian.net/browse/PUSH-570) [PUSH-570]: https://mozilla-hub.atlassian.net/browse/PUSH-570?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ
1 parent 236244e commit 3b014e4

File tree

2 files changed

+28
-10
lines changed

2 files changed

+28
-10
lines changed

autoendpoint/src/extractors/notification.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ pub struct Notification {
3030
/// The current state the message was in (if tracked)
3131
pub reliable_state: Option<autopush_common::reliability::ReliabilityState>,
3232
#[cfg(feature = "reliable_report")]
33-
#[cfg(feature = "reliable_report")]
3433
pub reliability_id: Option<String>,
3534
}
3635

autopush-common/src/reliability.rs

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,10 @@ pub const COUNTS: &str = "state_counts";
2929
pub const EXPIRY: &str = "expiry";
3030

3131
const CONNECTION_EXPIRATION: TimeDelta = TimeDelta::seconds(10);
32-
const NO_EXPIRATION: u64 = 0;
32+
// Minimum expiration period of 1 second.
33+
// This was set to `0`, but there was some confusion whether that would not set an
34+
// expiration time for a record or would set a record not to expire.
35+
const MIN_EXPIRATION: u64 = 1;
3336

3437
/// The various states that a message may transit on the way from reception to delivery.
3538
// Note: "Message" in this context refers to the Subscription Update.
@@ -270,15 +273,15 @@ impl PushReliability {
270273
if new == ReliabilityState::Received {
271274
trace!(
272275
"🔍 Creating new record {state_key} ex {:?}",
273-
expr.unwrap_or(NO_EXPIRATION)
276+
expr.unwrap_or(MIN_EXPIRATION)
274277
);
275278
// we can't perform this in a transaction because we can only increment if the set succeeds,
276279
// and values aren't returned when creating values in transactions. In order to do this
277280
// from inside the transaction, you would need to create a function, and that feels a bit
278281
// too heavy for this.
279282
// Create the new `state.{id}` key if it does not exist, and set the expiration.
280283
let options = redis::SetOptions::default()
281-
.with_expiration(redis::SetExpiry::EX(expr.unwrap_or(NO_EXPIRATION)))
284+
.with_expiration(redis::SetExpiry::EX(expr.unwrap_or(MIN_EXPIRATION)))
282285
.conditional_set(redis::ExistenceCheck::NX);
283286
trace!("🔍 ⭕ SET {state_key} NX EX {:?}", new);
284287
let result = conn
@@ -290,10 +293,14 @@ impl PushReliability {
290293
ApcErrorKind::GeneralError("Could not create the state key".to_owned())
291294
})?;
292295
if result != redis::Value::Okay {
293-
error!(
294-
"🔍⚠️ Tried to recreate state_key {state_key}: {:?}",
295-
&result
296-
);
296+
// Redis returned a `Nil`, indicating that there was some error. The only thing that should cause that
297+
// would be if the `old` state was reset to `None` and we thought we needed to create a new state.
298+
// Since the message carries it's prior state, it shouldn't be set to `None` unless there's something
299+
// strange going on.
300+
// TODO: It's worth noting that when restarting autoendpoint, we get a large number of these in the logs.
301+
// Need to figure out the reason for that.
302+
// The `result` is always `nil` so that's not helpful.
303+
error!("🔍⚠️ Tried to recreate state_key {state_key}: {old:?} => {new:?}",);
297304
return Err(
298305
ApcErrorKind::GeneralError("Tried to recreate state_key".to_string()).into(),
299306
);
@@ -383,9 +390,14 @@ impl PushReliability {
383390
// in the pipe, which may vary due to the current state).
384391
// This could also be strung together as a cascade of functions, but it's broken
385392
// out to discrete steps for readability.
393+
/* On prod, we get a large number of these errors, which I think might be clogging
394+
up the redis connections, causing servers to report as degraded.
395+
*/
386396
if result == redis::Value::Nil {
387-
warn!("🔍⚠🪈 {id} - Pipe failed, retry.");
388-
return Ok(None);
397+
warn!("🔍⚠🪈 {id} - Pipe failed, skipping retry.");
398+
// temporarily just let things fail to handle autoendpoint degradation.
399+
// return Ok(None);
400+
return Ok(Some(redis::Value::Okay));
389401
}
390402
if let Some(operations) = result.as_sequence() {
391403
// We have responses, the first items report the state of the commands,
@@ -566,6 +578,11 @@ pub fn gen_report(values: HashMap<String, i32>) -> Result<String> {
566578
family.clone(),
567579
);
568580
for (milestone, value) in values.into_iter() {
581+
// prevent any stray leakage of invalid state data
582+
if ReliabilityState::from_str(&milestone).is_err() {
583+
trace!("🔍 skipping invalid state {milestone:?}");
584+
continue;
585+
}
569586
// Specify the static "state" label name with the given milestone, and add the
570587
// value as the gauge value.
571588
family
@@ -609,13 +626,15 @@ mod tests {
609626
report.insert(ReliabilityState::Stored.to_string(), 222);
610627
report.insert(ReliabilityState::Retrieved.to_string(), 333);
611628
report.insert(trns.clone(), 444);
629+
report.insert("biginvalid".to_string(), -1);
612630

613631
let generated = gen_report(report).unwrap();
614632
// We don't really care if the `Created` or `HELP` lines are included
615633
assert!(generated.contains(&format!("# TYPE {METRIC_NAME}")));
616634
// sample the first and last values.
617635
assert!(generated.contains(&format!("{METRIC_NAME}{{state=\"{recv}\"}} 111")));
618636
assert!(generated.contains(&format!("{METRIC_NAME}{{state=\"{trns}\"}} 444")));
637+
assert!(!generated.contains(&format!("{METRIC_NAME}{{state=\"biginvalid\"}} -1")));
619638
}
620639

621640
#[test]

0 commit comments

Comments
 (0)