@@ -29,7 +29,10 @@ pub const COUNTS: &str = "state_counts";
29
29
pub const EXPIRY : & str = "expiry" ;
30
30
31
31
const CONNECTION_EXPIRATION : TimeDelta = TimeDelta :: seconds ( 10 ) ;
32
- const NO_EXPIRATION : u64 = 0 ;
32
+ // Minimum expiration period of 1 second.
33
+ // This was set to `0`, but there was some confusion whether that would not set an
34
+ // expiration time for a record or would set a record not to expire.
35
+ const MIN_EXPIRATION : u64 = 1 ;
33
36
34
37
/// The various states that a message may transit on the way from reception to delivery.
35
38
// Note: "Message" in this context refers to the Subscription Update.
@@ -270,15 +273,15 @@ impl PushReliability {
270
273
if new == ReliabilityState :: Received {
271
274
trace ! (
272
275
"🔍 Creating new record {state_key} ex {:?}" ,
273
- expr. unwrap_or( NO_EXPIRATION )
276
+ expr. unwrap_or( MIN_EXPIRATION )
274
277
) ;
275
278
// we can't perform this in a transaction because we can only increment if the set succeeds,
276
279
// and values aren't returned when creating values in transactions. In order to do this
277
280
// from inside the transaction, you would need to create a function, and that feels a bit
278
281
// too heavy for this.
279
282
// Create the new `state.{id}` key if it does not exist, and set the expiration.
280
283
let options = redis:: SetOptions :: default ( )
281
- . with_expiration ( redis:: SetExpiry :: EX ( expr. unwrap_or ( NO_EXPIRATION ) ) )
284
+ . with_expiration ( redis:: SetExpiry :: EX ( expr. unwrap_or ( MIN_EXPIRATION ) ) )
282
285
. conditional_set ( redis:: ExistenceCheck :: NX ) ;
283
286
trace ! ( "🔍 ⭕ SET {state_key} NX EX {:?}" , new) ;
284
287
let result = conn
@@ -290,10 +293,14 @@ impl PushReliability {
290
293
ApcErrorKind :: GeneralError ( "Could not create the state key" . to_owned ( ) )
291
294
} ) ?;
292
295
if result != redis:: Value :: Okay {
293
- error ! (
294
- "🔍⚠️ Tried to recreate state_key {state_key}: {:?}" ,
295
- & result
296
- ) ;
296
+ // Redis returned a `Nil`, indicating that there was some error. The only thing that should cause that
297
+ // would be if the `old` state was reset to `None` and we thought we needed to create a new state.
298
+ // Since the message carries it's prior state, it shouldn't be set to `None` unless there's something
299
+ // strange going on.
300
+ // TODO: It's worth noting that when restarting autoendpoint, we get a large number of these in the logs.
301
+ // Need to figure out the reason for that.
302
+ // The `result` is always `nil` so that's not helpful.
303
+ error ! ( "🔍⚠️ Tried to recreate state_key {state_key}: {old:?} => {new:?}" , ) ;
297
304
return Err (
298
305
ApcErrorKind :: GeneralError ( "Tried to recreate state_key" . to_string ( ) ) . into ( ) ,
299
306
) ;
@@ -383,9 +390,14 @@ impl PushReliability {
383
390
// in the pipe, which may vary due to the current state).
384
391
// This could also be strung together as a cascade of functions, but it's broken
385
392
// out to discrete steps for readability.
393
+ /* On prod, we get a large number of these errors, which I think might be clogging
394
+ up the redis connections, causing servers to report as degraded.
395
+ */
386
396
if result == redis:: Value :: Nil {
387
- warn ! ( "🔍⚠🪈 {id} - Pipe failed, retry." ) ;
388
- return Ok ( None ) ;
397
+ warn ! ( "🔍⚠🪈 {id} - Pipe failed, skipping retry." ) ;
398
+ // temporarily just let things fail to handle autoendpoint degradation.
399
+ // return Ok(None);
400
+ return Ok ( Some ( redis:: Value :: Okay ) ) ;
389
401
}
390
402
if let Some ( operations) = result. as_sequence ( ) {
391
403
// We have responses, the first items report the state of the commands,
@@ -566,6 +578,11 @@ pub fn gen_report(values: HashMap<String, i32>) -> Result<String> {
566
578
family. clone ( ) ,
567
579
) ;
568
580
for ( milestone, value) in values. into_iter ( ) {
581
+ // prevent any stray leakage of invalid state data
582
+ if ReliabilityState :: from_str ( & milestone) . is_err ( ) {
583
+ trace ! ( "🔍 skipping invalid state {milestone:?}" ) ;
584
+ continue ;
585
+ }
569
586
// Specify the static "state" label name with the given milestone, and add the
570
587
// value as the gauge value.
571
588
family
@@ -609,13 +626,15 @@ mod tests {
609
626
report. insert ( ReliabilityState :: Stored . to_string ( ) , 222 ) ;
610
627
report. insert ( ReliabilityState :: Retrieved . to_string ( ) , 333 ) ;
611
628
report. insert ( trns. clone ( ) , 444 ) ;
629
+ report. insert ( "biginvalid" . to_string ( ) , -1 ) ;
612
630
613
631
let generated = gen_report ( report) . unwrap ( ) ;
614
632
// We don't really care if the `Created` or `HELP` lines are included
615
633
assert ! ( generated. contains( & format!( "# TYPE {METRIC_NAME}" ) ) ) ;
616
634
// sample the first and last values.
617
635
assert ! ( generated. contains( & format!( "{METRIC_NAME}{{state=\" {recv}\" }} 111" ) ) ) ;
618
636
assert ! ( generated. contains( & format!( "{METRIC_NAME}{{state=\" {trns}\" }} 444" ) ) ) ;
637
+ assert ! ( !generated. contains( & format!( "{METRIC_NAME}{{state=\" biginvalid\" }} -1" ) ) ) ;
619
638
}
620
639
621
640
#[ test]
0 commit comments