Skip to content

Commit 791e875

Browse files
authored
feat: Clean up terminal states (#899)
Address potential problem where final message states may grow without bounds. This also updates the `reliability_report.py` to write to a Google Storage Bucket. Closes: #PUSH-12 / #895
1 parent 1c47ad6 commit 791e875

File tree

17 files changed

+829
-545
lines changed

17 files changed

+829
-545
lines changed

.circleci/config.yml

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,12 @@ executors:
7171
auth:
7272
username: $DOCKER_USER
7373
password: $DOCKER_PASS
74+
build-reliability-cron:
75+
docker:
76+
- image: docker:18.03.0-ce
77+
auth:
78+
username: $DOCKER_USER
79+
password: $DOCKER_PASS
7480
build-test-container-executor:
7581
docker:
7682
- image: cimg/base:2025.02
@@ -363,6 +369,39 @@ jobs:
363369
paths:
364370
- <<parameters.image>>.tar
365371

372+
# Create the reliability cron docker image. This is a singleton that does some
373+
# clean-up and reporting for the Push Reliability task.
374+
build-reliability-cron:
375+
executor: build-reliability-cron
376+
resource_class: small
377+
working_directory: /app
378+
parameters:
379+
image:
380+
type: string
381+
tag:
382+
type: string
383+
steps:
384+
# Install these packages before checkout because git may not exist or work
385+
- run:
386+
name: Install Docker build dependencies
387+
command: apk add --no-cache openssh-client git clang
388+
- checkout
389+
- setup_remote_docker
390+
- docker_login
391+
- run:
392+
name: Build cron docker image
393+
command: |
394+
docker build --tag <<parameters.image>>:<<parameters.tag>> -f ./scripts/reliability/Dockerfile ./scripts/reliability
395+
- run:
396+
name: Save cron docker image
397+
command: |
398+
mkdir -p /tmp/cron_cache
399+
docker save -o /tmp/cron_cache/<<parameters.image>>.tag <<parameters.image>>
400+
- persist_to_workspace:
401+
root: /tmp/cron_cache
402+
paths:
403+
- <<parameters.image>>.tag
404+
366405
build-test-container:
367406
executor: build-test-container-executor
368407
parameters:
@@ -515,6 +554,17 @@ workflows:
515554
filters:
516555
tags:
517556
only: /.*/
557+
558+
- build-reliability-cron:
559+
name: build-reliability-cron
560+
image: autopush-reliability-cron
561+
tag: latest
562+
filters:
563+
tags:
564+
only: /.*/
565+
# branches:
566+
# only: master
567+
518568
- build-test-container:
519569
name: Build Load Test Image
520570
image: autopush-load-tests

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,5 +108,8 @@ autoconnect_ws = { path = "./autoconnect/autoconnect-ws" }
108108
autoconnect_ws_clientsm = { path = "./autoconnect/autoconnect-ws/autoconnect-ws-clientsm" }
109109
autopush_common = { path = "./autopush-common", features = ["bigtable"] }
110110

111+
[workspace.lints.clippy]
112+
result_large_err = "allow" # 1.87.0 will flag ApiError as too large.
113+
111114
[profile.release]
112115
debug = 1

autoconnect/autoconnect-settings/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,6 @@ autopush_common.workspace = true
2626
bigtable = ["autopush_common/bigtable"]
2727
emulator = ["bigtable"]
2828
reliable_report = ["autopush_common/reliable_report"]
29+
30+
[lints]
31+
workspace = true

autoconnect/autoconnect-ws/autoconnect-ws-sm/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,6 @@ autoconnect_common = { workspace = true, features = ["test-support"] }
3333

3434
[features]
3535
reliable_report = []
36+
37+
[lints]
38+
workspace = true

autoendpoint/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,6 @@ stub = []
6868
log_vapid = []
6969

7070
reliable_report = ["autopush_common/reliable_report"]
71+
72+
[lints]
73+
workspace = true

autoendpoint/src/routers/apns/router.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -501,7 +501,7 @@ impl Router for ApnsRouter {
501501
// mutable, but we are also essentially consuming the
502502
// notification nothing else should modify it.
503503
notification
504-
.record_reliability(&self.reliability, ReliabilityState::Transmitted)
504+
.record_reliability(&self.reliability, ReliabilityState::BridgeTransmitted)
505505
.await;
506506
}
507507

autoendpoint/src/routers/fcm/router.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ impl Router for FcmRouter {
199199
// mutable, but we are also essentially consuming the
200200
// notification nothing else should modify it.
201201
notification
202-
.record_reliability(&self.reliability, ReliabilityState::Transmitted)
202+
.record_reliability(&self.reliability, ReliabilityState::BridgeTransmitted)
203203
.await;
204204
// Sent successfully, update metrics and make response
205205
trace!("Send request was successful");

autoendpoint/src/settings.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ impl Default for Settings {
9999
stub: StubSettings::default(),
100100
#[cfg(feature = "reliable_report")]
101101
reliability_dsn: None,
102+
#[cfg(feature = "reliable_report")]
102103
reliability_retry_count: autopush_common::redis_util::MAX_TRANSACTION_LOOP,
103104
}
104105
}

autopush-common/src/reliability.rs

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ use deadpool_redis::Config;
1414
use prometheus_client::{
1515
encoding::text::encode, metrics::family::Family, metrics::gauge::Gauge, registry::Registry,
1616
};
17-
use redis::aio::ConnectionLike;
18-
use redis::AsyncCommands;
17+
use redis::{aio::ConnectionLike, AsyncCommands};
1918

2019
use crate::db::client::DbClient;
2120
use crate::errors::{ApcErrorKind, Result};
@@ -46,18 +45,36 @@ const CONNECTION_EXPIRATION: TimeDelta = TimeDelta::seconds(10);
4645
#[serde(rename_all = "snake_case")]
4746
#[strum(serialize_all = "snake_case")]
4847
pub enum ReliabilityState {
49-
Received, // Message was received by the Push Server
50-
Stored, // Message was stored because it could not be delivered immediately
51-
Retrieved, // Message was taken from storage for delivery
52-
IntTransmitted, // Message was handed off between autoendpoint and autoconnect
53-
IntAccepted, // Message was accepted by autoconnect from autopendpoint
54-
Transmitted, // Message was handed off for delivery to the UA
55-
Accepted, // Message was accepted for delivery by the UA
56-
Delivered, // Message was provided to the WebApp recipient by the UA
57-
DecryptionError, // Message was provided to the UA and it reported a decryption error
58-
NotDelivered, // Message was provided to the UA and it reported a not delivered error
59-
Expired, // Message expired naturally (e.g. TTL=0)
60-
Errored, // Message resulted in an Error state and failed to be delivered.
48+
Received, // Message was received by the Push Server
49+
Stored, // Message was stored because it could not be delivered immediately
50+
Retrieved, // Message was taken from storage for delivery
51+
IntTransmitted, // Message was handed off between autoendpoint and autoconnect
52+
IntAccepted, // Message was accepted by autoconnect from autopendpoint
53+
BridgeTransmitted, // Message was handed off to a mobile bridge for eventual delivery
54+
Transmitted, // Message was handed off for delivery to the UA
55+
Accepted, // Message was accepted for delivery by the UA
56+
Delivered, // Message was provided to the WebApp recipient by the UA
57+
DecryptionError, // Message was provided to the UA and it reported a decryption error
58+
NotDelivered, // Message was provided to the UA and it reported a not delivered error
59+
Expired, // Message expired naturally (e.g. TTL=0)
60+
Errored, // Message resulted in an Error state and failed to be delivered.
61+
}
62+
63+
impl ReliabilityState {
64+
/// Has the message reached a state where no further transitions should be possible?
65+
pub fn is_terminal(self) -> bool {
66+
// NOTE: this list should match the daily snapshot captured by `reliability_cron.py`
67+
// which will trim these counts after the max message TTL has expired.
68+
matches!(
69+
self,
70+
ReliabilityState::DecryptionError
71+
| ReliabilityState::BridgeTransmitted
72+
| ReliabilityState::Delivered
73+
| ReliabilityState::Errored
74+
| ReliabilityState::Expired
75+
| ReliabilityState::NotDelivered
76+
)
77+
}
6178
}
6279

6380
#[derive(Clone)]
@@ -157,10 +174,18 @@ impl PushReliability {
157174
};
158175
// Errors are not fatal, and should not impact message flow, but
159176
// we should record them somewhere.
160-
pipeline.zadd(EXPIRY, format!("{}#{}", new, id), expr.unwrap_or_default());
161-
let _ = pipeline.exec_async(conn).await.inspect_err(|e| {
162-
warn!("🔍 Failed to write to storage: {:?}", e);
163-
});
177+
if !new.is_terminal() {
178+
// Write the expiration only if the state is non-terminal. Otherwise we run the risk of
179+
// messages reporting a false "expired" state even if they were "successful".
180+
let cc = pipeline
181+
.zadd(EXPIRY, format!("{}#{}", new, id), expr.unwrap_or_default())
182+
.exec_async(conn)
183+
.await
184+
.inspect_err(|e| {
185+
warn!("🔍 Failed to write to storage: {:?}", e);
186+
});
187+
trace!("🔍 internal record result: {:?}", cc);
188+
}
164189
}
165190

166191
/// Perform a garbage collection cycle on a reliability object.

scripts/reliability/Dockerfile

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Docker build for Reliability Reporter.
2+
3+
FROM python:3.12-alpine
4+
5+
ADD . /app
6+
WORKDIR /app
7+
# should this be templated? Some have a path here I presume gets loaded via a Docker build.
8+
# (e.g. fxa uses `/app/keys/keyfile_name.json`)
9+
# loaded va the webservices-infra cronjob.yaml:: containers::volumeMounts
10+
VOLUME [ "keys" "/app/keys" ]
11+
ENV GOOGLE_APPLICATION_CREDENTIALS = /app/keys/service-account-key.json
12+
13+
# these will need to be templated in the calling k8s config.
14+
# ENV AUTOEND__DB_SETTINGS = '{"table_name":"projects/test/instance/test/tables/autopush"}'
15+
# ENV AUTOEND__DB_DSN = "grpc:://localhost:8086"
16+
# ENV AUTOEND__RELIABILITY_DSN = "redis:://localhost"
17+
18+
# Generate the report
19+
ENV AUTOTRACK_REPORT_BUCKET_NAME = "autopush_reliability"
20+
ENV AUTOTRACK_OUTPUT="md json"
21+
RUN pip install .
22+
CMD ["python3", "reliability_report.py"]
23+

0 commit comments

Comments
 (0)