Skip to content

Commit 93eb002

Browse files
everton-dematosbrianmcgillion
authored andcommitted
logging: add clock-jump recovery and tighten Alloy service ordering
- Add ghaf.logging.recovery options and shared clock-jump watcher + recover oneshot. - Ensure alloy.service is ordered after/requires systemd-journald on client and server. - Server pipeline: route journald through loki.process, drop entries older than 168h, and align WAL max_segment_age. Signed-off-by: Everton de Matos <[email protected]>
1 parent 23c038f commit 93eb002

File tree

4 files changed

+148
-16
lines changed

4 files changed

+148
-16
lines changed

modules/common/logging/client.nix

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -124,19 +124,24 @@ in
124124

125125
services.alloy.enable = true;
126126

127-
# Copy certs/keys (and optional CA) into /run/credentials/alloy.service/…
128-
systemd.services.alloy.serviceConfig.LoadCredential = [
129-
"client_cert:${cfg.tls.certFile}"
130-
"client_key:${cfg.tls.keyFile}"
131-
]
132-
++ lib.optionals (cfg.tls.caFile != null) [
133-
"client_ca:${cfg.tls.caFile}"
134-
];
127+
systemd.services.alloy.serviceConfig = {
128+
after = [ "systemd-journald.service" ];
129+
requires = [ "systemd-journald.service" ];
130+
131+
# Once alloy.service in admin-vm stopped this service will
132+
# still keep on retrying to send logs batch, so we need to
133+
# stop it forcefully.
134+
TimeoutStopSec = 4;
135135

136-
# Once alloy.service in admin-vm stopped this service will
137-
# still keep on retrying to send logs batch, so we need to
138-
# stop it forcefully.
139-
systemd.services.alloy.serviceConfig.TimeoutStopSec = 4;
136+
# Copy certs/keys (and optional CA) into /run/credentials/alloy.service/…
137+
LoadCredential = [
138+
"client_cert:${cfg.tls.certFile}"
139+
"client_key:${cfg.tls.keyFile}"
140+
]
141+
++ lib.optionals (cfg.tls.caFile != null) [
142+
"client_ca:${cfg.tls.caFile}"
143+
];
144+
};
140145

141146
ghaf.security.audit.extraRules = [
142147
"-w /etc/alloy/client.alloy -p rwxa -k alloy_client_config"

modules/common/logging/common.nix

Lines changed: 122 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,73 @@
11
# SPDX-FileCopyrightText: 2022-2026 TII (SSRC) and the Ghaf contributors
22
# SPDX-License-Identifier: Apache-2.0
3-
{ lib, ... }:
3+
{
4+
config,
5+
lib,
6+
pkgs,
7+
...
8+
}:
49
let
5-
inherit (lib) mkOption types;
10+
inherit (lib) mkIf mkOption types;
11+
recCfg = config.ghaf.logging.recovery;
12+
13+
ghafClockJumpWatcher = pkgs.writeShellApplication {
14+
name = "ghaf-clock-jump-watcher";
15+
runtimeInputs = with pkgs; [
16+
coreutils
17+
gawk
18+
systemd
19+
];
20+
text = ''
21+
threshold="${toString recCfg.thresholdSeconds}"
22+
interval="${toString recCfg.intervalSeconds}"
23+
24+
last_real="$(date +%s)"
25+
last_up="$(cut -d' ' -f1 /proc/uptime)"
26+
27+
while true; do
28+
sleep "$interval"
29+
real="$(date +%s)"
30+
up="$(cut -d' ' -f1 /proc/uptime)"
31+
32+
drift="$(awk -v r1="$last_real" -v r2="$real" -v u1="$last_up" -v u2="$up" \
33+
'BEGIN{print (r2-r1) - (u2-u1)}')"
34+
35+
abs="$(awk -v d="$drift" 'BEGIN{print (d<0)?-d:d}')"
36+
37+
if awk -v a="$abs" -v t="$threshold" 'BEGIN{exit !(a>=t)}'; then
38+
systemctl start ghaf-journal-alloy-recover.service || true
39+
fi
40+
41+
last_real="$real"
42+
last_up="$up"
43+
done
44+
'';
45+
};
46+
47+
ghafJournalAlloyRecover = pkgs.writeShellApplication {
48+
name = "ghaf-journal-alloy-recover";
49+
runtimeInputs = with pkgs; [
50+
coreutils
51+
systemd
52+
];
53+
text = ''
54+
stamp="/run/ghaf-journal-alloy-recover.stamp"
55+
now="$(date +%s)"
56+
cooldown="${toString recCfg.cooldownSeconds}"
57+
58+
if [ -e "$stamp" ]; then
59+
last="$(cat "$stamp" 2>/dev/null || echo 0)"
60+
if [ "$((now-last))" -lt "$cooldown" ]; then
61+
exit 0
62+
fi
63+
fi
64+
echo "$now" > "$stamp"
65+
66+
systemd-tmpfiles --create --prefix /var/log/journal
67+
systemctl restart systemd-journald.service
68+
systemctl restart alloy.service
69+
'';
70+
};
671
in
772
{
873
# Creating logging configuration options needed across the host and vms
@@ -78,5 +143,60 @@ in
78143
default = "1day";
79144
};
80145
};
146+
147+
recovery = {
148+
enable = mkOption {
149+
description = "Recover journald/alloy after a realtime clock jump (e.g., manual clock change).";
150+
type = types.bool;
151+
default = false;
152+
};
153+
154+
thresholdSeconds = mkOption {
155+
description = "Only act on clock jumps >= this many seconds.";
156+
type = types.int;
157+
default = 30;
158+
};
159+
160+
intervalSeconds = mkOption {
161+
description = "Polling interval used by the clock-jump watcher.";
162+
type = types.int;
163+
default = 5;
164+
};
165+
166+
cooldownSeconds = mkOption {
167+
description = "Minimum time between recover executions.";
168+
type = types.int;
169+
default = 60;
170+
};
171+
};
172+
};
173+
174+
config = mkIf (config.ghaf.logging.enable && recCfg.enable) {
175+
176+
# Watcher: detects realtime jumps by comparing realtime vs monotonic progression
177+
systemd.services.ghaf-clock-jump-watcher = {
178+
description = "Detect realtime clock jumps and trigger journald/alloy recovery";
179+
wantedBy = [ "multi-user.target" ];
180+
181+
serviceConfig = {
182+
Type = "simple";
183+
Restart = "always";
184+
RestartSec = 2;
185+
ExecStart = lib.getExe ghafClockJumpWatcher;
186+
};
187+
};
188+
189+
systemd.services.ghaf-journal-alloy-recover = {
190+
description = "Recover journald/alloy after time jump";
191+
192+
unitConfig = {
193+
StartLimitIntervalSec = "0";
194+
};
195+
196+
serviceConfig = {
197+
Type = "oneshot";
198+
ExecStart = lib.getExe ghafJournalAlloyRecover;
199+
};
200+
};
81201
};
82202
}

modules/common/logging/server.nix

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,12 +177,15 @@ in
177177
stage.drop {
178178
expression = "(GatewayAuthenticator::login|Gateway login succeeded|csd-wrapper|nmcli)"
179179
}
180+
stage.drop {
181+
older_than = "168h"
182+
}
180183
}
181184
182185
loki.source.journal "journal" {
183186
path = "/var/log/journal"
184187
relabel_rules = discovery.relabel.adminJournal.rules
185-
forward_to = [loki.write.remote.receiver]
188+
forward_to = [loki.process.system.receiver]
186189
}
187190
188191
loki.write "remote" {
@@ -205,7 +208,7 @@ in
205208
// system in order to guarantee persistence of acknowledged data.
206209
wal {
207210
enabled = true
208-
max_segment_age = "240h"
211+
max_segment_age = "168h"
209212
drain_timeout = "4s"
210213
}
211214
external_labels = { machine = local.file.macAddress.content }
@@ -229,6 +232,9 @@ in
229232
services.alloy.enable = true;
230233

231234
systemd.services.alloy.serviceConfig = {
235+
after = [ "systemd-journald.service" ];
236+
requires = [ "systemd-journald.service" ];
237+
232238
# If there is no internet connection , shutdown/reboot will take around 100sec
233239
# So, to fix that problem we need to add stop timeout
234240
# https://github.com/grafana/loki/issues/6533

modules/microvm/sysvms/adminvm.nix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ let
9191
};
9292
};
9393
};
94+
recovery.enable = true;
9495
};
9596

9697
security.fail2ban.enable = configHost.ghaf.development.ssh.daemon.enable;

0 commit comments

Comments
 (0)