1+ # Naming conventions: https://prometheus.io/docs/practices/naming/
2+ # Label rules with `group: longterm-metrics` to archive them by the optional second prometheus instance.
3+
14groups:
25 - name: longterm-metrics-hourly
36 interval: 1h
47 rules:
58
69 # --- CPU ---
7- # Percentage of the time, over the last hour, that all CPUs were working
10+ # Fraction of the time, over the last hour, that all CPUs were working (0-1)
811 # 1 means all CPUs were working all the time, 0 means they were all idle all the time
9- - record: instance:cpu_load:avg_rate1h
12+ - record: instance:node_cpu_seconds:avg_rate1h_not_idle
1013 expr: avg by(instance) (rate(node_cpu_seconds_total{mode!="idle"}[1h]))
1114 labels:
1215 group: longterm-metrics
1316
14- # Percentage of the time that CPUs are waiting for IO
15- - record: instance:cpu_load_iowait:avg_rate1h
17+ # Fraction of the time that CPUs are waiting for IO (0-1)
18+ - record: instance:node_cpu_seconds:avg_rate1h_iowait
1619 expr: avg by(instance) (rate(node_cpu_seconds_total{mode="iowait"}[1h]))
1720 labels:
1821 group: longterm-metrics
1922
20- # Number of threads in the last hour
21- - record: instance:node_threads_count:last_1h
23+ # Number of threads in the last hour (1)
24+ - record: instance:go_threads:avg1h
2225 expr: avg by(instance) (avg_over_time(go_threads[1h]))
2326 labels:
2427 group: longterm-metrics
2528
2629 # --- Network ---
27- # Bytes sent
28- - record: instance:node_network_transmit_bytes_total :sum_rate1h
30+ # Bytes sent (bytes)
31+ - record: instance:node_network_transmit_bytes :sum_rate1h
2932 expr: sum by(instance) (rate(node_network_transmit_bytes_total[1h]))
3033 labels:
3134 group: longterm-metrics
3235
33- # Bytes received
34- - record: instance:node_network_receive_bytes_total :sum_rate1h
36+ # Bytes received (bytes)
37+ - record: instance:node_network_receive_bytes :sum_rate1h
3538 expr: sum by(instance) (rate(node_network_receive_bytes_total[1h]))
3639 labels:
3740 group: longterm-metrics
3841
39- # Total download volume in the last 1 hour logged by the prometheus-log-exporter counter
40- - record: thredds:kb_transfer_size_kb:increase_1h
41- expr: increase(thredds_transfer_size_kb_total [1h])
42+ # Total download volume in the last 1 hour logged by the prometheus-log-exporter counter (bytes)
43+ - record: instance:thredds_transfer_size_bytes:increase1h
44+ expr: increase(thredds_transfer_size_bytes_total [1h])
4245 labels:
4346 group: longterm-metrics
4447
4548 # --- Memory ---
46- # Total memory available in the last hour
47- - record: instance:node_memory_MemAvailable_bytes:last_1h
49+ # Total memory available in the last hour (bytes)
50+ - record: instance:node_memory_MemAvailable_bytes:avg1h
4851 expr: avg by(instance) (avg_over_time(node_memory_MemAvailable_bytes[1h]))
4952 labels:
5053 group: longterm-metrics
5154
52- # Swap memory use
53- - record: instance:node_memory_SwapUsed_percent:last_1h
55+ # Swap memory use (bytes)
56+ - record: instance:node_memory_SwapFree_bytes:avg1h
5457 expr: avg by(instance) (avg_over_time(node_memory_SwapFree_bytes[1h]))
5558 labels:
5659 group: longterm-metrics
5760
5861 # -- Disks ---
59- # Bytes read
62+ # Disk read (bytes)
6063 - record: instance:node_disk_read_bytes:sum_rate1h
6164 expr: sum by(instance) (rate(node_disk_read_bytes_total[1h]))
6265 labels:
6366 group: longterm-metrics
6467
65- # Bytes written
68+ # Disk write (bytes)
6669 - record: instance:node_disk_written_bytes:sum_rate1h
6770 expr: sum by(instance) (rate(node_disk_written_bytes_total[1h]))
6871 labels:
@@ -71,14 +74,14 @@ groups:
7174 # --- Users ---
7275 # To aggregate user numbers over a month or a year, we need to keep the name labels.
7376
74- # JupyterLab container kb writes to disk in the last hour
75- - record: name:jupyter_writes_to_disk_kb:last_1h
76- expr: (sum by(name) (increase(container_fs_writes_bytes_total{name=~"jupyter-.+"}[1h])) > 0) / 1024
77+ # JupyterLab container writes to disk in the last hour (bytes, filtered on > 0)
78+ - record: jupyter:container_fs_writes_bytes:sum_increase1h
79+ expr: (sum by(name) (increase(container_fs_writes_bytes_total{name=~"jupyter-.+"}[1h])) > 0)
7780 labels:
7881 group: longterm-metrics
7982
80- # JupyterLab container CPU usage (> 1 minute per hour)
81- - record: name:jupyter_cpu_seconds:last_1h
83+ # JupyterLab container CPU usage (seconds, filtered on > 60 seconds per hour)
84+ - record: jupyter:container_cpu_user_seconds:sum_increase1h
8285 expr: (sum by(name) (increase(container_cpu_user_seconds_total{name=~"jupyter-.+"}[1h])) > 60)
8386 labels:
8487 group: longterm-metrics
@@ -87,43 +90,45 @@ groups:
8790 - name: longterm-metrics-daily
8891 interval: 1d
8992 rules:
90- # Note: `avg_over_time ` could be replace by `last_over_time`, the latter having been added in Prometheus 2.26
93+ # Note: `max_over_time ` could be replace by `last_over_time`, the latter having been added in Prometheus 2.26
9194
9295 # --- System ---
93- # Uptime
94- - record: instance:node_boot_time_seconds:last_1d
96+ # Uptime (seconds)
97+ - record: instance:node_boot_time_seconds:max_over_time1d
9598 expr: max by(instance) (time() - max_over_time(node_boot_time_seconds[1d]))
9699 labels:
97100 group: longterm-metrics
98101
99102 # --- Disk space ---
100- # Free disk space in the last 24 hours
101- - record: instance:node_filesystem_free_bytes:last_1d
103+ # Free disk space (bytes)
104+ - record: instance:node_filesystem_free_bytes:avg_min_over_time1d
102105 expr: avg by(instance) (min_over_time(node_filesystem_free_bytes[1d]))
103106 labels:
104107 group: longterm-metrics
105108
106- # Disk size in the last 24 hours
107- - record: instance:node_filesystem_size_bytes:last_1d
109+ # Disk size (bytes)
110+ - record: instance:node_filesystem_size_bytes:avg_max_over_time1d
108111 expr: avg by(instance) (max_over_time(node_filesystem_size_bytes[1d]))
109112 labels:
110113 group: longterm-metrics
111114
112115 # --- RAM ---
113- # Total memory available
114- - record: instance:node_memory_MemTotal_bytes:last_1d
116+ # Total memory available (bytes)
117+ - record: instance:node_memory_MemTotal_bytes:avg_max_over_time1d
115118 expr: avg by(instance) (max_over_time(node_memory_MemTotal_bytes[1d]))
116119 labels:
117120 group: longterm-metrics
118121
119- # Swap memory size
120- - record: instance:node_memory_SwapTotal_bytes:last_1d
122+ # Swap memory size (bytes)
123+ - record: instance:node_memory_SwapTotal_bytes:avg_min_over_time1d
121124 expr: avg by(instance) (min_over_time(node_memory_SwapTotal_bytes[1d]))
122125 labels:
123126 group: longterm-metrics
124127
125- # JupyterLab container open
126- - record: name:jupyter_container_seen:last_1d
127- expr: round(sum by(name) (rate(container_last_seen{name=~"jupyter-.+"}[1d]) > 0.9))
128+ # --- Users ---
129+ # Fraction of time JupyterLab containers open in the last day (0-1)
130+ # 1: container open all day
131+ - record: jupyter:container_last_seen:sum_rate1d
132+ expr: sum by(name) (rate(container_last_seen{name=~"jupyter-.+"}[1d]) > 0)
128133 labels:
129134 group: longterm-metrics
0 commit comments