@@ -35,7 +35,6 @@ import json
35
35
import logging
36
36
import os
37
37
import re
38
- import shlex
39
38
import subprocess
40
39
import sys
41
40
import time
@@ -65,6 +64,7 @@ OTEL_COLLECTOR_HEALTH_ENDPOINT = "http://localhost:13133/health"
65
64
YB_CONTROLLER = "yb-controller-server"
66
65
NODE_EXPORTER = "node_exporter"
67
66
CONNECTION_POOLING_MANAGER = "odyssey"
67
+ EARLYOOM = "earlyoom"
68
68
TOTAL = "total"
69
69
70
70
ALERT_ENHANCEMENTS_RELEASE_BUILD = "2.6.0.0-b0"
@@ -269,6 +269,9 @@ YB_NODE_UNEXPECTED_MASTER_RUNNING = MetricDefinition(
269
269
YB_NODE_CLOCKBOUND_STATUS = MetricDefinition (
270
270
"yb_node_clockbound_status" ,
271
271
"Check whether clockbound is synchronized" )
272
+ YB_EARLYOOM_KILLS_COUNT = MetricDefinition (
273
+ "yb_earlyoom_kills_count" ,
274
+ "Number of process kills initiated by earlyoom during last hour" )
272
275
273
276
###################################################################################################
274
277
# Reporting
@@ -560,7 +563,8 @@ class NodeChecker():
560
563
master_http_port , tserver_http_port , ysql_server_http_port , node_version ,
561
564
is_ybc_enabled , ybc_port , time_drift_wrn_threshold , time_drift_err_threshold ,
562
565
otel_enabled , temp_output_file , ddl_atomicity_check , master_leader_url ,
563
- master_rpc_port , tserver_rpc_port , verbose , clock_service_required ):
566
+ master_rpc_port , tserver_rpc_port , verbose , clock_service_required ,
567
+ enable_earlyoom ):
564
568
self .node = node
565
569
self .node_name = node_name
566
570
self .node_identifier = node_identifier
@@ -597,6 +601,7 @@ class NodeChecker():
597
601
self .temp_output_file = temp_output_file
598
602
self .ddl_atomicity_check = ddl_atomicity_check
599
603
self .master_leader_url = master_leader_url
604
+ self .enable_earlyoom = enable_earlyoom
600
605
self .verbose = verbose
601
606
self .prev_process_results = self ._load_previous_per_process_results (temp_output_file )
602
607
self .current_process_results = {}
@@ -2214,6 +2219,41 @@ class NodeChecker():
2214
2219
metric .add_value (0 if has_errors else 1 )
2215
2220
return e .fill_and_return_entry (errors , has_error = has_errors , metrics = [metric ])
2216
2221
2222
+ def count_earlyoom_kills (self ):
2223
+ e = self ._new_metric_entry ("Earlyoom kills count" )
2224
+ try :
2225
+ cmd = ('journalctl --since "1 hour ago" --user -u earlyoom | '
2226
+ 'grep -E "sending SIG(TERM|KILL) to process" | wc -l' )
2227
+ res = self ._check_output (cmd ).strip ()
2228
+ count = 0
2229
+ if res .isdigit ():
2230
+ count = int (res )
2231
+ metric = Metric .from_definition (YB_EARLYOOM_KILLS_COUNT ) \
2232
+ .add_value (count )
2233
+ return e .fill_and_return_entry ([count ], has_error = False , metrics = [metric ])
2234
+ except Exception as ex :
2235
+ message = str (ex )
2236
+ return e .fill_and_return_entry ([message ], has_error = True )
2237
+
2238
+ def check_earlyoom (self , enable_earlyoom ):
2239
+ e = self ._new_entry ("Earlyoom service status" )
2240
+ # if ssl_installed is not None and not ssl_installed:
2241
+ # return e.fill_and_return_warning_entry(["OpenSSL is not installed, skipped"])
2242
+ pid = self .get_process_pid_by_name (EARLYOOM )
2243
+ if enable_earlyoom :
2244
+ if pid is None :
2245
+ return e .fill_and_return_entry (["No process running" ], has_error = True )
2246
+ cmd = "cat /proc/{}/status | grep VmLck" .format (pid )
2247
+ output = self ._check_output (cmd )
2248
+ if output .find ("VmLck" ) == - 1 :
2249
+ return e .fill_and_return_warning_entry (
2250
+ ["Unable to do mlock (required for better performance)" ])
2251
+ else :
2252
+ if pid is not None :
2253
+ return e .fill_and_return_entry (
2254
+ ["Found running process: {}, should be disabled" .format (pid )], has_error = True )
2255
+ return e .fill_and_return_entry ([], has_error = False )
2256
+
2217
2257
def verbose_log (self , message ):
2218
2258
if self .verbose :
2219
2259
logging .info (message )
@@ -2501,6 +2541,7 @@ class NodeInfo:
2501
2541
self .is_ybc_enabled = data ["enableYbc" ]
2502
2542
self .ybc_port = data ["ybcPort" ]
2503
2543
self .otel_enabled = data ["otelCollectorEnabled" ]
2544
+ self .enable_earlyoom = data ["earlyoomEnabled" ]
2504
2545
self .clockSyncServiceRequired = data .get ("clockSyncServiceRequired" , True )
2505
2546
self .clockbound_enabled = data ["clockboundEnabled" ]
2506
2547
@@ -2551,10 +2592,13 @@ def main():
2551
2592
n .master_http_port , n .tserver_http_port , n .ysql_server_http_port , n .yb_version ,
2552
2593
n .is_ybc_enabled , n .ybc_port , n .time_drift_wrn_threshold , n .time_drift_err_threshold ,
2553
2594
n .otel_enabled , args .temp_output_file , args .ddl_atomicity_check , args .master_leader_url ,
2554
- n .master_rpc_port , n .tserver_rpc_port , args .verbose , n .clockSyncServiceRequired )
2595
+ n .master_rpc_port , n .tserver_rpc_port , args .verbose , n .clockSyncServiceRequired ,
2596
+ n .enable_earlyoom )
2555
2597
2556
2598
coordinator .add_precheck (checker , "check_openssl_availability" )
2557
-
2599
+ coordinator .add_check (checker , "check_earlyoom" , n .enable_earlyoom )
2600
+ if n .enable_earlyoom :
2601
+ coordinator .add_check (checker , "count_earlyoom_kills" )
2558
2602
coordinator .add_check (checker , "check_node_metrics_collection" )
2559
2603
coordinator .add_check (checker , "check_disk_utilization" )
2560
2604
coordinator .add_check (checker , "check_for_core_files" )
0 commit comments