Skip to content

Commit d9ffda4

Browse files
committed
[PLAT-16591] YBA Health checks for earlyoom
Summary: Added a health check to verify that earlyoom is working as expected. If earlyoom is available we are also checking mlock ability (and show warning if cannot) Also there is a metric to count a number of kills by earlyoom during last hour. Test Plan: 1) create universe with earlyoom enabled but without mlock ability 2) verify that we have warning about mlock 3) kill earlyoom - verify we have error 4) check number of kills metrics create universe without earlyoom - see no alerts Reviewers: anijhawan Reviewed By: anijhawan Subscribers: yugaware Differential Revision: https://phorge.dev.yugabyte.com/D43339
1 parent 8a8f512 commit d9ffda4

File tree

2 files changed

+54
-4
lines changed

2 files changed

+54
-4
lines changed

managed/src/main/java/com/yugabyte/yw/commissioner/HealthChecker.java

+6
Original file line numberDiff line numberDiff line change
@@ -735,6 +735,10 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) {
735735
Map<UUID, NodeInstance> nodeInstanceMap =
736736
NodeInstance.listByUuids(nodeUuids).stream()
737737
.collect(Collectors.toMap(NodeInstance::getNodeUuid, Function.identity()));
738+
boolean earlyoomEnabled =
739+
details.additionalServicesStateData != null
740+
&& details.additionalServicesStateData.getEarlyoomConfig() != null
741+
&& details.additionalServicesStateData.getEarlyoomConfig().isEnabled();
738742
for (NodeDetails nodeDetails : sortedDetails) {
739743
NodeInstance nodeInstance = nodeInstanceMap.get(nodeDetails.getNodeUuid());
740744
String nodeIdentifier = StringUtils.EMPTY;
@@ -762,6 +766,7 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) {
762766
.setTestYsqlshConnectivity(testYsqlshConnectivity)
763767
.setTestCqlshConnectivity(testCqlshConnectivity)
764768
.setUniverseUuid(params.universe.getUniverseUUID())
769+
.setEarlyoomEnabled(earlyoomEnabled)
765770
.setNodeDetails(nodeDetails);
766771
if (nodeDetails.isMaster) {
767772
nodeInfo
@@ -1308,6 +1313,7 @@ public static class NodeInfo {
13081313
private boolean clockSyncServiceRequired = true;
13091314
private boolean clockboundEnabled = false;
13101315
@JsonIgnore @EqualsAndHashCode.Exclude private NodeDetails nodeDetails;
1316+
private boolean earlyoomEnabled = false;
13111317
}
13121318

13131319
@Data

managed/src/main/resources/health/node_health.py.template

+48-4
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ import json
3535
import logging
3636
import os
3737
import re
38-
import shlex
3938
import subprocess
4039
import sys
4140
import time
@@ -65,6 +64,7 @@ OTEL_COLLECTOR_HEALTH_ENDPOINT = "http://localhost:13133/health"
6564
YB_CONTROLLER = "yb-controller-server"
6665
NODE_EXPORTER = "node_exporter"
6766
CONNECTION_POOLING_MANAGER = "odyssey"
67+
EARLYOOM = "earlyoom"
6868
TOTAL = "total"
6969

7070
ALERT_ENHANCEMENTS_RELEASE_BUILD = "2.6.0.0-b0"
@@ -269,6 +269,9 @@ YB_NODE_UNEXPECTED_MASTER_RUNNING = MetricDefinition(
269269
YB_NODE_CLOCKBOUND_STATUS = MetricDefinition(
270270
"yb_node_clockbound_status",
271271
"Check whether clockbound is synchronized")
272+
YB_EARLYOOM_KILLS_COUNT = MetricDefinition(
273+
"yb_earlyoom_kills_count",
274+
"Number of process kills initiated by earlyoom during last hour")
272275

273276
###################################################################################################
274277
# Reporting
@@ -560,7 +563,8 @@ class NodeChecker():
560563
master_http_port, tserver_http_port, ysql_server_http_port, node_version,
561564
is_ybc_enabled, ybc_port, time_drift_wrn_threshold, time_drift_err_threshold,
562565
otel_enabled, temp_output_file, ddl_atomicity_check, master_leader_url,
563-
master_rpc_port, tserver_rpc_port, verbose, clock_service_required):
566+
master_rpc_port, tserver_rpc_port, verbose, clock_service_required,
567+
enable_earlyoom):
564568
self.node = node
565569
self.node_name = node_name
566570
self.node_identifier = node_identifier
@@ -597,6 +601,7 @@ class NodeChecker():
597601
self.temp_output_file = temp_output_file
598602
self.ddl_atomicity_check = ddl_atomicity_check
599603
self.master_leader_url = master_leader_url
604+
self.enable_earlyoom = enable_earlyoom
600605
self.verbose = verbose
601606
self.prev_process_results = self._load_previous_per_process_results(temp_output_file)
602607
self.current_process_results = {}
@@ -2214,6 +2219,41 @@ class NodeChecker():
22142219
metric.add_value(0 if has_errors else 1)
22152220
return e.fill_and_return_entry(errors, has_error=has_errors, metrics=[metric])
22162221

2222+
def count_earlyoom_kills(self):
2223+
e = self._new_metric_entry("Earlyoom kills count")
2224+
try:
2225+
cmd = ('journalctl --since "1 hour ago" --user -u earlyoom | '
2226+
'grep -E "sending SIG(TERM|KILL) to process" | wc -l')
2227+
res = self._check_output(cmd).strip()
2228+
count = 0
2229+
if res.isdigit():
2230+
count = int(res)
2231+
metric = Metric.from_definition(YB_EARLYOOM_KILLS_COUNT) \
2232+
.add_value(count)
2233+
return e.fill_and_return_entry([count], has_error=False, metrics=[metric])
2234+
except Exception as ex:
2235+
message = str(ex)
2236+
return e.fill_and_return_entry([message], has_error=True)
2237+
2238+
def check_earlyoom(self, enable_earlyoom):
2239+
e = self._new_entry("Earlyoom service status")
2240+
# if ssl_installed is not None and not ssl_installed:
2241+
# return e.fill_and_return_warning_entry(["OpenSSL is not installed, skipped"])
2242+
pid = self.get_process_pid_by_name(EARLYOOM)
2243+
if enable_earlyoom:
2244+
if pid is None:
2245+
return e.fill_and_return_entry(["No process running"], has_error=True)
2246+
cmd = "cat /proc/{}/status | grep VmLck".format(pid)
2247+
output = self._check_output(cmd)
2248+
if output.find("VmLck") == -1:
2249+
return e.fill_and_return_warning_entry(
2250+
["Unable to do mlock (required for better performance)"])
2251+
else:
2252+
if pid is not None:
2253+
return e.fill_and_return_entry(
2254+
["Found running process: {}, should be disabled".format(pid)], has_error=True)
2255+
return e.fill_and_return_entry([], has_error=False)
2256+
22172257
def verbose_log(self, message):
22182258
if self.verbose:
22192259
logging.info(message)
@@ -2501,6 +2541,7 @@ class NodeInfo:
25012541
self.is_ybc_enabled = data["enableYbc"]
25022542
self.ybc_port = data["ybcPort"]
25032543
self.otel_enabled = data["otelCollectorEnabled"]
2544+
self.enable_earlyoom = data["earlyoomEnabled"]
25042545
self.clockSyncServiceRequired = data.get("clockSyncServiceRequired", True)
25052546
self.clockbound_enabled = data["clockboundEnabled"]
25062547

@@ -2551,10 +2592,13 @@ def main():
25512592
n.master_http_port, n.tserver_http_port, n.ysql_server_http_port, n.yb_version,
25522593
n.is_ybc_enabled, n.ybc_port, n.time_drift_wrn_threshold, n.time_drift_err_threshold,
25532594
n.otel_enabled, args.temp_output_file, args.ddl_atomicity_check, args.master_leader_url,
2554-
n.master_rpc_port, n.tserver_rpc_port, args.verbose, n.clockSyncServiceRequired)
2595+
n.master_rpc_port, n.tserver_rpc_port, args.verbose, n.clockSyncServiceRequired,
2596+
n.enable_earlyoom)
25552597

25562598
coordinator.add_precheck(checker, "check_openssl_availability")
2557-
2599+
coordinator.add_check(checker, "check_earlyoom", n.enable_earlyoom)
2600+
if n.enable_earlyoom:
2601+
coordinator.add_check(checker, "count_earlyoom_kills")
25582602
coordinator.add_check(checker, "check_node_metrics_collection")
25592603
coordinator.add_check(checker, "check_disk_utilization")
25602604
coordinator.add_check(checker, "check_for_core_files")

0 commit comments

Comments
 (0)