Skip to content

Commit c7adb73

Browse files
committed
[PLAT-16454] Configuring HA alert frequency based on config
Summary: HA alert for standby down defaults to 15 minutes and doesn't change unless configured by user. This diff replaces the threshold value according to the HA config replication frequency, which is also just a runtime config value we update. We set the threshold to twice the replication frequency or 15 minutes, whichever is greater. Test Plan: Configure HA, change the HA config frequency and ensure alert threshold also changes. Reviewers: amalyshev Reviewed By: amalyshev Subscribers: yugaware Differential Revision: https://phorge.dev.yugabyte.com/D41363
1 parent 1f8781d commit c7adb73

File tree

10 files changed

+64
-17
lines changed

10 files changed

+64
-17
lines changed

managed/src/main/java/com/yugabyte/yw/common/alerts/AlertConfigurationService.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
package com.yugabyte.yw.common.alerts;
1111

1212
import static com.yugabyte.yw.common.Util.doubleToString;
13+
import static com.yugabyte.yw.common.ha.PlatformReplicationHelper.REPLICATION_FREQUENCY_KEY;
1314
import static com.yugabyte.yw.models.AlertConfiguration.createQueryByFilter;
1415
import static com.yugabyte.yw.models.helpers.CommonUtils.nowWithoutMillis;
1516
import static com.yugabyte.yw.models.helpers.CommonUtils.performPagedQuery;
@@ -374,6 +375,22 @@ private AlertConfiguration validate(
374375
+ doubleToString(templateDescription.getThresholdMaxValue()))
375376
.throwError();
376377
}
378+
if (configuration.getTemplate() == AlertTemplate.HA_STANDBY_SYNC) {
379+
long replicationFrequency =
380+
runtimeConfigFactory
381+
.globalRuntimeConf()
382+
.getDuration(REPLICATION_FREQUENCY_KEY)
383+
.toMillis();
384+
// threshold in minutes RF in milliseconds
385+
if ((threshold.getThreshold() * 60.0) < (2.0 * replicationFrequency / 1000.0)) {
386+
beanValidator
387+
.error()
388+
.forField(
389+
"HA Standby Sync[" + severity.name() + "].threshold",
390+
"can't be less than twice replication frequency")
391+
.throwError();
392+
}
393+
}
377394
});
378395
if (before != null) {
379396
if (!configuration.getCustomerUUID().equals(before.getCustomerUUID())) {

managed/src/main/java/com/yugabyte/yw/common/ha/PlatformReplicationHelper.java

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,25 @@
1717
import com.typesafe.config.ConfigException;
1818
import com.typesafe.config.ConfigValue;
1919
import com.typesafe.config.ConfigValueFactory;
20+
import com.yugabyte.yw.common.AlertTemplate;
2021
import com.yugabyte.yw.common.AppConfigHelper;
2122
import com.yugabyte.yw.common.PrometheusConfigHelper;
2223
import com.yugabyte.yw.common.PrometheusConfigManager;
2324
import com.yugabyte.yw.common.ShellProcessHandler;
2425
import com.yugabyte.yw.common.ShellResponse;
2526
import com.yugabyte.yw.common.SwamperHelper;
27+
import com.yugabyte.yw.common.alerts.AlertConfigurationService;
2628
import com.yugabyte.yw.common.config.GlobalConfKeys;
2729
import com.yugabyte.yw.common.config.RuntimeConfGetter;
2830
import com.yugabyte.yw.common.config.impl.SettableRuntimeConfigFactory;
2931
import com.yugabyte.yw.common.ha.PlatformReplicationManager.PlatformBackupParams;
3032
import com.yugabyte.yw.common.utils.FileUtils;
3133
import com.yugabyte.yw.metrics.MetricUrlProvider;
34+
import com.yugabyte.yw.models.AlertConfiguration;
35+
import com.yugabyte.yw.models.AlertConfigurationThreshold;
3236
import com.yugabyte.yw.models.HighAvailabilityConfig;
3337
import com.yugabyte.yw.models.PlatformInstance;
38+
import com.yugabyte.yw.models.filters.AlertConfigurationFilter;
3439
import java.io.BufferedWriter;
3540
import java.io.File;
3641
import java.io.FileWriter;
@@ -47,6 +52,7 @@
4752
import java.util.List;
4853
import java.util.Map;
4954
import java.util.Optional;
55+
import java.util.stream.Collectors;
5056
import lombok.extern.slf4j.Slf4j;
5157
import org.apache.pekko.actor.Cancellable;
5258
import org.apache.velocity.Template;
@@ -69,7 +75,7 @@ public class PlatformReplicationHelper {
6975
private static final String REPLICATION_SCHEDULE_ENABLED_KEY =
7076
"yb.ha.replication_schedule_enabled";
7177
private static final String NUM_BACKUP_RETENTION_KEY = "yb.ha.num_backup_retention";
72-
static final String REPLICATION_FREQUENCY_KEY = "yb.ha.replication_frequency";
78+
public static final String REPLICATION_FREQUENCY_KEY = "yb.ha.replication_frequency";
7379
static final String DB_USERNAME_CONFIG_KEY = "db.default.username";
7480
static final String DB_PASSWORD_CONFIG_KEY = "db.default.password";
7581
static final String DB_HOST_CONFIG_KEY = "db.default.host";
@@ -92,6 +98,8 @@ public class PlatformReplicationHelper {
9298

9399
private final PrometheusConfigManager prometheusConfigManager;
94100

101+
private final AlertConfigurationService alertConfigurationService;
102+
95103
@VisibleForTesting ShellProcessHandler shellProcessHandler;
96104

97105
@Inject
@@ -102,14 +110,16 @@ public PlatformReplicationHelper(
102110
ShellProcessHandler shellProcessHandler,
103111
MetricUrlProvider metricUrlProvider,
104112
PrometheusConfigHelper prometheusConfigHelper,
105-
PrometheusConfigManager prometheusConfigManager) {
113+
PrometheusConfigManager prometheusConfigManager,
114+
AlertConfigurationService alertConfigurationService) {
106115
this.confGetter = confGetter;
107116
this.runtimeConfigFactory = runtimeConfigFactory;
108117
this.remoteClientFactory = remoteClientFactory;
109118
this.shellProcessHandler = shellProcessHandler;
110119
this.metricUrlProvider = metricUrlProvider;
111120
this.prometheusConfigHelper = prometheusConfigHelper;
112121
this.prometheusConfigManager = prometheusConfigManager;
122+
this.alertConfigurationService = alertConfigurationService;
113123
}
114124

115125
Path getBackupDir() {
@@ -203,6 +213,26 @@ public void setReplicationFrequency(Duration duration) {
203213
runtimeConfigFactory
204214
.globalRuntimeConf()
205215
.setValue(REPLICATION_FREQUENCY_KEY, String.format("%d ms", duration.toMillis()));
216+
List<AlertConfiguration> haStandbyAlertConfigs =
217+
alertConfigurationService.list(
218+
AlertConfigurationFilter.builder().template(AlertTemplate.HA_STANDBY_SYNC).build());
219+
haStandbyAlertConfigs.forEach(
220+
haStandbyAlertConfig -> {
221+
haStandbyAlertConfig.setThresholds(
222+
haStandbyAlertConfig.getThresholds().entrySet().stream()
223+
.collect(
224+
Collectors.toMap(
225+
Map.Entry::getKey,
226+
e ->
227+
new AlertConfigurationThreshold()
228+
.setCondition(e.getValue().getCondition())
229+
.setThreshold(
230+
Math.max(
231+
// Convert to minutes
232+
2 * duration.toMillis() / 1000.0 / 60.0,
233+
e.getValue().getThreshold())))));
234+
alertConfigurationService.save(haStandbyAlertConfig);
235+
});
206236
}
207237

208238
JsonNode getBackupInfoJson(long frequency, boolean isRunning) {

managed/src/main/resources/alert/alert_templates.yml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -87,15 +87,15 @@ templates:
8787
createForNewCustomer: true
8888
defaultThresholdMap:
8989
WARNING:
90-
threshold: 900.0
90+
threshold: 15
9191
targetType: PLATFORM
9292
defaultThresholdCondition: GREATER_THAN
93-
defaultThresholdUnit: SECOND
93+
defaultThresholdUnit: MINUTE
9494
thresholdReadOnly: false
95-
thresholdUnitName: seconds
95+
thresholdUnitName: min
9696
labels:
9797
affected_instance_addrs: >-
98-
{{ range $index, $element := query "count by (instance_address) (time() - last_over_time(yba_ha_last_backup_seconds[1m]) > {{ query_threshold }}) > 0"}}
98+
{{ range $index, $element := query "count by (instance_address) ((time() - last_over_time(yba_ha_last_backup_seconds[1m]) / 60) > {{ query_threshold }}) > 0"}}
9999
{{ if $index }},{{ end }}{{ $element.Labels.instance_address }}{{ end }}
100100
annotations:
101101
summary: >-
@@ -1293,15 +1293,15 @@ templates:
12931293
thresholdUnitName: ms
12941294
labels:
12951295
affected_node_names: >-
1296-
{{ range $index, $element := query
1296+
{{ range $index, $element := query
12971297
"max by (universe_uuid, node_name)
1298-
(max_over_time(consumer_safe_time_lag{universe_uuid='{{ $labels.universe_uuid }}'}[10m])
1298+
(max_over_time(consumer_safe_time_lag{universe_uuid='{{ $labels.universe_uuid }}'}[10m])
12991299
{{ query_condition }} {{ query_threshold }})" }}
13001300
{{if $index}},{{end}}{{ $element.Labels.node_name }}{{ end }}
13011301
affected_namespace_ids: >-
1302-
{{ range $index, $element := query
1302+
{{ range $index, $element := query
13031303
"max by (universe_uuid, namespace_id)
1304-
(max_over_time(consumer_safe_time_lag{universe_uuid='{{ $labels.universe_uuid }}'}[10m])
1304+
(max_over_time(consumer_safe_time_lag{universe_uuid='{{ $labels.universe_uuid }}'}[10m])
13051305
{{ query_condition }} {{ query_threshold }})" }}
13061306
{{if $index}},{{end}}{{ $element.Labels.namespace_id }}{{ end }}
13071307
annotations:

managed/src/main/resources/reference.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -768,7 +768,7 @@ yb {
768768
num_backup_retention = 10
769769
prometheus_config_dir = "/prometheus_configs"
770770
replication_schedule_enabled = false
771-
replication_frequency = 30 minutes
771+
replication_frequency = 1 minutes
772772
# 0 - never shutdown
773773
# 1 - only shutdown promoted instance
774774
# 2 - shutdown both promoted and demoted instance

managed/src/test/resources/dev.expected.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ yb.docker.release = ""
107107
yb.grafana.accessKey = changeme
108108
yb.ha.num_backup_retention = 10
109109
yb.ha.prometheus_config_dir = "/etc/prometheus"
110-
yb.ha.replication_frequency = "30 minutes"
110+
yb.ha.replication_frequency = "1 minutes"
111111
yb.ha.replication_schedule_enabled = false
112112
yb.health.check_interval_ms = 300000
113113
yb.health.default_email = "RESOLVED_YB_ALERTS_EMAIL"

managed/src/test/resources/helm.expected.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ yb.docker.network = bridge
108108
yb.docker.release = "/opt/yugabyte/release"
109109
yb.ha.num_backup_retention = 10
110110
yb.ha.prometheus_config_dir = "/prometheus_configs"
111-
yb.ha.replication_frequency = "30 minutes"
111+
yb.ha.replication_frequency = "1 minutes"
112112
yb.ha.replication_schedule_enabled = false
113113
yb.health.check_interval_ms = 300000
114114
yb.health.default_email = "[email protected]"

managed/src/test/resources/replicated.expected.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ yb.docker.network = bridge
106106
yb.docker.release = "/opt/yugabyte/release"
107107
yb.ha.num_backup_retention = 10
108108
yb.ha.prometheus_config_dir = "/prometheus_configs"
109-
yb.ha.replication_frequency = "30 minutes"
109+
yb.ha.replication_frequency = "1 minutes"
110110
yb.ha.replication_schedule_enabled = false
111111
yb.health.check_interval_ms = 300000
112112
yb.health.default_email = "[email protected]"

managed/src/test/resources/test.expected.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ yb.aws.default_volume_count = 1
7878
yb.aws.default_volume_size_gb = 250
7979
yb.ha.num_backup_retention = 10
8080
yb.ha.prometheus_config_dir = "/prometheus_configs"
81-
yb.ha.replication_frequency = "30 minutes"
81+
yb.ha.replication_frequency = "1 minutes"
8282
yb.ha.replication_schedule_enabled = false
8383
yb.health.default_email = ""
8484
yb.metrics.host = localhost

managed/src/test/resources/yugabundle.expected.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ yb.aws.default_volume_count = 1
7575
yb.aws.default_volume_size_gb = 250
7676
yb.ha.num_backup_retention = 10
7777
yb.ha.prometheus_config_dir = "/prometheus_configs"
78-
yb.ha.replication_frequency = "30 minutes"
78+
yb.ha.replication_frequency = "1 minutes"
7979
yb.ha.replication_schedule_enabled = false
8080
yb.health.default_email = "RESOLVED_YB_ALERTS_EMAIL"
8181
yb.metrics.host = localhost

managed/src/test/resources/yugabyted.expected.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ yb.aws.default_volume_count = 1
8383
yb.aws.default_volume_size_gb = 250
8484
yb.ha.num_backup_retention = 10
8585
yb.ha.prometheus_config_dir = "/prometheus_configs"
86-
yb.ha.replication_frequency = "30 minutes"
86+
yb.ha.replication_frequency = "1 minutes"
8787
yb.ha.replication_schedule_enabled = false
8888
yb.health.default_email = ""
8989
yb.metrics.host = localhost

0 commit comments

Comments
 (0)