Skip to content

Commit c374649

Browse files
ViktorZiegBugra KocabaybugrakocabaynosovkViktorZiegTI8M
authored
feat(alerting): Implement alert reminders (#1138)
* feat(alerting): add reminder-interval feature which allows setting an interval to run alert consecutively * feat(test): add tests for reminder-interval feature * feat(docs): modify documentation for reminder-interval feature * chore: change "due" to "TRIGGERED" for easier log look through * chore: update "reminder-interval" to "repeat-interval" * chore: update reminder-interval to repeat-interval * chore: adapt repeat interval feature after merge * chore: adapt repeat interval feature after merge * RepeatInterval => MinimumRepeatInterval * fix merge issues (cherry picked from commit 9b2161556bddf01d385f97dafac2515857190ae5) * rename and move MiniumRepeatInterval * move MiniumRepeatInterval (again) --------- Co-authored-by: Bugra Kocabay <[email protected]> Co-authored-by: Bugra Kocabay <[email protected]> Co-authored-by: Konstantin Nosov <[email protected]> Co-authored-by: Viktor Ziegler <[email protected]> Co-authored-by: TwiN <[email protected]>
1 parent f6e9387 commit c374649

File tree

6 files changed

+83
-16
lines changed

6 files changed

+83
-16
lines changed

README.md

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -553,16 +553,17 @@ individual endpoints with configurable descriptions and thresholds.
553553

554554
Alerts are configured at the endpoint level like so:
555555

556-
| Parameter | Description | Default |
557-
|:-----------------------------|:-------------------------------------------------------------------------------|:--------------|
558-
| `alerts` | List of all alerts for a given endpoint. | `[]` |
559-
| `alerts[].type` | Type of alert. <br />See table below for all valid types. | Required `""` |
560-
| `alerts[].enabled` | Whether to enable the alert. | `true` |
561-
| `alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert. | `3` |
562-
| `alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved. | `2` |
563-
| `alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved. | `false` |
564-
| `alerts[].description` | Description of the alert. Will be included in the alert sent. | `""` |
565-
| `alerts[].provider-override` | Alerting provider configuration override for the given alert type | `{}` |
556+
| Parameter | Description | Default |
557+
|:-------------------------------------|:-------------------------------------------------------------------------------|:--------------|
558+
| `alerts` | List of all alerts for a given endpoint. | `[]` |
559+
| `alerts[].type` | Type of alert. <br />See table below for all valid types. | Required `""` |
560+
| `alerts[].enabled` | Whether to enable the alert. | `true` |
561+
| `alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert. | `3` |
562+
| `alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved. | `2` |
563+
| `alerts[].minimum-reminder-interval` | Configuration for setting an interval between reminders. | `""` |
564+
| `alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved. | `false` |
565+
| `alerts[].description` | Description of the alert. Will be included in the alert sent. | `""` |
566+
| `alerts[].provider-override` | Alerting provider configuration override for the given alert type | `{}` |
566567

567568
Here's an example of what an alert configuration might look like at the endpoint level:
568569
```yaml

alerting/alert/alert.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"errors"
77
"strconv"
88
"strings"
9+
"time"
910

1011
"github.com/TwiN/logr"
1112
"gopkg.in/yaml.v3"
@@ -35,6 +36,9 @@ type Alert struct {
3536
// SuccessThreshold defines how many successful executions must happen in a row before an ongoing incident is marked as resolved
3637
SuccessThreshold int `yaml:"success-threshold"`
3738

39+
// MinimumReminderInterval is the interval between reminders
40+
MinimumReminderInterval time.Duration `yaml:"minimum-reminder-interval,omitempty"`
41+
3842
// Description of the alert. Will be included in the alert sent.
3943
//
4044
// This is a pointer, because it is populated by YAML and we need to know whether it was explicitly set to a value

alerting/provider/ilert/ilert_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -174,21 +174,21 @@ func TestAlertProvider_BuildRequestBody(t *testing.T) {
174174
Provider: AlertProvider{DefaultConfig: Config{IntegrationKey: "some-integration-key"}},
175175
Alert: alert.Alert{Description: &firstDescription, SuccessThreshold: 3, FailureThreshold: 3, ResolveKey: "123", Type: "ilert", SendOnResolved: &sendOnResolved},
176176
Resolved: false,
177-
ExpectedBody: `{"alert":{"Type":"ilert","Enabled":null,"FailureThreshold":3,"SuccessThreshold":3,"Description":"description-1","SendOnResolved":true,"ProviderOverride":null,"ResolveKey":"123","Triggered":false},"name":"endpoint-name","group":"","status":"firing","title":"endpoint-name","details":"description-1","condition_results":[{"condition":"[CONNECTED] == true","success":false},{"condition":"[STATUS] == 200","success":false}],"url":""}`,
177+
ExpectedBody: `{"alert":{"Type":"ilert","Enabled":null,"FailureThreshold":3,"SuccessThreshold":3,"MinimumReminderInterval":0,"Description":"description-1","SendOnResolved":true,"ProviderOverride":null,"ResolveKey":"123","Triggered":false},"name":"endpoint-name","group":"","status":"firing","title":"endpoint-name","details":"description-1","condition_results":[{"condition":"[CONNECTED] == true","success":false},{"condition":"[STATUS] == 200","success":false}],"url":""}`,
178178
},
179179
{
180180
Name: "resolved",
181181
Provider: AlertProvider{DefaultConfig: Config{IntegrationKey: "some-integration-key"}},
182182
Alert: alert.Alert{Description: &firstDescription, SuccessThreshold: 4, FailureThreshold: 3, ResolveKey: "123", Type: "ilert", SendOnResolved: &sendOnResolved},
183183
Resolved: true,
184-
ExpectedBody: `{"alert":{"Type":"ilert","Enabled":null,"FailureThreshold":3,"SuccessThreshold":4,"Description":"description-1","SendOnResolved":true,"ProviderOverride":null,"ResolveKey":"123","Triggered":false},"name":"endpoint-name","group":"","status":"resolved","title":"endpoint-name","details":"description-1","condition_results":[{"condition":"[CONNECTED] == true","success":true},{"condition":"[STATUS] == 200","success":true}],"url":""}`,
184+
ExpectedBody: `{"alert":{"Type":"ilert","Enabled":null,"FailureThreshold":3,"SuccessThreshold":4,"MinimumReminderInterval":0,"Description":"description-1","SendOnResolved":true,"ProviderOverride":null,"ResolveKey":"123","Triggered":false},"name":"endpoint-name","group":"","status":"resolved","title":"endpoint-name","details":"description-1","condition_results":[{"condition":"[CONNECTED] == true","success":true},{"condition":"[STATUS] == 200","success":true}],"url":""}`,
185185
},
186186
{
187187
Name: "group-override",
188188
Provider: AlertProvider{DefaultConfig: Config{IntegrationKey: "some-integration-key"}, Overrides: []Override{{Group: "g", Config: Config{IntegrationKey: "different-integration-key"}}}},
189189
Alert: alert.Alert{Description: &secondDescription, SuccessThreshold: 5, FailureThreshold: 3, ResolveKey: "123", Type: "ilert", SendOnResolved: &sendOnResolved},
190190
Resolved: false,
191-
ExpectedBody: `{"alert":{"Type":"ilert","Enabled":null,"FailureThreshold":3,"SuccessThreshold":5,"Description":"description-2","SendOnResolved":true,"ProviderOverride":null,"ResolveKey":"123","Triggered":false},"name":"endpoint-name","group":"","status":"firing","title":"endpoint-name","details":"description-2","condition_results":[{"condition":"[CONNECTED] == true","success":false},{"condition":"[STATUS] == 200","success":false}],"url":""}`,
191+
ExpectedBody: `{"alert":{"Type":"ilert","Enabled":null,"FailureThreshold":3,"SuccessThreshold":5,"MinimumReminderInterval":0,"Description":"description-2","SendOnResolved":true,"ProviderOverride":null,"ResolveKey":"123","Triggered":false},"name":"endpoint-name","group":"","status":"firing","title":"endpoint-name","details":"description-2","condition_results":[{"condition":"[CONNECTED] == true","success":false},{"condition":"[STATUS] == 200","success":false}],"url":""}`,
192192
},
193193
}
194194

config/endpoint/endpoint.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,9 @@ type Endpoint struct {
131131

132132
// NumberOfSuccessesInARow is the number of successful evaluations in a row
133133
NumberOfSuccessesInARow int `yaml:"-"`
134+
135+
// LastReminderSent is the time at which the last reminder was sent for this endpoint.
136+
LastReminderSent time.Time `yaml:"-"`
134137
}
135138

136139
// IsEnabled returns whether the endpoint is enabled or not

watchdog/alerting.go

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@ package watchdog
22

33
import (
44
"errors"
5+
"log"
56
"os"
7+
"time"
68

79
"github.com/TwiN/gatus/v5/alerting"
810
"github.com/TwiN/gatus/v5/config/endpoint"
@@ -30,14 +32,24 @@ func handleAlertsToTrigger(ep *endpoint.Endpoint, result *endpoint.Result, alert
3032
if !endpointAlert.IsEnabled() || endpointAlert.FailureThreshold > ep.NumberOfFailuresInARow {
3133
continue
3234
}
33-
if endpointAlert.Triggered {
34-
logr.Debugf("[watchdog.handleAlertsToTrigger] Alert for endpoint with key=%s with description='%s' has already been TRIGGERED, skipping", ep.Key(), endpointAlert.GetDescription())
35+
// Determine if an initial alert should be sent
36+
sendInitialAlert := !endpointAlert.Triggered
37+
// Determine if a reminder should be sent
38+
sendReminder := endpointAlert.Triggered && endpointAlert.MinimumReminderInterval > 0 && time.Since(ep.LastReminderSent) >= endpointAlert.MinimumReminderInterval
39+
// If neither initial alert nor reminder needs to be sent, skip to the next alert
40+
if !sendInitialAlert && !sendReminder {
41+
logr.Debugf("[watchdog.handleAlertsToTrigger] Alert for endpoint=%s with description='%s' is not due for triggering or reminding, skipping", ep.Name, endpointAlert.GetDescription())
3542
continue
3643
}
3744
alertProvider := alertingConfig.GetAlertingProviderByAlertType(endpointAlert.Type)
3845
if alertProvider != nil {
3946
logr.Infof("[watchdog.handleAlertsToTrigger] Sending %s alert because alert for endpoint with key=%s with description='%s' has been TRIGGERED", endpointAlert.Type, ep.Key(), endpointAlert.GetDescription())
4047
var err error
48+
alertType := "reminder"
49+
if sendInitialAlert {
50+
alertType = "initial"
51+
}
52+
log.Printf("[watchdog.handleAlertsToTrigger] Sending %s %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED", alertType, endpointAlert.Type, ep.Name, endpointAlert.GetDescription())
4153
if os.Getenv("MOCK_ALERT_PROVIDER") == "true" {
4254
if os.Getenv("MOCK_ALERT_PROVIDER_ERROR") == "true" {
4355
err = errors.New("error")
@@ -48,7 +60,11 @@ func handleAlertsToTrigger(ep *endpoint.Endpoint, result *endpoint.Result, alert
4860
if err != nil {
4961
logr.Errorf("[watchdog.handleAlertsToTrigger] Failed to send an alert for endpoint with key=%s: %s", ep.Key(), err.Error())
5062
} else {
51-
endpointAlert.Triggered = true
63+
// Mark initial alert as triggered and update last reminder time
64+
if sendInitialAlert {
65+
endpointAlert.Triggered = true
66+
}
67+
ep.LastReminderSent = time.Now()
5268
if err := store.Get().UpsertTriggeredEndpointAlert(ep, endpointAlert); err != nil {
5369
logr.Errorf("[watchdog.handleAlertsToTrigger] Failed to persist triggered endpoint alert for endpoint with key=%s: %s", ep.Key(), err.Error())
5470
}

watchdog/alerting_test.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package watchdog
33
import (
44
"os"
55
"testing"
6+
"time"
67

78
"github.com/TwiN/gatus/v5/alerting"
89
"github.com/TwiN/gatus/v5/alerting/alert"
@@ -517,6 +518,48 @@ func TestHandleAlertingWithProviderThatOnlyReturnsErrorOnResolve(t *testing.T) {
517518
verify(t, ep, 0, 2, false, "")
518519
}
519520

521+
func TestHandleAlertingWithMinimumReminderInterval(t *testing.T) {
522+
_ = os.Setenv("MOCK_ALERT_PROVIDER", "true")
523+
defer os.Clearenv()
524+
525+
cfg := &config.Config{
526+
Alerting: &alerting.Config{
527+
Custom: &custom.AlertProvider{
528+
DefaultConfig: custom.Config{
529+
URL: "https://twin.sh/health",
530+
Method: "GET",
531+
},
532+
},
533+
},
534+
}
535+
enabled := true
536+
ep := &endpoint.Endpoint{
537+
URL: "https://example.com",
538+
Alerts: []*alert.Alert{
539+
{
540+
Type: alert.TypeCustom,
541+
Enabled: &enabled,
542+
FailureThreshold: 2,
543+
SuccessThreshold: 3,
544+
SendOnResolved: &enabled,
545+
Triggered: false,
546+
MinimumReminderInterval: 1 * time.Second,
547+
},
548+
},
549+
}
550+
551+
verify(t, ep, 0, 0, false, "The alert shouldn't start triggered")
552+
HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting)
553+
verify(t, ep, 1, 0, false, "The alert shouldn't have triggered")
554+
HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting)
555+
verify(t, ep, 2, 0, true, "The alert should've triggered")
556+
HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting)
557+
verify(t, ep, 3, 0, true, "The alert should still be triggered")
558+
HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting)
559+
verify(t, ep, 4, 0, true, "The alert should still be triggered")
560+
HandleAlerting(ep, &endpoint.Result{Success: true}, cfg.Alerting)
561+
}
562+
520563
func verify(t *testing.T, ep *endpoint.Endpoint, expectedNumberOfFailuresInARow, expectedNumberOfSuccessInARow int, expectedTriggered bool, expectedTriggeredReason string) {
521564
if ep.NumberOfFailuresInARow != expectedNumberOfFailuresInARow {
522565
t.Errorf("endpoint.NumberOfFailuresInARow should've been %d, got %d", expectedNumberOfFailuresInARow, ep.NumberOfFailuresInARow)

0 commit comments

Comments
 (0)