Skip to content

Commit b78927b

Browse files
authored
Replace stuck on removal metrics with new stuck_on_removal_current (#978)
* Replace existing stuck on removal metrics * Use new current stuck metric
1 parent 5c235e3 commit b78927b

File tree

5 files changed

+45
-108
lines changed

5 files changed

+45
-108
lines changed

controllers/disruption_controller.go

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,6 @@ import (
2424
"strings"
2525
"time"
2626

27-
chaosv1beta1 "github.com/DataDog/chaos-controller/api/v1beta1"
28-
"github.com/DataDog/chaos-controller/cloudservice"
29-
cLog "github.com/DataDog/chaos-controller/log"
30-
"github.com/DataDog/chaos-controller/o11y/metrics"
31-
"github.com/DataDog/chaos-controller/o11y/tracer"
32-
"github.com/DataDog/chaos-controller/safemode"
33-
"github.com/DataDog/chaos-controller/services"
34-
"github.com/DataDog/chaos-controller/targetselector"
35-
chaostypes "github.com/DataDog/chaos-controller/types"
36-
"github.com/DataDog/chaos-controller/watchers"
3727
"go.opentelemetry.io/otel"
3828
"go.opentelemetry.io/otel/attribute"
3929
"go.opentelemetry.io/otel/trace"
@@ -56,6 +46,17 @@ import (
5646
"sigs.k8s.io/controller-runtime/pkg/predicate"
5747
"sigs.k8s.io/controller-runtime/pkg/reconcile"
5848
"sigs.k8s.io/controller-runtime/pkg/source"
49+
50+
chaosv1beta1 "github.com/DataDog/chaos-controller/api/v1beta1"
51+
"github.com/DataDog/chaos-controller/cloudservice"
52+
cLog "github.com/DataDog/chaos-controller/log"
53+
"github.com/DataDog/chaos-controller/o11y/metrics"
54+
"github.com/DataDog/chaos-controller/o11y/tracer"
55+
"github.com/DataDog/chaos-controller/safemode"
56+
"github.com/DataDog/chaos-controller/services"
57+
"github.com/DataDog/chaos-controller/targetselector"
58+
chaostypes "github.com/DataDog/chaos-controller/types"
59+
"github.com/DataDog/chaos-controller/watchers"
5960
)
6061

6162
// DisruptionReconciler reconciles a Disruption object
@@ -1054,7 +1055,7 @@ func (r *DisruptionReconciler) ReportMetrics(ctx context.Context) {
10541055
if d.Status.IsStuckOnRemoval {
10551056
stuckOnRemoval++
10561057

1057-
if err := r.MetricsSink.MetricStuckOnRemoval([]string{"disruptionName:" + d.Name, "namespace:" + d.Namespace}); err != nil {
1058+
if err := r.MetricsSink.MetricStuckOnRemovalCurrent(1, []string{"disruptionName:" + d.Name, "disruptionNamespace:" + d.Namespace}); err != nil {
10581059
r.BaseLog.Errorw("error sending stuck_on_removal metric", "error", err)
10591060
}
10601061
}
@@ -1071,11 +1072,6 @@ func (r *DisruptionReconciler) ReportMetrics(ctx context.Context) {
10711072
namespaces[d.Namespace]++
10721073
}
10731074

1074-
// send metrics
1075-
if err := r.MetricsSink.MetricStuckOnRemovalGauge(float64(stuckOnRemoval)); err != nil {
1076-
r.BaseLog.Errorw("error sending stuck_on_removal_total metric", "error", err)
1077-
}
1078-
10791075
if len(namespaces) > 0 {
10801076
for namespace, count := range namespaces {
10811077
if err := r.MetricsSink.MetricDisruptionsGauge(float64(count), []string{fmt.Sprintf("namespace:%s", namespace)}); err != nil {

o11y/metrics/datadog/datadog.go

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@ import (
1010
"os"
1111
"time"
1212

13+
"github.com/DataDog/datadog-go/statsd"
14+
1315
"github.com/DataDog/chaos-controller/o11y/metrics/types"
1416
chaostypes "github.com/DataDog/chaos-controller/types"
15-
"github.com/DataDog/datadog-go/statsd"
1617
)
1718

1819
const (
@@ -163,17 +164,11 @@ func (d Sink) MetricPodsCreated(target, instanceName, namespace string, succeed
163164
return d.metricWithStatus(d.prefix+"pods.created", tags)
164165
}
165166

166-
// MetricStuckOnRemoval is emitted once per minute per disruption, if that disruption is "stuck on removal", i.e.,
167+
// MetricStuckOnRemovalCurrent is emitted once per minute counting the number of disruptions _per namespace_
168+
// that are "stuck on removal", i.e.,
167169
// we have attempted to clean and delete the disruption, but that has not worked, and a human needs to intervene.
168-
func (d Sink) MetricStuckOnRemoval(tags []string) error {
169-
return d.metricWithStatus(d.prefix+"disruptions.stuck_on_removal", tags)
170-
}
171-
172-
// MetricStuckOnRemovalGauge is emitted once per minute counting the total number of disruptions that are
173-
// "stuck on removal", i.e., we have attempted to clean and delete the disruption, but that has not worked,
174-
// and a human needs to intervene.
175-
func (d Sink) MetricStuckOnRemovalGauge(gauge float64) error {
176-
return d.client.Gauge(d.prefix+"disruptions.stuck_on_removal_total", gauge, []string{}, 1)
170+
func (d Sink) MetricStuckOnRemovalCurrent(gauge float64, tags []string) error {
171+
return d.client.Gauge(d.prefix+"disruptions.stuck_on_removal_current", gauge, tags, 1)
177172
}
178173

179174
// MetricDisruptionsGauge is emitted once per minute counting the total number of ongoing disruptions per namespace,

o11y/metrics/metrics.go

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,12 @@ import (
99
"fmt"
1010
"time"
1111

12+
"go.uber.org/zap"
13+
1214
"github.com/DataDog/chaos-controller/o11y/metrics/datadog"
1315
"github.com/DataDog/chaos-controller/o11y/metrics/noop"
1416
"github.com/DataDog/chaos-controller/o11y/metrics/types"
1517
chaostypes "github.com/DataDog/chaos-controller/types"
16-
"go.uber.org/zap"
1718
)
1819

1920
// Sink describes a metric sink
@@ -56,13 +57,10 @@ type Sink interface {
5657
// MetricDisruptionOngoingDuration indicates the duration between a Disruption's creation timestamp, and the current time.
5758
// This is emitted approximately every one minute
5859
MetricDisruptionOngoingDuration(duration time.Duration, tags []string) error
59-
// MetricStuckOnRemoval is emitted once per minute per disruption, if that disruption is "stuck on removal", i.e.,
60+
// MetricStuckOnRemovalCurrent is emitted once per minute counting the number of disruptions _per namespace_
61+
// that are "stuck on removal", i.e.,
6062
// we have attempted to clean and delete the disruption, but that has not worked, and a human needs to intervene.
61-
MetricStuckOnRemoval(tags []string) error
62-
// MetricStuckOnRemovalGauge is emitted once per minute counting the total number of disruptions that are
63-
// "stuck on removal", i.e., we have attempted to clean and delete the disruption, but that has not worked,
64-
// and a human needs to intervene.
65-
MetricStuckOnRemovalGauge(gauge float64) error
63+
MetricStuckOnRemovalCurrent(gauge float64, tags []string) error
6664
// MetricDisruptionsGauge is emitted once per minute counting the total number of ongoing disruptions per namespace,
6765
// or if we fail to determine the namespaced metrics, simply the total number of disruptions found
6866
MetricDisruptionsGauge(gauge float64, tags []string) error

o11y/metrics/noop/noop.go

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@ import (
99
"fmt"
1010
"time"
1111

12+
"go.uber.org/zap"
13+
1214
"github.com/DataDog/chaos-controller/o11y/metrics/types"
1315
chaostypes "github.com/DataDog/chaos-controller/types"
14-
"go.uber.org/zap"
1516
)
1617

1718
// Sink describes a no-op sink
@@ -128,19 +129,11 @@ func (n Sink) MetricPodsCreated(target, instanceName, namespace string, succeed
128129
return nil
129130
}
130131

131-
// MetricStuckOnRemoval is emitted once per minute per disruption, if that disruption is "stuck on removal", i.e.,
132+
// MetricStuckOnRemovalCurrent is emitted once per minute counting the number of disruptions _per namespace_
133+
// that are "stuck on removal", i.e.,
132134
// we have attempted to clean and delete the disruption, but that has not worked, and a human needs to intervene.
133-
func (n Sink) MetricStuckOnRemoval(tags []string) error {
134-
fmt.Println("NOOP: MetricStuckOnRemoval +1")
135-
136-
return nil
137-
}
138-
139-
// MetricStuckOnRemovalGauge is emitted once per minute counting the total number of disruptions that are
140-
// "stuck on removal", i.e., we have attempted to clean and delete the disruption, but that has not worked,
141-
// and a human needs to intervene.
142-
func (n Sink) MetricStuckOnRemovalGauge(gauge float64) error {
143-
n.log.Debugf("NOOP: MetricStuckOnRemovalGauge %f\n", gauge)
135+
func (n Sink) MetricStuckOnRemovalCurrent(gauge float64, tags []string) error {
136+
n.log.Debugf("NOOP: MetricStuckOnRemovalCurrent %f %s\n", gauge, tags)
144137

145138
return nil
146139
}

o11y/metrics/sink_mock.go

Lines changed: 16 additions & 61 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)