Skip to content

Commit f60eae7

Browse files
authored
Implement context-based logger propagation (#994)
* feat(o11y): standardize logging with Datadog tag conventions Centralized all logging tag keys to use constants from o11y/tags/tags.go, ensuring compliance with Datadog naming conventions (lowercase, underscores). Replaced hardcoded strings across 64 files with typed constants for better maintainability and consistency. Key improvements: - Organized tag constants by functional clusters (generic, events, timing, etc.) - Added 80+ missing tag constants for complete coverage - Removed duplicate constants and mock files - Enhanced observability infrastructure with structured logging Co-Authored-By: Claude <[email protected]> * fix: k8s version * circle-ci: add auto reruns for e2e-tests Jira: CHAOSPLT-1211
1 parent d723b12 commit f60eae7

File tree

81 files changed

+1966
-1251
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

81 files changed

+1966
-1251
lines changed

.circleci/config.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@ jobs:
211211
export PATH="/home/circleci/go/bin:${PATH}"
212212
make e2e-test GOBIN=/home/circleci/go/bin KUBECTL="minikube kubectl --" E2E_TEST_CLUSTER_NAME="minikube" E2E_TEST_KUBECTL_CONTEXT="minikube"
213213
no_output_timeout: 15m
214+
max_auto_reruns: 3
214215
- run:
215216
name: Save logs
216217
when: on_fail

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ PROTOC_VERSION = 3.17.3
5454
PROTOC_OS ?= osx
5555
PROTOC_ZIP = protoc-${PROTOC_VERSION}-${PROTOC_OS}-x86_64.zip
5656
# you might also want to change ~/lima.yaml k3s version
57-
KUBERNETES_MAJOR_VERSION ?= 1.26
57+
KUBERNETES_MAJOR_VERSION ?= 1.28
5858
KUBERNETES_VERSION ?= v$(KUBERNETES_MAJOR_VERSION).0
5959
KUBEBUILDER_VERSION ?= 3.1.0
6060
USE_VOLUMES ?= false
@@ -70,7 +70,7 @@ GOLANGCI_LINT_INSTALLED_VERSION = $(shell (golangci-lint --version || echo "") |
7070
CONTROLLER_GEN_VERSION = v0.14.0
7171
CONTROLLER_GEN_INSTALLED_VERSION = $(shell (controller-gen --version || echo "") | awk '{ print $$2 }')
7272

73-
MOCKERY_VERSION = 2.53.0
73+
MOCKERY_VERSION = 2.53.5
7474
MOCKERY_ARCH = $(GOARCH)
7575
ifeq (amd64,$(GOARCH))
7676
MOCKERY_ARCH = x86_64

api/v1beta1/disruption_cron_types.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ import (
99
"fmt"
1010
"time"
1111

12-
"github.com/DataDog/chaos-controller/log"
13-
1412
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
13+
14+
tagutil "github.com/DataDog/chaos-controller/o11y/tags"
1515
)
1616

1717
func init() {
@@ -119,8 +119,8 @@ func (d *DisruptionCron) IsReadyToRemoveFinalizer(finalizerDelay time.Duration)
119119
// getMetricsTags parses the disruptioncron to generate metrics tags
120120
func (d *DisruptionCron) getMetricsTags() []string {
121121
tags := []string{
122-
fmt.Sprintf("%s:%s", log.DisruptionCronNameKey, d.Name),
123-
fmt.Sprintf("%s:%s", log.DisruptionCronNamespaceKey, d.Namespace),
122+
tagutil.FormatTag(tagutil.DisruptionCronNameKey, d.Name),
123+
tagutil.FormatTag(tagutil.DisruptionCronNamespaceKey, d.Namespace),
124124
}
125125

126126
return tags

api/v1beta1/disruption_cron_webhook.go

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,6 @@ import (
1212
"strings"
1313
"time"
1414

15-
cLog "github.com/DataDog/chaos-controller/log"
16-
"github.com/DataDog/chaos-controller/o11y/metrics"
17-
"github.com/DataDog/chaos-controller/utils"
18-
1915
"github.com/robfig/cron"
2016
"go.uber.org/zap"
2117
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -26,6 +22,10 @@ import (
2622
ctrl "sigs.k8s.io/controller-runtime"
2723
"sigs.k8s.io/controller-runtime/pkg/webhook"
2824
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
25+
26+
"github.com/DataDog/chaos-controller/o11y/metrics"
27+
tagutil "github.com/DataDog/chaos-controller/o11y/tags"
28+
"github.com/DataDog/chaos-controller/utils"
2929
)
3030

3131
const (
@@ -48,8 +48,8 @@ func (d *DisruptionCron) SetupWebhookWithManager(setupWebhookConfig utils.SetupW
4848
disruptionCronWebhookRecorder = setupWebhookConfig.Recorder
4949
disruptionCronWebhookDeleteOnly = setupWebhookConfig.DeleteOnlyFlag
5050
disruptionCronWebhookLogger = setupWebhookConfig.Logger.With(
51-
"source", "admission-controller",
52-
"admission-controller", "disruption-cron-webhook",
51+
tagutil.SourceKey, "admission-controller",
52+
tagutil.AdmissionControllerKey, "disruption-cron-webhook",
5353
)
5454
disruptionCronMetricsSink = setupWebhookConfig.MetricsSink
5555

@@ -76,7 +76,11 @@ var _ webhook.Defaulter = &DisruptionCron{}
7676
// Default implements webhook.Defaulter so a webhook will be registered for the type
7777
func (d *DisruptionCron) Default() {
7878
if d.Spec.DelayedStartTolerance.Duration() == 0 {
79-
logger.Infow(fmt.Sprintf("setting default delayedStartTolerance of %s in disruptionCron", defaultCronDelayedStartTolerance), cLog.DisruptionCronNameKey, d.Name, cLog.DisruptionCronNamespaceKey, d.Namespace)
79+
logger.Infow(fmt.Sprintf("setting default delayedStartTolerance of %s in disruptionCron", defaultCronDelayedStartTolerance),
80+
tagutil.DisruptionCronNameKey, d.Name,
81+
tagutil.DisruptionCronNamespaceKey, d.Namespace,
82+
)
83+
8084
d.Spec.DelayedStartTolerance = DisruptionDuration(defaultCronDelayedStartTolerance.String())
8185
}
8286
}
@@ -87,16 +91,19 @@ var _ webhook.Validator = &DisruptionCron{}
8791

8892
// ValidateCreate implements webhook.Validator so a webhook will be registered for the type
8993
func (d *DisruptionCron) ValidateCreate() (_ admission.Warnings, err error) {
90-
log := disruptionCronWebhookLogger.With(cLog.DisruptionCronNameKey, d.Name, cLog.DisruptionCronNamespaceKey, d.Namespace)
94+
log := disruptionCronWebhookLogger.With(
95+
tagutil.DisruptionCronNameKey, d.Name,
96+
tagutil.DisruptionCronNamespaceKey, d.Namespace,
97+
)
9198

92-
log.Infow("validating created disruption cron", "spec", d.Spec)
99+
log.Infow("validating created disruption cron", tagutil.SpecKey, d.Spec)
93100

94101
metricTags := d.getMetricsTags()
95102

96103
defer func() {
97104
if err != nil {
98105
if mErr := disruptionCronMetricsSink.MetricValidationFailed(metricTags); mErr != nil {
99-
log.Errorw("error sending a metric", "error", mErr)
106+
log.Errorw("error sending a metric", tagutil.ErrorKey, mErr)
100107
}
101108
}
102109
}()
@@ -123,7 +130,7 @@ func (d *DisruptionCron) ValidateCreate() (_ admission.Warnings, err error) {
123130
}
124131

125132
if mErr := metricsSink.MetricValidationCreated(metricTags); mErr != nil {
126-
log.Errorw("error sending a metric", "error", mErr)
133+
log.Errorw("error sending a metric", tagutil.ErrorKey, mErr)
127134
}
128135

129136
// send informative event to disruption cron to broadcast
@@ -133,16 +140,18 @@ func (d *DisruptionCron) ValidateCreate() (_ admission.Warnings, err error) {
133140
}
134141

135142
func (d *DisruptionCron) ValidateUpdate(oldObject runtime.Object) (_ admission.Warnings, err error) {
136-
log := logger.With(cLog.DisruptionCronNameKey, d.Name, cLog.DisruptionCronNamespaceKey, d.Namespace)
143+
log := logger.With(
144+
tagutil.DisruptionCronNameKey, d.Name, tagutil.DisruptionCronNamespaceKey, d.Namespace,
145+
)
137146

138-
log.Infow("validating updated disruption cron", "spec", d.Spec)
147+
log.Infow("validating updated disruption cron", tagutil.SpecKey, d.Spec)
139148

140149
metricTags := d.getMetricsTags()
141150

142151
defer func() {
143152
if err != nil {
144153
if mErr := disruptionCronMetricsSink.MetricValidationFailed(metricTags); mErr != nil {
145-
log.Errorw("error sending a metric", "error", mErr)
154+
log.Errorw("error sending a metric", tagutil.ErrorKey, mErr)
146155
}
147156
}
148157
}()
@@ -169,7 +178,7 @@ func (d *DisruptionCron) ValidateUpdate(oldObject runtime.Object) (_ admission.W
169178
}
170179

171180
if mErr := metricsSink.MetricValidationUpdated(metricTags); mErr != nil {
172-
log.Errorw("error sending a metric", "error", mErr)
181+
log.Errorw("error sending a metric", tagutil.ErrorKey, mErr)
173182
}
174183

175184
// send informative event to disruption cron to broadcast
@@ -179,15 +188,18 @@ func (d *DisruptionCron) ValidateUpdate(oldObject runtime.Object) (_ admission.W
179188
}
180189

181190
func (d *DisruptionCron) ValidateDelete() (warnings admission.Warnings, err error) {
182-
log := disruptionCronWebhookLogger.With(cLog.DisruptionCronNameKey, d.Name, cLog.DisruptionCronNamespaceKey, d.Namespace)
191+
log := disruptionCronWebhookLogger.With(
192+
tagutil.DisruptionCronNameKey, d.Name,
193+
tagutil.DisruptionCronNamespaceKey, d.Namespace,
194+
)
183195

184-
log.Infow("validating deleted disruption cron", "spec", d.Spec)
196+
log.Infow("validating deleted disruption cron", tagutil.SpecKey, d.Spec)
185197

186198
// During the validation of the deletion the timestamp does not exist so we need to set it before emitting the event
187199
d.DeletionTimestamp = &metav1.Time{Time: time.Now()}
188200

189201
if mErr := metricsSink.MetricValidationDeleted(d.getMetricsTags()); mErr != nil {
190-
log.Errorw("error sending a metric", "error", mErr)
202+
log.Errorw("error sending a metric", tagutil.ErrorKey, mErr)
191203
}
192204

193205
// send informative event to disruption cron to broadcast
@@ -199,7 +211,7 @@ func (d *DisruptionCron) ValidateDelete() (warnings admission.Warnings, err erro
199211
func (d *DisruptionCron) emitEvent(eventReason EventReason) {
200212
disruptionCronJSON, err := json.Marshal(d)
201213
if err != nil {
202-
disruptionCronWebhookLogger.Errorw("failed to marshal disruption cron", "error", err)
214+
disruptionCronWebhookLogger.Errorw("failed to marshal disruption cron", tagutil.ErrorKey, err)
203215
return
204216
}
205217

api/v1beta1/disruption_pods.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,18 @@ import (
99
"context"
1010
"fmt"
1111

12-
chaostypes "github.com/DataDog/chaos-controller/types"
13-
"go.uber.org/zap"
1412
corev1 "k8s.io/api/core/v1"
1513
"k8s.io/apimachinery/pkg/labels"
1614
"sigs.k8s.io/controller-runtime/pkg/client"
15+
16+
cLog "github.com/DataDog/chaos-controller/log"
17+
"github.com/DataDog/chaos-controller/o11y/tags"
18+
chaostypes "github.com/DataDog/chaos-controller/types"
1719
)
1820

1921
// GetChaosPods returns chaos pods owned by the given instance and having the given labels
2022
// both instance and label set are optional but at least one must be provided
21-
func GetChaosPods(ctx context.Context, log *zap.SugaredLogger, chaosNamespace string, k8sClient client.Client, instance *Disruption, ls labels.Set) ([]corev1.Pod, error) {
23+
func GetChaosPods(ctx context.Context, chaosNamespace string, k8sClient client.Client, instance *Disruption, ls labels.Set) ([]corev1.Pod, error) {
2224
pods := &corev1.PodList{}
2325

2426
if k8sClient == nil {
@@ -56,9 +58,11 @@ func GetChaosPods(ctx context.Context, log *zap.SugaredLogger, chaosNamespace st
5658
podNames = append(podNames, pod.Name)
5759
}
5860

59-
if log != nil {
60-
log.Debugw("searching for chaos pods with label selector...", "labels", ls.String(), "foundPods", podNames)
61-
}
61+
// Get logger from context for debugging
62+
cLog.FromContext(ctx).Debugw("searching for chaos pods with label selector...",
63+
tags.LabelsKey, ls.String(),
64+
tags.FoundPodsKey, podNames,
65+
)
6266

6367
return pods.Items, nil
6468
}

0 commit comments

Comments
 (0)