Skip to content

Commit 3891549

Browse files
authored
CHAOS-326: Make logging changes (#572)
* Set log level to DEBUG * Only log starting injection when we are actually creating new pods * chaos pods not being ready immediately is very normal, should be DEBUG * Dont update targets if we did not change them * Pods can take minutes to become Ready or Terminated, dont retry every 5s * log tc commands at DEBUG by default * Dont give error on safe to retry finalizer removals * Log host resolution for user clarity * The linter was angry * Restore update logic * We fixed this feature, put the test back * Remove unused argument from assertion function * Remove comment saying test is broken
1 parent d405aa7 commit 3891549

File tree

6 files changed

+52
-49
lines changed

6 files changed

+52
-49
lines changed

api/v1beta1/disruption_webhook.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ import (
1313
"time"
1414

1515
"github.com/DataDog/chaos-controller/ddmark"
16-
"github.com/DataDog/chaos-controller/types"
1716
"github.com/DataDog/chaos-controller/utils"
1817
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1918

@@ -43,7 +42,7 @@ var handlerEnabled bool
4342
var defaultDuration time.Duration
4443

4544
func (r *Disruption) SetupWebhookWithManager(setupWebhookConfig utils.SetupWebhookWithManagerConfig) error {
46-
if err := ddmark.InitLibrary(EmbeddedChaosAPI, types.DDMarkChaoslibPrefix); err != nil {
45+
if err := ddmark.InitLibrary(EmbeddedChaosAPI, chaostypes.DDMarkChaoslibPrefix); err != nil {
4746
return err
4847
}
4948

@@ -108,7 +107,7 @@ func (r *Disruption) ValidateCreate() error {
108107
return err
109108
}
110109

111-
multiErr := ddmark.ValidateStructMultierror(r.Spec, "validation_webhook", types.DDMarkChaoslibPrefix)
110+
multiErr := ddmark.ValidateStructMultierror(r.Spec, "validation_webhook", chaostypes.DDMarkChaoslibPrefix)
112111
if multiErr.ErrorOrNil() != nil {
113112
return multierror.Prefix(multiErr, "ddmark: ")
114113
}

controllers/disruption_controller.go

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,10 @@ func (r *DisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Request)
149149
return ctrl.Result{}, err
150150
}
151151

152-
// if not cleaned yet, requeue and reconcile again in 5s-10s
152+
// if not cleaned yet, requeue and reconcile again in 15s-20s
153153
// the reason why we don't rely on the exponential backoff here is that it retries too fast at the beginning
154154
if !isCleaned {
155-
requeueAfter := time.Duration(rand.Intn(5)+5) * time.Second //nolint:gosec
155+
requeueAfter := time.Duration(rand.Intn(5)+15) * time.Second //nolint:gosec
156156

157157
r.log.Infow(fmt.Sprintf("disruption has not been fully cleaned yet, re-queuing in %v", requeueAfter))
158158

@@ -281,8 +281,8 @@ func (r *DisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Request)
281281

282282
return ctrl.Result{}, fmt.Errorf("error updating disruption injection status: %w", err)
283283
} else if !injected {
284-
// requeue after 5-10 seconds, as default 1ms is too quick here
285-
requeueAfter := time.Duration(rand.Intn(5)+5) * time.Second //nolint:gosec
284+
// requeue after 15-20 seconds, as default 1ms is too quick here
285+
requeueAfter := time.Duration(rand.Intn(5)+15) * time.Second //nolint:gosec
286286
r.log.Infow("disruption is not fully injected yet, requeuing", "injectionStatus", instance.Status.InjectionStatus)
287287

288288
return ctrl.Result{
@@ -346,7 +346,7 @@ func (r *DisruptionReconciler) updateInjectionStatus(instance *chaosv1beta1.Disr
346346

347347
// consider the disruption as not fully injected if at least one not ready pod is found
348348
if !podReady {
349-
r.log.Infow("chaos pod is not ready yet", "chaosPod", chaosPod.Name)
349+
r.log.Debugw("chaos pod is not ready yet", "chaosPod", chaosPod.Name)
350350
}
351351
}
352352

@@ -383,10 +383,6 @@ func (r *DisruptionReconciler) updateInjectionStatus(instance *chaosv1beta1.Disr
383383

384384
// startInjection creates non-existing chaos pod for the given disruption
385385
func (r *DisruptionReconciler) startInjection(instance *chaosv1beta1.Disruption) error {
386-
if len(instance.Status.Targets) > 0 {
387-
r.log.Infow("starting targets injection", "targets", instance.Status.Targets)
388-
}
389-
390386
// chaosPodsMap is used to check if a target's chaos pods already exist or not
391387
chaosPodsMap := make(map[string]map[string]bool, len(instance.Status.Targets))
392388

@@ -408,6 +404,10 @@ func (r *DisruptionReconciler) startInjection(instance *chaosv1beta1.Disruption)
408404
}
409405
}
410406

407+
if len(instance.Status.Targets) > 0 && (len(instance.Status.Targets) != len(chaosPodsMap)) {
408+
r.log.Infow("starting targets injection", "targets", instance.Status.Targets)
409+
}
410+
411411
// iterate through target + existing disruption kind -- to ensure all chaos pods exist
412412
for _, target := range instance.Status.Targets {
413413
for _, disKind := range chaostypes.DisruptionKindNames {
@@ -623,6 +623,7 @@ func (r *DisruptionReconciler) handleOrphanedChaosPods(req ctrl.Request) error {
623623
// - the pod is pending
624624
// - the pod is succeeded (exit code == 0)
625625
// - the pod target is not healthy (not existing anymore for instance)
626+
//
626627
// if a finalizer can't be removed because none of the conditions above are fulfilled, the instance is flagged
627628
// as stuck on removal and the pod finalizer won't be removed unless someone does it manually
628629
// the pod target will be moved to ignored targets, so it is not picked up by the next reconcile loop
@@ -725,7 +726,11 @@ func (r *DisruptionReconciler) handleChaosPodTermination(instance *chaosv1beta1.
725726
controllerutil.RemoveFinalizer(&chaosPod, chaostypes.ChaosPodFinalizer)
726727

727728
if err := r.Client.Update(context.Background(), &chaosPod); err != nil {
728-
r.log.Errorw("error removing chaos pod finalizer", "error", err, "chaosPod", chaosPod.Name)
729+
if strings.Contains(err.Error(), "latest version and try again") {
730+
r.log.Debugw("cannot remove chaos pod finalizer, need to re-reconcile", "error", err)
731+
} else {
732+
r.log.Errorw("error removing chaos pod finalizer", "error", err, "chaosPod", chaosPod.Name)
733+
}
729734

730735
return
731736
}
@@ -807,7 +812,7 @@ func (r *DisruptionReconciler) selectTargets(instance *chaosv1beta1.Disruption)
807812
instance.Status.RemoveTargets(cTargetsCount - dTargetsCount)
808813
}
809814

810-
r.log.Infow("updating instance status with targets selected for injection")
815+
r.log.Debugw("updating instance status with targets selected for injection")
811816

812817
instance.Status.SelectedTargetsCount = len(instance.Status.Targets)
813818
instance.Status.IgnoredTargetsCount = totalAvailableTargetsCount - targetsCount

controllers/disruption_controller_test.go

Lines changed: 29 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323

2424
. "github.com/onsi/ginkgo"
2525
. "github.com/onsi/gomega"
26+
"k8s.io/apimachinery/pkg/types"
2627
"sigs.k8s.io/controller-runtime/pkg/client"
2728

2829
chaosv1beta1 "github.com/DataDog/chaos-controller/api/v1beta1"
@@ -127,24 +128,24 @@ func expectChaosInjectors(instance *chaosv1beta1.Disruption, count int) error {
127128
return nil
128129
}
129130

130-
func expectDisruptionStatus(instance *chaosv1beta1.Disruption, desiredTargetsCount int, ignoredTargetsCount int, selectedTargetsCount int, injectedTargetsCount int) error {
131+
func expectDisruptionStatus(desiredTargetsCount int, ignoredTargetsCount int, selectedTargetsCount int, injectedTargetsCount int) error {
131132
updatedInstance := &chaosv1beta1.Disruption{}
132133

133134
if err := k8sClient.Get(context.Background(), instanceKey, updatedInstance); err != nil {
134135
return err
135136
}
136137

137138
if desiredTargetsCount != updatedInstance.Status.DesiredTargetsCount {
138-
return fmt.Errorf("incorred number of desired targets: expected %d, found %d", desiredTargetsCount, updatedInstance.Status.DesiredTargetsCount)
139+
return fmt.Errorf("incorrect number of desired targets: expected %d, found %d", desiredTargetsCount, updatedInstance.Status.DesiredTargetsCount)
139140
}
140141
if ignoredTargetsCount != updatedInstance.Status.IgnoredTargetsCount {
141-
return fmt.Errorf("incorred number of ignored targets: expected %d, found %d", ignoredTargetsCount, updatedInstance.Status.IgnoredTargetsCount)
142+
return fmt.Errorf("incorrect number of ignored targets: expected %d, found %d", ignoredTargetsCount, updatedInstance.Status.IgnoredTargetsCount)
142143
}
143144
if injectedTargetsCount != updatedInstance.Status.InjectedTargetsCount {
144-
return fmt.Errorf("incorred number of injected targets: expected %d, found %d", injectedTargetsCount, updatedInstance.Status.InjectedTargetsCount)
145+
return fmt.Errorf("incorrect number of injected targets: expected %d, found %d", injectedTargetsCount, updatedInstance.Status.InjectedTargetsCount)
145146
}
146147
if selectedTargetsCount != updatedInstance.Status.SelectedTargetsCount {
147-
return fmt.Errorf("incorred number of selected targets: expected %d, found %d", selectedTargetsCount, updatedInstance.Status.SelectedTargetsCount)
148+
return fmt.Errorf("incorrect number of selected targets: expected %d, found %d", selectedTargetsCount, updatedInstance.Status.SelectedTargetsCount)
148149
}
149150

150151
return nil
@@ -413,7 +414,7 @@ var _ = Describe("Disruption Controller", func() {
413414
Expect(expectChaosInjectors(disruption, 2)).To(BeNil())
414415

415416
By("Ensuring that the disruption status is displaying the right number of targets")
416-
Eventually(func() error { return expectDisruptionStatus(disruption, 2, 0, 2, 2) }, timeout).Should(Succeed())
417+
Eventually(func() error { return expectDisruptionStatus(2, 0, 2, 2) }, timeout).Should(Succeed())
417418

418419
By("Adding an extra target")
419420
Expect(k8sClient.Create(context.Background(), targetPodA2)).To(BeNil())
@@ -422,7 +423,7 @@ var _ = Describe("Disruption Controller", func() {
422423
Eventually(func() error { return expectChaosPod(disruption, 3) }, timeout).Should(Succeed())
423424

424425
By("Ensuring that the disruption status is displaying the right number of targets")
425-
Eventually(func() error { return expectDisruptionStatus(disruption, 3, 0, 3, 3) }, timeout).Should(Succeed())
426+
Eventually(func() error { return expectDisruptionStatus(3, 0, 3, 3) }, timeout).Should(Succeed())
426427

427428
By("Deleting the extra target")
428429
Expect(k8sClient.Delete(context.Background(), targetPodA2)).To(BeNil())
@@ -431,7 +432,7 @@ var _ = Describe("Disruption Controller", func() {
431432
Eventually(func() error { return expectChaosPod(disruption, 2) }, timeout).Should(Succeed())
432433

433434
By("Ensuring that the disruption status is displaying the right number of targets")
434-
Eventually(func() error { return expectDisruptionStatus(disruption, 2, 0, 2, 2) }, timeout).Should(Succeed())
435+
Eventually(func() error { return expectDisruptionStatus(2, 0, 2, 2) }, timeout).Should(Succeed())
435436
})
436437
})
437438

@@ -463,7 +464,7 @@ var _ = Describe("Disruption Controller", func() {
463464

464465
It("should scale up then down with the right number of targets count", func() {
465466
By("Ensuring that the disruption status is displaying the right number of targets")
466-
Eventually(func() error { return expectDisruptionStatus(disruption, 3, 0, 2, 2) }, timeout).Should(Succeed())
467+
Eventually(func() error { return expectDisruptionStatus(3, 0, 2, 2) }, timeout).Should(Succeed())
467468

468469
By("Adding an extra target")
469470
Expect(k8sClient.Create(context.Background(), targetPodA3)).To(BeNil())
@@ -472,7 +473,7 @@ var _ = Describe("Disruption Controller", func() {
472473
Expect(k8sClient.Create(context.Background(), targetPodA4)).To(BeNil())
473474

474475
By("Ensuring that the disruption status is displaying the right number of targets")
475-
Eventually(func() error { return expectDisruptionStatus(disruption, 3, 1, 3, 3) }, timeout).Should(Succeed())
476+
Eventually(func() error { return expectDisruptionStatus(3, 1, 3, 3) }, timeout).Should(Succeed())
476477

477478
By("Deleting the extra target")
478479
Expect(k8sClient.Delete(context.Background(), targetPodA3)).To(BeNil())
@@ -482,26 +483,22 @@ var _ = Describe("Disruption Controller", func() {
482483
})
483484
})
484485

485-
// NOTE: disabled until fixed
486-
// the feature is broken now that we moved all chaos pods into the same namespace
487-
// because we had to remove the owner reference on those pods, meaning that
488-
// the reconcile loop does not automatically trigger anymore on chaos pods events like a delete
489-
// Context("manually delete a chaos pod", func() {
490-
// It("should properly handle the chaos pod finalizer", func() {
491-
// By("Ensuring that the chaos pods have been created")
492-
// Eventually(func() error { return expectChaosPod(disruption, 5) }, timeout).Should(Succeed())
493-
494-
// By("Listing chaos pods to pick one to delete")
495-
// chaosPods, err := listChaosPods(disruption)
496-
// Expect(err).To(BeNil())
497-
// chaosPod := chaosPods.Items[0]
498-
// chaosPodKey := types.NamespacedName{Namespace: chaosPod.Namespace, Name: chaosPod.Name}
499-
500-
// By("Deleting one of the chaos pod")
501-
// Expect(k8sClient.Delete(context.Background(), &chaosPod)).To(BeNil())
502-
503-
// By("Waiting for the chaos pod finalizer to be removed")
504-
// Eventually(func() error { return k8sClient.Get(context.Background(), chaosPodKey, &chaosPod) }, timeout).Should(MatchError(fmt.Sprintf("Pod \"%s\" not found", chaosPod.Name)))
505-
// })
506-
// })
486+
Context("manually delete a chaos pod", func() {
487+
It("should properly handle the chaos pod finalizer", func() {
488+
By("Ensuring that the chaos pods have been created")
489+
Eventually(func() error { return expectChaosPod(disruption, 4) }, timeout).Should(Succeed())
490+
491+
By("Listing chaos pods to pick one to delete")
492+
chaosPods, err := listChaosPods(disruption)
493+
Expect(err).To(BeNil())
494+
chaosPod := chaosPods.Items[0]
495+
chaosPodKey := types.NamespacedName{Namespace: chaosPod.Namespace, Name: chaosPod.Name}
496+
497+
By("Deleting one of the chaos pod")
498+
Expect(k8sClient.Delete(context.Background(), &chaosPod)).To(BeNil())
499+
500+
By("Waiting for the chaos pod finalizer to be removed")
501+
Eventually(func() error { return k8sClient.Get(context.Background(), chaosPodKey, &chaosPod) }, timeout).Should(MatchError(fmt.Sprintf("Pod \"%s\" not found", chaosPod.Name)))
502+
})
503+
})
507504
})

injector/network_disruption.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ func (i *networkDisruptionInjector) Inject() error {
165165
return fmt.Errorf("error applying tc operations: %w", err)
166166
}
167167

168-
i.config.Log.Info("operations applied successfully")
168+
i.config.Log.Debug("operations applied successfully")
169169
}
170170

171171
i.config.Log.Info("editing pod net_cls cgroup to apply a classid to target container packets")
@@ -847,6 +847,8 @@ func (i *networkDisruptionInjector) addFiltersForHosts(interfaces []string, host
847847
return fmt.Errorf("error resolving given host %s: %w", host.Host, err)
848848
}
849849

850+
i.config.Log.Infof("resolved %s as %s", host.Host, ips)
851+
850852
for _, ip := range ips {
851853
// handle flow direction
852854
var (

log/log.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ import (
1414
func NewZapLogger() (*zap.SugaredLogger, error) {
1515
// configure logger
1616
loggerConfig := zap.NewProductionConfig()
17-
loggerConfig.Level.SetLevel(zapcore.InfoLevel)
17+
loggerConfig.Level.SetLevel(zapcore.DebugLevel)
1818
loggerConfig.EncoderConfig.MessageKey = "message"
1919
loggerConfig.EncoderConfig.EncodeTime = zapcore.EpochMillisTimeEncoder
2020

network/tc.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ func (e defaultTcExecuter) Run(args ...string) (int, string, error) {
6161
cmd.Stderr = stderr
6262

6363
// run command
64-
e.log.Infof("running tc command: %v", cmd.String())
64+
e.log.Debugf("running tc command: %v", cmd.String())
6565

6666
// early exit if dry-run mode is enabled
6767
if e.dryRun {

0 commit comments

Comments
 (0)