Skip to content

Commit 99ca80b

Browse files
Merge branch 'main' into nathan/bugfix-dynamictargeting
2 parents 1dd4df4 + 3891549 commit 99ca80b

File tree

6 files changed

+52
-49
lines changed

6 files changed

+52
-49
lines changed

api/v1beta1/disruption_webhook.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ import (
1313
"time"
1414

1515
"github.com/DataDog/chaos-controller/ddmark"
16-
"github.com/DataDog/chaos-controller/types"
1716
"github.com/DataDog/chaos-controller/utils"
1817
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1918

@@ -43,7 +42,7 @@ var handlerEnabled bool
4342
var defaultDuration time.Duration
4443

4544
func (r *Disruption) SetupWebhookWithManager(setupWebhookConfig utils.SetupWebhookWithManagerConfig) error {
46-
if err := ddmark.InitLibrary(EmbeddedChaosAPI, types.DDMarkChaoslibPrefix); err != nil {
45+
if err := ddmark.InitLibrary(EmbeddedChaosAPI, chaostypes.DDMarkChaoslibPrefix); err != nil {
4746
return err
4847
}
4948

@@ -108,7 +107,7 @@ func (r *Disruption) ValidateCreate() error {
108107
return err
109108
}
110109

111-
multiErr := ddmark.ValidateStructMultierror(r.Spec, "validation_webhook", types.DDMarkChaoslibPrefix)
110+
multiErr := ddmark.ValidateStructMultierror(r.Spec, "validation_webhook", chaostypes.DDMarkChaoslibPrefix)
112111
if multiErr.ErrorOrNil() != nil {
113112
return multierror.Prefix(multiErr, "ddmark: ")
114113
}

controllers/disruption_controller.go

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -150,10 +150,10 @@ func (r *DisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Request)
150150
return ctrl.Result{}, err
151151
}
152152

153-
// if not cleaned yet, requeue and reconcile again in 5s-10s
153+
// if not cleaned yet, requeue and reconcile again in 15s-20s
154154
// the reason why we don't rely on the exponential backoff here is that it retries too fast at the beginning
155155
if !isCleaned {
156-
requeueAfter := time.Duration(rand.Intn(5)+5) * time.Second //nolint:gosec
156+
requeueAfter := time.Duration(rand.Intn(5)+15) * time.Second //nolint:gosec
157157

158158
r.log.Infow(fmt.Sprintf("disruption has not been fully cleaned yet, re-queuing in %v", requeueAfter))
159159

@@ -282,8 +282,8 @@ func (r *DisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Request)
282282

283283
return ctrl.Result{}, fmt.Errorf("error updating disruption injection status: %w", err)
284284
} else if !injected {
285-
// requeue after 5-10 seconds, as default 1ms is too quick here
286-
requeueAfter := time.Duration(rand.Intn(5)+5) * time.Second //nolint:gosec
285+
// requeue after 15-20 seconds, as default 1ms is too quick here
286+
requeueAfter := time.Duration(rand.Intn(5)+15) * time.Second //nolint:gosec
287287
r.log.Infow("disruption is not fully injected yet, requeuing", "injectionStatus", instance.Status.InjectionStatus)
288288

289289
return ctrl.Result{
@@ -347,7 +347,7 @@ func (r *DisruptionReconciler) updateInjectionStatus(instance *chaosv1beta1.Disr
347347

348348
// consider the disruption as not fully injected if at least one not ready pod is found
349349
if !podReady {
350-
r.log.Infow("chaos pod is not ready yet", "chaosPod", chaosPod.Name)
350+
r.log.Debugw("chaos pod is not ready yet", "chaosPod", chaosPod.Name)
351351
}
352352
}
353353

@@ -384,10 +384,6 @@ func (r *DisruptionReconciler) updateInjectionStatus(instance *chaosv1beta1.Disr
384384

385385
// startInjection creates non-existing chaos pod for the given disruption
386386
func (r *DisruptionReconciler) startInjection(instance *chaosv1beta1.Disruption) error {
387-
if len(instance.Status.Targets) > 0 {
388-
r.log.Infow("starting targets injection", "targets", instance.Status.Targets)
389-
}
390-
391387
// chaosPodsMap is used to check if a target's chaos pods already exist or not
392388
chaosPodsMap := make(map[string]map[string]bool, len(instance.Status.Targets))
393389

@@ -409,6 +405,10 @@ func (r *DisruptionReconciler) startInjection(instance *chaosv1beta1.Disruption)
409405
}
410406
}
411407

408+
if len(instance.Status.Targets) > 0 && (len(instance.Status.Targets) != len(chaosPodsMap)) {
409+
r.log.Infow("starting targets injection", "targets", instance.Status.Targets)
410+
}
411+
412412
// iterate through target + existing disruption kind -- to ensure all chaos pods exist
413413
for _, target := range instance.Status.Targets {
414414
for _, disKind := range chaostypes.DisruptionKindNames {
@@ -624,6 +624,7 @@ func (r *DisruptionReconciler) handleOrphanedChaosPods(req ctrl.Request) error {
624624
// - the pod is pending
625625
// - the pod is succeeded (exit code == 0)
626626
// - the pod target is not healthy (not existing anymore for instance)
627+
//
627628
// if a finalizer can't be removed because none of the conditions above are fulfilled, the instance is flagged
628629
// as stuck on removal and the pod finalizer won't be removed unless someone does it manually
629630
// the pod target will be moved to ignored targets, so it is not picked up by the next reconcile loop
@@ -726,7 +727,11 @@ func (r *DisruptionReconciler) handleChaosPodTermination(instance *chaosv1beta1.
726727
controllerutil.RemoveFinalizer(&chaosPod, chaostypes.ChaosPodFinalizer)
727728

728729
if err := r.Client.Update(context.Background(), &chaosPod); err != nil {
729-
r.log.Errorw("error removing chaos pod finalizer", "error", err, "chaosPod", chaosPod.Name)
730+
if strings.Contains(err.Error(), "latest version and try again") {
731+
r.log.Debugw("cannot remove chaos pod finalizer, need to re-reconcile", "error", err)
732+
} else {
733+
r.log.Errorw("error removing chaos pod finalizer", "error", err, "chaosPod", chaosPod.Name)
734+
}
730735

731736
return
732737
}
@@ -808,7 +813,7 @@ func (r *DisruptionReconciler) selectTargets(instance *chaosv1beta1.Disruption)
808813
instance.Status.RemoveTargets(cTargetsCount - dTargetsCount)
809814
}
810815

811-
r.log.Infow("updating instance status with targets selected for injection")
816+
r.log.Debugw("updating instance status with targets selected for injection")
812817

813818
instance.Status.SelectedTargetsCount = len(instance.Status.Targets)
814819
instance.Status.IgnoredTargetsCount = totalAvailableTargetsCount - targetsCount

controllers/disruption_controller_test.go

Lines changed: 29 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323

2424
. "github.com/onsi/ginkgo"
2525
. "github.com/onsi/gomega"
26+
"k8s.io/apimachinery/pkg/types"
2627
"sigs.k8s.io/controller-runtime/pkg/client"
2728

2829
chaosv1beta1 "github.com/DataDog/chaos-controller/api/v1beta1"
@@ -127,24 +128,24 @@ func expectChaosInjectors(instance *chaosv1beta1.Disruption, count int) error {
127128
return nil
128129
}
129130

130-
func expectDisruptionStatus(instance *chaosv1beta1.Disruption, desiredTargetsCount int, ignoredTargetsCount int, selectedTargetsCount int, injectedTargetsCount int) error {
131+
func expectDisruptionStatus(desiredTargetsCount int, ignoredTargetsCount int, selectedTargetsCount int, injectedTargetsCount int) error {
131132
updatedInstance := &chaosv1beta1.Disruption{}
132133

133134
if err := k8sClient.Get(context.Background(), instanceKey, updatedInstance); err != nil {
134135
return err
135136
}
136137

137138
if desiredTargetsCount != updatedInstance.Status.DesiredTargetsCount {
138-
return fmt.Errorf("incorred number of desired targets: expected %d, found %d", desiredTargetsCount, updatedInstance.Status.DesiredTargetsCount)
139+
return fmt.Errorf("incorrect number of desired targets: expected %d, found %d", desiredTargetsCount, updatedInstance.Status.DesiredTargetsCount)
139140
}
140141
if ignoredTargetsCount != updatedInstance.Status.IgnoredTargetsCount {
141-
return fmt.Errorf("incorred number of ignored targets: expected %d, found %d", ignoredTargetsCount, updatedInstance.Status.IgnoredTargetsCount)
142+
return fmt.Errorf("incorrect number of ignored targets: expected %d, found %d", ignoredTargetsCount, updatedInstance.Status.IgnoredTargetsCount)
142143
}
143144
if injectedTargetsCount != updatedInstance.Status.InjectedTargetsCount {
144-
return fmt.Errorf("incorred number of injected targets: expected %d, found %d", injectedTargetsCount, updatedInstance.Status.InjectedTargetsCount)
145+
return fmt.Errorf("incorrect number of injected targets: expected %d, found %d", injectedTargetsCount, updatedInstance.Status.InjectedTargetsCount)
145146
}
146147
if selectedTargetsCount != updatedInstance.Status.SelectedTargetsCount {
147-
return fmt.Errorf("incorred number of selected targets: expected %d, found %d", selectedTargetsCount, updatedInstance.Status.SelectedTargetsCount)
148+
return fmt.Errorf("incorrect number of selected targets: expected %d, found %d", selectedTargetsCount, updatedInstance.Status.SelectedTargetsCount)
148149
}
149150

150151
return nil
@@ -413,7 +414,7 @@ var _ = Describe("Disruption Controller", func() {
413414
Expect(expectChaosInjectors(disruption, 2)).To(BeNil())
414415

415416
By("Ensuring that the disruption status is displaying the right number of targets")
416-
Eventually(func() error { return expectDisruptionStatus(disruption, 2, 0, 2, 2) }, timeout).Should(Succeed())
417+
Eventually(func() error { return expectDisruptionStatus(2, 0, 2, 2) }, timeout).Should(Succeed())
417418

418419
By("Adding an extra target")
419420
Expect(k8sClient.Create(context.Background(), targetPodA2)).To(BeNil())
@@ -422,7 +423,7 @@ var _ = Describe("Disruption Controller", func() {
422423
Eventually(func() error { return expectChaosPod(disruption, 3) }, timeout).Should(Succeed())
423424

424425
By("Ensuring that the disruption status is displaying the right number of targets")
425-
Eventually(func() error { return expectDisruptionStatus(disruption, 3, 0, 3, 3) }, timeout).Should(Succeed())
426+
Eventually(func() error { return expectDisruptionStatus(3, 0, 3, 3) }, timeout).Should(Succeed())
426427

427428
By("Deleting the extra target")
428429
Expect(k8sClient.Delete(context.Background(), targetPodA2)).To(BeNil())
@@ -431,7 +432,7 @@ var _ = Describe("Disruption Controller", func() {
431432
Eventually(func() error { return expectChaosPod(disruption, 2) }, timeout).Should(Succeed())
432433

433434
By("Ensuring that the disruption status is displaying the right number of targets")
434-
Eventually(func() error { return expectDisruptionStatus(disruption, 2, 0, 2, 2) }, timeout).Should(Succeed())
435+
Eventually(func() error { return expectDisruptionStatus(2, 0, 2, 2) }, timeout).Should(Succeed())
435436
})
436437
})
437438

@@ -463,7 +464,7 @@ var _ = Describe("Disruption Controller", func() {
463464

464465
It("should scale up then down with the right number of targets count", func() {
465466
By("Ensuring that the disruption status is displaying the right number of targets")
466-
Eventually(func() error { return expectDisruptionStatus(disruption, 3, 0, 2, 2) }, timeout).Should(Succeed())
467+
Eventually(func() error { return expectDisruptionStatus(3, 0, 2, 2) }, timeout).Should(Succeed())
467468

468469
By("Adding an extra target")
469470
Expect(k8sClient.Create(context.Background(), targetPodA3)).To(BeNil())
@@ -472,7 +473,7 @@ var _ = Describe("Disruption Controller", func() {
472473
Expect(k8sClient.Create(context.Background(), targetPodA4)).To(BeNil())
473474

474475
By("Ensuring that the disruption status is displaying the right number of targets")
475-
Eventually(func() error { return expectDisruptionStatus(disruption, 3, 1, 3, 3) }, timeout).Should(Succeed())
476+
Eventually(func() error { return expectDisruptionStatus(3, 1, 3, 3) }, timeout).Should(Succeed())
476477

477478
By("Deleting the extra target")
478479
Expect(k8sClient.Delete(context.Background(), targetPodA3)).To(BeNil())
@@ -482,26 +483,22 @@ var _ = Describe("Disruption Controller", func() {
482483
})
483484
})
484485

485-
// NOTE: disabled until fixed
486-
// the feature is broken now that we moved all chaos pods into the same namespace
487-
// because we had to remove the owner reference on those pods, meaning that
488-
// the reconcile loop does not automatically trigger anymore on chaos pods events like a delete
489-
// Context("manually delete a chaos pod", func() {
490-
// It("should properly handle the chaos pod finalizer", func() {
491-
// By("Ensuring that the chaos pods have been created")
492-
// Eventually(func() error { return expectChaosPod(disruption, 5) }, timeout).Should(Succeed())
493-
494-
// By("Listing chaos pods to pick one to delete")
495-
// chaosPods, err := listChaosPods(disruption)
496-
// Expect(err).To(BeNil())
497-
// chaosPod := chaosPods.Items[0]
498-
// chaosPodKey := types.NamespacedName{Namespace: chaosPod.Namespace, Name: chaosPod.Name}
499-
500-
// By("Deleting one of the chaos pod")
501-
// Expect(k8sClient.Delete(context.Background(), &chaosPod)).To(BeNil())
502-
503-
// By("Waiting for the chaos pod finalizer to be removed")
504-
// Eventually(func() error { return k8sClient.Get(context.Background(), chaosPodKey, &chaosPod) }, timeout).Should(MatchError(fmt.Sprintf("Pod \"%s\" not found", chaosPod.Name)))
505-
// })
506-
// })
486+
Context("manually delete a chaos pod", func() {
487+
It("should properly handle the chaos pod finalizer", func() {
488+
By("Ensuring that the chaos pods have been created")
489+
Eventually(func() error { return expectChaosPod(disruption, 4) }, timeout).Should(Succeed())
490+
491+
By("Listing chaos pods to pick one to delete")
492+
chaosPods, err := listChaosPods(disruption)
493+
Expect(err).To(BeNil())
494+
chaosPod := chaosPods.Items[0]
495+
chaosPodKey := types.NamespacedName{Namespace: chaosPod.Namespace, Name: chaosPod.Name}
496+
497+
By("Deleting one of the chaos pod")
498+
Expect(k8sClient.Delete(context.Background(), &chaosPod)).To(BeNil())
499+
500+
By("Waiting for the chaos pod finalizer to be removed")
501+
Eventually(func() error { return k8sClient.Get(context.Background(), chaosPodKey, &chaosPod) }, timeout).Should(MatchError(fmt.Sprintf("Pod \"%s\" not found", chaosPod.Name)))
502+
})
503+
})
507504
})

injector/network_disruption.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ func (i *networkDisruptionInjector) Inject() error {
165165
return fmt.Errorf("error applying tc operations: %w", err)
166166
}
167167

168-
i.config.Log.Info("operations applied successfully")
168+
i.config.Log.Debug("operations applied successfully")
169169
}
170170

171171
i.config.Log.Info("editing pod net_cls cgroup to apply a classid to target container packets")
@@ -847,6 +847,8 @@ func (i *networkDisruptionInjector) addFiltersForHosts(interfaces []string, host
847847
return fmt.Errorf("error resolving given host %s: %w", host.Host, err)
848848
}
849849

850+
i.config.Log.Infof("resolved %s as %s", host.Host, ips)
851+
850852
for _, ip := range ips {
851853
// handle flow direction
852854
var (

log/log.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ import (
1414
func NewZapLogger() (*zap.SugaredLogger, error) {
1515
// configure logger
1616
loggerConfig := zap.NewProductionConfig()
17-
loggerConfig.Level.SetLevel(zapcore.InfoLevel)
17+
loggerConfig.Level.SetLevel(zapcore.DebugLevel)
1818
loggerConfig.EncoderConfig.MessageKey = "message"
1919
loggerConfig.EncoderConfig.EncodeTime = zapcore.EpochMillisTimeEncoder
2020

network/tc.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ func (e defaultTcExecuter) Run(args ...string) (int, string, error) {
6161
cmd.Stderr = stderr
6262

6363
// run command
64-
e.log.Infof("running tc command: %v", cmd.String())
64+
e.log.Debugf("running tc command: %v", cmd.String())
6565

6666
// early exit if dry-run mode is enabled
6767
if e.dryRun {

0 commit comments

Comments
 (0)