Skip to content

Commit d723b12

Browse files
authored
Add Pod Replacement disruption (#988)
1 parent db2c142 commit d723b12

17 files changed

+1165
-6
lines changed

api/v1beta1/disruption_types.go

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@ type DisruptionSpec struct {
5858
// +nullable
5959
Pulse *DisruptionPulse `json:"pulse,omitempty"` // enable pulsing diruptions and specify the duration of the active state and the dormant state of the pulsing duration
6060
Duration DisruptionDuration `json:"duration,omitempty"` // time from disruption creation until chaos pods are deleted and no more are created
61+
// MaxRuns specifies the maximum number of times the disruption should be executed
62+
// After this many runs, the disruption will become idle (default: unlimited for continuous disruptions)
63+
// +kubebuilder:validation:Minimum=1
64+
MaxRuns *int `json:"maxRuns,omitempty"`
6165
// Level defines what the disruption will target, either a pod or a node
6266
// +kubebuilder:default=pod
6367
// +kubebuilder:validation:Enum=pod;node
@@ -80,6 +84,8 @@ type DisruptionSpec struct {
8084
// +nullable
8185
GRPC *GRPCDisruptionSpec `json:"grpc,omitempty"`
8286
// +nullable
87+
PodReplacement *PodReplacementSpec `json:"podReplacement,omitempty"`
88+
// +nullable
8389
Reporting *Reporting `json:"reporting,omitempty"`
8490
}
8591

@@ -292,6 +298,8 @@ type DisruptionStatus struct {
292298
// timestamp of when a disruption has been cleaned last.
293299
// +nullable
294300
CleanedAt *metav1.Time `json:"cleanedAt,omitempty"`
301+
// Number of times the disruption has been executed (chaos pods created)
302+
RunCount int `json:"runCount,omitempty"`
295303
}
296304

297305
type DisruptionFilter struct {
@@ -686,27 +694,38 @@ func (s DisruptionSpec) validateGlobalDisruptionScope(requireSelectors bool) (re
686694
}
687695

688696
// Rule: At least one disruption kind must be applied
689-
if s.CPUPressure == nil && s.DiskPressure == nil && s.DiskFailure == nil && s.Network == nil && s.GRPC == nil && s.ContainerFailure == nil && s.NodeFailure == nil && len(s.DNS) == 0 {
697+
if s.CPUPressure == nil && s.DiskPressure == nil && s.DiskFailure == nil && s.Network == nil && s.GRPC == nil && s.ContainerFailure == nil && s.NodeFailure == nil && s.PodReplacement == nil && len(s.DNS) == 0 {
690698
retErr = multierror.Append(retErr, errors.New("at least one disruption kind must be specified, please read the docs to see your options"))
691699
}
692700

693-
// Rule: ContainerFailure and NodeFailure disruptions are not compatible with other failure types
701+
// Rule: ContainerFailure, NodeFailure, and PodReplacement disruptions are not compatible with other failure types
694702
if s.ContainerFailure != nil {
695-
if s.CPUPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.NodeFailure != nil || len(s.DNS) > 0 {
703+
if s.CPUPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.NodeFailure != nil || s.PodReplacement != nil || len(s.DNS) > 0 {
696704
retErr = multierror.Append(retErr, errors.New("container failure disruptions are not compatible with other disruption kinds. The container failure will remove the impact of the other disruption types"))
697705
}
698706
}
699707

700708
if s.NodeFailure != nil {
701-
if s.CPUPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.ContainerFailure != nil || len(s.DNS) > 0 {
709+
if s.CPUPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.ContainerFailure != nil || s.PodReplacement != nil || len(s.DNS) > 0 {
702710
retErr = multierror.Append(retErr, errors.New("node failure disruptions are not compatible with other disruption kinds. The node failure will remove the impact of the other disruption types"))
703711
}
704712
}
705713

714+
if s.PodReplacement != nil {
715+
if s.CPUPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.ContainerFailure != nil || s.NodeFailure != nil || len(s.DNS) > 0 {
716+
retErr = multierror.Append(retErr, errors.New("pod replacement disruptions are not compatible with other disruption kinds. The pod replacement will remove the impact of the other disruption types"))
717+
}
718+
// Rule: container failure not possible if disruption is node-level
719+
if s.Level == chaostypes.DisruptionLevelNode {
720+
retErr = multierror.Append(retErr, errors.New("cannot execute a pod replacement because the level configuration is set to node"))
721+
}
722+
}
723+
706724
// Rule: on init compatibility
707725
if s.OnInit {
708726
if s.CPUPressure != nil ||
709727
s.NodeFailure != nil ||
728+
s.PodReplacement != nil ||
710729
s.ContainerFailure != nil ||
711730
s.DiskPressure != nil ||
712731
s.GRPC != nil ||
@@ -756,7 +775,7 @@ func (s DisruptionSpec) validateGlobalDisruptionScope(requireSelectors bool) (re
756775
// Rule: pulse compatibility
757776
if s.Pulse != nil {
758777
if s.Pulse.ActiveDuration.Duration() > 0 || s.Pulse.DormantDuration.Duration() > 0 {
759-
if s.NodeFailure != nil || s.ContainerFailure != nil {
778+
if s.NodeFailure != nil || s.PodReplacement != nil || s.ContainerFailure != nil {
760779
retErr = multierror.Append(retErr, errors.New("pulse is only compatible with network, cpu pressure, disk pressure, dns and grpc disruptions"))
761780
}
762781
}
@@ -811,6 +830,8 @@ func (s DisruptionSpec) DisruptionKindPicker(kind chaostypes.DisruptionKindName)
811830
disruptionKind = s.DNS
812831
case chaostypes.DisruptionKindGRPCDisruption:
813832
disruptionKind = s.GRPC
833+
case chaostypes.DisruptionKindPodReplacement:
834+
disruptionKind = s.PodReplacement
814835
case chaostypes.DisruptionKindDiskFailure:
815836
disruptionKind = s.DiskFailure
816837
}
@@ -891,6 +912,10 @@ func (s DisruptionSpec) DisruptionCount() int {
891912
count++
892913
}
893914

915+
if s.PodReplacement != nil {
916+
count++
917+
}
918+
894919
if s.DiskFailure != nil {
895920
count++
896921
}
@@ -1022,6 +1047,10 @@ func (s DisruptionSpec) Explain() []string {
10221047
explanation = append(explanation, s.NodeFailure.Explain()...)
10231048
}
10241049

1050+
if s.PodReplacement != nil {
1051+
explanation = append(explanation, s.PodReplacement.Explain()...)
1052+
}
1053+
10251054
if s.ContainerFailure != nil {
10261055
explanation = append(explanation, s.ContainerFailure.Explain()...)
10271056
}
@@ -1110,6 +1139,7 @@ func (status *DisruptionStatus) HasTarget(searchTarget string) bool {
11101139
var NonReinjectableDisruptions = map[chaostypes.DisruptionKindName]struct{}{
11111140
chaostypes.DisruptionKindGRPCDisruption: {},
11121141
chaostypes.DisruptionKindNodeFailure: {},
1142+
chaostypes.DisruptionKindPodReplacement: {},
11131143
}
11141144

11151145
func DisruptionIsNotReinjectable(kind chaostypes.DisruptionKindName) bool {
@@ -1122,6 +1152,7 @@ func DisruptionIsNotReinjectable(kind chaostypes.DisruptionKindName) bool {
11221152
// the chaos pod. So once the chaos pod is gone, there's nothing left for us to clean.
11231153
var NoSideEffectDisruptions = map[chaostypes.DisruptionKindName]struct{}{
11241154
chaostypes.DisruptionKindNodeFailure: {},
1155+
chaostypes.DisruptionKindPodReplacement: {},
11251156
chaostypes.DisruptionKindContainerFailure: {},
11261157
}
11271158

api/v1beta1/disruption_webhook.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,9 @@ func (r *Disruption) initialSafetyNets() ([]string, error) {
448448
responses = append(responses, "node failure disruptions are not allowed in this cluster, please use a disruption type or test elsewhere")
449449
}
450450

451+
// Pod replacement disruptions are pod-level and don't require node-level permissions
452+
// They are allowed as long as the basic disruption requirements are met
453+
451454
if !allowNodeLevel && r.Spec.Level == chaostypes.DisruptionLevelNode {
452455
logger.Debugw("the specified disruption is applied at the node level and will be rejected")
453456

api/v1beta1/pod_replacement.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
// Unless explicitly stated otherwise all files in this repository are licensed
2+
// under the Apache License Version 2.0.
3+
// This product includes software developed at Datadog (https://www.datadoghq.com/).
4+
// Copyright 2025 Datadog, Inc.
5+
6+
package v1beta1
7+
8+
import "strconv"
9+
10+
// PodReplacementSpec represents a pod replacement disruption
11+
type PodReplacementSpec struct {
12+
// DeleteStorage determines if PVCs associated with the target pod should be deleted
13+
// +kubebuilder:default=true
14+
DeleteStorage bool `json:"deleteStorage,omitempty"`
15+
// ForceDelete forces deletion of stuck pods by setting grace period to 0
16+
ForceDelete bool `json:"forceDelete,omitempty"`
17+
// GracePeriodSeconds specifies the grace period for pod deletion in seconds
18+
// If not specified, uses the pod's default grace period
19+
GracePeriodSeconds *int64 `json:"gracePeriodSeconds,omitempty"`
20+
}
21+
22+
// Validate validates args for the given disruption
23+
func (s *PodReplacementSpec) Validate() error {
24+
return nil
25+
}
26+
27+
// GenerateArgs generates injection or cleanup pod arguments for the given spec
28+
func (s *PodReplacementSpec) GenerateArgs() []string {
29+
args := []string{
30+
"pod-replacement",
31+
"inject",
32+
}
33+
34+
if s.DeleteStorage {
35+
args = append(args, "--delete-storage")
36+
}
37+
38+
if s.ForceDelete {
39+
args = append(args, "--force-delete")
40+
}
41+
42+
if s.GracePeriodSeconds != nil {
43+
args = append(args, "--grace-period-seconds", strconv.FormatInt(*s.GracePeriodSeconds, 10))
44+
}
45+
46+
return args
47+
}
48+
49+
func (s *PodReplacementSpec) Explain() []string {
50+
explanation := "spec.podReplacement will cordon the node hosting the target pod to prevent new pods from being scheduled, " +
51+
"then delete the target pod to force it to reschedule. "
52+
53+
if s.DeleteStorage {
54+
explanation += "PersistentVolumeClaims associated with the pod will also be deleted to simulate complete storage loss. "
55+
}
56+
57+
explanation += "This simulates a complete pod replacement scenario where both the pod and its storage are recreated. " +
58+
"Unlike node-level disruptions, this only affects the specifically targeted pod."
59+
60+
return []string{"", explanation}
61+
}

api/v1beta1/zz_generated.deepcopy.go

Lines changed: 31 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,12 @@ spec:
271271
- pod
272272
- node
273273
type: string
274+
maxRuns:
275+
description: |-
276+
MaxRuns specifies the maximum number of times the disruption should be executed
277+
After this many runs, the disruption will become idle (default: unlimited for continuous disruptions)
278+
minimum: 1
279+
type: integer
274280
network:
275281
description: NetworkDisruptionSpec represents a network disruption injection
276282
nullable: true
@@ -492,6 +498,24 @@ spec:
492498
type: object
493499
onInit:
494500
type: boolean
501+
podReplacement:
502+
description: PodReplacementSpec represents a pod replacement disruption
503+
nullable: true
504+
properties:
505+
deleteStorage:
506+
default: true
507+
description: DeleteStorage determines if PVCs associated with the target pod should be deleted
508+
type: boolean
509+
forceDelete:
510+
description: ForceDelete forces deletion of stuck pods by setting grace period to 0
511+
type: boolean
512+
gracePeriodSeconds:
513+
description: |-
514+
GracePeriodSeconds specifies the grace period for pod deletion in seconds
515+
If not specified, uses the pod's default grace period
516+
format: int64
517+
type: integer
518+
type: object
495519
pulse:
496520
description: DisruptionPulse contains the active disruption duration and the dormant disruption duration
497521
nullable: true

chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,12 @@ spec:
272272
- pod
273273
- node
274274
type: string
275+
maxRuns:
276+
description: |-
277+
MaxRuns specifies the maximum number of times the disruption should be executed
278+
After this many runs, the disruption will become idle (default: unlimited for continuous disruptions)
279+
minimum: 1
280+
type: integer
275281
network:
276282
description: NetworkDisruptionSpec represents a network disruption injection
277283
nullable: true
@@ -493,6 +499,24 @@ spec:
493499
type: object
494500
onInit:
495501
type: boolean
502+
podReplacement:
503+
description: PodReplacementSpec represents a pod replacement disruption
504+
nullable: true
505+
properties:
506+
deleteStorage:
507+
default: true
508+
description: DeleteStorage determines if PVCs associated with the target pod should be deleted
509+
type: boolean
510+
forceDelete:
511+
description: ForceDelete forces deletion of stuck pods by setting grace period to 0
512+
type: boolean
513+
gracePeriodSeconds:
514+
description: |-
515+
GracePeriodSeconds specifies the grace period for pod deletion in seconds
516+
If not specified, uses the pod's default grace period
517+
format: int64
518+
type: integer
519+
type: object
496520
pulse:
497521
description: DisruptionPulse contains the active disruption duration and the dormant disruption duration
498522
nullable: true

chart/templates/generated/chaos.datadoghq.com_disruptions.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,12 @@ spec:
262262
- pod
263263
- node
264264
type: string
265+
maxRuns:
266+
description: |-
267+
MaxRuns specifies the maximum number of times the disruption should be executed
268+
After this many runs, the disruption will become idle (default: unlimited for continuous disruptions)
269+
minimum: 1
270+
type: integer
265271
network:
266272
description: NetworkDisruptionSpec represents a network disruption injection
267273
nullable: true
@@ -483,6 +489,24 @@ spec:
483489
type: object
484490
onInit:
485491
type: boolean
492+
podReplacement:
493+
description: PodReplacementSpec represents a pod replacement disruption
494+
nullable: true
495+
properties:
496+
deleteStorage:
497+
default: true
498+
description: DeleteStorage determines if PVCs associated with the target pod should be deleted
499+
type: boolean
500+
forceDelete:
501+
description: ForceDelete forces deletion of stuck pods by setting grace period to 0
502+
type: boolean
503+
gracePeriodSeconds:
504+
description: |-
505+
GracePeriodSeconds specifies the grace period for pod deletion in seconds
506+
If not specified, uses the pod's default grace period
507+
format: int64
508+
type: integer
509+
type: object
486510
pulse:
487511
description: DisruptionPulse contains the active disruption duration and the dormant disruption duration
488512
nullable: true
@@ -645,6 +669,9 @@ spec:
645669
type: boolean
646670
isStuckOnRemoval:
647671
type: boolean
672+
runCount:
673+
description: Number of times the disruption has been executed (chaos pods created)
674+
type: integer
648675
selectedTargetsCount:
649676
description: Actual targets selected by the disruption
650677
type: integer

0 commit comments

Comments
 (0)