Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions sre/roles/faults/meta/argument_specs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ argument_specs:
- misconfigured-resource-quota
- misconfigured-service-port
- modify-environment-variables
- readiness-flap
- unsupported-image
required: true
type: str
Expand Down Expand Up @@ -291,6 +292,72 @@ argument_specs:
default: force
required: false
type: str
readiness_flap:
required: false
type: dict
options:
workload:
required: true
type: dict
options:
kind:
choices:
- Deployment
- StatefulSet
required: true
type: str
name:
required: true
type: str
namespace:
required: true
type: str
container:
required: false
type: dict
options:
name:
required: false
type: str
probe_config:
required: false
type: dict
options:
flap_interval_seconds:
required: false
default: 4
type: int
failure_threshold:
required: false
default: 1
type: int
success_threshold:
required: false
default: 1
type: int
period_seconds:
required: false
default: 1
type: int
chaos_mesh:
required: false
type: dict
options:
latency_ms:
required: false
default: 1000
type: int
drop_probability:
required: false
default: 0.7
type: float
restart_policy:
choices:
- force
- safe
default: force
required: false
type: str
unsupported_image:
required: false
type: dict
Expand Down
37 changes: 37 additions & 0 deletions sre/roles/faults/tasks/inject_custom_readiness_flap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
---
- name: Inject readiness probe flapping fault
kubernetes.core.k8s_json_patch:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
api_version: apps/v1
kind: Deployment
name: "{{ spec.workload.name }}"
namespace: "{{ spec.workload.namespace }}"
patch:
- op: replace
path: /spec/template/spec/containers/0/readinessProbe
value:
httpGet:
path: "{{ spec.readiness_flap.readiness_path | default('/ready') }}"
port: "{{ spec.readiness_flap.readiness_port | default(8080) }}"
initialDelaySeconds: 0
periodSeconds: "{{ spec.readiness_flap.period_seconds | default(1) }}"
failureThreshold: "{{ spec.readiness_flap.failure_threshold | default(1) }}"
successThreshold: "{{ spec.readiness_flap.success_threshold | default(1) }}"
timeoutSeconds: "{{ spec.readiness_flap.timeout_seconds | default(1) }}"

- name: Wait for Deployment rollout to apply readiness patch
kubernetes.core.k8s_info:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
kind: Deployment
namespace: "{{ spec.workload.namespace }}"
name: "{{ spec.workload.name }}"
register: faults_readiness_flap_rollout
retries: 10
delay: 3
until: faults_readiness_flap_rollout.resources | length > 0

- name: Display readiness-flap injection summary
ansible.builtin.debug:
msg:
- "Injected readiness probe flapping into Deployment '{{ spec.workload.name }}' in namespace '{{ spec.workload.namespace }}'."
- "Pods may now alternate between Ready and NotReady, simulating intermittent readiness probe failures."
37 changes: 37 additions & 0 deletions sre/roles/faults/tasks/remove_custom_readiness_flap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
---
- name: Remove readiness probe flapping fault (restore stable probe)
kubernetes.core.k8s_json_patch:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
api_version: apps/v1
kind: Deployment
name: "{{ spec.workload.name }}"
namespace: "{{ spec.workload.namespace }}"
patch:
- op: replace
path: /spec/template/spec/containers/0/readinessProbe
value:
httpGet:
path: "{{ spec.readiness_flap.readiness_path | default('/ready') }}"
port: "{{ spec.readiness_flap.readiness_port | default(8080) }}"
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 3
successThreshold: 1
timeoutSeconds: 2

- name: Confirm readiness probe restored
kubernetes.core.k8s_info:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
kind: Deployment
namespace: "{{ spec.workload.namespace }}"
name: "{{ spec.workload.name }}"
register: faults_readiness_flap_restore
retries: 10
delay: 3
until: faults_readiness_flap_restore.resources | length > 0

- name: Display readiness-flap removal summary
ansible.builtin.debug:
msg:
- "Restored stable readiness probe for Deployment '{{ spec.workload.name }}'."
- "Pods should now return to consistent Ready state."
63 changes: 63 additions & 0 deletions sre/roles/incidents/files/ground_truths/incident_200.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
---
metadata:
version: "v1"
fault:
- entity:
name: frontend
group_id: frontend
kind: Deployment
condition: Readiness probe misbehaves intermittently, causing pod flapping
category: Misconfiguration
fault_mechanism: readiness-probe-flap
alerts:
- id: FailedPodsDetected
group_id: frontend
metadata:
description: Multiple frontend pods repeatedly transition between Ready and NotReady.
- id: RequestErrorRate
group_id: frontend
metadata:
description: User-facing 5xx error rate increases during readiness probe failures.
- id: RequestLatency
group_id: frontend
metadata:
description: Latency spikes observed in frontend service due to pod churn.
- id: RequestLatency
group_id: cart-service-1
metadata:
description: Increased latency in cart-service due to upstream frontend-service timeouts.

groups:
- id: frontend
kind: Deployment
namespace: default
root_cause: true
- id: frontend-service
kind: Service
namespace: default
root_cause: false
- id: cart-service-1
kind: Deployment
namespace: default
root_cause: false
aliases:
- - frontend
- frontend-service
- - cart-service-1
propagations:
- source: frontend
target: frontend-service
condition: Readiness probe failing
effect: Service temporarily loses endpoints
- source: frontend-service
target: cart-service-1
condition: Upstream request timeouts
effect: Cascading latency and retry storms
recommended_actions:
- solution:
id: S1
actions:
- Inspect readiness probe configuration (`periodSeconds`, `failureThreshold`, `timeoutSeconds`)
- Adjust probe frequency and grace period to avoid transient failures
- Increase service endpoint stabilization timeout to prevent load balancer churn
- Validate container startup and dependency readiness
40 changes: 40 additions & 0 deletions sre/roles/incidents/files/specs/incident_200.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
---
metadata:
version: "v1"
scenario:
metadata:
complexity: Medium
id: 200
name: Readiness Probe Flapping
platform: kubernetes
spec:
environment:
applications:
otel_demo:
enabled: true
configuration:
traffic_generator:
enabled: true
tools:
category: sre
selected:
- chaos-mesh
- prometheus
- grafana
faults:
- custom:
name: readiness-flap
readiness_flap:
workload:
kind: Deployment
name: frontend
namespace: default
probe_config:
flap_interval_seconds: 8
failure_threshold: 1
success_threshold: 1
period_seconds: 2
chaos_mesh:
latency_ms: 1000
drop_probability: 0.7
restart_policy: force
Loading