diff --git a/sre/roles/faults/meta/argument_specs.yaml b/sre/roles/faults/meta/argument_specs.yaml index 4e9b11f39..6b2d8157c 100644 --- a/sre/roles/faults/meta/argument_specs.yaml +++ b/sre/roles/faults/meta/argument_specs.yaml @@ -50,6 +50,7 @@ argument_specs: - misconfigured-resource-quota - misconfigured-service-port - modify-environment-variables + - readiness-flap - unsupported-image required: true type: str @@ -291,6 +292,72 @@ argument_specs: default: force required: false type: str + readiness_flap: + required: false + type: dict + options: + workload: + required: true + type: dict + options: + kind: + choices: + - Deployment + - StatefulSet + required: true + type: str + name: + required: true + type: str + namespace: + required: true + type: str + container: + required: false + type: dict + options: + name: + required: false + type: str + probe_config: + required: false + type: dict + options: + flap_interval_seconds: + required: false + default: 4 + type: int + failure_threshold: + required: false + default: 1 + type: int + success_threshold: + required: false + default: 1 + type: int + period_seconds: + required: false + default: 1 + type: int + chaos_mesh: + required: false + type: dict + options: + latency_ms: + required: false + default: 1000 + type: int + drop_probability: + required: false + default: 0.7 + type: float + restart_policy: + choices: + - force + - safe + default: force + required: false + type: str unsupported_image: required: false type: dict diff --git a/sre/roles/faults/tasks/inject_custom_readiness_flap.yaml b/sre/roles/faults/tasks/inject_custom_readiness_flap.yaml new file mode 100644 index 000000000..64a0dca1b --- /dev/null +++ b/sre/roles/faults/tasks/inject_custom_readiness_flap.yaml @@ -0,0 +1,37 @@ +--- +- name: Inject readiness probe flapping fault + kubernetes.core.k8s_json_patch: + kubeconfig: "{{ faults_cluster.kubeconfig }}" + api_version: apps/v1 + kind: Deployment + name: "{{ spec.workload.name }}" + namespace: "{{ spec.workload.namespace }}" + patch: + - op: replace + path: /spec/template/spec/containers/0/readinessProbe + value: + httpGet: + path: "{{ spec.readiness_flap.readiness_path | default('/ready') }}" + port: "{{ spec.readiness_flap.readiness_port | default(8080) }}" + initialDelaySeconds: 0 + periodSeconds: "{{ spec.readiness_flap.period_seconds | default(1) }}" + failureThreshold: "{{ spec.readiness_flap.failure_threshold | default(1) }}" + successThreshold: "{{ spec.readiness_flap.success_threshold | default(1) }}" + timeoutSeconds: "{{ spec.readiness_flap.timeout_seconds | default(1) }}" + +- name: Wait for Deployment rollout to apply readiness patch + kubernetes.core.k8s_info: + kubeconfig: "{{ faults_cluster.kubeconfig }}" + kind: Deployment + namespace: "{{ spec.workload.namespace }}" + name: "{{ spec.workload.name }}" + register: faults_readiness_flap_rollout + retries: 10 + delay: 3 + until: faults_readiness_flap_rollout.resources | length > 0 + +- name: Display readiness-flap injection summary + ansible.builtin.debug: + msg: + - "Injected readiness probe flapping into Deployment '{{ spec.workload.name }}' in namespace '{{ spec.workload.namespace }}'." + - "Pods may now alternate between Ready and NotReady, simulating intermittent readiness probe failures." diff --git a/sre/roles/faults/tasks/remove_custom_readiness_flap.yaml b/sre/roles/faults/tasks/remove_custom_readiness_flap.yaml new file mode 100644 index 000000000..e15be3ef6 --- /dev/null +++ b/sre/roles/faults/tasks/remove_custom_readiness_flap.yaml @@ -0,0 +1,37 @@ +--- +- name: Remove readiness probe flapping fault (restore stable probe) + kubernetes.core.k8s_json_patch: + kubeconfig: "{{ faults_cluster.kubeconfig }}" + api_version: apps/v1 + kind: Deployment + name: "{{ spec.workload.name }}" + namespace: "{{ spec.workload.namespace }}" + patch: + - op: replace + path: /spec/template/spec/containers/0/readinessProbe + value: + httpGet: + path: "{{ spec.readiness_flap.readiness_path | default('/ready') }}" + port: "{{ spec.readiness_flap.readiness_port | default(8080) }}" + initialDelaySeconds: 5 + periodSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + timeoutSeconds: 2 + +- name: Confirm readiness probe restored + kubernetes.core.k8s_info: + kubeconfig: "{{ faults_cluster.kubeconfig }}" + kind: Deployment + namespace: "{{ spec.workload.namespace }}" + name: "{{ spec.workload.name }}" + register: faults_readiness_flap_restore + retries: 10 + delay: 3 + until: faults_readiness_flap_restore.resources | length > 0 + +- name: Display readiness-flap removal summary + ansible.builtin.debug: + msg: + - "Restored stable readiness probe for Deployment '{{ spec.workload.name }}'." + - "Pods should now return to consistent Ready state." diff --git a/sre/roles/incidents/files/ground_truths/incident_200.yaml b/sre/roles/incidents/files/ground_truths/incident_200.yaml new file mode 100644 index 000000000..b09934bb3 --- /dev/null +++ b/sre/roles/incidents/files/ground_truths/incident_200.yaml @@ -0,0 +1,63 @@ +--- +metadata: + version: "v1" +fault: + - entity: + name: frontend + group_id: frontend + kind: Deployment + condition: Readiness probe misbehaves intermittently, causing pod flapping + category: Misconfiguration + fault_mechanism: readiness-probe-flap +alerts: + - id: FailedPodsDetected + group_id: frontend + metadata: + description: Multiple frontend pods repeatedly transition between Ready and NotReady. + - id: RequestErrorRate + group_id: frontend + metadata: + description: User-facing 5xx error rate increases during readiness probe failures. + - id: RequestLatency + group_id: frontend + metadata: + description: Latency spikes observed in frontend service due to pod churn. + - id: RequestLatency + group_id: cart-service-1 + metadata: + description: Increased latency in cart-service due to upstream frontend-service timeouts. + +groups: + - id: frontend + kind: Deployment + namespace: default + root_cause: true + - id: frontend-service + kind: Service + namespace: default + root_cause: false + - id: cart-service-1 + kind: Deployment + namespace: default + root_cause: false +aliases: + - - frontend + - frontend-service + - - cart-service-1 +propagations: + - source: frontend + target: frontend-service + condition: Readiness probe failing + effect: Service temporarily loses endpoints + - source: frontend-service + target: cart-service-1 + condition: Upstream request timeouts + effect: Cascading latency and retry storms +recommended_actions: + - solution: + id: S1 + actions: + - Inspect readiness probe configuration (`periodSeconds`, `failureThreshold`, `timeoutSeconds`) + - Adjust probe frequency and grace period to avoid transient failures + - Increase service endpoint stabilization timeout to prevent load balancer churn + - Validate container startup and dependency readiness diff --git a/sre/roles/incidents/files/specs/incident_200.yaml b/sre/roles/incidents/files/specs/incident_200.yaml new file mode 100644 index 000000000..052960cd8 --- /dev/null +++ b/sre/roles/incidents/files/specs/incident_200.yaml @@ -0,0 +1,40 @@ +--- +metadata: + version: "v1" +scenario: + metadata: + complexity: Medium + id: 200 + name: Readiness Probe Flapping + platform: kubernetes + spec: + environment: + applications: + otel_demo: + enabled: true + configuration: + traffic_generator: + enabled: true + tools: + category: sre + selected: + - chaos-mesh + - prometheus + - grafana + faults: + - custom: + name: readiness-flap + readiness_flap: + workload: + kind: Deployment + name: frontend + namespace: default + probe_config: + flap_interval_seconds: 8 + failure_threshold: 1 + success_threshold: 1 + period_seconds: 2 + chaos_mesh: + latency_ms: 1000 + drop_probability: 0.7 + restart_policy: force