Skip to content
24 changes: 24 additions & 0 deletions sre/roles/faults/meta/argument_specs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ argument_specs:
- misconfigured-service-port
- modify-environment-variables
- unsupported-image
- exhaust-node-resources
required: true
type: str
invalid_command:
Expand Down Expand Up @@ -325,6 +326,29 @@ argument_specs:
default: force
required: false
type: str
exhaust_node_resources:
required: false
type: dict
options:
workload:
required: true
type: dict
options:
kind:
choices:
- Deployment
- StatefulSet
required: true
type: str
name:
required: true
type: str
namespace:
required: true
type: str
container:
required: true
type: str
otel_demo:
required: false
type: dict
Expand Down
9 changes: 9 additions & 0 deletions sre/roles/faults/tasks/inject_custom.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,12 @@
when:
- fault.custom.name == 'unsupported-image'
- fault.custom.unsupported_image is defined

- name: Import exhaust node resources injection tasks
ansible.builtin.import_tasks:
file: inject_custom_exhaust_node_resources.yaml
vars:
spec: "{{ fault.custom.exhaust_node_resources }}"
when:
- fault.custom.name == 'exhaust-node-resources'
- fault.custom.exhaust_node_resources is defined
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
---
- name: Import variable setting tasks
ansible.builtin.import_tasks:
file: set_faults_workload_container_index.yaml
vars:
faults_workload: "{{ spec.workload }}"

- name: Inject resource-limit-removal via Chaos Mesh schedule
kubernetes.core.k8s:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
resource_definition:
apiVersion: chaos-mesh.org/v1alpha1
kind: Schedule
metadata:
name: remove-limits-{{ spec.workload.name }}
namespace: "{{ spec.workload.namespace }}"
spec:
schedule: "@once"
historyLimit: 1
concurrencyPolicy: Forbid
type: PodChaos
podChaos:
action: patch
mode: one
selector:
namespaces:
- "{{ spec.workload.namespace }}"
labelSelectors:
app.kubernetes.io/name: "{{ spec.workload.name }}"
patch:
containers:
- name: "{{ spec.workload.container.name }}"
resources:
limits: {}

- name: Confirm Chaos Mesh schedule created
kubernetes.core.k8s_info:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
api_version: chaos-mesh.org/v1alpha1
kind: Schedule
namespace: "{{ spec.workload.namespace }}"
name: "remove-limits-{{ spec.workload.name }}"
register: faults_schedule_info
9 changes: 9 additions & 0 deletions sre/roles/faults/tasks/remove_custom.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,12 @@
when:
- fault.custom.name == 'unsupported-image'
- fault.custom.unsupported_image is defined

- name: Import exhaust node resources removal tasks
ansible.builtin.import_tasks:
file: remove_custom_exhaust_node_resources.yaml
vars:
spec: "{{ fault.custom.exhaust_node_resources }}"
when:
- fault.custom.name == 'exhaust-node-resources'
- fault.custom.exhaust_node_resources is defined
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
- name: Retrieve Chaos Mesh Schedule objects for target workload
kubernetes.core.k8s_info:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
api_version: chaos-mesh.org/v1alpha1
kind: Schedule
namespace: "{{ spec.workload.namespace }}"
label_selectors:
- app.kubernetes.io/name={{ spec.workload.name }}
register: faults_schedule_list

- name: Remove Chaos Mesh Schedule CRs related to this workload
kubernetes.core.k8s:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
state: absent
api_version: chaos-mesh.org/v1alpha1
kind: Schedule
name: "{{ item.metadata.name }}"
namespace: "{{ spec.workload.namespace }}"
loop: "{{ faults_schedule_list.resources }}"
when: faults_schedule_list.resources | length > 0

- name: Confirm Chaos Mesh Schedule removal
kubernetes.core.k8s_info:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
api_version: chaos-mesh.org/v1alpha1
kind: Schedule
namespace: "{{ spec.workload.namespace }}"
register: faults_schedule_post_delete
49 changes: 49 additions & 0 deletions sre/roles/incidents/files/ground_truths/incident_41.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
---
metadata:
version: "v1"
fault:
- entity:
name: remove-limits-cart
group_id: cart
kind: Schedule
name: exhaust-node-resources
condition: Node resource exhaustion via removed CPU/memory limits
category: Chaos
fault_mechanism: chaos

alerts:
- id: RequestLatency
group_id: cart
metadata:
description: Latency of cart deployment increases due to CPU resource contention

groups:
- id: remove-limits-cart
kind: Schedule
filter:
- remove-limits-{{ fault.entity.name }}
namespace: chaos-mesh
root_cause: true
- id: cart
kind: Deployment
filter:
- cart.*
namespace: otel-demo
root_cause: true

aliases:
- - cart

propagations:
- source: remove-limits-cart
target: cart
condition: Chaos Mesh Schedule removes CPU/memory limits from cart deployment
effect: Pods can consume more resources, potentially causing node exhaustion and higher latency

recommended_actions:
- solution:
id: tune_resources
actions:
- Increase CPU/memory requests for cart deployment
- Use autoscaling or reduce workload concurrency
- Reapply CPU/memory limits as needed
44 changes: 44 additions & 0 deletions sre/roles/incidents/files/specs/incident_41.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
---
metadata:
version: "v1"
scenario:
metadata:
complexity: Medium
id: 41
name: Exhaust Node Resources
platform: kubernetes
spec:
environment:
applications:
otel_demo:
enabled: true
tools:
category: sre
selected:
- prometheus
- grafana
- jaeger
- chaos-mesh

faults:
- chaos_mesh:
schedule:
name: remove-limits-cart
spec:
schedule: "@once"
historyLimit: 1
concurrencyPolicy: Forbid
type: PodChaos
podChaos:
action: patch
mode: one
selector:
namespaces:
- "{{ applications_helm_releases.otel_demo.namespace }}"
labelSelectors:
app.kubernetes.io/name: "cart"
patch:
containers:
- name: cart
resources:
limits: {}
Loading