Skip to content
24 changes: 24 additions & 0 deletions sre/roles/faults/meta/argument_specs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ argument_specs:
- misconfigured-service-port
- modify-environment-variables
- unsupported-image
- exhaust-node-resources
required: true
type: str
invalid_command:
Expand Down Expand Up @@ -325,6 +326,29 @@ argument_specs:
default: force
required: false
type: str
exhaust_node_resources:
required: false
type: dict
options:
workload:
required: true
type: dict
options:
kind:
choices:
- Deployment
- StatefulSet
required: true
type: str
name:
required: true
type: str
namespace:
required: true
type: str
container:
required: true
type: str
otel_demo:
required: false
type: dict
Expand Down
9 changes: 9 additions & 0 deletions sre/roles/faults/tasks/inject_custom.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,12 @@
when:
- fault.custom.name == 'unsupported-image'
- fault.custom.unsupported_image is defined

- name: Import exhaust node resources injection tasks
ansible.builtin.import_tasks:
file: inject_custom_exhaust_node_resources.yaml
vars:
spec: "{{ fault.custom.exhaust_node_resources }}"
when:
- fault.custom.name == 'exhaust-node-resources'
- fault.custom.exhaust_node_resources is defined
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
---
- name: Import variable setting tasks
ansible.builtin.import_tasks:
file: set_faults_workload_container_index.yaml
vars:
faults_workload: "{{ spec.workload }}"

- name: Inject CPU and Memory stress using Chaos Mesh
kubernetes.core.k8s:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
resource_definition:
apiVersion: chaos-mesh.org/v1alpha1
kind: StressChaos
metadata:
name: stress-chaos-{{ spec.workload.name }}
namespace: "{{ spec.workload.namespace }}"
spec:
mode: one
selector:
namespaces:
- "{{ spec.workload.namespace }}"
labelSelectors:
app.kubernetes.io/name: "{{ spec.workload.name }}"
stressors:
cpu:
workers: 4
memory:
workers: 1
size: "256MB"
duration: "60s"
scheduler:
cron: "@once"

- name: Confirm stress chaos created
kubernetes.core.k8s_info:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
api_version: chaos-mesh.org/v1alpha1
kind: StressChaos
namespace: "{{ spec.workload.namespace }}"
name: "stress-chaos-{{ spec.workload.name }}"
register: faults_stress_chaos_info

- name: Import workload restart tasks
ansible.builtin.import_tasks:
file: force_workload_restart.yaml
when:
- (spec.workload.restart_policy | default('force')) == 'force'
vars:
faults_workload: "{{ spec.workload }}"
9 changes: 9 additions & 0 deletions sre/roles/faults/tasks/remove_custom.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,12 @@
when:
- fault.custom.name == 'unsupported-image'
- fault.custom.unsupported_image is defined

- name: Import exhaust node resources removal tasks
ansible.builtin.import_tasks:
file: remove_custom_exhaust_node_resources.yaml
vars:
spec: "{{ fault.custom.exhaust_node_resources }}"
when:
- fault.custom.name == 'exhaust-node-resources'
- fault.custom.exhaust_node_resources is defined
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
- name: Retrieve StressChaos objects for target workload
kubernetes.core.k8s_info:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
api_version: chaos-mesh.org/v1alpha1
kind: StressChaos
namespace: "{{ spec.workload.namespace }}"
label_selectors:
- app.kubernetes.io/name={{ spec.workload.name }}
register: faults_stress_chaos_list

- name: Remove StressChaos CRs related to this workload
kubernetes.core.k8s:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
state: absent
api_version: chaos-mesh.org/v1alpha1
kind: StressChaos
name: "{{ item.metadata.name }}"
namespace: "{{ spec.workload.namespace }}"
loop: "{{ faults_stress_chaos_list.resources }}"
when: faults_stress_chaos_list.resources | length > 0

- name: Confirm StressChaos removal
kubernetes.core.k8s_info:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
api_version: chaos-mesh.org/v1alpha1
kind: StressChaos
namespace: "{{ spec.workload.namespace }}"
register: faults_stress_chaos_post_delete
37 changes: 37 additions & 0 deletions sre/roles/incidents/files/ground_truths/incident_41.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
---
fault:
- entity:
name: cart
group_id: cart
kind: Deployment
type: custom
name: exhaust-node-resources
condition: Node resource exhaustion
category: Change
fault_mechanism: exhaust-node-resources

alerts:
- id: RequestLatency
group_id: cart
metadata:
description: Latency of cart deployment increases due to CPU resource contention

groups:
- id: cart
kind: Deployment
namespace: otel-demo
filter:
- cart.*
root_cause: true

aliases:
- - cart

propagations: []

recommended_actions:
- solution:
id: tune_resources
actions:
- Increase CPU/memory requests for cart deployment
- Use autoscaling or reduce workload concurrency
32 changes: 32 additions & 0 deletions sre/roles/incidents/files/specs/incident_41.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
---
metadata:
version: "v1"
scenario:
metadata:
complexity: Medium
id: 41
name: Exhaust Node Resources
platform: kubernetes
spec:
environment:
applications:
otel_demo:
enabled: true
tools:
category: sre
selected:
- prometheus
- grafana
- jaeger
- chaos-mesh

faults:
- custom:
name: exhaust-node-resources
exhaust_node_resources:
workload:
kind: Deployment
name: cart
namespace: "{{ applications_helm_releases.otel_demo.namespace }}"
container:
name: cart
Loading