diff --git a/sre/roles/faults/meta/argument_specs.yaml b/sre/roles/faults/meta/argument_specs.yaml index 4e9b11f39..8e38c7cec 100644 --- a/sre/roles/faults/meta/argument_specs.yaml +++ b/sre/roles/faults/meta/argument_specs.yaml @@ -51,6 +51,7 @@ argument_specs: - misconfigured-service-port - modify-environment-variables - unsupported-image + - exhaust-node-resources required: true type: str invalid_command: @@ -325,6 +326,29 @@ argument_specs: default: force required: false type: str + exhaust_node_resources: + required: false + type: dict + options: + workload: + required: true + type: dict + options: + kind: + choices: + - Deployment + - StatefulSet + required: true + type: str + name: + required: true + type: str + namespace: + required: true + type: str + container: + required: true + type: str otel_demo: required: false type: dict diff --git a/sre/roles/faults/tasks/inject_custom.yaml b/sre/roles/faults/tasks/inject_custom.yaml index efa772ca4..762c213de 100644 --- a/sre/roles/faults/tasks/inject_custom.yaml +++ b/sre/roles/faults/tasks/inject_custom.yaml @@ -88,3 +88,12 @@ when: - fault.custom.name == 'unsupported-image' - fault.custom.unsupported_image is defined + +- name: Import exhaust node resources injection tasks + ansible.builtin.import_tasks: + file: inject_custom_exhaust_node_resources.yaml + vars: + spec: "{{ fault.custom.exhaust_node_resources }}" + when: + - fault.custom.name == 'exhaust-node-resources' + - fault.custom.exhaust_node_resources is defined diff --git a/sre/roles/faults/tasks/inject_custom_exhaust_node_resources.yaml b/sre/roles/faults/tasks/inject_custom_exhaust_node_resources.yaml new file mode 100644 index 000000000..f9711b106 --- /dev/null +++ b/sre/roles/faults/tasks/inject_custom_exhaust_node_resources.yaml @@ -0,0 +1,43 @@ +--- +- name: Import variable setting tasks + ansible.builtin.import_tasks: + file: set_faults_workload_container_index.yaml + vars: + faults_workload: "{{ spec.workload }}" + +- name: Inject resource-limit-removal via Chaos Mesh schedule + kubernetes.core.k8s: + kubeconfig: "{{ faults_cluster.kubeconfig }}" + resource_definition: + apiVersion: chaos-mesh.org/v1alpha1 + kind: Schedule + metadata: + name: remove-limits-{{ spec.workload.name }} + namespace: "{{ spec.workload.namespace }}" + spec: + schedule: "@once" + historyLimit: 1 + concurrencyPolicy: Forbid + type: PodChaos + podChaos: + action: patch + mode: one + selector: + namespaces: + - "{{ spec.workload.namespace }}" + labelSelectors: + app.kubernetes.io/name: "{{ spec.workload.name }}" + patch: + containers: + - name: "{{ spec.workload.container.name }}" + resources: + limits: {} + +- name: Confirm Chaos Mesh schedule created + kubernetes.core.k8s_info: + kubeconfig: "{{ faults_cluster.kubeconfig }}" + api_version: chaos-mesh.org/v1alpha1 + kind: Schedule + namespace: "{{ spec.workload.namespace }}" + name: "remove-limits-{{ spec.workload.name }}" + register: faults_schedule_info diff --git a/sre/roles/faults/tasks/remove_custom.yaml b/sre/roles/faults/tasks/remove_custom.yaml index e868a86a7..21cb8c90a 100644 --- a/sre/roles/faults/tasks/remove_custom.yaml +++ b/sre/roles/faults/tasks/remove_custom.yaml @@ -88,3 +88,12 @@ when: - fault.custom.name == 'unsupported-image' - fault.custom.unsupported_image is defined + +- name: Import exhaust node resources removal tasks + ansible.builtin.import_tasks: + file: remove_custom_exhaust_node_resources.yaml + vars: + spec: "{{ fault.custom.exhaust_node_resources }}" + when: + - fault.custom.name == 'exhaust-node-resources' + - fault.custom.exhaust_node_resources is defined diff --git a/sre/roles/faults/tasks/remove_custom_exhaust_node_resources.yaml b/sre/roles/faults/tasks/remove_custom_exhaust_node_resources.yaml new file mode 100644 index 000000000..9f6455bb8 --- /dev/null +++ b/sre/roles/faults/tasks/remove_custom_exhaust_node_resources.yaml @@ -0,0 +1,29 @@ +--- +- name: Retrieve Chaos Mesh Schedule objects for target workload + kubernetes.core.k8s_info: + kubeconfig: "{{ faults_cluster.kubeconfig }}" + api_version: chaos-mesh.org/v1alpha1 + kind: Schedule + namespace: "{{ spec.workload.namespace }}" + label_selectors: + - app.kubernetes.io/name={{ spec.workload.name }} + register: faults_schedule_list + +- name: Remove Chaos Mesh Schedule CRs related to this workload + kubernetes.core.k8s: + kubeconfig: "{{ faults_cluster.kubeconfig }}" + state: absent + api_version: chaos-mesh.org/v1alpha1 + kind: Schedule + name: "{{ item.metadata.name }}" + namespace: "{{ spec.workload.namespace }}" + loop: "{{ faults_schedule_list.resources }}" + when: faults_schedule_list.resources | length > 0 + +- name: Confirm Chaos Mesh Schedule removal + kubernetes.core.k8s_info: + kubeconfig: "{{ faults_cluster.kubeconfig }}" + api_version: chaos-mesh.org/v1alpha1 + kind: Schedule + namespace: "{{ spec.workload.namespace }}" + register: faults_schedule_post_delete diff --git a/sre/roles/incidents/files/ground_truths/incident_41.yaml b/sre/roles/incidents/files/ground_truths/incident_41.yaml new file mode 100644 index 000000000..af48ca90b --- /dev/null +++ b/sre/roles/incidents/files/ground_truths/incident_41.yaml @@ -0,0 +1,49 @@ +--- +metadata: + version: "v1" +fault: + - entity: + name: remove-limits-cart + group_id: cart + kind: Schedule + name: exhaust-node-resources + condition: Node resource exhaustion via removed CPU/memory limits + category: Chaos + fault_mechanism: chaos + +alerts: + - id: RequestLatency + group_id: cart + metadata: + description: Latency of cart deployment increases due to CPU resource contention + +groups: + - id: remove-limits-cart + kind: Schedule + filter: + - remove-limits-{{ fault.entity.name }} + namespace: chaos-mesh + root_cause: true + - id: cart + kind: Deployment + filter: + - cart.* + namespace: otel-demo + root_cause: true + +aliases: + - - cart + +propagations: + - source: remove-limits-cart + target: cart + condition: Chaos Mesh Schedule removes CPU/memory limits from cart deployment + effect: Pods can consume more resources, potentially causing node exhaustion and higher latency + +recommended_actions: + - solution: + id: tune_resources + actions: + - Increase CPU/memory requests for cart deployment + - Use autoscaling or reduce workload concurrency + - Reapply CPU/memory limits as needed diff --git a/sre/roles/incidents/files/specs/incident_41.yaml b/sre/roles/incidents/files/specs/incident_41.yaml new file mode 100644 index 000000000..851024ebd --- /dev/null +++ b/sre/roles/incidents/files/specs/incident_41.yaml @@ -0,0 +1,44 @@ +--- +metadata: + version: "v1" +scenario: + metadata: + complexity: Medium + id: 41 + name: Exhaust Node Resources + platform: kubernetes + spec: + environment: + applications: + otel_demo: + enabled: true + tools: + category: sre + selected: + - prometheus + - grafana + - jaeger + - chaos-mesh + + faults: + - chaos_mesh: + schedule: + name: remove-limits-cart + spec: + schedule: "@once" + historyLimit: 1 + concurrencyPolicy: Forbid + type: PodChaos + podChaos: + action: patch + mode: one + selector: + namespaces: + - "{{ applications_helm_releases.otel_demo.namespace }}" + labelSelectors: + app.kubernetes.io/name: "cart" + patch: + containers: + - name: cart + resources: + limits: {}