diff --git a/sre/roles/faults/tasks/inject_custom.yaml b/sre/roles/faults/tasks/inject_custom.yaml index efa772ca4..dc271cc47 100644 --- a/sre/roles/faults/tasks/inject_custom.yaml +++ b/sre/roles/faults/tasks/inject_custom.yaml @@ -1,4 +1,13 @@ --- +- name: Low Resource Limits + ansible.builtin.import_tasks: + file: inject_low_resource_limits.yaml + vars: + spec: "{{ fault.custom.misconfigured_service_port }}" + when: + - fault.custom.name == 'low-resource-limits' + - fault.custom.inject_low_resource_limits is defined + - name: Import invalid command injection tasks ansible.builtin.import_tasks: file: inject_custom_invalid_command.yaml diff --git a/sre/roles/faults/tasks/inject_low_resource_limits.yaml b/sre/roles/faults/tasks/inject_low_resource_limits.yaml new file mode 100644 index 000000000..879bd45b1 --- /dev/null +++ b/sre/roles/faults/tasks/inject_low_resource_limits.yaml @@ -0,0 +1,66 @@ +--- +- name: Retrieve workload information for target services + kubernetes.core.k8s_info: + api_version: apps/v1 + kind: "{{ workload.kind }}" + kubeconfig: "{{ faults_cluster.kubeconfig }}" + name: "{{ workload.name }}" + namespace: "{{ spec.namespace.name }}" + register: faults_workloads_info + loop: "{{ spec.workloads }}" + loop_control: + label: "{{ workload.kind | lower }}/{{ workload.name }}" + loop_var: workload + when: + - workload.name in ['valkey', 'postgresql'] # Only target specific services + +- name: Patch workloads with restrictive resource limits + kubernetes.core.k8s: + kubeconfig: "{{ faults_cluster.kubeconfig }}" + state: patched + api_version: "{{ result.resources[0].apiVersion }}" + kind: "{{ result.resources[0].kind }}" + name: "{{ result.resources[0].metadata.name }}" + namespace: "{{ result.resources[0].metadata.namespace }}" + definition: + spec: + template: + spec: + containers: + - name: "{{ result.resources[0].spec.template.spec.containers[0].name }}" + resources: + limits: + memory: "256Mi" + cpu: "200m" + requests: + memory: "128Mi" + cpu: "100m" + loop: "{{ faults_workloads_info.results }}" + loop_control: + label: "{{ result.resources[0].kind | lower }}/{{ result.resources[0].metadata.name }}" + loop_var: result + when: + - faults_workloads_info is defined + - result.resources | length == 1 + +- name: Restart workloads to apply new resource limits + kubernetes.core.k8s: + kubeconfig: "{{ faults_cluster.kubeconfig }}" + state: patched + api_version: "{{ result.resources[0].apiVersion }}" + kind: "{{ result.resources[0].kind }}" + name: "{{ result.resources[0].metadata.name }}" + namespace: "{{ result.resources[0].metadata.namespace }}" + definition: + spec: + template: + metadata: + annotations: + kubectl.kubernetes.io/restartedAt: "{{ ansible_date_time.iso8601 }}" + loop: "{{ faults_workloads_info.results }}" + loop_control: + label: "{{ result.resources[0].kind | lower }}/{{ result.resources[0].metadata.name }}" + loop_var: result + when: + - faults_workloads_info is defined + - result.resources | length == 1 \ No newline at end of file diff --git a/sre/roles/incidents/files/ground_truths/incident_32.yaml b/sre/roles/incidents/files/ground_truths/incident_32.yaml new file mode 100644 index 000000000..6ab54e35e --- /dev/null +++ b/sre/roles/incidents/files/ground_truths/incident_32.yaml @@ -0,0 +1,26 @@ +--- +fault: [] +alerts: + - id: CPUSpend + group_id: otel-demo-namespace-1 + metadata: + description: CPU spend increased by 20 percent + - id: MemorySpend + group_id: otel-demo-namespace-1 + metadata: + description: Memory spend has increased by 20 percent +groups: + - id: otel-demo-namespace-1 + kind: Namespace + name: otel-demo + namespace: otel-demo + root_cause: true +aliases: + - - otel-demo-namespace-1 +propagations: [] +recommended_actions: + - solution: + id: "no_action" + actions: + - no changes is needed in application + - update opencost alert to prevent false alerts diff --git a/sre/roles/incidents/files/specs/incident_32.yaml b/sre/roles/incidents/files/specs/incident_32.yaml new file mode 100644 index 000000000..67930390b --- /dev/null +++ b/sre/roles/incidents/files/specs/incident_32.yaml @@ -0,0 +1,23 @@ +--- +metadata: + complexity: Low + id: 32 + name: Low Resource Limits + platform: kubernetes +spec: + environment: + applications: + otel_demo: + enabled: true + tools: + category: sre + selected: + - kubernetes-topology-monitor + faults: + - custom: + name: misconfigured-resource-quota + misconfigured_network_policy: + workload: + kind: Deployment + name: frontend + namespace: "{{ applications_helm_releases.otel_demo.namespace }}"