From 9e8e0c72abae45cc7cc3b4acc0a119fe87b268ae Mon Sep 17 00:00:00 2001 From: VincentCCandela Date: Mon, 20 Oct 2025 20:59:22 -0400 Subject: [PATCH 1/5] limits on memory and cpu fault --- sre/.ansible-lint-ignore | 2 - sre/a.out | 0 sre/ansible.cfg | 3 - sre/dev/local_cluster/kind-config.yaml.1 | 2185 +++++++++++++++++ sre/playbooks/roles | 1 + sre/roles/faults/tasks/inject_custom.yaml | 9 + .../tasks/inject_low_mem_cpu_constraints.yaml | 68 + .../files/ground_truths/incident_32.yaml | 26 + .../incidents/files/specs/incident_32.yaml | 23 + 9 files changed, 2312 insertions(+), 5 deletions(-) delete mode 100644 sre/.ansible-lint-ignore create mode 100644 sre/a.out delete mode 100644 sre/ansible.cfg create mode 100644 sre/dev/local_cluster/kind-config.yaml.1 create mode 120000 sre/playbooks/roles create mode 100644 sre/roles/faults/tasks/inject_low_mem_cpu_constraints.yaml create mode 100644 sre/roles/incidents/files/ground_truths/incident_32.yaml create mode 100644 sre/roles/incidents/files/specs/incident_32.yaml diff --git a/sre/.ansible-lint-ignore b/sre/.ansible-lint-ignore deleted file mode 100644 index 716e787a8..000000000 --- a/sre/.ansible-lint-ignore +++ /dev/null @@ -1,2 +0,0 @@ -playbooks/generate_leaderboard_bundle_status.yaml yaml[line-length] -roles/applications/files/rules/otel-demo.yaml yaml[line-length] diff --git a/sre/a.out b/sre/a.out new file mode 100644 index 000000000..e69de29bb diff --git a/sre/ansible.cfg b/sre/ansible.cfg deleted file mode 100644 index 7e3ac396c..000000000 --- a/sre/ansible.cfg +++ /dev/null @@ -1,3 +0,0 @@ -[defaults] -display_skipped_hosts = false -roles_path = roles diff --git a/sre/dev/local_cluster/kind-config.yaml.1 b/sre/dev/local_cluster/kind-config.yaml.1 new file mode 100644 index 000000000..9e4b5ef30 --- /dev/null +++ b/sre/dev/local_cluster/kind-config.yaml.1 @@ -0,0 +1,2185 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ITBench-Scenarios/sre/dev/local_cluster/kind-config.yaml at main · itbench-hub/ITBench-Scenarios · GitHub + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ Skip to content + + + + + + + + + + + +
+
+ + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + +
+ + + + + +
+ + + + + + + + + +
+
+
+ + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+ +
+
+ +
+ +
+

Footer

+ + + + +
+
+ + + + + © 2025 GitHub, Inc. + +
+ + +
+
+ + + + + + + + + + + + + + + + + + + + +
+
+
+ + + diff --git a/sre/playbooks/roles b/sre/playbooks/roles new file mode 120000 index 000000000..d8c4472ca --- /dev/null +++ b/sre/playbooks/roles @@ -0,0 +1 @@ +../roles \ No newline at end of file diff --git a/sre/roles/faults/tasks/inject_custom.yaml b/sre/roles/faults/tasks/inject_custom.yaml index efa772ca4..4c1899445 100644 --- a/sre/roles/faults/tasks/inject_custom.yaml +++ b/sre/roles/faults/tasks/inject_custom.yaml @@ -1,4 +1,13 @@ --- +- name: Low Mem CPU Constraints + ansible.builtin.import_tasks: + file: inject_low_mem_cpu_constraints.yaml + vars: + spec: "{{ fault.custom.misconfigured_service_port }}" + when: + - fault.custom.name == 'low-mem-cpu-constraints' + - fault.custom.low_mem_cpu_constraints is defined + - name: Import invalid command injection tasks ansible.builtin.import_tasks: file: inject_custom_invalid_command.yaml diff --git a/sre/roles/faults/tasks/inject_low_mem_cpu_constraints.yaml b/sre/roles/faults/tasks/inject_low_mem_cpu_constraints.yaml new file mode 100644 index 000000000..d786dea84 --- /dev/null +++ b/sre/roles/faults/tasks/inject_low_mem_cpu_constraints.yaml @@ -0,0 +1,68 @@ +--- +- name: Create ResourceQuota with hard memory limits + kubernetes.core.k8s: + kubeconfig: "{{ faults_cluster.kubeconfig }}" + resource_definition: + apiVersion: v1 + kind: ResourceQuota + metadata: + name: "{{ spec.namespace.name }}-memory" + namespace: "{{ spec.namespace.name }}" + spec: + hard: + requests.memory: 1Gi + limits.memory: 2Gi + requests.cpu: "1" + limits.cpu: "2" + pods: "10" + state: present + +- name: Retrieve workload replica information + kubernetes.core.k8s_info: + api_version: apps/v1 + kind: "{{ workload.kind }}" + kubeconfig: "{{ faults_cluster.kubeconfig }}" + name: "{{ workload.name }}" + namespace: "{{ spec.namespace.name }}" + register: faults_workloads_info + loop: "{{ spec.workloads }}" + loop_control: + label: "{{ workload.kind | lower }}/{{ workload.name }}" + loop_var: workload + when: + - workload.name != 'otel-collector' + +- name: Scale down workloads to 0 replicas + kubernetes.core.k8s_scale: + api_version: "{{ result.resources[0].apiVersion }}" + kind: "{{ result.resources[0].kind }}" + kubeconfig: "{{ faults_cluster.kubeconfig }}" + name: "{{ result.resources[0].metadata.name }}" + namespace: "{{ result.resources[0].metadata.namespace }}" + replicas: 0 + wait: true + wait_timeout: 60 + loop: "{{ faults_workloads_info.results }}" + loop_control: + label: "{{ result.resources[0].kind | lower }}/{{ result.resources[0].metadata.name }}" + loop_var: result + when: + - faults_workloads_info is defined + - result.resources | length == 1 + +- name: Scale up workloads to original replica count + kubernetes.core.k8s_scale: + api_version: "{{ result.resources[0].apiVersion }}" + kind: "{{ result.resources[0].kind }}" + kubeconfig: "{{ faults_cluster.kubeconfig }}" + name: "{{ result.resources[0].metadata.name }}" + namespace: "{{ result.resources[0].metadata.namespace }}" + replicas: "{{ result.resources[0].spec.replicas }}" + wait: false + loop: "{{ faults_workloads_info.results }}" + loop_control: + label: "{{ result.resources[0].kind | lower }}/{{ result.resources[0].metadata.name }}" + loop_var: result + when: + - faults_workloads_info is defined + - result.resources | length == 1 diff --git a/sre/roles/incidents/files/ground_truths/incident_32.yaml b/sre/roles/incidents/files/ground_truths/incident_32.yaml new file mode 100644 index 000000000..6ab54e35e --- /dev/null +++ b/sre/roles/incidents/files/ground_truths/incident_32.yaml @@ -0,0 +1,26 @@ +--- +fault: [] +alerts: + - id: CPUSpend + group_id: otel-demo-namespace-1 + metadata: + description: CPU spend increased by 20 percent + - id: MemorySpend + group_id: otel-demo-namespace-1 + metadata: + description: Memory spend has increased by 20 percent +groups: + - id: otel-demo-namespace-1 + kind: Namespace + name: otel-demo + namespace: otel-demo + root_cause: true +aliases: + - - otel-demo-namespace-1 +propagations: [] +recommended_actions: + - solution: + id: "no_action" + actions: + - no changes is needed in application + - update opencost alert to prevent false alerts diff --git a/sre/roles/incidents/files/specs/incident_32.yaml b/sre/roles/incidents/files/specs/incident_32.yaml new file mode 100644 index 000000000..bec82cead --- /dev/null +++ b/sre/roles/incidents/files/specs/incident_32.yaml @@ -0,0 +1,23 @@ +--- +metadata: + complexity: Low + id: 32 + name: Low Mem CPU Constraints + platform: kubernetes +spec: + environment: + applications: + otel_demo: + enabled: true + tools: + category: sre + selected: + - kubernetes-topology-monitor + faults: + - custom: + name: misconfigured-resource-quota + misconfigured_network_policy: + workload: + kind: Deployment + name: frontend + namespace: "{{ applications_helm_releases.otel_demo.namespace }}" From 8cfde8221e0b72382b49ea40b13add52f217feb3 Mon Sep 17 00:00:00 2001 From: VincentCCandela Date: Fri, 24 Oct 2025 13:36:45 -0400 Subject: [PATCH 2/5] basic fixes --- sre/playbooks/roles | 1 - 1 file changed, 1 deletion(-) delete mode 120000 sre/playbooks/roles diff --git a/sre/playbooks/roles b/sre/playbooks/roles deleted file mode 120000 index d8c4472ca..000000000 --- a/sre/playbooks/roles +++ /dev/null @@ -1 +0,0 @@ -../roles \ No newline at end of file From e8f041dfca980c4bb64552a4ccb2061453eda25a Mon Sep 17 00:00:00 2001 From: VincentCCandela Date: Fri, 24 Oct 2025 13:38:20 -0400 Subject: [PATCH 3/5] basic fixes --- sre/.ansible-lint-ignore | 2 + sre/a.out | 0 sre/ansible.cfg | 3 + sre/dev/local_cluster/kind-config.yaml.1 | 2185 ----------------- sre/roles/faults/tasks/inject_custom.yaml | 2 +- .../incidents/files/specs/incident_32.yaml | 2 +- 6 files changed, 7 insertions(+), 2187 deletions(-) create mode 100644 sre/.ansible-lint-ignore delete mode 100644 sre/a.out create mode 100644 sre/ansible.cfg delete mode 100644 sre/dev/local_cluster/kind-config.yaml.1 diff --git a/sre/.ansible-lint-ignore b/sre/.ansible-lint-ignore new file mode 100644 index 000000000..716e787a8 --- /dev/null +++ b/sre/.ansible-lint-ignore @@ -0,0 +1,2 @@ +playbooks/generate_leaderboard_bundle_status.yaml yaml[line-length] +roles/applications/files/rules/otel-demo.yaml yaml[line-length] diff --git a/sre/a.out b/sre/a.out deleted file mode 100644 index e69de29bb..000000000 diff --git a/sre/ansible.cfg b/sre/ansible.cfg new file mode 100644 index 000000000..7e3ac396c --- /dev/null +++ b/sre/ansible.cfg @@ -0,0 +1,3 @@ +[defaults] +display_skipped_hosts = false +roles_path = roles diff --git a/sre/dev/local_cluster/kind-config.yaml.1 b/sre/dev/local_cluster/kind-config.yaml.1 deleted file mode 100644 index 9e4b5ef30..000000000 --- a/sre/dev/local_cluster/kind-config.yaml.1 +++ /dev/null @@ -1,2185 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ITBench-Scenarios/sre/dev/local_cluster/kind-config.yaml at main · itbench-hub/ITBench-Scenarios · GitHub - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - -
- Skip to content - - - - - - - - - - - -
-
- - - - - - - - - - - - - - - - - -
- -
- - - - - - - - -
- - - - - -
- - - - - - - - - -
-
-
- - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - -
-
- - - - -
- -
- -
-
- -
- -
-

Footer

- - - - -
-
- - - - - © 2025 GitHub, Inc. - -
- - -
-
- - - - - - - - - - - - - - - - - - - - -
-
-
- - - diff --git a/sre/roles/faults/tasks/inject_custom.yaml b/sre/roles/faults/tasks/inject_custom.yaml index 4c1899445..ab2e203b7 100644 --- a/sre/roles/faults/tasks/inject_custom.yaml +++ b/sre/roles/faults/tasks/inject_custom.yaml @@ -7,7 +7,7 @@ when: - fault.custom.name == 'low-mem-cpu-constraints' - fault.custom.low_mem_cpu_constraints is defined - + - name: Import invalid command injection tasks ansible.builtin.import_tasks: file: inject_custom_invalid_command.yaml diff --git a/sre/roles/incidents/files/specs/incident_32.yaml b/sre/roles/incidents/files/specs/incident_32.yaml index bec82cead..8ca95c585 100644 --- a/sre/roles/incidents/files/specs/incident_32.yaml +++ b/sre/roles/incidents/files/specs/incident_32.yaml @@ -14,7 +14,7 @@ spec: selected: - kubernetes-topology-monitor faults: - - custom: + - custom: name: misconfigured-resource-quota misconfigured_network_policy: workload: From 40d8334d244eaf68ad6b77923e106f5e599ca456 Mon Sep 17 00:00:00 2001 From: VincentCCandela Date: Sun, 26 Oct 2025 23:58:02 -0400 Subject: [PATCH 4/5] pod-level resource limits instead of namespace level --- .../tasks/inject_low_mem_cpu_constraints.yaml | 62 +++++++++---------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/sre/roles/faults/tasks/inject_low_mem_cpu_constraints.yaml b/sre/roles/faults/tasks/inject_low_mem_cpu_constraints.yaml index d786dea84..879bd45b1 100644 --- a/sre/roles/faults/tasks/inject_low_mem_cpu_constraints.yaml +++ b/sre/roles/faults/tasks/inject_low_mem_cpu_constraints.yaml @@ -1,23 +1,5 @@ --- -- name: Create ResourceQuota with hard memory limits - kubernetes.core.k8s: - kubeconfig: "{{ faults_cluster.kubeconfig }}" - resource_definition: - apiVersion: v1 - kind: ResourceQuota - metadata: - name: "{{ spec.namespace.name }}-memory" - namespace: "{{ spec.namespace.name }}" - spec: - hard: - requests.memory: 1Gi - limits.memory: 2Gi - requests.cpu: "1" - limits.cpu: "2" - pods: "10" - state: present - -- name: Retrieve workload replica information +- name: Retrieve workload information for target services kubernetes.core.k8s_info: api_version: apps/v1 kind: "{{ workload.kind }}" @@ -30,18 +12,29 @@ label: "{{ workload.kind | lower }}/{{ workload.name }}" loop_var: workload when: - - workload.name != 'otel-collector' + - workload.name in ['valkey', 'postgresql'] # Only target specific services -- name: Scale down workloads to 0 replicas - kubernetes.core.k8s_scale: +- name: Patch workloads with restrictive resource limits + kubernetes.core.k8s: + kubeconfig: "{{ faults_cluster.kubeconfig }}" + state: patched api_version: "{{ result.resources[0].apiVersion }}" kind: "{{ result.resources[0].kind }}" - kubeconfig: "{{ faults_cluster.kubeconfig }}" name: "{{ result.resources[0].metadata.name }}" namespace: "{{ result.resources[0].metadata.namespace }}" - replicas: 0 - wait: true - wait_timeout: 60 + definition: + spec: + template: + spec: + containers: + - name: "{{ result.resources[0].spec.template.spec.containers[0].name }}" + resources: + limits: + memory: "256Mi" + cpu: "200m" + requests: + memory: "128Mi" + cpu: "100m" loop: "{{ faults_workloads_info.results }}" loop_control: label: "{{ result.resources[0].kind | lower }}/{{ result.resources[0].metadata.name }}" @@ -50,19 +43,24 @@ - faults_workloads_info is defined - result.resources | length == 1 -- name: Scale up workloads to original replica count - kubernetes.core.k8s_scale: +- name: Restart workloads to apply new resource limits + kubernetes.core.k8s: + kubeconfig: "{{ faults_cluster.kubeconfig }}" + state: patched api_version: "{{ result.resources[0].apiVersion }}" kind: "{{ result.resources[0].kind }}" - kubeconfig: "{{ faults_cluster.kubeconfig }}" name: "{{ result.resources[0].metadata.name }}" namespace: "{{ result.resources[0].metadata.namespace }}" - replicas: "{{ result.resources[0].spec.replicas }}" - wait: false + definition: + spec: + template: + metadata: + annotations: + kubectl.kubernetes.io/restartedAt: "{{ ansible_date_time.iso8601 }}" loop: "{{ faults_workloads_info.results }}" loop_control: label: "{{ result.resources[0].kind | lower }}/{{ result.resources[0].metadata.name }}" loop_var: result when: - faults_workloads_info is defined - - result.resources | length == 1 + - result.resources | length == 1 \ No newline at end of file From 53105fcdb6ee4f4177faaa3c1d9cd32a81ae8f6a Mon Sep 17 00:00:00 2001 From: VincentCCandela Date: Mon, 27 Oct 2025 00:24:07 -0400 Subject: [PATCH 5/5] minor filename changes --- sre/roles/faults/tasks/inject_custom.yaml | 8 ++++---- ...u_constraints.yaml => inject_low_resource_limits.yaml} | 0 sre/roles/incidents/files/specs/incident_32.yaml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) rename sre/roles/faults/tasks/{inject_low_mem_cpu_constraints.yaml => inject_low_resource_limits.yaml} (100%) diff --git a/sre/roles/faults/tasks/inject_custom.yaml b/sre/roles/faults/tasks/inject_custom.yaml index ab2e203b7..dc271cc47 100644 --- a/sre/roles/faults/tasks/inject_custom.yaml +++ b/sre/roles/faults/tasks/inject_custom.yaml @@ -1,12 +1,12 @@ --- -- name: Low Mem CPU Constraints +- name: Low Resource Limits ansible.builtin.import_tasks: - file: inject_low_mem_cpu_constraints.yaml + file: inject_low_resource_limits.yaml vars: spec: "{{ fault.custom.misconfigured_service_port }}" when: - - fault.custom.name == 'low-mem-cpu-constraints' - - fault.custom.low_mem_cpu_constraints is defined + - fault.custom.name == 'low-resource-limits' + - fault.custom.inject_low_resource_limits is defined - name: Import invalid command injection tasks ansible.builtin.import_tasks: diff --git a/sre/roles/faults/tasks/inject_low_mem_cpu_constraints.yaml b/sre/roles/faults/tasks/inject_low_resource_limits.yaml similarity index 100% rename from sre/roles/faults/tasks/inject_low_mem_cpu_constraints.yaml rename to sre/roles/faults/tasks/inject_low_resource_limits.yaml diff --git a/sre/roles/incidents/files/specs/incident_32.yaml b/sre/roles/incidents/files/specs/incident_32.yaml index 8ca95c585..67930390b 100644 --- a/sre/roles/incidents/files/specs/incident_32.yaml +++ b/sre/roles/incidents/files/specs/incident_32.yaml @@ -2,7 +2,7 @@ metadata: complexity: Low id: 32 - name: Low Mem CPU Constraints + name: Low Resource Limits platform: kubernetes spec: environment: