From bf97b7e936754a99a48521c6c83c70edeb158c60 Mon Sep 17 00:00:00 2001 From: Tyler Nguyen Date: Tue, 28 Oct 2025 10:35:05 -0400 Subject: [PATCH 1/6] feat: expired tls certificate in service mesh Signed-off-by: Tyler Nguyen --- sre/roles/faults/meta/argument_specs.yaml | 21 ++++++ sre/roles/faults/tasks/inject_custom.yaml | 9 +++ ...nject_custom_expired_tls_certificates.yaml | 75 +++++++++++++++++++ sre/roles/faults/tasks/remove_custom.yaml | 9 +++ ...emove_custom_expired_tls_certificates.yaml | 50 +++++++++++++ .../files/ground_truths/incident_301.yaml | 33 ++++++++ .../incidents/files/specs/incident_301.yaml | 31 ++++++++ 7 files changed, 228 insertions(+) create mode 100644 sre/roles/faults/tasks/inject_custom_expired_tls_certificates.yaml create mode 100644 sre/roles/faults/tasks/remove_custom_expired_tls_certificates.yaml create mode 100644 sre/roles/incidents/files/ground_truths/incident_301.yaml create mode 100644 sre/roles/incidents/files/specs/incident_301.yaml diff --git a/sre/roles/faults/meta/argument_specs.yaml b/sre/roles/faults/meta/argument_specs.yaml index 4e9b11f39..938c141ff 100644 --- a/sre/roles/faults/meta/argument_specs.yaml +++ b/sre/roles/faults/meta/argument_specs.yaml @@ -51,6 +51,7 @@ argument_specs: - misconfigured-service-port - modify-environment-variables - unsupported-image + - expired-tls-certificates required: true type: str invalid_command: @@ -325,6 +326,26 @@ argument_specs: default: force required: false type: str + expired_tls_certificates: + required: false + type: dict + options: + istio_namespace: + default: istio-system + required: false + type: str + cert_ttl: + default: 1m + required: false + type: str + max_cert_ttl: + default: 2m + required: false + type: str + grace_period: + default: 30s + required: false + type: str otel_demo: required: false type: dict diff --git a/sre/roles/faults/tasks/inject_custom.yaml b/sre/roles/faults/tasks/inject_custom.yaml index efa772ca4..35e952f57 100644 --- a/sre/roles/faults/tasks/inject_custom.yaml +++ b/sre/roles/faults/tasks/inject_custom.yaml @@ -88,3 +88,12 @@ when: - fault.custom.name == 'unsupported-image' - fault.custom.unsupported_image is defined + +- name: Import expired TLS certificates injection tasks + ansible.builtin.import_tasks: + file: inject_custom_expired_tls_certificates.yaml + vars: + spec: "{{ fault.custom.expired_tls_certificates }}" + when: + - fault.custom.name == 'expired-tls-certificates' + - fault.custom.expired_tls_certificates is defined diff --git a/sre/roles/faults/tasks/inject_custom_expired_tls_certificates.yaml b/sre/roles/faults/tasks/inject_custom_expired_tls_certificates.yaml new file mode 100644 index 000000000..e5a5517f7 --- /dev/null +++ b/sre/roles/faults/tasks/inject_custom_expired_tls_certificates.yaml @@ -0,0 +1,75 @@ +--- + +- name: Retrieve current istiod deployment + kubernetes.core.k8s_info: + api_version: apps/v1 + kind: Deployment + kubeconfig: "{{ faults_cluster.kubeconfig }}" + name: istiod + namespace: "{{ spec.istio_namespace | default('istio-system') }}" + register: istiod_deployment_info + +- name: Set original environment variables fact + ansible.builtin.set_fact: + original_istiod_env: "{{ istiod_deployment_info.resources[0].spec.template.spec.containers[0].env | default([]) }}" + +- name: Construct new environment variables + ansible.builtin.set_fact: + new_istiod_env: >- + {{ + original_istiod_env | + rejectattr('name', 'in', ['DEFAULT_WORKLOAD_CERT_TTL', 'MAX_WORKLOAD_CERT_TTL', 'WORKLOAD_CERT_ROTATION_GRACE_PERIOD']) | + list + [ + { + "name": "DEFAULT_WORKLOAD_CERT_TTL", + "value": spec.cert_ttl | default('1m') + }, + { + "name": "MAX_WORKLOAD_CERT_TTL", + "value": spec.max_cert_ttl | default('2m') + }, + { + "name": "WORKLOAD_CERT_ROTATION_GRACE_PERIOD", + "value": spec.grace_period | default('30s') + } + ] + }} + +- name: Inject fault (expired TLS certificates) + kubernetes.core.k8s_json_patch: + api_version: apps/v1 + kind: Deployment + kubeconfig: "{{ faults_cluster.kubeconfig }}" + name: istiod + namespace: "{{ spec.istio_namespace | default('istio-system') }}" + patch: + - op: replace + path: /spec/template/spec/containers/0/env + value: "{{ new_istiod_env }}" + +- name: Wait for istiod pods to restart with new configuration + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + kubeconfig: "{{ faults_cluster.kubeconfig }}" + namespace: "{{ spec.istio_namespace | default('istio-system') }}" + label_selectors: + - app=istiod + wait: true + wait_condition: + type: Ready + status: "True" + wait_timeout: 300 + +- name: Display fault injection confirmation + ansible.builtin.debug: + msg: | + TLS certificate expiration fault injected successfully! + + Configuration applied: + - Default certificate TTL: {{ spec.cert_ttl | default('1m') }} + - Maximum certificate TTL: {{ spec.max_cert_ttl | default('2m') }} + - Certificate rotation grace period: {{ spec.grace_period | default('30s') }} + + This will cause certificates to expire quickly, disrupting service mesh communication. + Monitor your applications for TLS handshake failures and certificate validation errors. \ No newline at end of file diff --git a/sre/roles/faults/tasks/remove_custom.yaml b/sre/roles/faults/tasks/remove_custom.yaml index e868a86a7..581a6187b 100644 --- a/sre/roles/faults/tasks/remove_custom.yaml +++ b/sre/roles/faults/tasks/remove_custom.yaml @@ -88,3 +88,12 @@ when: - fault.custom.name == 'unsupported-image' - fault.custom.unsupported_image is defined + +- name: Import expired TLS certificates removal tasks + ansible.builtin.import_tasks: + file: remove_custom_expired_tls_certificates.yaml + vars: + spec: "{{ fault.custom.expired_tls_certificates }}" + when: + - fault.custom.name == 'expired-tls-certificates' + - fault.custom.expired_tls_certificates is defined diff --git a/sre/roles/faults/tasks/remove_custom_expired_tls_certificates.yaml b/sre/roles/faults/tasks/remove_custom_expired_tls_certificates.yaml new file mode 100644 index 000000000..6368d37dc --- /dev/null +++ b/sre/roles/faults/tasks/remove_custom_expired_tls_certificates.yaml @@ -0,0 +1,50 @@ +--- +# This task removes the expired TLS certificates fault by restoring +# the original istiod deployment configuration + +- name: Retrieve current istiod deployment + kubernetes.core.k8s_info: + api_version: apps/v1 + kind: Deployment + kubeconfig: "{{ faults_cluster.kubeconfig }}" + name: istiod + namespace: "{{ spec.istio_namespace | default('istio-system') }}" + register: istiod_deployment_info + +- name: Remove fault (restore original TLS certificate configuration) + kubernetes.core.k8s_json_patch: + api_version: apps/v1 + kind: Deployment + kubeconfig: "{{ faults_cluster.kubeconfig }}" + name: istiod + namespace: "{{ spec.istio_namespace | default('istio-system') }}" + patch: + - op: replace + path: /spec/template/spec/containers/0/env + value: | + {{ (istiod_deployment_info.resources[0].spec.template.spec.containers[0].env | rejectattr('name', 'in', ['DEFAULT_WORKLOAD_CERT_TTL', 'MAX_WORKLOAD_CERT_TTL', 'WORKLOAD_CERT_ROTATION_GRACE_PERIOD']) | list) | to_json }} + +- name: Wait for istiod pods to restart with restored configuration + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + kubeconfig: "{{ faults_cluster.kubeconfig }}" + namespace: "{{ spec.istio_namespace | default('istio-system') }}" + label_selectors: + - app=istiod + wait: true + wait_condition: + type: Ready + status: "True" + wait_timeout: 300 + +- name: Display fault removal confirmation + ansible.builtin.debug: + msg: | + TLS certificate expiration fault removed successfully! + + Original certificate TTL settings have been restored. + Istio will now use default certificate lifetimes for new certificates. + + Note: Existing certificates with short TTLs may still be in use until they expire + and are automatically rotated. This process may take a few minutes. diff --git a/sre/roles/incidents/files/ground_truths/incident_301.yaml b/sre/roles/incidents/files/ground_truths/incident_301.yaml new file mode 100644 index 000000000..c70b858b0 --- /dev/null +++ b/sre/roles/incidents/files/ground_truths/incident_301.yaml @@ -0,0 +1,33 @@ +fault: + - category: network + condition: "Expired TLS Certificates" + entity: + kind: "kubernetes" + name: "istiod" + fault_mechanism: "Modifying Istio control plane environment variables" + +groups: + - kind: kubernetes + id: "istio-control-plane" + namespace: "istio-system" + root_cause: true + filter: + - "app=istiod" + +alerts: + - id: "RequestErrorRate" + group_id: "istio-control-plane" + metadata: + description: "Increased error rates due to TLS handshake failures" + +recommended_actions: + - solution: + id: "restore-istio-config" + actions: + - "Remove the fault by restoring the original istiod deployment configuration." + - "Monitor service mesh communication to ensure TLS handshakes are successful." + +aliases: + - ["istio-control-plane"] + +propagations: [] \ No newline at end of file diff --git a/sre/roles/incidents/files/specs/incident_301.yaml b/sre/roles/incidents/files/specs/incident_301.yaml new file mode 100644 index 000000000..bcf6ec096 --- /dev/null +++ b/sre/roles/incidents/files/specs/incident_301.yaml @@ -0,0 +1,31 @@ +--- +metadata: + complexity: High + id: 301 + name: Istio Service Mesh TLS Certificate Expiration + platform: kubernetes +spec: + environment: + applications: + otel_demo: + enabled: true + configuration: + load_generator: + spawn_rate: 5 + users: 100 + tools: + category: sre + selected: + - istio + - prometheus + - jaeger + - kubernetes-topology-monitor + faults: + - custom: + name: expired-tls-certificates + expired_tls_certificates: + istio_namespace: istio-system + cert_ttl: 30s + max_cert_ttl: 1m + grace_period: 10s +# ground truth \ No newline at end of file From 6bb68d7447c0550aedd66ea14fa1fee45c5468f4 Mon Sep 17 00:00:00 2001 From: Tyler Nguyen Date: Wed, 29 Oct 2025 01:07:39 -0400 Subject: [PATCH 2/6] Incident 301: ansible-lint playbook fixes --- ...nject_custom_expired_tls_certificates.yaml | 22 +++++++++---------- ...emove_custom_expired_tls_certificates.yaml | 20 +++++++++-------- .../incidents/files/specs/incident_301.yaml | 2 +- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/sre/roles/faults/tasks/inject_custom_expired_tls_certificates.yaml b/sre/roles/faults/tasks/inject_custom_expired_tls_certificates.yaml index e5a5517f7..d48a3126a 100644 --- a/sre/roles/faults/tasks/inject_custom_expired_tls_certificates.yaml +++ b/sre/roles/faults/tasks/inject_custom_expired_tls_certificates.yaml @@ -1,23 +1,22 @@ --- - - name: Retrieve current istiod deployment kubernetes.core.k8s_info: api_version: apps/v1 kind: Deployment kubeconfig: "{{ faults_cluster.kubeconfig }}" name: istiod - namespace: "{{ spec.istio_namespace | default('istio-system') }}" - register: istiod_deployment_info + namespace: "{{ spec.istio_namespace | default('istio-control') }}" + register: faults_istiod_deployment_info - name: Set original environment variables fact ansible.builtin.set_fact: - original_istiod_env: "{{ istiod_deployment_info.resources[0].spec.template.spec.containers[0].env | default([]) }}" + faults_original_istiod_env: "{{ faults_istiod_deployment_info.resources[0].spec.template.spec.containers[0].env | default([]) }}" - name: Construct new environment variables ansible.builtin.set_fact: - new_istiod_env: >- + faults_new_istiod_env: >- {{ - original_istiod_env | + faults_original_istiod_env | rejectattr('name', 'in', ['DEFAULT_WORKLOAD_CERT_TTL', 'MAX_WORKLOAD_CERT_TTL', 'WORKLOAD_CERT_ROTATION_GRACE_PERIOD']) | list + [ { @@ -41,20 +40,21 @@ kind: Deployment kubeconfig: "{{ faults_cluster.kubeconfig }}" name: istiod - namespace: "{{ spec.istio_namespace | default('istio-system') }}" + namespace: "{{ spec.istio_namespace | default('istio-control') }}" patch: - op: replace path: /spec/template/spec/containers/0/env - value: "{{ new_istiod_env }}" + value: "{{ faults_new_istiod_env }}" - name: Wait for istiod pods to restart with new configuration kubernetes.core.k8s_info: api_version: v1 kind: Pod kubeconfig: "{{ faults_cluster.kubeconfig }}" - namespace: "{{ spec.istio_namespace | default('istio-system') }}" + namespace: "{{ spec.istio_namespace | default('istio-control') }}" label_selectors: - app=istiod + - istio=pilot wait: true wait_condition: type: Ready @@ -63,7 +63,7 @@ - name: Display fault injection confirmation ansible.builtin.debug: - msg: | + msg: > TLS certificate expiration fault injected successfully! Configuration applied: @@ -72,4 +72,4 @@ - Certificate rotation grace period: {{ spec.grace_period | default('30s') }} This will cause certificates to expire quickly, disrupting service mesh communication. - Monitor your applications for TLS handshake failures and certificate validation errors. \ No newline at end of file + Monitor your applications for TLS handshake failures and certificate validation errors. diff --git a/sre/roles/faults/tasks/remove_custom_expired_tls_certificates.yaml b/sre/roles/faults/tasks/remove_custom_expired_tls_certificates.yaml index 6368d37dc..ce245a35a 100644 --- a/sre/roles/faults/tasks/remove_custom_expired_tls_certificates.yaml +++ b/sre/roles/faults/tasks/remove_custom_expired_tls_certificates.yaml @@ -8,8 +8,8 @@ kind: Deployment kubeconfig: "{{ faults_cluster.kubeconfig }}" name: istiod - namespace: "{{ spec.istio_namespace | default('istio-system') }}" - register: istiod_deployment_info + namespace: "{{ spec.istio_namespace | default('istio-control') }}" + register: faults_istiod_deployment_info - name: Remove fault (restore original TLS certificate configuration) kubernetes.core.k8s_json_patch: @@ -17,21 +17,23 @@ kind: Deployment kubeconfig: "{{ faults_cluster.kubeconfig }}" name: istiod - namespace: "{{ spec.istio_namespace | default('istio-system') }}" + namespace: "{{ spec.istio_namespace | default('istio-control') }}" patch: - op: replace path: /spec/template/spec/containers/0/env value: | - {{ (istiod_deployment_info.resources[0].spec.template.spec.containers[0].env | rejectattr('name', 'in', ['DEFAULT_WORKLOAD_CERT_TTL', 'MAX_WORKLOAD_CERT_TTL', 'WORKLOAD_CERT_ROTATION_GRACE_PERIOD']) | list) | to_json }} + {{ (faults_istiod_deployment_info.resources[0].spec.template.spec.containers[0].env | + rejectattr('name', 'in', ['DEFAULT_WORKLOAD_CERT_TTL', 'MAX_WORKLOAD_CERT_TTL', 'WORKLOAD_CERT_ROTATION_GRACE_PERIOD']) | + list) | to_json }} - name: Wait for istiod pods to restart with restored configuration kubernetes.core.k8s_info: api_version: v1 kind: Pod kubeconfig: "{{ faults_cluster.kubeconfig }}" - namespace: "{{ spec.istio_namespace | default('istio-system') }}" + namespace: "{{ spec.istio_namespace | default('istio-control') }}" label_selectors: - - app=istiod + - istio=pilot wait: true wait_condition: type: Ready @@ -40,11 +42,11 @@ - name: Display fault removal confirmation ansible.builtin.debug: - msg: | + msg: > TLS certificate expiration fault removed successfully! - + Original certificate TTL settings have been restored. Istio will now use default certificate lifetimes for new certificates. - + Note: Existing certificates with short TTLs may still be in use until they expire and are automatically rotated. This process may take a few minutes. diff --git a/sre/roles/incidents/files/specs/incident_301.yaml b/sre/roles/incidents/files/specs/incident_301.yaml index bcf6ec096..e73798870 100644 --- a/sre/roles/incidents/files/specs/incident_301.yaml +++ b/sre/roles/incidents/files/specs/incident_301.yaml @@ -24,7 +24,7 @@ spec: - custom: name: expired-tls-certificates expired_tls_certificates: - istio_namespace: istio-system + istio_namespace: istio-control cert_ttl: 30s max_cert_ttl: 1m grace_period: 10s From 6a56e24f79977761bae2854ab7e0e75f95223f72 Mon Sep 17 00:00:00 2001 From: Tyler Nguyen Date: Thu, 30 Oct 2025 10:59:04 -0400 Subject: [PATCH 3/6] istio pod configuration debug --- sre/roles/tools/tasks/install_istio.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sre/roles/tools/tasks/install_istio.yaml b/sre/roles/tools/tasks/install_istio.yaml index 18df2d03c..efe314a19 100644 --- a/sre/roles/tools/tasks/install_istio.yaml +++ b/sre/roles/tools/tasks/install_istio.yaml @@ -62,3 +62,17 @@ type: "{{ 'NodePort' if tools_cluster.provider == 'kind' else 'LoadBalancer' }}" nodePorts: "{{ {'http': 30080, 'https': 30443} if tools_cluster.provider == 'kind' else omit }}" wait: true + +- name: Label otel-demo namespace for sidecar injection + kubernetes.core.k8s: + + kubeconfig: "{{ tools_cluster.kubeconfig }}" + state: present + resource_definition: + apiVersion: v1 + kind: Namespace + metadata: + name: otel-demo + labels: + istio-injection: enabled + it-bench/monitoring: "true" \ No newline at end of file From 6631324b65c0d688de38ece406ae47c9c7858d31 Mon Sep 17 00:00:00 2001 From: Tyler Nguyen Date: Tue, 4 Nov 2025 02:50:03 -0500 Subject: [PATCH 4/6] fix: final ansible --- .../tasks/inject_custom_expired_tls_certificates.yaml | 6 +++--- .../ground_truths/{incident_301.yaml => incident_42.yaml} | 0 .../files/specs/{incident_301.yaml => incident_42.yaml} | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename sre/roles/incidents/files/ground_truths/{incident_301.yaml => incident_42.yaml} (100%) rename sre/roles/incidents/files/specs/{incident_301.yaml => incident_42.yaml} (98%) diff --git a/sre/roles/faults/tasks/inject_custom_expired_tls_certificates.yaml b/sre/roles/faults/tasks/inject_custom_expired_tls_certificates.yaml index d48a3126a..921570acb 100644 --- a/sre/roles/faults/tasks/inject_custom_expired_tls_certificates.yaml +++ b/sre/roles/faults/tasks/inject_custom_expired_tls_certificates.yaml @@ -63,13 +63,13 @@ - name: Display fault injection confirmation ansible.builtin.debug: - msg: > + msg: >- TLS certificate expiration fault injected successfully! - + Configuration applied: - Default certificate TTL: {{ spec.cert_ttl | default('1m') }} - Maximum certificate TTL: {{ spec.max_cert_ttl | default('2m') }} - Certificate rotation grace period: {{ spec.grace_period | default('30s') }} - + This will cause certificates to expire quickly, disrupting service mesh communication. Monitor your applications for TLS handshake failures and certificate validation errors. diff --git a/sre/roles/incidents/files/ground_truths/incident_301.yaml b/sre/roles/incidents/files/ground_truths/incident_42.yaml similarity index 100% rename from sre/roles/incidents/files/ground_truths/incident_301.yaml rename to sre/roles/incidents/files/ground_truths/incident_42.yaml diff --git a/sre/roles/incidents/files/specs/incident_301.yaml b/sre/roles/incidents/files/specs/incident_42.yaml similarity index 98% rename from sre/roles/incidents/files/specs/incident_301.yaml rename to sre/roles/incidents/files/specs/incident_42.yaml index e73798870..e64423ce9 100644 --- a/sre/roles/incidents/files/specs/incident_301.yaml +++ b/sre/roles/incidents/files/specs/incident_42.yaml @@ -1,7 +1,7 @@ --- metadata: complexity: High - id: 301 + id: 42 name: Istio Service Mesh TLS Certificate Expiration platform: kubernetes spec: From ef4ea09457ddbefb3df8c193787a4ff0c8a6da5a Mon Sep 17 00:00:00 2001 From: Tyler Nguyen Date: Tue, 4 Nov 2025 02:59:54 -0500 Subject: [PATCH 5/6] new line space debug --- .../faults/tasks/inject_custom_expired_tls_certificates.yaml | 2 -- sre/roles/incidents/files/ground_truths/incident_42.yaml | 2 +- sre/roles/incidents/files/specs/incident_42.yaml | 1 - sre/roles/tools/tasks/install_istio.yaml | 2 +- 4 files changed, 2 insertions(+), 5 deletions(-) diff --git a/sre/roles/faults/tasks/inject_custom_expired_tls_certificates.yaml b/sre/roles/faults/tasks/inject_custom_expired_tls_certificates.yaml index 921570acb..6302f454d 100644 --- a/sre/roles/faults/tasks/inject_custom_expired_tls_certificates.yaml +++ b/sre/roles/faults/tasks/inject_custom_expired_tls_certificates.yaml @@ -65,11 +65,9 @@ ansible.builtin.debug: msg: >- TLS certificate expiration fault injected successfully! - Configuration applied: - Default certificate TTL: {{ spec.cert_ttl | default('1m') }} - Maximum certificate TTL: {{ spec.max_cert_ttl | default('2m') }} - Certificate rotation grace period: {{ spec.grace_period | default('30s') }} - This will cause certificates to expire quickly, disrupting service mesh communication. Monitor your applications for TLS handshake failures and certificate validation errors. diff --git a/sre/roles/incidents/files/ground_truths/incident_42.yaml b/sre/roles/incidents/files/ground_truths/incident_42.yaml index c70b858b0..43f032e16 100644 --- a/sre/roles/incidents/files/ground_truths/incident_42.yaml +++ b/sre/roles/incidents/files/ground_truths/incident_42.yaml @@ -30,4 +30,4 @@ recommended_actions: aliases: - ["istio-control-plane"] -propagations: [] \ No newline at end of file +propagations: [] diff --git a/sre/roles/incidents/files/specs/incident_42.yaml b/sre/roles/incidents/files/specs/incident_42.yaml index e64423ce9..65f270c04 100644 --- a/sre/roles/incidents/files/specs/incident_42.yaml +++ b/sre/roles/incidents/files/specs/incident_42.yaml @@ -28,4 +28,3 @@ spec: cert_ttl: 30s max_cert_ttl: 1m grace_period: 10s -# ground truth \ No newline at end of file diff --git a/sre/roles/tools/tasks/install_istio.yaml b/sre/roles/tools/tasks/install_istio.yaml index efe314a19..8d7f1218b 100644 --- a/sre/roles/tools/tasks/install_istio.yaml +++ b/sre/roles/tools/tasks/install_istio.yaml @@ -75,4 +75,4 @@ name: otel-demo labels: istio-injection: enabled - it-bench/monitoring: "true" \ No newline at end of file + it-bench/monitoring: "true" From 3835a74d1f9ca927539897ba4d6933de66269cf4 Mon Sep 17 00:00:00 2001 From: Tyler Nguyen Date: Fri, 7 Nov 2025 13:38:10 -0500 Subject: [PATCH 6/6] Fix incident 301 issues and update incident_42 specs --- sre/roles/faults/tasks/inject.yaml | 18 ++++++ sre/roles/faults/tasks/remove.yaml | 18 ++++++ ...emove_custom_expired_tls_certificates.yaml | 10 ++++ .../files/ground_truths/incident_42.yaml | 6 ++ .../incidents/files/specs/incident_42.yaml | 59 ++++++++++--------- 5 files changed, 83 insertions(+), 28 deletions(-) diff --git a/sre/roles/faults/tasks/inject.yaml b/sre/roles/faults/tasks/inject.yaml index 2cd27934d..00681116e 100644 --- a/sre/roles/faults/tasks/inject.yaml +++ b/sre/roles/faults/tasks/inject.yaml @@ -24,3 +24,21 @@ file: inject_valkey.yaml when: - fault.valkey is defined + +# - name: Import misconfigured service mesh fault injection tasks (sidecar injection off) +# ansible.builtin.include_tasks: +# file: inject_custom_sidecar_injection_off.yaml +# vars: +# spec: "{{ fault.misconfigured_service_mesh.sidecar_injection_off }}" +# when: +# - fault.type == 'misconfigured_service_mesh' +# - fault.misconfigured_service_mesh.misconfiguration_type == 'sidecar_injection_off' + +# - name: Import misconfigured service mesh fault injection tasks (authorization policy deny) +# ansible.builtin.include_tasks: +# file: inject_custom_authorization_policy_deny.yaml +# vars: +# spec: "{{ fault.misconfigured_service_mesh.authorization_policy_deny }}" +# when: +# - fault.type == 'misconfigured_service_mesh' +# - fault.misconfigured_service_mesh.misconfiguration_type == 'authorization_policy_deny' diff --git a/sre/roles/faults/tasks/remove.yaml b/sre/roles/faults/tasks/remove.yaml index 48dd112a3..c4e992c7f 100644 --- a/sre/roles/faults/tasks/remove.yaml +++ b/sre/roles/faults/tasks/remove.yaml @@ -24,3 +24,21 @@ file: remove_valkey.yaml when: - fault.valkey is defined + +# - name: Import misconfigured service mesh fault removal tasks (sidecar injection off) +# ansible.builtin.include_tasks: +# file: remove_custom_sidecar_injection_off.yaml +# vars: +# spec: "{{ fault.misconfigured_service_mesh.sidecar_injection_off }}" +# when: +# - fault.type == 'misconfigured_service_mesh' +# - fault.misconfigured_service_mesh.misconfiguration_type == 'sidecar_injection_off' + +# - name: Import misconfigured service mesh fault removal tasks (authorization policy deny) +# ansible.builtin.include_tasks: +# file: remove_custom_authorization_policy_deny.yaml +# vars: +# spec: "{{ fault.misconfigured_service_mesh.authorization_policy_deny }}" +# when: +# - fault.type == 'misconfigured_service_mesh' +# - fault.misconfigured_service_mesh.misconfiguration_type == 'authorization_policy_deny' diff --git a/sre/roles/faults/tasks/remove_custom_expired_tls_certificates.yaml b/sre/roles/faults/tasks/remove_custom_expired_tls_certificates.yaml index ce245a35a..2e4262b85 100644 --- a/sre/roles/faults/tasks/remove_custom_expired_tls_certificates.yaml +++ b/sre/roles/faults/tasks/remove_custom_expired_tls_certificates.yaml @@ -11,6 +11,16 @@ namespace: "{{ spec.istio_namespace | default('istio-control') }}" register: faults_istiod_deployment_info +- name: Ensure istiod deployment exists + kubernetes.core.k8s_info: + api_version: apps/v1 + kind: Deployment + kubeconfig: "{{ faults_cluster.kubeconfig }}" + name: istiod + namespace: "{{ spec.istio_namespace | default('istio-control') }}" + register: istiod_deployment_info + failed_when: istiod_deployment_info.resources | length == 0 + - name: Remove fault (restore original TLS certificate configuration) kubernetes.core.k8s_json_patch: api_version: apps/v1 diff --git a/sre/roles/incidents/files/ground_truths/incident_42.yaml b/sre/roles/incidents/files/ground_truths/incident_42.yaml index 43f032e16..aba03c1a0 100644 --- a/sre/roles/incidents/files/ground_truths/incident_42.yaml +++ b/sre/roles/incidents/files/ground_truths/incident_42.yaml @@ -6,6 +6,12 @@ fault: name: "istiod" fault_mechanism: "Modifying Istio control plane environment variables" +metadata: + version: v1 +scenario: + metadata: {} + spec: {} + groups: - kind: kubernetes id: "istio-control-plane" diff --git a/sre/roles/incidents/files/specs/incident_42.yaml b/sre/roles/incidents/files/specs/incident_42.yaml index 65f270c04..c0a24fca6 100644 --- a/sre/roles/incidents/files/specs/incident_42.yaml +++ b/sre/roles/incidents/files/specs/incident_42.yaml @@ -1,30 +1,33 @@ --- metadata: - complexity: High - id: 42 - name: Istio Service Mesh TLS Certificate Expiration - platform: kubernetes -spec: - environment: - applications: - otel_demo: - enabled: true - configuration: - load_generator: - spawn_rate: 5 - users: 100 - tools: - category: sre - selected: - - istio - - prometheus - - jaeger - - kubernetes-topology-monitor - faults: - - custom: - name: expired-tls-certificates - expired_tls_certificates: - istio_namespace: istio-control - cert_ttl: 30s - max_cert_ttl: 1m - grace_period: 10s + version: v1 +scenario: + metadata: + complexity: High + id: 42 + name: Istio Service Mesh TLS Certificate Expiration + platform: kubernetes + spec: + environment: + applications: + otel_demo: + enabled: true + configuration: + load_generator: + spawn_rate: 5 + users: 100 + tools: + category: sre + selected: + - istio + - prometheus + - jaeger + - kubernetes-topology-monitor + faults: + - custom: + name: expired-tls-certificates + expired_tls_certificates: + istio_namespace: istio-control + cert_ttl: 30s + max_cert_ttl: 1m + grace_period: 10s