Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions sre/roles/faults/meta/argument_specs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ argument_specs:
- misconfigured-service-port
- modify-environment-variables
- unsupported-image
- expired-tls-certificates
required: true
type: str
invalid_command:
Expand Down Expand Up @@ -325,6 +326,26 @@ argument_specs:
default: force
required: false
type: str
expired_tls_certificates:
required: false
type: dict
options:
istio_namespace:
default: istio-system
required: false
type: str
cert_ttl:
default: 1m
required: false
type: str
max_cert_ttl:
default: 2m
required: false
type: str
grace_period:
default: 30s
required: false
type: str
otel_demo:
required: false
type: dict
Expand Down
18 changes: 18 additions & 0 deletions sre/roles/faults/tasks/inject.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,21 @@
file: inject_valkey.yaml
when:
- fault.valkey is defined

# - name: Import misconfigured service mesh fault injection tasks (sidecar injection off)
# ansible.builtin.include_tasks:
# file: inject_custom_sidecar_injection_off.yaml
# vars:
# spec: "{{ fault.misconfigured_service_mesh.sidecar_injection_off }}"
# when:
# - fault.type == 'misconfigured_service_mesh'
# - fault.misconfigured_service_mesh.misconfiguration_type == 'sidecar_injection_off'

# - name: Import misconfigured service mesh fault injection tasks (authorization policy deny)
# ansible.builtin.include_tasks:
# file: inject_custom_authorization_policy_deny.yaml
# vars:
# spec: "{{ fault.misconfigured_service_mesh.authorization_policy_deny }}"
# when:
# - fault.type == 'misconfigured_service_mesh'
# - fault.misconfigured_service_mesh.misconfiguration_type == 'authorization_policy_deny'
9 changes: 9 additions & 0 deletions sre/roles/faults/tasks/inject_custom.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,12 @@
when:
- fault.custom.name == 'unsupported-image'
- fault.custom.unsupported_image is defined

- name: Import expired TLS certificates injection tasks
ansible.builtin.import_tasks:
file: inject_custom_expired_tls_certificates.yaml
vars:
spec: "{{ fault.custom.expired_tls_certificates }}"
when:
- fault.custom.name == 'expired-tls-certificates'
- fault.custom.expired_tls_certificates is defined
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
---
- name: Retrieve current istiod deployment
kubernetes.core.k8s_info:
api_version: apps/v1
kind: Deployment
kubeconfig: "{{ faults_cluster.kubeconfig }}"
name: istiod
namespace: "{{ spec.istio_namespace | default('istio-control') }}"
register: faults_istiod_deployment_info

- name: Set original environment variables fact
ansible.builtin.set_fact:
faults_original_istiod_env: "{{ faults_istiod_deployment_info.resources[0].spec.template.spec.containers[0].env | default([]) }}"

- name: Construct new environment variables
ansible.builtin.set_fact:
faults_new_istiod_env: >-
{{
faults_original_istiod_env |
rejectattr('name', 'in', ['DEFAULT_WORKLOAD_CERT_TTL', 'MAX_WORKLOAD_CERT_TTL', 'WORKLOAD_CERT_ROTATION_GRACE_PERIOD']) |
list + [
{
"name": "DEFAULT_WORKLOAD_CERT_TTL",
"value": spec.cert_ttl | default('1m')
},
{
"name": "MAX_WORKLOAD_CERT_TTL",
"value": spec.max_cert_ttl | default('2m')
},
{
"name": "WORKLOAD_CERT_ROTATION_GRACE_PERIOD",
"value": spec.grace_period | default('30s')
}
]
}}

- name: Inject fault (expired TLS certificates)
kubernetes.core.k8s_json_patch:
api_version: apps/v1
kind: Deployment
kubeconfig: "{{ faults_cluster.kubeconfig }}"
name: istiod
namespace: "{{ spec.istio_namespace | default('istio-control') }}"
patch:
- op: replace
path: /spec/template/spec/containers/0/env
value: "{{ faults_new_istiod_env }}"

- name: Wait for istiod pods to restart with new configuration
kubernetes.core.k8s_info:
api_version: v1
kind: Pod
kubeconfig: "{{ faults_cluster.kubeconfig }}"
namespace: "{{ spec.istio_namespace | default('istio-control') }}"
label_selectors:
- app=istiod
- istio=pilot
wait: true
wait_condition:
type: Ready
status: "True"
wait_timeout: 300

- name: Display fault injection confirmation
ansible.builtin.debug:
msg: >-
TLS certificate expiration fault injected successfully!
Configuration applied:
- Default certificate TTL: {{ spec.cert_ttl | default('1m') }}
- Maximum certificate TTL: {{ spec.max_cert_ttl | default('2m') }}
- Certificate rotation grace period: {{ spec.grace_period | default('30s') }}
This will cause certificates to expire quickly, disrupting service mesh communication.
Monitor your applications for TLS handshake failures and certificate validation errors.
18 changes: 18 additions & 0 deletions sre/roles/faults/tasks/remove.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,21 @@
file: remove_valkey.yaml
when:
- fault.valkey is defined

# - name: Import misconfigured service mesh fault removal tasks (sidecar injection off)
# ansible.builtin.include_tasks:
# file: remove_custom_sidecar_injection_off.yaml
# vars:
# spec: "{{ fault.misconfigured_service_mesh.sidecar_injection_off }}"
# when:
# - fault.type == 'misconfigured_service_mesh'
# - fault.misconfigured_service_mesh.misconfiguration_type == 'sidecar_injection_off'

# - name: Import misconfigured service mesh fault removal tasks (authorization policy deny)
# ansible.builtin.include_tasks:
# file: remove_custom_authorization_policy_deny.yaml
# vars:
# spec: "{{ fault.misconfigured_service_mesh.authorization_policy_deny }}"
# when:
# - fault.type == 'misconfigured_service_mesh'
# - fault.misconfigured_service_mesh.misconfiguration_type == 'authorization_policy_deny'
9 changes: 9 additions & 0 deletions sre/roles/faults/tasks/remove_custom.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,12 @@
when:
- fault.custom.name == 'unsupported-image'
- fault.custom.unsupported_image is defined

- name: Import expired TLS certificates removal tasks
ansible.builtin.import_tasks:
file: remove_custom_expired_tls_certificates.yaml
vars:
spec: "{{ fault.custom.expired_tls_certificates }}"
when:
- fault.custom.name == 'expired-tls-certificates'
- fault.custom.expired_tls_certificates is defined
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
---
# This task removes the expired TLS certificates fault by restoring
# the original istiod deployment configuration

- name: Retrieve current istiod deployment
kubernetes.core.k8s_info:
api_version: apps/v1
kind: Deployment
kubeconfig: "{{ faults_cluster.kubeconfig }}"
name: istiod
namespace: "{{ spec.istio_namespace | default('istio-control') }}"
register: faults_istiod_deployment_info

- name: Ensure istiod deployment exists
kubernetes.core.k8s_info:
api_version: apps/v1
kind: Deployment
kubeconfig: "{{ faults_cluster.kubeconfig }}"
name: istiod
namespace: "{{ spec.istio_namespace | default('istio-control') }}"
register: istiod_deployment_info
failed_when: istiod_deployment_info.resources | length == 0

- name: Remove fault (restore original TLS certificate configuration)
kubernetes.core.k8s_json_patch:
api_version: apps/v1
kind: Deployment
kubeconfig: "{{ faults_cluster.kubeconfig }}"
name: istiod
namespace: "{{ spec.istio_namespace | default('istio-control') }}"
patch:
- op: replace
path: /spec/template/spec/containers/0/env
value: |
{{ (faults_istiod_deployment_info.resources[0].spec.template.spec.containers[0].env |
rejectattr('name', 'in', ['DEFAULT_WORKLOAD_CERT_TTL', 'MAX_WORKLOAD_CERT_TTL', 'WORKLOAD_CERT_ROTATION_GRACE_PERIOD']) |
list) | to_json }}

- name: Wait for istiod pods to restart with restored configuration
kubernetes.core.k8s_info:
api_version: v1
kind: Pod
kubeconfig: "{{ faults_cluster.kubeconfig }}"
namespace: "{{ spec.istio_namespace | default('istio-control') }}"
label_selectors:
- istio=pilot
wait: true
wait_condition:
type: Ready
status: "True"
wait_timeout: 300

- name: Display fault removal confirmation
ansible.builtin.debug:
msg: >
TLS certificate expiration fault removed successfully!

Original certificate TTL settings have been restored.
Istio will now use default certificate lifetimes for new certificates.

Note: Existing certificates with short TTLs may still be in use until they expire
and are automatically rotated. This process may take a few minutes.
39 changes: 39 additions & 0 deletions sre/roles/incidents/files/ground_truths/incident_42.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
fault:
- category: network
condition: "Expired TLS Certificates"
entity:
kind: "kubernetes"
name: "istiod"
fault_mechanism: "Modifying Istio control plane environment variables"

metadata:
version: v1
scenario:
metadata: {}
spec: {}

groups:
- kind: kubernetes
id: "istio-control-plane"
namespace: "istio-system"
root_cause: true
filter:
- "app=istiod"

alerts:
- id: "RequestErrorRate"
group_id: "istio-control-plane"
metadata:
description: "Increased error rates due to TLS handshake failures"

recommended_actions:
- solution:
id: "restore-istio-config"
actions:
- "Remove the fault by restoring the original istiod deployment configuration."
- "Monitor service mesh communication to ensure TLS handshakes are successful."

aliases:
- ["istio-control-plane"]

propagations: []
33 changes: 33 additions & 0 deletions sre/roles/incidents/files/specs/incident_42.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
---
metadata:
version: v1
scenario:
metadata:
complexity: High
id: 42
name: Istio Service Mesh TLS Certificate Expiration
platform: kubernetes
spec:
environment:
applications:
otel_demo:
enabled: true
configuration:
load_generator:
spawn_rate: 5
users: 100
tools:
category: sre
selected:
- istio
- prometheus
- jaeger
- kubernetes-topology-monitor
faults:
- custom:
name: expired-tls-certificates
expired_tls_certificates:
istio_namespace: istio-control
cert_ttl: 30s
max_cert_ttl: 1m
grace_period: 10s
14 changes: 14 additions & 0 deletions sre/roles/tools/tasks/install_istio.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,17 @@
type: "{{ 'NodePort' if tools_cluster.provider == 'kind' else 'LoadBalancer' }}"
nodePorts: "{{ {'http': 30080, 'https': 30443} if tools_cluster.provider == 'kind' else omit }}"
wait: true

- name: Label otel-demo namespace for sidecar injection
kubernetes.core.k8s:

kubeconfig: "{{ tools_cluster.kubeconfig }}"
state: present
resource_definition:
apiVersion: v1
kind: Namespace
metadata:
name: otel-demo
labels:
istio-injection: enabled
it-bench/monitoring: "true"
Loading