diff --git a/sre/Makefile.runner b/sre/Makefile.runner index ce3b02b84..e510dfb96 100644 --- a/sre/Makefile.runner +++ b/sre/Makefile.runner @@ -33,7 +33,7 @@ launch_sync_workflow: ## Launches the scenario sync workflow .PHONY: launch_start_workflow launch_start_workflow: ## Launches the workflow equivalent of start_incident on AWX ansible-playbook -i inventory.yaml playbooks/manage_awx.yaml --tags "launch_workflows" \ - --extra-vars "run_phase=init" + --extra-vars "run_phase=start" .PHONY: launch_stop_workflow launch_stop_workflow: ## Launches the workflow equivalent of stop_incident on AWX diff --git a/sre/dev/local_cluster/Makefile b/sre/dev/local_cluster/Makefile index 13f091ad6..3cee50244 100644 --- a/sre/dev/local_cluster/Makefile +++ b/sre/dev/local_cluster/Makefile @@ -45,4 +45,4 @@ delete_cluster: ## DEPRECATED: Deletes a Kind cluster @echo "This command will be removed in a future version." @echo "Executing 'make destory_cluster'..." @echo "" - $(MAKE) destory_cluster + $(MAKE) destroy_cluster diff --git a/sre/dev/remote_cluster/roles/kops/tasks/create_stack_async.yaml b/sre/dev/remote_cluster/roles/kops/tasks/create_stack_async.yaml index e151aa47c..baeabc69b 100644 --- a/sre/dev/remote_cluster/roles/kops/tasks/create_stack_async.yaml +++ b/sre/dev/remote_cluster/roles/kops/tasks/create_stack_async.yaml @@ -121,7 +121,7 @@ - --cloud - aws - --topology - - "{{ 'private' if kops_stack.runners.aws.elastic_ip_allocation_id is defined else 'public' }}" + - "{{ 'private' if kops_elastic_ip_available else 'public' }}" - --network-id - "{{ kops_vpc_info.vpc.id }}" - --subnets diff --git a/sre/dev/remote_cluster/roles/kops/tasks/validate_stack.yaml b/sre/dev/remote_cluster/roles/kops/tasks/validate_stack.yaml index 06b3adc75..9ed55c043 100644 --- a/sre/dev/remote_cluster/roles/kops/tasks/validate_stack.yaml +++ b/sre/dev/remote_cluster/roles/kops/tasks/validate_stack.yaml @@ -7,10 +7,23 @@ success_msg: Valid number of kOps clusters configured. - name: Validate Elastic IP allocation ID - ansible.builtin.assert: - that: - - kops_stack.runners.aws.elastic_ip_allocation_id | length > 0 - fail_msg: Invalid number of kOps clusters set. Must be greater than 0. - success_msg: Valid number of kOps clusters configured. - when: - - kops_stack.runners.aws.elastic_ip_allocation_id is defined + block: + - name: Check Elastic IP allocation ID + ansible.builtin.assert: + that: + - kops_stack.runners.aws.elastic_ip_allocation_id is defined + - kops_stack.runners.aws.elastic_ip_allocation_id | length > 0 + fail_msg: Invalid or missing Elastic IP allocation ID. + success_msg: Valid Elastic IP allocation ID configured. + + - name: Set validation flag for valid Elastic IP + ansible.builtin.set_fact: + kops_elastic_ip_available: true + rescue: + - name: Warning about invalid Elastic IP + ansible.builtin.debug: + msg: "WARNING: {{ ansible_failed_result.msg | default('Invalid or missing Elastic IP allocation ID') }}" + + - name: Set validation flag for invalid Elastic IP + ansible.builtin.set_fact: + kops_elastic_ip_available: false diff --git a/sre/docs/data_snapshots.excalidraw b/sre/docs/data_snapshots.excalidraw new file mode 100644 index 000000000..ae2a04bb0 --- /dev/null +++ b/sre/docs/data_snapshots.excalidraw @@ -0,0 +1,870 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor", + "elements": [ + { + "id": "mQZf89Bn-HBuGFEOKlkL2", + "type": "rectangle", + "x": 1238.3335571289062, + "y": -1049.5238069806778, + "width": 448.66662597656244, + "height": 350.66666412353516, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a0", + "roundness": { + "type": 3 + }, + "seed": 905003567, + "version": 606, + "versionNonce": 1487021759, + "isDeleted": false, + "boundElements": [ + { + "id": "7VKMRtI8XZCTR5422aT5c", + "type": "arrow" + } + ], + "updated": 1762961826146, + "link": null, + "locked": false + }, + { + "id": "axPDhYFh8IYXorLtKDFbs", + "type": "text", + "x": 1361.3335266113281, + "y": -1034.8571581159317, + "width": 164.1798553466797, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a1", + "roundness": null, + "seed": 1921455681, + "version": 702, + "versionNonce": 1047053055, + "isDeleted": false, + "boundElements": [], + "updated": 1762961826146, + "link": null, + "locked": false, + "text": " Data Snapshot ", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": " Data Snapshot ", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "tx5Vdv1S_YZX4J29BZ7HY", + "type": "text", + "x": 1258.6668701171875, + "y": -974.8571428571427, + "width": 95.41989135742188, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a3", + "roundness": null, + "seed": 1960340865, + "version": 631, + "versionNonce": 259394335, + "isDeleted": false, + "boundElements": [], + "updated": 1762961826146, + "link": null, + "locked": false, + "text": "alerts.tsv", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "alerts.tsv", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "yFtacwfYUsBvHQbAM7MlU", + "type": "text", + "x": 1265.3335266113281, + "y": -790.190471104213, + "width": 80.73989868164062, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a4", + "roundness": null, + "seed": 92367791, + "version": 769, + "versionNonce": 1004883775, + "isDeleted": false, + "boundElements": [], + "updated": 1762961826146, + "link": null, + "locked": false, + "text": "metrics/", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "metrics/", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "rncLvd00ZDb9WEdQMUUUw", + "type": "text", + "x": 1310.0001831054688, + "y": -760.190486363002, + "width": 223.77976989746094, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a5", + "roundness": null, + "seed": 557124577, + "version": 684, + "versionNonce": 2115440479, + "isDeleted": false, + "boundElements": [], + "updated": 1762961826146, + "link": null, + "locked": false, + "text": "pod_*.tsv (or parquet)", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "pod_*.tsv (or parquet)", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "mE9VWcNVNfiBjd9eaLZSf", + "type": "text", + "x": 1309.9336700439453, + "y": -732.0237993512833, + "width": 254.97972106933594, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a6", + "roundness": null, + "seed": 1105688097, + "version": 700, + "versionNonce": 1885064063, + "isDeleted": false, + "boundElements": [], + "updated": 1762961826146, + "link": null, + "locked": false, + "text": "service_*.tsv (or parquet)", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "service_*.tsv (or parquet)", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "vOdUFo2PcK3Jh-jHFmh1O", + "type": "text", + "x": 1259.6668701171875, + "y": -930.8571428571427, + "width": 149.95982360839844, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a8", + "roundness": null, + "seed": 1117530017, + "version": 525, + "versionNonce": 250872735, + "isDeleted": false, + "boundElements": [], + "updated": 1762961826146, + "link": null, + "locked": false, + "text": "k8s_events.tsv", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "k8s_events.tsv", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "j684aIAWdNAeeERSWnvTs", + "type": "text", + "x": 1263.260269165039, + "y": -893.3571428571427, + "width": 157.1398162841797, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aA", + "roundness": null, + "seed": 1546961903, + "version": 534, + "versionNonce": 2047883199, + "isDeleted": false, + "boundElements": [], + "updated": 1762961826146, + "link": null, + "locked": false, + "text": "k8s_objects.tsv", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "k8s_objects.tsv", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "c8rohz7h6Tl1DoWi99-0L", + "type": "text", + "x": 1256.6668701171875, + "y": -852.8571428571427, + "width": 107.59986877441406, + "height": 50, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aC", + "roundness": null, + "seed": 90441697, + "version": 546, + "versionNonce": 2010179551, + "isDeleted": false, + "boundElements": [], + "updated": 1762961826146, + "link": null, + "locked": false, + "text": " logs.tsv\n traces.tsv", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": " logs.tsv\n traces.tsv", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "bAJQWl8ToUHoPawpJ6mpw", + "type": "rectangle", + "x": 385.1666564941406, + "y": -290.3333320617676, + "width": 393.19045584542414, + "height": 232.38094983782094, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aD", + "roundness": { + "type": 3 + }, + "seed": 916359599, + "version": 217, + "versionNonce": 714906289, + "isDeleted": false, + "boundElements": [], + "updated": 1762961793302, + "link": null, + "locked": false + }, + { + "id": "9MAvFiN6yz3s-z7kdOPFO", + "type": "text", + "x": 418.6666564941406, + "y": -253.6666717529297, + "width": 298.8130798339843, + "height": 175, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aE", + "roundness": null, + "seed": 1716280687, + "version": 281, + "versionNonce": 820222719, + "isDeleted": false, + "boundElements": [ + { + "id": "2agNaK3e_hv98gzLx5DjL", + "type": "arrow" + } + ], + "updated": 1762961817059, + "link": null, + "locked": false, + "text": "Question: Diagnose the\nproblem in the IT environment\n\nGiven:\n1. Data snapshot\n2. Topology (Text\ndescription)", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Question: Diagnose the problem in the IT environment\n\nGiven:\n1. Data snapshot\n2. Topology (Text description)", + "autoResize": false, + "lineHeight": 1.25 + }, + { + "id": "V0XH3BPHxA6zsUaUXqlC3", + "type": "text", + "x": 1719.6668701171875, + "y": -898.8571428571427, + "width": 382.879638671875, + "height": 50, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aG", + "roundness": null, + "seed": 2130967809, + "version": 553, + "versionNonce": 910595071, + "isDeleted": false, + "boundElements": [], + "updated": 1762961826146, + "link": null, + "locked": false, + "text": "parquet (as opposed to TsV) preferred\nas the content involves JSON objects", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "parquet (as opposed to TsV) preferred\nas the content involves JSON objects", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "b1R-6LaxnvmarynKpGJsZ", + "type": "rectangle", + "x": 391.1666564941406, + "y": 181.66666793823242, + "width": 360.33331298828125, + "height": 82.66666412353516, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aI", + "roundness": { + "type": 3 + }, + "seed": 1243823457, + "version": 262, + "versionNonce": 800312159, + "isDeleted": false, + "boundElements": [ + { + "id": "hNtXalTlyc9sA2-qMdFTO", + "type": "arrow" + } + ], + "updated": 1762960178399, + "link": null, + "locked": false + }, + { + "id": "zBa8sL-vWMBGb2nv8xowa", + "type": "text", + "x": 424.6666564941406, + "y": 218.3333282470703, + "width": 298.8130798339843, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aJ", + "roundness": null, + "seed": 477555009, + "version": 288, + "versionNonce": 2108685359, + "isDeleted": false, + "boundElements": [], + "updated": 1762958496011, + "link": null, + "locked": false, + "text": "Prompt", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Prompt", + "autoResize": false, + "lineHeight": 1.25 + }, + { + "id": "944wqqK4NZ7aGjbzExCk9", + "type": "rectangle", + "x": 918.3333740234375, + "y": 122.33335494995117, + "width": 2161.1429792131694, + "height": 1557.4286095755442, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aL", + "roundness": { + "type": 3 + }, + "seed": 241426319, + "version": 703, + "versionNonce": 599845887, + "isDeleted": false, + "boundElements": [ + { + "id": "hNtXalTlyc9sA2-qMdFTO", + "type": "arrow" + } + ], + "updated": 1762961242380, + "link": null, + "locked": false + }, + { + "id": "N1AJFDrw-YvrC9hLmp0xg", + "type": "text", + "x": 977.3333129882812, + "y": 157, + "width": 1317.4788818359377, + "height": 1000, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aR", + "roundness": null, + "seed": 1160807791, + "version": 815, + "versionNonce": 101492063, + "isDeleted": false, + "boundElements": [], + "updated": 1762961312553, + "link": null, + "locked": false, + "text": "You are SRE Diagnosis Agent.\nYou are a smart and capable tool using agent. \nYou are an expert at diagnosing problem in IT environments.\nYou have extensive experience with kubernetes and SRE tools.\nYour primary goal is to diagnose a IT\nincident.\nYou must identify all the entity that caused or was impacted by the incident and determine if\nit was a contributing factor or not. Also document the propagation chain\n\nErrors generally propagate in backward direction to the alert.\nE.g., in a service graph where calls which calls\n, if service is unavailable, calls from to will also fail showing\nup as high error rate on . But, traffic to will drop, causing an alert on\n. Thus, errors propagate backward, while traffic drops propagate forward. In\nthis case, is the root cause. However, in many cases, you will have to go\ndeeper to understand why is failing.\\n There are some exceptions to the\nabove rule.\n\nFor example, circuit breakers and connection exhaustions (e.g., in\ndatabase, web servers) etc. can cause calls to fail. For example, if is a\nnginx web server with connection limit of 20 concurrent call, any calls >20 from\n to will fail. In this case, the root cause is unclear. All we can say is\nthat we have a connection exhaustion problem because of the configured limit on\n. However, you will need to figure out if is sending extra legitimate\nload to . If yes, then scaling is one option. However, could be\nitself faulty and sending more load because of a bug. In that case, should\nbe restarted. These can only be known by investigating traces and events more\ndeeply on and . You can stop diagnosis once you've found all the root\ncauses of the faults. Each fault propogation chain has exactly one root\ncause.\n\nAn example procedure to diagnose the problem:\n\n 1) Refer to the traces related to the affected entity or entities at (traces.*)\n 2) Check the status of all pods in the affected namespace by referring to the file at (k8s_objects.*)\n 3) Check all of the events in the relevant namespaces (k8s_events.*)\n 4) Refer to the metrics related to the affected entity or entities at (metrics directory)\n 5) Refer to logs related to all entities at \n 6) Use the information gathered to form a diagnosis. Structure the diagnosis in the following format...\n", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "You are SRE Diagnosis Agent.\nYou are a smart and capable tool using agent. \nYou are an expert at diagnosing problem in IT environments.\nYou have extensive experience with kubernetes and SRE tools.\nYour primary goal is to diagnose a IT\nincident.\nYou must identify all the entity that caused or was impacted by the incident and determine if\nit was a contributing factor or not. Also document the propagation chain\n\nErrors generally propagate in backward direction to the alert.\nE.g., in a service graph where calls which calls\n, if service is unavailable, calls from to will also fail showing\nup as high error rate on . But, traffic to will drop, causing an alert on\n. Thus, errors propagate backward, while traffic drops propagate forward. In\nthis case, is the root cause. However, in many cases, you will have to go\ndeeper to understand why is failing.\\n There are some exceptions to the\nabove rule.\n\nFor example, circuit breakers and connection exhaustions (e.g., in\ndatabase, web servers) etc. can cause calls to fail. For example, if is a\nnginx web server with connection limit of 20 concurrent call, any calls >20 from\n to will fail. In this case, the root cause is unclear. All we can say is\nthat we have a connection exhaustion problem because of the configured limit on\n. However, you will need to figure out if is sending extra legitimate\nload to . If yes, then scaling is one option. However, could be\nitself faulty and sending more load because of a bug. In that case, should\nbe restarted. These can only be known by investigating traces and events more\ndeeply on and . You can stop diagnosis once you've found all the root\ncauses of the faults. Each fault propogation chain has exactly one root\ncause.\n\nAn example procedure to diagnose the problem:\n\n 1) Refer to the traces related to the affected entity or entities at (traces.*)\n 2) Check the status of all pods in the affected namespace by referring to the file at (k8s_objects.*)\n 3) Check all of the events in the relevant namespaces (k8s_events.*)\n 4) Refer to the metrics related to the affected entity or entities at (metrics directory)\n 5) Refer to logs related to all entities at \n 6) Use the information gathered to form a diagnosis. Structure the diagnosis in the following format...\n", + "autoResize": false, + "lineHeight": 1.25 + }, + { + "id": "hNtXalTlyc9sA2-qMdFTO", + "type": "arrow", + "x": 916.67254016166, + "y": 232.82168531265177, + "width": 160.115282069081, + "height": 2.4797365803505897, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aS", + "roundness": { + "type": 2 + }, + "seed": 166416849, + "version": 320, + "versionNonce": 1239487519, + "isDeleted": false, + "boundElements": null, + "updated": 1762961242380, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -160.115282069081, + -2.4797365803505897 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "944wqqK4NZ7aGjbzExCk9", + "focus": 0.8190187181119306, + "gap": 2.047680082774491 + }, + "endBinding": { + "elementId": "b1R-6LaxnvmarynKpGJsZ", + "focus": 0.1009941615261671, + "gap": 5.214275541758866 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "7VKMRtI8XZCTR5422aT5c", + "type": "arrow", + "x": 1235.090527040118, + "y": -929.3822733844219, + "width": 645.9476902423122, + "height": 785.2394162415634, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aT", + "roundness": { + "type": 2 + }, + "seed": 1072975249, + "version": 595, + "versionNonce": 743241439, + "isDeleted": false, + "boundElements": null, + "updated": 1762961826146, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -645.9476902423122, + 785.2394162415634 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "mQZf89Bn-HBuGFEOKlkL2", + "focus": 0.7406509909532756, + "gap": 4.12392870232793 + }, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "rzE0udd8_Di1AHyfaFNWD", + "type": "rectangle", + "x": 2434.8095238095234, + "y": -1398.0476177760534, + "width": 740.0951974051342, + "height": 1439.2380926949638, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aU", + "roundness": { + "type": 3 + }, + "seed": 1532530655, + "version": 933, + "versionNonce": 153972977, + "isDeleted": false, + "boundElements": [ + { + "id": "2agNaK3e_hv98gzLx5DjL", + "type": "arrow" + } + ], + "updated": 1762961756527, + "link": null, + "locked": false + }, + { + "id": "231NJTHTPhRWDA-OSJWVi", + "type": "text", + "x": 2460.5714082263767, + "y": -1379.8571428571427, + "width": 586.6594848632812, + "height": 1375, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "ag", + "roundness": null, + "seed": 1131742001, + "version": 265, + "versionNonce": 554969809, + "isDeleted": false, + "boundElements": [ + { + "id": "2agNaK3e_hv98gzLx5DjL", + "type": "arrow" + } + ], + "updated": 1762961756527, + "link": null, + "locked": false, + "text": "{\n \"nodes\": [\n {\"id\": \"accounting\"},\n {\"id\": \"ad\"},\n {\"id\": \"cache\"},\n {\"id\": \"cart\"},\n {\"id\": \"checkout\"},\n {\"id\": \"currency\"},\n {\"id\": \"email\"},\n {\"id\": \"flagd\"},\n {\"id\": \"flagd-ui\"},\n {\"id\": \"fraud-detection\"},\n {\"id\": \"frontend\"},\n {\"id\": \"frontend-proxy\"},\n {\"id\": \"image-provider\"},\n {\"id\": \"load-generator\"},\n {\"id\": \"payment\"},\n {\"id\": \"product-catalog\"},\n {\"id\": \"queue\"},\n {\"id\": \"quote\"},\n {\"id\": \"recommendation\"},\n {\"id\": \"shipping\"}\n ],\n \"edges\": [\n {\"source\": \"ad\", \"target\": \"flagd\"},\n {\"source\": \"checkout\", \"target\": \"currency\"},\n {\"source\": \"checkout\", \"target\": \"cart\"},\n {\"source\": \"checkout\", \"target\": \"queue\"},\n {\"source\": \"cart\", \"target\": \"cache\"},\n {\"source\": \"cart\", \"target\": \"flagd\"},\n {\"source\": \"checkout\", \"target\": \"payment\"},\n {\"source\": \"checkout\", \"target\": \"email\"},\n {\"source\": \"checkout\", \"target\": \"product-catalog\"},\n {\"source\": \"checkout\", \"target\": \"shipping\"},\n {\"source\": \"fraud-detection\", \"target\": \"flagd\"},\n {\"source\": \"frontend\", \"target\": \"ad\"},\n {\"source\": \"frontend\", \"target\": \"currency\"},\n {\"source\": \"frontend\", \"target\": \"cart\"},\n {\"source\": \"frontend\", \"target\": \"checkout\"},\n {\"source\": \"frontend\", \"target\": \"shipping\"},\n {\"source\": \"frontend\", \"target\": \"recommendation\"},\n {\"source\": \"frontend\", \"target\": \"product-catalog\"},\n {\"source\": \"frontend-proxy\", \"target\": \"flagd\"},\n {\"source\": \"frontend-proxy\", \"target\": \"frontend\"},\n {\"source\": \"frontend-proxy\", \"target\": \"flagd-ui\"},\n {\"source\": \"frontend-proxy\", \"target\": \"image-provider\"},\n {\"source\": \"payment\", \"target\": \"flagd\"},\n {\"source\": \"queue\", \"target\": \"accounting\"},\n {\"source\": \"queue\", \"target\": \"fraud-detection\"},\n {\"source\": \"recommendation\", \"target\": \"flagd\"},\n {\"source\": \"recommendation\", \"target\": \"product-catalog\"},\n {\"source\": \"shipping\", \"target\": \"quote\"},\n {\"source\": \"load-generator\", \"target\": \"frontend-proxy\"}\n ]\n}", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "{\n \"nodes\": [\n {\"id\": \"accounting\"},\n {\"id\": \"ad\"},\n {\"id\": \"cache\"},\n {\"id\": \"cart\"},\n {\"id\": \"checkout\"},\n {\"id\": \"currency\"},\n {\"id\": \"email\"},\n {\"id\": \"flagd\"},\n {\"id\": \"flagd-ui\"},\n {\"id\": \"fraud-detection\"},\n {\"id\": \"frontend\"},\n {\"id\": \"frontend-proxy\"},\n {\"id\": \"image-provider\"},\n {\"id\": \"load-generator\"},\n {\"id\": \"payment\"},\n {\"id\": \"product-catalog\"},\n {\"id\": \"queue\"},\n {\"id\": \"quote\"},\n {\"id\": \"recommendation\"},\n {\"id\": \"shipping\"}\n ],\n \"edges\": [\n {\"source\": \"ad\", \"target\": \"flagd\"},\n {\"source\": \"checkout\", \"target\": \"currency\"},\n {\"source\": \"checkout\", \"target\": \"cart\"},\n {\"source\": \"checkout\", \"target\": \"queue\"},\n {\"source\": \"cart\", \"target\": \"cache\"},\n {\"source\": \"cart\", \"target\": \"flagd\"},\n {\"source\": \"checkout\", \"target\": \"payment\"},\n {\"source\": \"checkout\", \"target\": \"email\"},\n {\"source\": \"checkout\", \"target\": \"product-catalog\"},\n {\"source\": \"checkout\", \"target\": \"shipping\"},\n {\"source\": \"fraud-detection\", \"target\": \"flagd\"},\n {\"source\": \"frontend\", \"target\": \"ad\"},\n {\"source\": \"frontend\", \"target\": \"currency\"},\n {\"source\": \"frontend\", \"target\": \"cart\"},\n {\"source\": \"frontend\", \"target\": \"checkout\"},\n {\"source\": \"frontend\", \"target\": \"shipping\"},\n {\"source\": \"frontend\", \"target\": \"recommendation\"},\n {\"source\": \"frontend\", \"target\": \"product-catalog\"},\n {\"source\": \"frontend-proxy\", \"target\": \"flagd\"},\n {\"source\": \"frontend-proxy\", \"target\": \"frontend\"},\n {\"source\": \"frontend-proxy\", \"target\": \"flagd-ui\"},\n {\"source\": \"frontend-proxy\", \"target\": \"image-provider\"},\n {\"source\": \"payment\", \"target\": \"flagd\"},\n {\"source\": \"queue\", \"target\": \"accounting\"},\n {\"source\": \"queue\", \"target\": \"fraud-detection\"},\n {\"source\": \"recommendation\", \"target\": \"flagd\"},\n {\"source\": \"recommendation\", \"target\": \"product-catalog\"},\n {\"source\": \"shipping\", \"target\": \"quote\"},\n {\"source\": \"load-generator\", \"target\": \"frontend-proxy\"}\n ]\n}", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "2agNaK3e_hv98gzLx5DjL", + "type": "arrow", + "x": 2439.142836797804, + "y": -550.7991951591416, + "width": 1808.571428571427, + "height": 466.6563380162888, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "ah", + "roundness": { + "type": 2 + }, + "seed": 1009447807, + "version": 474, + "versionNonce": 1734403935, + "isDeleted": false, + "boundElements": null, + "updated": 1762961817592, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -1808.571428571427, + 466.6563380162888 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "231NJTHTPhRWDA-OSJWVi", + "focus": -0.07906651663348825, + "gap": 21.428571428572468 + }, + "endBinding": { + "elementId": "9MAvFiN6yz3s-z7kdOPFO", + "focus": 2.0051297814409734, + "gap": 5.476185389923103 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "URHpL5agcksRIhQw09J74", + "type": "text", + "x": 969.1428367978058, + "y": 1158.7142857142899, + "width": 2081.998046875, + "height": 475, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aj", + "roundness": null, + "seed": 685998271, + "version": 29, + "versionNonce": 248913823, + "isDeleted": false, + "boundElements": null, + "updated": 1762961254820, + "link": null, + "locked": false, + "text": "{{\n \"entities\": [\n {{\n \"id\": \"entity id that caused or was impacted in the incident.\",\n \"contributing_factor\": \"Whether the entity was a contributing factor or not (true or false).\"\n }}\n ],\n \"propagations\":\n [\n {{\n \"source\": \"entity id of the source entity at the current step of the propagation chain.\n If it is the first element of the chain, it should be the contributing factor entity id. Otherwise, it should match the target entity id of the previous step.\",\n \"target\": \"next entity id affected\",\n \"condition\":\"Reason causing the faulty / erroneous behaviour between the source and target entities\",\n \"effect\":\"Faulty behaviour between the two source and target entities observed as a result.\"\n }}\n ]\n}}\n**You must assemble partial chains and identify all correct independent contributing factors. You may have to rethink the contributing factors and cascading effects. Pay attention to the investigative context.**", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "{{\n \"entities\": [\n {{\n \"id\": \"entity id that caused or was impacted in the incident.\",\n \"contributing_factor\": \"Whether the entity was a contributing factor or not (true or false).\"\n }}\n ],\n \"propagations\":\n [\n {{\n \"source\": \"entity id of the source entity at the current step of the propagation chain.\n If it is the first element of the chain, it should be the contributing factor entity id. Otherwise, it should match the target entity id of the previous step.\",\n \"target\": \"next entity id affected\",\n \"condition\":\"Reason causing the faulty / erroneous behaviour between the source and target entities\",\n \"effect\":\"Faulty behaviour between the two source and target entities observed as a result.\"\n }}\n ]\n}}\n**You must assemble partial chains and identify all correct independent contributing factors. You may have to rethink the contributing factors and cascading effects. Pay attention to the investigative context.**", + "autoResize": true, + "lineHeight": 1.25 + } + ], + "appState": { + "gridSize": 20, + "gridStep": 5, + "gridModeEnabled": false, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file diff --git a/sre/playbooks/manage_recorders.yaml b/sre/playbooks/manage_recorders.yaml index a17214ad2..7552eb0ac 100644 --- a/sre/playbooks/manage_recorders.yaml +++ b/sre/playbooks/manage_recorders.yaml @@ -58,6 +58,8 @@ kubernetes: "{{ incidents_tools.kubernetes_topology_monitor | default(tools.kubernetes_topology_monitor) }}" traces: jaeger: "{{ incidents_tools.jaeger | default(tools.jaeger) }}" + unified: + clickhouse: "{{ incidents_tools.clickhouse | default(tools.clickhouse) }}" recorders_storage: "{{ storage }}" - name: Import leaderboard role diff --git a/sre/roles/applications/tasks/install_otel_demo.yaml b/sre/roles/applications/tasks/install_otel_demo.yaml index 0ee9827b7..68d4e8421 100644 --- a/sre/roles/applications/tasks/install_otel_demo.yaml +++ b/sre/roles/applications/tasks/install_otel_demo.yaml @@ -55,7 +55,7 @@ kubernetes.core.helm: chart_ref: opentelemetry-demo chart_repo_url: https://open-telemetry.github.io/opentelemetry-helm-charts - chart_version: 0.38.1 + chart_version: 0.38.6 kubeconfig: "{{ applications_cluster.kubeconfig }}" release_name: "{{ helm_releases.otel_demo.name }}" release_namespace: "{{ helm_releases.otel_demo.namespace }}" @@ -72,11 +72,11 @@ replicas: "{{ configuration.accounting.replicas }}" resources: requests: - cpu: 15m - memory: 80Mi - limits: cpu: 30m - memory: 150Mi + memory: 160Mi + limits: + cpu: 60m + memory: 300Mi ad: podAnnotations: openshift.io/required-scc: restricted-v2 @@ -137,11 +137,11 @@ openshift.io/required-scc: restricted-v2 resources: requests: - cpu: 2m - memory: 30Mi - limits: cpu: 10m memory: 60Mi + limits: + cpu: 100m + memory: 256Mi fraud-detection: podAnnotations: openshift.io/required-scc: restricted-v2 @@ -267,11 +267,11 @@ replicas: "{{ configuration.recommendation.replicas }}" resources: requests: - cpu: 15m - memory: 40Mi - limits: cpu: 30m memory: 80Mi + limits: + cpu: 100m + memory: 160Mi shipping: podAnnotations: openshift.io/required-scc: restricted-v2 @@ -311,7 +311,7 @@ clickhouse: username: "{{ tools_clickhouse_username }}" password: "{{ tools_clickhouse_password }}" - endpoint: "{{ tools_clickhouse_endpoint }}" + endpoint: "{{ tools_clickhouse_endpoint }}:8123" logs_table_name: otel_demo_logs traces_table_name: otel_demo_traces opensearch: @@ -422,3 +422,8 @@ label: horizontalpodautoscaler/{{ item | regex_replace('_', '-') }} when: - configuration[item].autoscaling | default(false) + +- name: Pausing for 300 seconds to allow for the application to stabalize + ansible.builtin.pause: + seconds: 300 + diff --git a/sre/roles/awx/tasks/configure_jobs.yaml b/sre/roles/awx/tasks/configure_jobs.yaml index 84a237f64..a2fde47d3 100644 --- a/sre/roles/awx/tasks/configure_jobs.yaml +++ b/sre/roles/awx/tasks/configure_jobs.yaml @@ -134,6 +134,14 @@ controller_password: "{{ awx_controller_password }}" # pragma: allowlist secret controller_username: admin execution_environment: ITBench-Custom-EE + extra_vars: + tools: + prometheus: true + jaeger: true + kubernetes_topology_monitor: true + clickhouse: false + incidents_tools: + clickhouse: false inventory: ITBench job_tags: install_recorders job_type: run @@ -175,6 +183,9 @@ prometheus: false jaeger: false kubernetes_topology_monitor: true + clickhouse: false + incidents_tools: + clickhouse: false inventory: ITBench job_tags: install_recorders job_type: run @@ -184,6 +195,32 @@ project: GitHub-ITBench state: present +- name: Add job template for running unified ClickHouse recorder + awx.awx.job_template: + allow_simultaneous: true + ask_credential_on_launch: true + ask_variables_on_launch: true + controller_host: "{{ awx_controller_host }}" + controller_password: "{{ awx_controller_password }}" # pragma: allowlist secret + controller_username: admin + execution_environment: ITBench-Custom-EE + extra_vars: + tools: + prometheus: false + jaeger: false + kubernetes_topology_monitor: false + clickhouse: true + incidents_tools: + clickhouse: true + inventory: ITBench + job_tags: install_recorders + job_type: run + name: Run-Unified-ClickHouse-Recorder + organization: ITBench + playbook: sre/playbooks/manage_recorders.yaml + project: GitHub-ITBench + state: present + - name: Add job template for enabling agent access awx.awx.job_template: allow_simultaneous: true diff --git a/sre/roles/awx/templates/workflow_nodes/scenario_exec_start.j2 b/sre/roles/awx/templates/workflow_nodes/scenario_exec_start.j2 index 9a177b820..d8179740e 100644 --- a/sre/roles/awx/templates/workflow_nodes/scenario_exec_start.j2 +++ b/sre/roles/awx/templates/workflow_nodes/scenario_exec_start.j2 @@ -1,5 +1,5 @@ --- -- identifier: node-deploy-tools +- identifier: node-deploy-applications extra_data: incident_id: {{ scenario_id }} related: @@ -9,11 +9,11 @@ - name: AWS {% endif %} success_nodes: - - identifier: node-deploy-applications + - identifier: node-reinit-tools unified_job_template: - name: Deploy-Tools + name: Deploy-Applications type: job_template -- identifier: node-deploy-applications +- identifier: node-reinit-tools extra_data: incident_id: {{ scenario_id }} related: @@ -25,7 +25,7 @@ success_nodes: - identifier: node-install-recorders unified_job_template: - name: Deploy-Applications + name: Reinit-Tools type: job_template - identifier: node-install-recorders extra_data: @@ -105,15 +105,43 @@ {% endif %} success_nodes: - identifier: node-handover-to-agent + failure_nodes: + - identifier: node-install-unified-clickhouse-recorder unified_job_template: name: Check-for-Alerts type: job_template - identifier: node-handover-to-agent {% if requires_aws_credentials %} related: + always_nodes: + - identifier: node-install-unified-clickhouse-recorder credentials: - name: AWS {% endif %} unified_job_template: name: Handover-To-Agent type: job_template +- identifier: node-install-unified-clickhouse-recorder + related: + always_nodes: + - identifier: node-uninstall-recorders + credentials: + - name: Cluster-{{ cluster_index + 1 }}-Kubeconfig +{% if requires_aws_credentials %} + - name: AWS +{% endif %} + unified_job_template: + name: Run-Unified-ClickHouse-Recorder + type: job_template +- identifier: node-uninstall-recorders + extra_data: + incident_id: {{ scenario_id }} + related: + credentials: + - name: Cluster-{{ cluster_index + 1 }}-Kubeconfig +{% if requires_aws_credentials %} + - name: AWS +{% endif %} + unified_job_template: + name: Uninstall-Data-Recorders + type: job_template diff --git a/sre/roles/awx/templates/workflow_nodes/scenario_exec_stop.j2 b/sre/roles/awx/templates/workflow_nodes/scenario_exec_stop.j2 index 440d16453..437a5a84a 100644 --- a/sre/roles/awx/templates/workflow_nodes/scenario_exec_stop.j2 +++ b/sre/roles/awx/templates/workflow_nodes/scenario_exec_stop.j2 @@ -75,8 +75,6 @@ extra_data: incident_id: {{ scenario_id }} related: - always_nodes: - - identifier: node-undeploy-tools credentials: - name: Cluster-{{ cluster_index + 1 }}-Kubeconfig {% if requires_aws_credentials %} @@ -85,15 +83,3 @@ unified_job_template: name: Undeploy-Applications type: job_template -- identifier: node-undeploy-tools - extra_data: - incident_id: {{ scenario_id }} - related: - credentials: - - name: Cluster-{{ cluster_index + 1 }}-Kubeconfig -{% if requires_aws_credentials %} - - name: AWS -{% endif %} - unified_job_template: - name: Undeploy-Tools - type: job_template diff --git a/sre/roles/recorders/files/unified/clickhouse/deployment.yaml b/sre/roles/recorders/files/unified/clickhouse/deployment.yaml new file mode 100644 index 000000000..7e58e7fa4 --- /dev/null +++ b/sre/roles/recorders/files/unified/clickhouse/deployment.yaml @@ -0,0 +1,45 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/name: clickhouse-unified-recorder + app.kubernetes.io/part-of: it-bench + name: clickhouse-unified-recorder +spec: + selector: + matchLabels: + app.kubernetes.io/name: clickhouse-unified-recorder + app.kubernetes.io/part-of: it-bench + template: + metadata: + annotations: + openshift.io/required-scc: restricted-v2 + labels: + app.kubernetes.io/name: clickhouse-unified-recorder + app.kubernetes.io/part-of: it-bench + spec: + containers: + - name: clickhouse-unified-recorder + image: registry.access.redhat.com/ubi10-minimal:10.0-1758699349 + command: + - /bin/sh + args: + - -c + - "sleep 600" + resources: + requests: + cpu: 500m + memory: 256Mi + limits: + memory: 512Mi + volumeMounts: + - name: records + mountPath: /opt/app-root/src/records + securityContext: + fsGroup: 1001 + volumes: + - name: records + persistentVolumeClaim: + claimName: clickhouse-unified-records + replicas: 1 diff --git a/sre/roles/recorders/files/unified/clickhouse/job.yaml b/sre/roles/recorders/files/unified/clickhouse/job.yaml new file mode 100644 index 000000000..a73b7a64d --- /dev/null +++ b/sre/roles/recorders/files/unified/clickhouse/job.yaml @@ -0,0 +1,15 @@ +--- +# This definition has been left purposely incomplete. This allows +# Dependabot to track the image. The full definition is provided in the +# template version of this object. + +apiVersion: batch/v1 +kind: Job +metadata: + name: clickhouse-unified-recorder +spec: + template: + spec: + containers: + - name: clickhouse-unified-recorder + image: registry.access.redhat.com/ubi10/python-312-minimal:10.0-1762316151 \ No newline at end of file diff --git a/sre/roles/recorders/files/unified/clickhouse/persistentvolumeclaim.yaml b/sre/roles/recorders/files/unified/clickhouse/persistentvolumeclaim.yaml new file mode 100644 index 000000000..a6c1b3fe2 --- /dev/null +++ b/sre/roles/recorders/files/unified/clickhouse/persistentvolumeclaim.yaml @@ -0,0 +1,15 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + labels: + app.kubernetes.io/name: clickhouse-unified-recorder + app.kubernetes.io/part-of: it-bench + name: clickhouse-unified-records +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi + volumeMode: Filesystem diff --git a/sre/roles/recorders/files/unified/clickhouse/scripts/gather.py b/sre/roles/recorders/files/unified/clickhouse/scripts/gather.py new file mode 100644 index 000000000..d13ffcd93 --- /dev/null +++ b/sre/roles/recorders/files/unified/clickhouse/scripts/gather.py @@ -0,0 +1,904 @@ +"""ClickHouse Event Streamer for OpenTelemetry and Kubernetes data. + +This module provides a client for querying ClickHouse databases containing +OpenTelemetry traces, logs, metrics, and Kubernetes events/objects. +""" + +import json +import logging +import os +import re +import sys +from datetime import datetime +from typing import Dict, List, Optional + +import clickhouse_connect +import pandas as pd + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + +logger = logging.getLogger(__name__) + + +class ClickHouseEventStreamer: + """Client for streaming and querying OpenTelemetry and Kubernetes data from ClickHouse. + + This class provides methods to query events, logs, traces, metrics, and Kubernetes + objects from ClickHouse databases and save them to TSV files. + + Attributes: + default_client: ClickHouse client for the default database. + prometheus_client: ClickHouse client for the prometheus database. + records_dir: Directory path where query results are saved. + """ + + def __init__( + self, + host: str = 'localhost', + port: int = 8123, + username: str = 'default', + password: str = '', + secure: bool = False, + verify: bool = True, + proxy_path: Optional[str] = None + ): + """Initialize ClickHouse clients and setup records directory. + + Args: + host: ClickHouse server hostname. + port: ClickHouse server port. + username: Username for authentication. + password: Password for authentication. + secure: Whether to use HTTPS connection. + verify: Whether to verify SSL certificates. + proxy_path: Optional proxy path for connection. + """ + base_settings = { + 'host': host, + 'port': port, + 'username': username, + 'password': password, + 'secure': secure, + 'verify': verify + } + + if proxy_path: + base_settings['proxy_path'] = proxy_path + + self.default_client = clickhouse_connect.get_client( + database='default', + **base_settings + ) + + self.prometheus_client = clickhouse_connect.get_client( + database='prometheus', + **base_settings + ) + + self.records_dir = os.path.join(os.path.expanduser("~"), "records") + os.makedirs(self.records_dir, exist_ok=True) + + self._metric_table_ids = None + + def _get_metric_table_ids(self) -> Dict[str, Optional[str]]: + """Discover and cache metric table IDs from ClickHouse system tables. + + Returns: + Dictionary mapping table types ('data', 'tags', 'metrics') to table names. + """ + if self._metric_table_ids is None: + df = self.prometheus_client.query_df(""" + SELECT name + FROM system.tables + WHERE database = 'prometheus' + AND name LIKE '.inner_id%' + """) + + self._metric_table_ids = { + 'data': None, + 'tags': None, + 'metrics': None + } + + for table_name in df['name']: + if '.inner_id.data.' in table_name: + self._metric_table_ids['data'] = table_name + elif '.inner_id.tags.' in table_name: + self._metric_table_ids['tags'] = table_name + elif '.inner_id.metrics.' in table_name: + self._metric_table_ids['metrics'] = table_name + + logger.info(f"Found metric tables: {self._metric_table_ids}") + + return self._metric_table_ids + + def _save_to_records( + self, + data: pd.DataFrame, + prefix: str, + subdir: Optional[str] = None + ) -> str: + """Save DataFrame to TSV file in records directory. + + Args: + data: DataFrame to save. + prefix: Filename prefix for the saved file. + subdir: Optional subdirectory within records_dir. + + Returns: + Path to the saved file. + """ + if subdir: + save_dir = os.path.join(self.records_dir, subdir) + os.makedirs(save_dir, exist_ok=True) + file_path = os.path.join(save_dir, f"{prefix}.tsv") + else: + file_path = os.path.join(self.records_dir, f"{prefix}.tsv") + + data.to_csv(file_path, sep='\t', index=False) + logger.info(f"Saved {len(data)} records to: {file_path}") + return file_path + + def query_df_batched( + self, + query: str, + batch_size: int = 10000, + client: Optional[clickhouse_connect.driver.Client] = None + ) -> pd.DataFrame: + """Execute query in batches to handle large result sets. + + This method automatically fetches query results in manageable batches + to prevent memory issues when dealing with large datasets. It fetches + all available data by default. + + Args: + query: SQL query to execute. + batch_size: Number of rows to fetch per batch. Default is 10,000. + client: ClickHouse client to use (defaults to default_client). + + Returns: + Combined DataFrame containing all batched results. + + Note: + To limit the number of rows returned, use SQL LIMIT in your query: + Example: query_df_batched("SELECT * FROM table LIMIT 50000") + """ + if client is None: + client = self.default_client + + from_match = re.search(r'FROM\s+(\S+)', query, re.IGNORECASE) + if not from_match: + return client.query_df(query) + table_name = from_match.group(1) + + where_match = re.search( + r'WHERE\s+(.*?)(?:ORDER|GROUP|LIMIT|$)', + query, + re.IGNORECASE | re.DOTALL + ) + where_clause = f"WHERE {where_match.group(1)}" if where_match else "" + + count_query = f"SELECT COUNT(*) as count FROM {table_name} {where_clause}" + try: + count_df = client.query_df(count_query) + total_count = int(count_df['count'].iloc[0]) if not count_df.empty else 0 + except Exception as e: + logger.warning(f"Could not get count, running query directly: {e}") + return client.query_df(query) + + if total_count == 0: + return pd.DataFrame() + + logger.info(f"Total records: {total_count:,}") + + # If small dataset, just run directly + if total_count <= batch_size: + return client.query_df(query) + + # Remove existing LIMIT/OFFSET if any + base_query = re.sub(r'\s+(LIMIT|OFFSET)\s+\d+', '', query, flags=re.IGNORECASE) + + # Fetch in batches + all_dfs = [] + offset = 0 + batch_num = 0 + + while True: # ← Simplified: No max_batches check + batch_query = f"{base_query} LIMIT {batch_size} OFFSET {offset}" + + logger.debug(f"Fetching batch {batch_num + 1} (offset: {offset:,})...") + try: + batch_df = client.query_df(batch_query) + + # Stop if no more data + if batch_df.empty: + logger.debug("Empty batch received, stopping") + break + + logger.debug(f"Fetched {len(batch_df):,} rows") + all_dfs.append(batch_df) + offset += batch_size + batch_num += 1 + + # Stop if we received fewer rows than requested (reached end) + if len(batch_df) < batch_size: + logger.debug("Partial batch received, all data fetched") + break + + except Exception as e: + logger.error(f"Error in batch {batch_num + 1}: {e}") + break + + if not all_dfs: + return pd.DataFrame() + + logger.info(f"Successfully fetched {batch_num} batches") + return pd.concat(all_dfs, ignore_index=True) + + def get_events_df( + self, + namespaces: List[str] = None, + save_to_file: bool = True + ) -> pd.DataFrame: + """Retrieve Kubernetes events from ClickHouse. + + Args: + namespaces: List of Kubernetes namespaces to filter by. + save_to_file: Whether to save results to TSV file. + + Returns: + DataFrame containing parsed Kubernetes events. + """ + if namespaces is None: + namespaces = ['otel-demo', 'chaos-mesh'] + + query = """ + SELECT + Timestamp as timestamp, + Body as body, + ResourceAttributes['k8s.namespace.name'] as namespace + FROM kubernetes_events + WHERE ResourceAttributes['k8s.namespace.name'] IN ({namespaces}) + ORDER BY Timestamp ASC + """.format( + namespaces=','.join(f"'{ns}'" for ns in namespaces) + ) + + df = self.query_df_batched(query) + + if len(df) > 0: + try: + df['event_json'] = df['body'].apply( + lambda x: json.loads(x) if x else {} + ) + df['event_type'] = df['event_json'].apply( + lambda x: x.get('type', '') + ) + + df['reason'] = df['event_json'].apply( + lambda x: x.get('object', {}).get('reason', '') + ) + df['message'] = df['event_json'].apply( + lambda x: x.get('object', {}).get('message', '') + ) + df['event_time'] = df['event_json'].apply( + lambda x: x.get('object', {}).get('lastTimestamp', '') + ) + df['event_count'] = df['event_json'].apply( + lambda x: x.get('object', {}).get('count', 1) + ) + df['event_kind'] = df['event_json'].apply( + lambda x: x.get('object', {}).get('type', '') + ) + + df['object_kind'] = df['event_json'].apply( + lambda x: x.get('object', {}).get('involvedObject', {}).get('kind', '') + ) + df['object_name'] = df['event_json'].apply( + lambda x: x.get('object', {}).get('involvedObject', {}).get('name', '') + ) + df['object_namespace'] = df['event_json'].apply( + lambda x: x.get('object', {}).get('involvedObject', {}).get('namespace', '') + ) + + df['source_component'] = df['event_json'].apply( + lambda x: x.get('object', {}).get('source', {}).get('component', '') + ) + + df = df.drop(columns=['body', 'event_json']) + + except Exception as e: + logger.error(f"Could not parse event body: {e}") + + logger.info(f"Total events loaded: {len(df)}") + + if save_to_file and len(df) > 0: + prefix = f"k8s_events_{'_'.join(namespaces)}" + self._save_to_records(df, prefix) + + return df + + def get_k8s_objects_df( + self, + namespaces: Optional[List[str]] = None, + resource_types: Optional[List[str]] = None, + save_to_file: bool = True + ) -> pd.DataFrame: + """Retrieve Kubernetes object snapshots from ClickHouse. + + Args: + namespaces: List of Kubernetes namespaces to filter by. + resource_types: List of Kubernetes resource types to filter by. + save_to_file: Whether to save results to TSV file. + + Returns: + DataFrame containing Kubernetes object snapshots. + """ + query = """ + SELECT + Timestamp as timestamp, + Body as body, + ResourceAttributes['k8s.namespace.name'] as namespace, + LogAttributes['k8s.resource.name'] as resource_type + FROM kubernetes_objects_snapshot + WHERE 1=1 + """ + if namespaces is None: + namespaces = ['otel-demo', 'chaos-mesh'] + + query += " AND ResourceAttributes['k8s.namespace.name'] IN ({})".format( + ','.join(f"'{ns}'" for ns in namespaces) + ) + + if resource_types: + query += " AND LogAttributes['k8s.resource.name'] IN ({})".format( + ','.join(f"'{rt}'" for rt in resource_types) + ) + + query += " ORDER BY Timestamp ASC" + + df = self.query_df_batched(query) + + if len(df) > 0: + df['object_json'] = df['body'].apply( + lambda x: json.loads(x) if x else {} + ) + df['object_kind'] = df['object_json'].apply( + lambda x: x.get('kind', '') + ) + df['object_name'] = df['object_json'].apply( + lambda x: x.get('metadata', {}).get('name', '') + ) + df['api_version'] = df['object_json'].apply( + lambda x: x.get('apiVersion', '') + ) + + df = df.drop(columns=['object_json']) + + logger.info(f"Total K8s objects loaded: {len(df)}") + + if save_to_file and len(df) > 0: + prefix_parts = ["k8s_objects"] + if namespaces: + prefix_parts.append('_'.join(namespaces)) + if resource_types: + prefix_parts.append('_'.join(resource_types)) + prefix = '_'.join(prefix_parts) + self._save_to_records(df, prefix) + + return df + + def get_logs_df( + self, + services: Optional[List[str]] = None, + severity_levels: Optional[List[str]] = None, + save_to_file: bool = True + ) -> pd.DataFrame: + """Retrieve OpenTelemetry logs from ClickHouse. + + Args: + services: List of service names to filter by. + severity_levels: List of severity levels to filter by (defaults to warning, error, warn, fatal, critical). + save_to_file: Whether to save results to TSV file. + + Returns: + DataFrame containing OpenTelemetry logs. + """ + if severity_levels is None: + severity_levels = ['warning', 'error', 'warn', 'fatal', 'critical'] + + query = """ + SELECT + Timestamp as timestamp, + TraceId as trace_id, + SpanId as span_id, + TraceFlags as trace_flags, + SeverityText as severity_text, + SeverityNumber as severity_number, + ServiceName as service_name, + Body as body, + ResourceAttributes as resource_attributes, + LogAttributes as log_attributes + FROM otel_demo_logs + WHERE 1=1 + """ + + if services: + query += " AND ServiceName IN ({})".format( + ','.join(f"'{s}'" for s in services) + ) + + if severity_levels: + # 1-4: TRACE, 5-8: DEBUG, 9-12: INFO, 13-16: WARN, 17-20: ERROR, 21-24: FATAL + if any(level.lower() in ['warning', 'warn', 'error', 'fatal', 'critical'] for level in severity_levels): + query += " AND (SeverityText IN ({}) OR SeverityNumber >= 13)".format( + ','.join(f"'{s}'" for s in severity_levels) + ) + else: + query += " AND SeverityText IN ({})".format( + ','.join(f"'{s}'" for s in severity_levels) + ) + + query += " ORDER BY Timestamp ASC" + + df = self.query_df_batched(query) + + if len(df) > 0: + if (df['severity_text'] == '').all(): + df = df.drop(columns=['severity_text']) + if (df['severity_number'] == 0).all(): + df = df.drop(columns=['severity_number']) + + df['body'] = df['body'].str.replace('\n', ' ', regex=False) + + df['k8s_namespace'] = df['resource_attributes'].apply( + lambda x: x.get('k8s.namespace.name', '') if isinstance(x, dict) else '' + ) + df['k8s_pod_name'] = df['resource_attributes'].apply( + lambda x: x.get('k8s.pod.name', '') if isinstance(x, dict) else '' + ) + df['url_path'] = df['log_attributes'].apply( + lambda x: x.get('url.path', '') if isinstance(x, dict) else '' + ) + + df = df.drop(columns=['resource_attributes', 'log_attributes']) + + logger.info(f"Total logs loaded: {len(df)}") + + if save_to_file and len(df) > 0: + prefix = "otel_logs" + if services: + prefix += f"_{'_'.join(services)}" + if severity_levels: + prefix += f"_{'_'.join(severity_levels)}" + self._save_to_records(df, prefix) + + return df + + def get_traces_df( + self, + services: Optional[List[str]] = None, + trace_ids: Optional[List[str]] = None, + span_kinds: Optional[List[str]] = None, + status_codes: Optional[List[str]] = None, + min_duration_ms: Optional[int] = None, + save_to_file: bool = True + ) -> pd.DataFrame: + """Retrieve OpenTelemetry traces from ClickHouse. + + Args: + services: List of service names to filter by. + trace_ids: List of specific trace IDs to retrieve. + span_kinds: List of span kinds to filter by. + status_codes: List of status codes to filter by (defaults to Error). + min_duration_ms: Minimum span duration in milliseconds. + save_to_file: Whether to save results to TSV file. + + Returns: + DataFrame containing OpenTelemetry traces. + """ + if status_codes is None: + status_codes = ['Error'] + + query = """ + SELECT + Timestamp as timestamp, + TraceId as trace_id, + SpanId as span_id, + ParentSpanId as parent_span_id, + TraceState as trace_state, + SpanName as span_name, + SpanKind as span_kind, + ServiceName as service_name, + ScopeName as scope_name, + ScopeVersion as scope_version, + Duration as duration, + StatusCode as status_code, + StatusMessage as status_message + FROM otel_demo_traces + WHERE 1=1 + """ + + if services: + query += " AND ServiceName IN ({})".format( + ','.join(f"'{s}'" for s in services) + ) + + if trace_ids: + query += " AND TraceId IN ({})".format( + ','.join(f"'{tid}'" for tid in trace_ids) + ) + + if span_kinds: + query += " AND SpanKind IN ({})".format( + ','.join(f"'{sk}'" for sk in span_kinds) + ) + + if status_codes: + query += " AND StatusCode IN ({})".format( + ','.join(f"'{sc}'" for sc in status_codes) + ) + + if min_duration_ms: + query += f" AND Duration >= {min_duration_ms * 1000000}" + + query += " ORDER BY Timestamp ASC" + + df = self.query_df_batched(query) + + if len(df) > 0 and 'duration' in df.columns: + df['duration_ms'] = df['duration'] / 1000000 + + logger.info(f"Total traces loaded: {len(df)}") + + if save_to_file and len(df) > 0: + prefix = "otel_traces" + if services: + prefix += f"_{'_'.join(services)}" + self._save_to_records(df, prefix) + + return df + + def get_trace_by_id( + self, + trace_id: str, + save_to_file: bool = True + ) -> pd.DataFrame: + """Retrieve a specific trace by its ID. + + Args: + trace_id: The trace ID to retrieve. + save_to_file: Whether to save results to TSV file. + + Returns: + DataFrame containing all spans for the specified trace. + """ + df = self.get_traces_df(trace_ids=[trace_id], save_to_file=False) + + if len(df) > 0: + df = df.sort_values('timestamp') + + if save_to_file: + self._save_to_records(df, f"trace_{trace_id[:8]}") + + return df + + def get_error_traces(self, save_to_file: bool = True) -> pd.DataFrame: + """Retrieve all traces with error status codes. + + Args: + save_to_file: Whether to save results to TSV file. + + Returns: + DataFrame containing traces with errors. + """ + return self.get_traces_df( + status_codes=['Error'], + save_to_file=save_to_file + ) + + def get_metrics_df( + self, + metric_names: Optional[List[str]] = None, + namespace: str = 'otel-demo', + save_to_file: bool = True + ) -> tuple: + """Retrieve Prometheus metrics from ClickHouse. + + Args: + metric_names: List of metric names to retrieve. If None, fetches + CPU and memory related metrics. + namespace: Kubernetes namespace to filter by. + save_to_file: Whether to save results to TSV files. + + Returns: + Tuple of (pod_metrics_df, service_metrics_df). + """ + table_ids = self._get_metric_table_ids() + + if not table_ids['data'] or not table_ids['tags']: + raise Exception("Could not find prometheus metric tables") + + if save_to_file: + pod_metrics_dir = os.path.join(self.records_dir, "metrics_pod") + os.makedirs(pod_metrics_dir, exist_ok=True) + + service_metrics_dir = os.path.join(self.records_dir, "metrics_service") + os.makedirs(service_metrics_dir, exist_ok=True) + + if metric_names is None: + metrics_query = f""" + SELECT DISTINCT metric_name + FROM `{table_ids['tags']}` + WHERE metric_name LIKE '%cpu%' + OR metric_name LIKE '%memory%' + OR metric_name LIKE '%mem%' + """ + available_metrics_df = self.prometheus_client.query_df(metrics_query) + metric_names = available_metrics_df['metric_name'].tolist() + logger.info(f"Found {len(metric_names)} CPU/memory related metrics") + + pod_metrics_df = self._get_pod_metrics( + table_ids, + metric_names, + namespace, + pod_metrics_dir, + save_to_file + ) + service_metrics_df = self._get_service_metrics( + table_ids, + namespace, + service_metrics_dir, + save_to_file + ) + + return pod_metrics_df, service_metrics_df + + def _get_pod_metrics( + self, + table_ids: Dict[str, Optional[str]], + metric_names: List[str], + namespace: str, + metrics_dir: str, + save_to_file: bool + ) -> pd.DataFrame: + """Retrieve pod-level metrics from Prometheus tables. + + Args: + table_ids: Dictionary of metric table names. + metric_names: List of metric names to retrieve. + namespace: Kubernetes namespace to filter by. + metrics_dir: Directory to save metric files. + save_to_file: Whether to save results to TSV files. + + Returns: + DataFrame containing pod-level metrics. + """ + if not metric_names: + return pd.DataFrame() + + all_dfs = [] + batch_size = 5 + + for i in range(0, len(metric_names), batch_size): + batch_metrics = metric_names[i:i+batch_size] + escaped_metrics = [m.replace("'", "''") for m in batch_metrics] + + query = f""" + SELECT + t.metric_name, + d.timestamp, + d.value, + t.tags['pod'] as pod_name, + t.tags['namespace'] as namespace, + t.tags + FROM `{table_ids['data']}` d + JOIN `{table_ids['tags']}` t ON d.id = t.id + WHERE t.metric_name IN ({','.join(f"'{m}'" for m in escaped_metrics)}) + AND t.tags['namespace'] = '{namespace.replace("'", "''")}' + AND t.tags['pod'] != '' + ORDER BY d.timestamp ASC + """ + + batch_num = i // batch_size + 1 + total_batches = (len(metric_names) + batch_size - 1) // batch_size + logger.info(f"Processing pod metrics batch {batch_num}/{total_batches}") + + try: + batch_df = self.prometheus_client.query_df(query) + if not batch_df.empty: + all_dfs.append(batch_df) + logger.info(f" Loaded {len(batch_df)} metric points") + except Exception as e: + logger.error(f" Error processing batch: {e}") + continue + + if all_dfs: + df = pd.concat(all_dfs, ignore_index=True) + logger.info(f"Total pod metric points loaded: {len(df)}") + + if save_to_file and len(df) > 0: + unique_pods = df['pod_name'].unique() + for pod in unique_pods: + pod_df = df[df['pod_name'] == pod] + safe_pod_name = pod.replace('/', '_').replace(' ', '_') + self._save_to_records( + pod_df, + f"pod_{safe_pod_name}", + subdir="metrics" + ) + logger.debug(f" Saved {len(pod_df)} metrics for pod: {pod}") + + return df + else: + logger.warning("No pod metric data loaded") + return pd.DataFrame() + + def _get_service_metrics( + self, + table_ids: Dict[str, Optional[str]], + namespace: str, + metrics_dir: str, + save_to_file: bool + ) -> pd.DataFrame: + """Retrieve service-level metrics from Prometheus tables. + + Args: + table_ids: Dictionary of metric table names. + namespace: Kubernetes namespace to filter by. + metrics_dir: Directory to save metric files. + save_to_file: Whether to save results to TSV files. + + Returns: + DataFrame containing service-level metrics. + """ + duration_query = f""" + SELECT + t.metric_name, + d.timestamp, + d.value, + t.tags['service_name'] as service_name, + t.tags['namespace'] as namespace, + t.tags['le'] as bucket_le, + t.tags + FROM `{table_ids['data']}` d + JOIN `{table_ids['tags']}` t ON d.id = t.id + WHERE t.metric_name = 'traces_span_metrics_duration_milliseconds_bucket' + AND t.tags['namespace'] = '{namespace}' + AND t.tags['service_name'] NOT IN ('flagd', 'load-generator') + ORDER BY d.timestamp ASC + """ + + error_query = f""" + SELECT + t.metric_name, + d.timestamp, + d.value, + t.tags['service_name'] as service_name, + t.tags['namespace'] as namespace, + t.tags['status_code'] as status_code, + t.tags + FROM `{table_ids['data']}` d + JOIN `{table_ids['tags']}` t ON d.id = t.id + WHERE t.metric_name = 'traces_span_metrics_calls_total' + AND t.tags['namespace'] = '{namespace}' + AND t.tags['service_name'] NOT IN ('flagd', 'load-generator') + AND t.tags['status_code'] = 'STATUS_CODE_ERROR' + ORDER BY d.timestamp ASC + """ + + logger.info("Fetching service-level metrics...") + + all_service_dfs = [] + + try: + duration_df = self.prometheus_client.query_df(duration_query) + if not duration_df.empty: + duration_df['metric_type'] = 'duration_p95' + all_service_dfs.append(duration_df) + logger.info(f" Loaded {len(duration_df)} duration metric points") + except Exception as e: + logger.error(f" Error fetching duration metrics: {e}") + + try: + error_df = self.prometheus_client.query_df(error_query) + if not error_df.empty: + error_df['metric_type'] = 'error_rate' + all_service_dfs.append(error_df) + logger.info(f" Loaded {len(error_df)} error metric points") + except Exception as e: + logger.error(f" Error fetching error metrics: {e}") + + if all_service_dfs: + service_df = pd.concat(all_service_dfs, ignore_index=True) + + if save_to_file and len(service_df) > 0: + unique_services = service_df['service_name'].unique() + for service in unique_services: + svc_df = service_df[service_df['service_name'] == service] + svc_df = svc_df.drop(columns=['tags']) + self._save_to_records( + svc_df, + f"service_{service}", + subdir="metrics" + ) + logger.debug(f" Saved {len(svc_df)} metrics for service: {service}") + + return service_df + else: + logger.warning("No service metric data loaded") + return pd.DataFrame() + + def get_available_metrics(self) -> pd.DataFrame: + """Retrieve list of all available metrics in the prometheus database. + + Returns: + DataFrame containing metric names and their variation counts. + """ + table_ids = self._get_metric_table_ids() + + if not table_ids['tags']: + raise Exception("Could not find prometheus tags table") + + query = f""" + SELECT DISTINCT + metric_name, + count(*) as variations + FROM `{table_ids['tags']}` + GROUP BY metric_name + ORDER BY metric_name + """ + + df = self.prometheus_client.query_df(query) + logger.info(f"Found {len(df)} unique metrics") + return df + + def close(self): + """Close all ClickHouse client connections.""" + self.default_client.close() + self.prometheus_client.close() + + +def main(): + """Main execution function demonstrating usage of ClickHouseEventStreamer.""" + + endpoint = os.environ.get("CLICKHOUSE_ENDPOINT") + username = os.environ.get("CLICKHOUSE_USERNAME", "default") + password = os.environ.get("CLICKHOUSE_PASSWORD", "") + + # Clean up the endpoint if it has protocol + endpoint = endpoint.replace('https://', '').replace('http://', '') + + if endpoint is None or username is None or password is None: + sys.exit("error: CLICKHOUSE_ENDPOINT, USERNAME and PASSWORD environment variables are not set") + + streamer = ClickHouseEventStreamer( + host=endpoint, + username=username, + password=password + ) + + try: + logger.info("Fetching events...") + events_df = streamer.get_events_df() + + logger.info("\nFetching K8s objects...") + k8s_objects_df = streamer.get_k8s_objects_df() + + logger.info("\nFetching logs...") + logs = streamer.get_logs_df() + + logger.info("\nFetching traces...") + traces_df = streamer.get_traces_df() + + logger.info("\nFetching metrics...") + k8s_metrics = streamer.get_metrics_df() + + except Exception as e: + logger.error(f"Error: {e}") + finally: + streamer.close() + + +if __name__ == "__main__": + main() diff --git a/sre/roles/recorders/files/unified/clickhouse/scripts/requirements.txt b/sre/roles/recorders/files/unified/clickhouse/scripts/requirements.txt new file mode 100644 index 000000000..e0db89bd8 --- /dev/null +++ b/sre/roles/recorders/files/unified/clickhouse/scripts/requirements.txt @@ -0,0 +1,3 @@ +clickhouse-connect==0.9.2 +pandas==2.3.3 +pyarrow==22.0.0 diff --git a/sre/roles/recorders/meta/argument_specs.yaml b/sre/roles/recorders/meta/argument_specs.yaml index 865964c90..28c6edc15 100644 --- a/sre/roles/recorders/meta/argument_specs.yaml +++ b/sre/roles/recorders/meta/argument_specs.yaml @@ -51,6 +51,14 @@ argument_specs: default: true required: false type: bool + unified: + required: false + type: dict + options: + clickhouse: + default: false + required: false + type: bool recorders_storage: required: true type: dict diff --git a/sre/roles/recorders/tasks/install.yaml b/sre/roles/recorders/tasks/install.yaml index 1681d4534..7d585a72c 100644 --- a/sre/roles/recorders/tasks/install.yaml +++ b/sre/roles/recorders/tasks/install.yaml @@ -26,3 +26,9 @@ file: install_traces_recorders.yaml when: - recorders_required.traces is defined + +- name: Import unified recorders installation tasks + ansible.builtin.import_tasks: + file: install_unified_recorders.yaml + when: + - recorders_required.unified is defined diff --git a/sre/roles/recorders/tasks/install_unified_recorders.yaml b/sre/roles/recorders/tasks/install_unified_recorders.yaml new file mode 100644 index 000000000..9dfb1e3da --- /dev/null +++ b/sre/roles/recorders/tasks/install_unified_recorders.yaml @@ -0,0 +1,6 @@ +--- +- name: Import unified recorders based on ClickHouse installation tasks + ansible.builtin.import_tasks: + file: install_unified_recorders_clickhouse.yaml + when: + - recorders_required.unified.clickhouse | default(false) diff --git a/sre/roles/recorders/tasks/install_unified_recorders_clickhouse.yaml b/sre/roles/recorders/tasks/install_unified_recorders_clickhouse.yaml new file mode 100644 index 000000000..c0ac78614 --- /dev/null +++ b/sre/roles/recorders/tasks/install_unified_recorders_clickhouse.yaml @@ -0,0 +1,199 @@ +--- +- name: Import tools role for variable setting tasks - ClickHouse endpoint + ansible.builtin.import_role: + name: tools + tasks_from: set_clickhouse_endpoint + vars: + tools_cluster: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + +- name: Import tools role for variable setting tasks - ClickHouse username and password + ansible.builtin.import_role: + name: tools + tasks_from: set_clickhouse_credentials + vars: + tools_cluster: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + +- name: Load job information + ansible.builtin.set_fact: + recorders_unified_clickhouse_job: "{{ lookup('ansible.builtin.file', 'files/unified/clickhouse/job.yaml') | from_yaml }}" + +- name: Create PersistentVolumeClaim to retain records + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + src: files/unified/clickhouse/persistentvolumeclaim.yaml + state: present + +- name: Create ClickHouse-based unified recorder environment list + ansible.builtin.set_fact: + recorders_clickhouse_env_vars: + - name: CLICKHOUSE_ENDPOINT + value: "{{ tools_clickhouse_endpoint }}" + - name: CLICKHOUSE_USERNAME + value: "{{ tools_clickhouse_username }}" + - name: CLICKHOUSE_PASSWORD + value: "{{ tools_clickhouse_password }}" + +- name: Create ConfigMap with Python script + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + template: templates/unified/clickhouse/configmap.j2 + state: present + vars: + python_script_file_contents: "{{ lookup('ansible.builtin.file', 'files/unified/clickhouse/scripts/gather.py') }}" + requirements_file_contents: "{{ lookup('ansible.builtin.file', 'files/unified/clickhouse/scripts/requirements.txt') }}" + +- name: Wait for any ongoing jobs to be removed + kubernetes.core.k8s_info: + api_version: "{{ recorders_unified_clickhouse_job.apiVersion }}" + kind: "{{ recorders_unified_clickhouse_job.kind }}" + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + name: "{{ recorders_unified_clickhouse_job.metadata.name }}" + namespace: "{{ recorders_namespace.name }}" + register: recorders_unified_clickhouse_job_info + until: + - recorders_unified_clickhouse_job_info.resources | length == 0 + retries: 8 + delay: 15 + +- name: Install ClickHouse unified recorder + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + template: templates/unified/clickhouse/job.j2 + state: present + vars: + container_image: "{{ recorders_unified_clickhouse_job.spec.template.spec.containers[0].image }}" + container_environment_variables: "{{ recorders_clickhouse_env_vars }}" + +- name: Wait for any ongoing jobs to be removed + kubernetes.core.k8s_info: + api_version: batch/v1 + kind: Job + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + label_selectors: + - app.kubernetes.io/name=clickhouse-unified-recorder + - app.kubernetes.io/part-of=it-bench + namespace: "{{ recorders_namespace.name }}" + register: recorders_kubernetes_job_info + until: + - recorders_kubernetes_job_info.resources | length == 0 + retries: 20 + delay: 30 + +- name: Create Deployment to retrieve records + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + src: files/unified/clickhouse/deployment.yaml + state: present + wait: true + +- name: Retrieve the retriever pod name (wait until Running/Ready) + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + label_selectors: + - app.kubernetes.io/name=clickhouse-unified-recorder + - app.kubernetes.io/part-of=it-bench + register: recorders_kubernetes_pods_info + retries: 20 + delay: 6 + until: + - recorders_kubernetes_pods_info.resources | length >= 1 + - (recorders_kubernetes_pods_info.resources[0].status.phase | default('')) == 'Running' + - (recorders_kubernetes_pods_info.resources[0].status.containerStatuses | default([])) | length > 0 + - recorders_kubernetes_pods_info.resources[0].status.containerStatuses[0].ready | default(false) + +- name: Copy records directory from pod + kubernetes.core.k8s_cp: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + local_path: /tmp/observability_information_from_clickhouse + namespace: "{{ recorders_kubernetes_pods_info.resources[0].metadata.namespace }}" + pod: "{{ recorders_kubernetes_pods_info.resources[0].metadata.name }}" + remote_path: /opt/app-root/src/records + state: from_pod + when: + - recorders_kubernetes_pods_info.resources | length == 1 + +- name: Uninstall the Deployment + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + src: files/unified/clickhouse/deployment.yaml + state: absent + wait: true + +- name: Delete ConfigMap with Python script + kubernetes.core.k8s: + api_version: v1 + delete_all: true + kind: ConfigMap + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + label_selectors: + - app.kubernetes.io/name=clickhouse-unified-recorder + - app.kubernetes.io/part-of=it-bench + state: absent + wait: true + +- name: Delete PersistentVolumeClaim + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + src: files/unified/clickhouse/persistentvolumeclaim.yaml + state: absent + wait: true + +- name: Find all exported JSON files + ansible.builtin.find: + path: /tmp/observability_information_from_clickhouse + recurse: true + patterns: + - "*.json" + - "*.tsv" + - "*.parquet" + register: recorders_kubernetes_files + +- name: Ensure observability_information directory exists on local + ansible.builtin.file: + path: "{{ recorders_storage.local.directory }}/observability_information_from_clickhouse" + state: directory + mode: "0755" + when: + - recorders_storage.local is defined + +- name: Copy exported data into local directory + ansible.builtin.copy: + dest: "{{ recorders_storage.local.directory }}/observability_information_from_clickhouse/{{ file.path | regex_replace('^/tmp/observability_information_from_clickhouse/?', + '') }}" + mode: "0644" + src: "{{ file.path }}" + loop: "{{ recorders_kubernetes_files.files | default([]) }}" + loop_control: + label: file/{{ file.path | basename }} + loop_var: file + when: + - recorders_storage.local is defined + - (recorders_kubernetes_files.matched | default(0)) > 0 + +- name: Upload exported data to S3 bucket + amazon.aws.s3_object: + endpoint_url: "{{ recorders_storage.s3.endpoint }}" + bucket: "{{ recorders_storage.s3.bucket }}" + object: "{{ recorders_storage.s3.directory }}/observability_information_from_clickhouse/{{ file.path | regex_replace('^/tmp/observability_information_from_clickhouse/?', + '') }}" + src: "{{ file.path }}" + mode: put + loop: "{{ recorders_kubernetes_files.files | default([]) }}" + loop_control: + label: file/{{ file.path | basename }} + loop_var: file + when: + - recorders_storage.s3 is defined + - (recorders_kubernetes_files.matched | default(0)) > 0 diff --git a/sre/roles/recorders/templates/unified/clickhouse/configmap.j2 b/sre/roles/recorders/templates/unified/clickhouse/configmap.j2 new file mode 100644 index 000000000..7a13dbec8 --- /dev/null +++ b/sre/roles/recorders/templates/unified/clickhouse/configmap.j2 @@ -0,0 +1,13 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/name: clickhouse-unified-recorder + app.kubernetes.io/part-of: it-bench + name: clickhouse-unified-recorder-scripts +data: + deps: | + {{ requirements_file_contents | indent(width=4) }} + script: | + {{ python_script_file_contents | indent(width=4) }} diff --git a/sre/roles/recorders/templates/unified/clickhouse/job.j2 b/sre/roles/recorders/templates/unified/clickhouse/job.j2 new file mode 100644 index 000000000..4c37583b0 --- /dev/null +++ b/sre/roles/recorders/templates/unified/clickhouse/job.j2 @@ -0,0 +1,62 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + labels: + app.kubernetes.io/name: clickhouse-unified-recorder + app.kubernetes.io/part-of: it-bench + name: clickhouse-unified-recorder +spec: + template: + metadata: + annotations: + openshift.io/required-scc: restricted-v2 + labels: + app.kubernetes.io/name: clickhouse-unified-recorder + app.kubernetes.io/part-of: it-bench + spec: + containers: + - name: recorder + image: {{ container_image }} + command: + - /bin/sh + args: + - -c + - "python3.12 -m pip install -r ~/deps/requirements.txt && python3.12 ~/scripts/gather.py" + env: {{ container_environment_variables }} + resources: + requests: + cpu: 100m + memory: 512Mi + limits: + cpu: 500m + memory: 3Gi + volumeMounts: + - name: dependencies + mountPath: /opt/app-root/src/deps + readOnly: true + - name: scripts + mountPath: /opt/app-root/src/scripts + readOnly: true + - name: records + mountPath: /opt/app-root/src/records + restartPolicy: Never + securityContext: + fsGroup: 1001 + volumes: + - name: scripts + configMap: + name: clickhouse-unified-recorder-scripts + items: + - key: script + path: gather.py + - name: dependencies + configMap: + name: clickhouse-unified-recorder-scripts + items: + - key: deps + path: requirements.txt + - name: records + persistentVolumeClaim: + claimName: clickhouse-unified-records + ttlSecondsAfterFinished: 10 diff --git a/sre/roles/tools/defaults/main/instances.yaml b/sre/roles/tools/defaults/main/instances.yaml index b54d37079..664e7a456 100644 --- a/sre/roles/tools/defaults/main/instances.yaml +++ b/sre/roles/tools/defaults/main/instances.yaml @@ -6,5 +6,6 @@ tools_instances: opentelemetry_collectors: names: jaeger: jaeger - kubernetes_events: kubenetes-events + kubernetes_events: kubernetes-events + kubernetes_objects_snapshot: kubernetes-objects-snapshot namespace: opentelemetry-collectors diff --git a/sre/roles/tools/files/clickhouse/init-prometheus-configmap.yaml b/sre/roles/tools/files/clickhouse/init-prometheus-configmap.yaml new file mode 100644 index 000000000..6e7499d42 --- /dev/null +++ b/sre/roles/tools/files/clickhouse/init-prometheus-configmap.yaml @@ -0,0 +1,16 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: clickhouse-init-prometheus + labels: + app.kubernetes.io/name: clickhouse +data: + 01_init_prometheus.sql: | + -- Based on https://clickhouse.com/docs/engines/table-engines/special/time_series + -- Create database for Prometheus metrics + CREATE DATABASE IF NOT EXISTS prometheus; + + -- Create the TimeSeries table with default schema + CREATE TABLE IF NOT EXISTS prometheus.metrics + ENGINE = TimeSeries; diff --git a/sre/roles/tools/files/clickhouse/installation.yaml b/sre/roles/tools/files/clickhouse/installation.yaml index ee0b8c379..4f49374f3 100644 --- a/sre/roles/tools/files/clickhouse/installation.yaml +++ b/sre/roles/tools/files/clickhouse/installation.yaml @@ -28,6 +28,26 @@ spec: secretKeyRef: name: user-default-credentials key: password + profiles: + default/allow_experimental_time_series_table: 1 + default/http_connection_timeout: 30 + files: + prometheus.xml: | + + + 9363 + + + /write + + remote_write + prometheus + metrics
+
+
+
+
+
clusters: - name: main layout: diff --git a/sre/roles/tools/files/clickhouse/templates.yaml b/sre/roles/tools/files/clickhouse/templates.yaml index eeea47744..4192dfcc5 100644 --- a/sre/roles/tools/files/clickhouse/templates.yaml +++ b/sre/roles/tools/files/clickhouse/templates.yaml @@ -33,12 +33,28 @@ spec: - ALL image: docker.io/altinity/clickhouse-server:25.3.6.10034.altinitystable imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 8123 + - name: tcp + containerPort: 9000 + - name: prometheus + containerPort: 9363 resources: requests: cpu: 100m memory: 2Gi limits: - memory: 3Gi + memory: 6Gi + volumeMounts: + - name: initdb-volume + mountPath: /docker-entrypoint-initdb.d + readOnly: true + volumes: + - name: initdb-volume + configMap: + name: clickhouse-init-prometheus + defaultMode: 0555 --- apiVersion: clickhouse.altinity.com/v1 kind: ClickHouseInstallationTemplate @@ -83,5 +99,8 @@ spec: - name: tcp port: 9000 targetPort: 9000 + - name: prometheus + port: 9363 + targetPort: 9363 selector: app.kubernetes.io/name: clickhouse diff --git a/sre/roles/tools/tasks/install_clickhouse.yaml b/sre/roles/tools/tasks/install_clickhouse.yaml index bff62c95d..e9c3627d7 100644 --- a/sre/roles/tools/tasks/install_clickhouse.yaml +++ b/sre/roles/tools/tasks/install_clickhouse.yaml @@ -58,6 +58,7 @@ src: files/clickhouse/{{ item }} loop: - secret.yaml + - init-prometheus-configmap.yaml - templates.yaml - installation.yaml diff --git a/sre/roles/tools/tasks/install_opentelemetry.yaml b/sre/roles/tools/tasks/install_opentelemetry.yaml index 86a4cf720..47158a9b7 100644 --- a/sre/roles/tools/tasks/install_opentelemetry.yaml +++ b/sre/roles/tools/tasks/install_opentelemetry.yaml @@ -138,10 +138,10 @@ when: - tools_required.opentelemetry -- name: Install Kubernetes object collector +- name: Install Kubernetes object collector to collect events kubernetes.core.k8s: kubeconfig: "{{ tools_cluster.kubeconfig }}" - template: templates/opentelemetry/collectors/kubernetes-objects.j2 + template: templates/opentelemetry/collectors/kubernetes-objects-events.j2 state: present vars: container_image: | @@ -153,12 +153,27 @@ when: - tools_required.opentelemetry -- name: Wait for Kubernetes object collector deployment to complete +- name: Install Kubernetes object collector to snapshot select Kubernetes objects + kubernetes.core.k8s: + kubeconfig: "{{ tools_cluster.kubeconfig }}" + template: templates/opentelemetry/collectors/kubernetes-objects-snapshot.j2 + state: present + vars: + container_image: | + {{ + lookup('ansible.builtin.file', 'files/opentelemetry/collectors/kubernetes-objects.yaml') | + from_yaml | + community.general.json_query('spec.image') + }} + when: + - tools_required.opentelemetry + +- name: Wait for Kubernetes object collector deployments to complete kubernetes.core.k8s_info: api_version: opentelemetry.io/v1beta1 kind: OpenTelemetryCollector kubeconfig: "{{ tools_cluster.kubeconfig }}" - name: "{{ tools_instances.opentelemetry_collectors.names.kubernetes_events }}" + name: "{{ item }}" namespace: "{{ tools_instances.opentelemetry_collectors.namespace }}" wait: true register: tools_kubernetes_objects_collector_info @@ -167,6 +182,9 @@ - tools_kubernetes_objects_collector_info.resources[0].status.scale.statusReplicas | split('/') | unique | length == 1 retries: 10 delay: 30 + loop: + - "{{ tools_instances.opentelemetry_collectors.names.kubernetes_events }}" + - "{{ tools_instances.opentelemetry_collectors.names.kubernetes_objects_snapshot }}" when: - tools_required.opentelemetry diff --git a/sre/roles/tools/tasks/install_prometheus.yaml b/sre/roles/tools/tasks/install_prometheus.yaml index 3ca8207ad..464f70d39 100644 --- a/sre/roles/tools/tasks/install_prometheus.yaml +++ b/sre/roles/tools/tasks/install_prometheus.yaml @@ -41,6 +41,11 @@ additionalArgs: - name: web.enable-otlp-receiver value: "" + remoteWrite: + - url: "http://default:@{{ tools_instances.clickhouse.name }}-{{ tools_instances.clickhouse.name }}.{{ tools_instances.clickhouse.namespace }}.svc.cluster.local:9363/write" + queueConfig: + maxSamplesPerSend: 5000 + batchSendDeadline: 10s defaultRules: rules: alertmanager: false diff --git a/sre/roles/tools/tasks/reinit_clickhouse.yaml b/sre/roles/tools/tasks/reinit_clickhouse.yaml index dff6701d4..02941a6ad 100644 --- a/sre/roles/tools/tasks/reinit_clickhouse.yaml +++ b/sre/roles/tools/tasks/reinit_clickhouse.yaml @@ -31,15 +31,20 @@ wait: true register: tools_clickhouse_installations_info - - name: List all tables in the default database + - name: List tables from all existing databases kubernetes.core.k8s_exec: kubeconfig: "{{ tools_cluster.kubeconfig }}" namespace: "{{ installation.metadata.namespace }}" pod: "{{ installation.status.pods | first }}" - command: clickhouse-client --query="SHOW TABLES FROM default FORMAT TSV" + command: | + clickhouse-client --query=" + SELECT concat('`', database, '`.`', name, '`') AS full_name + FROM system.tables + WHERE database IN ('default', 'prometheus') + FORMAT TSV" loop: "{{ tools_clickhouse_installations_info.resources }}" loop_control: - label: clickhouseinstallation/{{ installation.metadata.name }} + label: "clickhouseinstallation/{{ installation.metadata.name }}" loop_var: installation register: tools_clickhouse_installation_tables_output when: @@ -47,36 +52,26 @@ - name: Create flattened list of table and instance groupings ansible.builtin.set_fact: - tools_clickhouse_instance_table_groups: | + tools_clickhouse_instance_table_groups: >- {{ - ( - tools_clickhouse_instance_table_groups | default([]) - ) + - ( - [ - output.stdout_lines | - product([output.installation]) | - flatten - ] - ) + (tools_clickhouse_instance_table_groups | default([])) + + (output.stdout_lines | product([output.installation]) | list) }} loop: "{{ tools_clickhouse_installation_tables_output.results }}" loop_control: - label: clickhouseinstallation/{{ output.installation.metadata.name }} + label: "clickhouseinstallation/{{ output.installation.metadata.name }}" loop_var: output when: - tools_clickhouse_installation_tables_output is defined + - output.stdout_lines is defined - output.stdout_lines | length > 0 - - name: Truncate default database ClickHouse tables + - name: Truncate ClickHouse tables in both databases kubernetes.core.k8s_exec: kubeconfig: "{{ tools_cluster.kubeconfig }}" - namespace: "{{ instance_table_group[1].metadata.namespace }}" - pod: "{{ instance_table_group[1].status.pods | first }}" - command: clickhouse-client --query="TRUNCATE TABLE default.{{ instance_table_group[0] }};" - loop: "{{ tools_clickhouse_instance_table_groups }}" + namespace: "{{ item[1].metadata.namespace }}" + pod: "{{ item[1].status.pods | first }}" + command: "clickhouse-client --query='TRUNCATE TABLE {{ item[0] }};'" + loop: "{{ tools_clickhouse_instance_table_groups | default([]) }}" loop_control: - label: table/{{ instance_table_group[0] | default('not-applicable') }} - loop_var: instance_table_group - when: - - tools_clickhouse_instance_table_groups is defined + label: "table/{{ item[0] }}" diff --git a/sre/roles/tools/tasks/set_clickhouse_endpoint.yaml b/sre/roles/tools/tasks/set_clickhouse_endpoint.yaml index 469ba9af2..2fee76369 100644 --- a/sre/roles/tools/tasks/set_clickhouse_endpoint.yaml +++ b/sre/roles/tools/tasks/set_clickhouse_endpoint.yaml @@ -11,7 +11,7 @@ - name: Extract in-cluster endpoint for Clickhouse cluster ansible.builtin.set_fact: - tools_clickhouse_endpoint: http://{{ tools_clickhouse_installation_info.resources[0].status.endpoint }}:8123 + tools_clickhouse_endpoint: http://{{ tools_clickhouse_installation_info.resources[0].status.endpoint }} when: - tools_clickhouse_installation_info is defined - tools_clickhouse_installation_info.resources | length > 0 diff --git a/sre/roles/tools/tasks/uninstall_clickhouse.yaml b/sre/roles/tools/tasks/uninstall_clickhouse.yaml index 7fa3cac89..a3fc80023 100644 --- a/sre/roles/tools/tasks/uninstall_clickhouse.yaml +++ b/sre/roles/tools/tasks/uninstall_clickhouse.yaml @@ -9,6 +9,7 @@ loop: - installation.yaml - templates.yaml + - init-prometheus-configmap.yaml - secret.yaml - name: Uninstall Clickhouse Operator diff --git a/sre/roles/tools/tasks/uninstall_prometheus.yaml b/sre/roles/tools/tasks/uninstall_prometheus.yaml index e41f121e4..bcc313365 100644 --- a/sre/roles/tools/tasks/uninstall_prometheus.yaml +++ b/sre/roles/tools/tasks/uninstall_prometheus.yaml @@ -55,3 +55,4 @@ name: "{{ helm_release.namespace }}" state: absent wait: true + wait_timeout: 600 diff --git a/sre/roles/tools/templates/opentelemetry/collectors/kubernetes-objects.j2 b/sre/roles/tools/templates/opentelemetry/collectors/kubernetes-objects-events.j2 similarity index 84% rename from sre/roles/tools/templates/opentelemetry/collectors/kubernetes-objects.j2 rename to sre/roles/tools/templates/opentelemetry/collectors/kubernetes-objects-events.j2 index d0805e4db..4b79b663e 100644 --- a/sre/roles/tools/templates/opentelemetry/collectors/kubernetes-objects.j2 +++ b/sre/roles/tools/templates/opentelemetry/collectors/kubernetes-objects-events.j2 @@ -21,6 +21,13 @@ rules: verbs: - list - watch + - apiGroups: + - "" + resources: + - events + verbs: + - list + - watch --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -61,18 +68,20 @@ spec: - debug - clickhouse receivers: - - otlp + - k8sobjects receivers: - otlp: - protocols: - grpc: - endpoint: 0.0.0.0:4317 - http: - endpoint: 0.0.0.0:4318 + k8sobjects: + objects: + - name: events + mode: watch + group: events.k8s.io + - name: events + mode: watch + group: "" exporters: clickhouse: username: "{{ tools_clickhouse_username }}" password: "{{ tools_clickhouse_password }}" - endpoint: "{{ tools_clickhouse_endpoint }}" + endpoint: "{{ tools_clickhouse_endpoint }}:8123" logs_table_name: kubernetes_events debug: {} diff --git a/sre/roles/tools/templates/opentelemetry/collectors/kubernetes-objects-snapshot.j2 b/sre/roles/tools/templates/opentelemetry/collectors/kubernetes-objects-snapshot.j2 new file mode 100644 index 000000000..a213ae7c8 --- /dev/null +++ b/sre/roles/tools/templates/opentelemetry/collectors/kubernetes-objects-snapshot.j2 @@ -0,0 +1,227 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kubernetes-objects-snapshot-otel-collector + namespace: {{ tools_instances.opentelemetry_collectors.namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: kubernetes-objects-snapshot-otel-collector + app.kubernetes.io/component: observability + app.kubernetes.io/part-of: it-bench + name: kubernetes-objects-snapshot-otel-collector +rules: + - apiGroups: [""] + resources: + - pods + - services + - endpoints + - persistentvolumeclaims + - persistentvolumes + - configmaps + - serviceaccounts + - nodes + - namespaces + - resourcequotas + - limitranges + - replicationcontrollers + verbs: ["get", "list"] + - apiGroups: ["apps"] + resources: + - deployments + - daemonsets + - statefulsets + - replicasets + verbs: ["get", "list"] + - apiGroups: ["batch"] + resources: + - jobs + - cronjobs + verbs: ["get", "list"] + - apiGroups: ["networking.k8s.io"] + resources: + - ingresses + - networkpolicies + verbs: ["get", "list"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: + - roles + - rolebindings + - clusterroles + - clusterrolebindings + verbs: ["get", "list"] + - apiGroups: ["storage.k8s.io"] + resources: + - storageclasses + - volumeattachments + verbs: ["get", "list"] + - apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["get", "list"] + - apiGroups: ["policy"] + resources: + - poddisruptionbudgets + verbs: ["get", "list"] + - apiGroups: ["apiextensions.k8s.io"] + resources: + - customresourcedefinitions + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/name: kubernetes-objects-snapshot-otel-collector + app.kubernetes.io/component: observability + app.kubernetes.io/part-of: it-bench + name: kubernetes-objects-snapshot-otel-collector +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kubernetes-objects-snapshot-otel-collector +subjects: + - kind: ServiceAccount + name: kubernetes-objects-snapshot-otel-collector + namespace: {{ tools_instances.opentelemetry_collectors.namespace }} +--- +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: {{ tools_instances.opentelemetry_collectors.names.kubernetes_objects_snapshot | default('kubernetes-objects-snapshot') }} + namespace: {{ tools_instances.opentelemetry_collectors.namespace }} +spec: + image: {{ container_image }} + mode: deployment + observability: + metrics: + enableMetrics: {{ tools_required.prometheus }} + serviceAccount: kubernetes-objects-snapshot-otel-collector + podAnnotations: + openshift.io/required-scc: restricted-v2 + config: + service: + pipelines: + logs: + exporters: + - debug + - clickhouse + receivers: + - k8sobjects + receivers: + k8sobjects: + objects: + - name: pods + mode: pull + interval: 5m + - name: services + mode: pull + interval: 5m + - name: endpoints + mode: pull + interval: 5m + - name: configmaps + mode: pull + interval: 5m + - name: serviceaccounts + mode: pull + interval: 5m + - name: namespaces + mode: pull + interval: 5m + - name: nodes + mode: pull + interval: 5m + - name: persistentvolumeclaims + mode: pull + interval: 5m + - name: persistentvolumes + mode: pull + interval: 5m + - name: resourcequotas + mode: pull + interval: 5m + - name: limitranges + mode: pull + interval: 5m + - name: replicationcontrollers + mode: pull + interval: 5m + - name: deployments + mode: pull + interval: 5m + group: apps + - name: daemonsets + mode: pull + interval: 5m + group: apps + - name: statefulsets + mode: pull + interval: 5m + group: apps + - name: replicasets + mode: pull + interval: 5m + group: apps + - name: jobs + mode: pull + interval: 5m + group: batch + - name: cronjobs + mode: pull + interval: 5m + group: batch + - name: ingresses + mode: pull + interval: 5m + group: networking.k8s.io + - name: networkpolicies + mode: pull + interval: 5m + group: networking.k8s.io + - name: roles + mode: pull + interval: 5m + group: rbac.authorization.k8s.io + - name: rolebindings + mode: pull + interval: 5m + group: rbac.authorization.k8s.io + - name: clusterroles + mode: pull + interval: 5m + group: rbac.authorization.k8s.io + - name: clusterrolebindings + mode: pull + interval: 5m + group: rbac.authorization.k8s.io + - name: storageclasses + mode: pull + interval: 5m + group: storage.k8s.io + - name: volumeattachments + mode: pull + interval: 5m + group: storage.k8s.io + - name: horizontalpodautoscalers + mode: pull + interval: 5m + group: autoscaling + - name: poddisruptionbudgets + mode: pull + interval: 5m + group: policy + - name: customresourcedefinitions + mode: pull + interval: 5m + group: apiextensions.k8s.io + exporters: + clickhouse: + username: "{{ tools_clickhouse_username }}" + password: "{{ tools_clickhouse_password }}" + endpoint: "{{ tools_clickhouse_endpoint }}:8123" + logs_table_name: kubernetes_objects_snapshot + debug: {}