Skip to content

Commit 97190cf

Browse files
Streamline configuration & install Prometheus and Grafana as subcharts (#40)
* install Grafana via subchart * Update JSON schema * Update helm docs * add dependency in ci * add dependency in ci * update helmignore * update helpers * disable rbac validation and simplify prometheus helpers for now * fix NOTES * fix CI * temporary fix for CI * install Prometheus as subchart * Update JSON schema * Update helm docs * remove grafana-legacy configs * Update JSON schema * Update helm docs * streamline prometheus configuration * Update JSON schema * Update helm docs * update grafana helpers * add parameter validation for prometheus and grafana * disable prometheus ingress by default * Update JSON schema * Update helm docs * add a ci script for local testing, fix GH CI * add Chart.lock * update Chart.lock * update CI * update CI * udpate CI * update .gitattributes * update values * fix CI * fix CI * move ingress configuration under envoy: * Update JSON schema * Update helm docs * fix grpcEndpoint helper * clean up values files * improve grafana and prometheus logic * update dashboard * improve monitoring * add validation for grafana datasources * add grafana datasources to values files * add missing labels * prevent from deploying ingresses with duplicate names * further streamline *.tpl helpers * update values file * update documentation * update README * Update helm docs * update diagram * clean up * removing the whitespace removal for the auth cluster section of envoy proxy config (@briedel) --------- Co-authored-by: GitHub Actions <[email protected]>
1 parent d9f8474 commit 97190cf

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+1872
-822
lines changed

.gitattributes

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
*.json linguist-detectable
22
*.yml linguist-detectable
3-
*.yaml linguist-detectable
3+
*.yaml linguist-detectable
4+
*.tpl linguist-language=Go

.github/workflows/ci-github-cms.yaml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ jobs:
5151
5252
- name: Deploy Helm chart
5353
run: |
54+
helm repo add grafana https://grafana.github.io/helm-charts
55+
helm repo update
56+
helm dependency build ./helm/supersonic
5457
helm upgrade --install supersonic ./helm/supersonic \
5558
--values values/values-cms-ci.yaml -n cms
5659
@@ -64,12 +67,12 @@ jobs:
6467
6568
- name: Prometheus ready
6669
run: |
67-
kubectl wait --for condition=Ready pod -l app.kubernetes.io/component=prometheus --timeout 120s -n cms
68-
kubectl get svc,pod -l app.kubernetes.io/component=prometheus -n cms
70+
kubectl wait --for condition=Ready pod -l app.kubernetes.io/name=prometheus --timeout 120s -n cms
71+
kubectl get svc,pod -l app.kubernetes.io/name=prometheus -n cms
6972
7073
- name: Grafana ready
7174
run: |
72-
kubectl wait --for condition=Ready pod -l app.kubernetes.io/component=grafana --timeout 120s -n cms
75+
kubectl wait --for condition=Ready pod -l app.kubernetes.io/name=grafana --timeout 120s -n cms
7376
7477
- name: Triton server ready
7578
run: |

.github/workflows/ci-local.sh

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#!/bin/bash
2+
3+
echo "Starting deployment process..."
4+
5+
# 1. Create a Kubernetes cluster with Kind
6+
echo "Creating Kind cluster..."
7+
kind create cluster --name gh-k8s-cluster
8+
9+
# 2. (Assuming Helm is installed and at the proper version)
10+
11+
# 3. Create CMS namespace
12+
echo "Creating CMS namespace..."
13+
kubectl create namespace cms
14+
15+
# 4. Install Prometheus Operator CRDs
16+
echo "Installing Prometheus Operator CRDs..."
17+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
18+
helm repo update
19+
kubectl create namespace monitoring
20+
helm install prometheus-operator prometheus-community/kube-prometheus-stack \
21+
--namespace monitoring \
22+
--set prometheusOperator.createCustomResource=false \
23+
--set defaultRules.create=false \
24+
--set alertmanager.enabled=false \
25+
--set prometheus.enabled=false \
26+
--set grafana.enabled=false
27+
28+
# 5. Install KEDA Autoscaler
29+
echo "Installing KEDA Autoscaler..."
30+
helm repo add kedacore https://kedacore.github.io/charts
31+
helm repo update
32+
kubectl create namespace keda
33+
helm install keda kedacore/keda --namespace keda
34+
35+
# 6. Mount CVMFS
36+
echo "Mounting CVMFS..."
37+
kubectl create namespace cvmfs-csi
38+
helm install -n cvmfs-csi cvmfs-csi oci://registry.cern.ch/kubernetes/charts/cvmfs-csi \
39+
--values ci/values-cvmfs-csi.yaml
40+
kubectl apply -f ci/cvmfs-storageclass.yaml -n cvmfs-csi
41+
42+
# 7. Deploy the Helm chart for supersonic
43+
echo "Deploying Helm chart for supersonic..."
44+
helm repo add grafana https://grafana.github.io/helm-charts
45+
helm repo update
46+
helm dependency build ./helm/supersonic
47+
helm upgrade --install supersonic ./helm/supersonic --values values/values-cms-ci.yaml -n cms
48+
49+
# 8. Wait for components to become ready
50+
51+
echo "Waiting for CVMFS pods to be ready..."
52+
kubectl wait --for=condition=Ready pod --all -n cvmfs-csi --timeout 120s
53+
54+
echo "Waiting for Envoy proxy pods to be ready..."
55+
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=envoy --timeout 120s -n cms
56+
57+
echo "Waiting for Prometheus pods to be ready..."
58+
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=prometheus --timeout 120s -n cms
59+
kubectl get svc,pod -l app.kubernetes.io/name=prometheus -n cms
60+
61+
echo "Waiting for Grafana pods to be ready..."
62+
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=grafana --timeout 120s -n cms
63+
64+
echo "Waiting for Triton server pods to be ready..."
65+
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=triton --timeout 300s -n cms
66+
67+
echo "Waiting for KEDA Autoscaler to be ready..."
68+
kubectl wait --for=condition=AbleToScale hpa -l app.kubernetes.io/component=keda --timeout 120s -n cms
69+
kubectl wait --for=condition=Ready so -l app.kubernetes.io/component=keda --timeout 120s -n cms
70+
71+
# 9. Validate the Deployment
72+
echo "Validating Deployment in 'cms' namespace..."
73+
kubectl get all -n cms
74+
75+
# 10. Run Perf Analyzer Job
76+
echo "Running Perf Analyzer Job..."
77+
kubectl apply -f ci/perf-analyzer-job.yaml
78+
kubectl wait --for=condition=complete job/perf-analyzer-job -n cms --timeout=180s || {
79+
echo "Perf-analyzer job did not complete in time or failed."
80+
exit 1
81+
}
82+
83+
# Retrieve and print the logs from the Perf Analyzer pod
84+
POD_NAME=$(kubectl get pods -n cms -l job-name=perf-analyzer-job -o jsonpath="{.items[0].metadata.name}")
85+
echo "========== Perf Analyzer Logs =========="
86+
kubectl logs -n cms "$POD_NAME"
87+
echo "========================================"
88+
89+
# 11. Cleanup the Kind cluster
90+
echo "Cleaning up: Deleting Kind cluster..."
91+
kind delete cluster --name gh-k8s-cluster
92+
93+
echo "Deployment process completed successfully!"

.github/workflows/helm-lint.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ jobs:
5555
5656
- name: Lint values.yaml files in values/ directory
5757
run: |
58+
helm repo add grafana https://grafana.github.io/helm-charts
59+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
60+
helm repo update
61+
helm dependency build ./helm/supersonic
5862
CHART_PATH="helm/supersonic/"
5963
VALUES_DIR="values/"
6064

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
# Sphinx Documentation
2-
docs/_build
2+
docs/_build
3+
4+
*.tgz

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ The main components of SuperSONIC are:
2626

2727
```
2828
helm repo add fastml https://fastmachinelearning.org/SuperSONIC
29+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
30+
helm repo add grafana https://grafana.github.io/helm-charts
2931
helm repo update
3032
helm install <release-name> fastml/supersonic --values <your-values.yaml> -n <namespace>
3133
```

docs/.values-table.md

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
| Key | Type | Default | Description |
44
|-----|------|---------|-------------|
55
| nameOverride | string | `""` | Unique identifier of SuperSONIC instance (equal to release name by default) |
6+
| serverLoadMetric | string | `""` | A metric used by both KEDA autoscaler and Envoy's prometheus-based rate limiter. # Default metric (inference queue latency) is defined in templates/_helpers.tpl |
7+
| serverLoadThreshold | int | `100` | Threshold for the metric |
68
| triton.replicas | int | `1` | Number of Triton server instances (if autoscaling is disabled) |
79
| triton.image | string | `"nvcr.io/nvidia/tritonserver:24.12-py3-min"` | Docker image for the Triton server |
810
| triton.command | list | `["/bin/sh","-c"]` | Command and arguments to run in Triton container |
@@ -22,6 +24,7 @@
2224
| envoy.resources | object | `{"limits":{"cpu":2,"memory":"4G"},"requests":{"cpu":1,"memory":"2G"}}` | Resource requests and limits for Envoy Proxy. Note: an Envoy Proxy with too many connections might run out of CPU |
2325
| envoy.service.type | string | `"ClusterIP"` | This is the client-facing endpoint. In order to be able to connect to it, either enable ingress, or use type: LoadBalancer. |
2426
| envoy.service.ports | list | `[{"name":"grpc","port":8001,"targetPort":8001},{"name":"admin","port":9901,"targetPort":9901}]` | Envoy Service ports |
27+
| envoy.ingress | object | `{"annotations":{},"enabled":false,"hostName":"","ingressClassName":""}` | Ingress configuration for Envoy |
2528
| envoy.grpc_route_timeout | string | `"0s"` | Timeout for gRPC route in Envoy; disabled by default (0s), preventing Envoy from closing connections too early. |
2629
| envoy.rate_limiter.listener_level | object | `{"enabled":false,"fill_interval":"12s","max_tokens":5,"tokens_per_fill":1}` | This rate limiter explicitly controls the number of client connections to the Envoy Proxy. |
2730
| envoy.rate_limiter.listener_level.enabled | bool | `false` | Enable rate limiter |
@@ -47,22 +50,25 @@
4750
| autoscaler.scaleDown.window | int | `600` | |
4851
| autoscaler.scaleDown.period | int | `120` | |
4952
| autoscaler.scaleDown.stepsize | int | `1` | |
50-
| prometheus | object | `{"external":true,"ingress":{"annotations":{},"enabled":false,"hostName":"","ingressClassName":""},"port":443,"scheme":"https","serverLoadMetric":"","serverLoadThreshold":100,"url":""}` | Connection to a Prometheus server is required for KEDA autoscaler and Envoy's prometheus-based rate limiter |
51-
| prometheus.external | bool | `true` | Whether to use external Prometheus instance (true) or deploy internal one (false) |
52-
| prometheus.url | string | `""` | External Prometheus server url and port number (find in documentation of a given cluster or ask admins) Only used when external=true |
53-
| prometheus.scheme | string | `"https"` | Specify whether external Prometheus endpoint is exposed as http or https Only used when external=true |
54-
| prometheus.serverLoadMetric | string | `""` | A metric used by both KEDA autoscaler and Envoy's prometheus-based rate limiter. # Default metric (inference queue latency) is defined in templates/_helpers.tpl |
55-
| prometheus.serverLoadThreshold | int | `100` | Threshold for the metric |
56-
| prometheus.ingress | object | `{"annotations":{},"enabled":false,"hostName":"","ingressClassName":""}` | Ingress configuration for internal Prometheus web UI (only used when external=false) |
57-
| ingress.enabled | bool | `false` | |
58-
| ingress.hostName | string | `""` | |
59-
| ingress.ingressClassName | string | `""` | |
60-
| ingress.annotations | object | `{}` | |
6153
| nodeSelector | object | `{}` | Node selector for all pods (Triton and Envoy) |
6254
| tolerations | list | `[]` | Tolerations for all pods (Triton and Envoy) |
63-
| grafana.enabled | bool | `false` | Enable or disable Grafana deployment |
64-
| grafana.ingress | object | `{"annotations":{},"enabled":false,"hostName":"","ingressClassName":"haproxy"}` | Ingress configuration for Grafana |
65-
| grafana.ingress.enabled | bool | `false` | Enable or disable ingress for Grafana |
66-
| grafana.ingress.hostName | string | `""` | Hostname for Grafana ingress |
67-
| grafana.ingress.ingressClassName | string | `"haproxy"` | Ingress class name (e.g. nginx, haproxy) |
68-
| grafana.ingress.annotations | object | `{}` | Additional annotations for Grafana ingress |
55+
| prometheus | object | `{"alertmanager":{"enabled":false},"configmapReload":{"prometheus":{"enabled":false}},"enabled":false,"external":{"enabled":false,"port":443,"scheme":"https","url":""},"kube-state-metrics":{"enabled":false},"prometheus-node-exporter":{"enabled":false},"prometheus-pushgateway":{"enabled":false},"pushgateway":{"enabled":false},"rbac":{"create":false},"server":{"configMapOverrideName":"prometheus-config","global":{"evaluation_interval":"5s","scrape_interval":"5s"},"ingress":{"annotations":{},"enabled":false,"hosts":[],"ingressClassName":"","tls":[{"hosts":[]}]},"persistentVolume":{"enabled":false},"releaseNamespace":true,"resources":{"limits":{"cpu":1,"memory":"1Gi"},"requests":{"cpu":"500m","memory":"512Mi"}},"retention":"15d","service":{"enabled":true,"servicePort":9090},"useExistingClusterRoleName":"supersonic-prometheus-role"},"serviceAccounts":{"server":{"create":false,"name":"supersonic-prometheus-sa"}}}` | Connection to a Prometheus server is required for KEDA autoscaler and Envoy's prometheus-based rate limiter |
56+
| prometheus.external.enabled | bool | `false` | Enable external Prometheus instance |
57+
| prometheus.external.url | string | `""` | External Prometheus server url |
58+
| prometheus.external.port | int | `443` | External Prometheus server port number |
59+
| prometheus.external.scheme | string | `"https"` | Specify whether external Prometheus endpoint is exposed as http or https |
60+
| prometheus.enabled | bool | `false` | Enable or disable Prometheus subchart deployment |
61+
| prometheus.server | object | `{"configMapOverrideName":"prometheus-config","global":{"evaluation_interval":"5s","scrape_interval":"5s"},"ingress":{"annotations":{},"enabled":false,"hosts":[],"ingressClassName":"","tls":[{"hosts":[]}]},"persistentVolume":{"enabled":false},"releaseNamespace":true,"resources":{"limits":{"cpu":1,"memory":"1Gi"},"requests":{"cpu":"500m","memory":"512Mi"}},"retention":"15d","service":{"enabled":true,"servicePort":9090},"useExistingClusterRoleName":"supersonic-prometheus-role"}` | Prometheus Helm chart configuration (https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus) |
62+
| grafana.enabled | bool | `false` | |
63+
| grafana.adminUser | string | `"admin"` | |
64+
| grafana.adminPassword | string | `"admin"` | |
65+
| grafana.persistence.enabled | bool | `false` | |
66+
| grafana.rbac.create | bool | `false` | |
67+
| grafana.serviceAccount.create | bool | `false` | |
68+
| grafana.datasources | object | `{"datasources.yaml":{"apiVersion":1,"datasources":[{"access":"proxy","isDefault":true,"jsonData":{"timeInterval":"5s","tlsSkipVerify":true},"name":"prometheus","type":"prometheus","url":"http://supersonic-prometheus-server:9090"}]}}` | Grafana datasources configuration |
69+
| grafana.dashboardProviders | object | `{"dashboardproviders.yaml":{"apiVersion":1,"providers":[{"disableDeletion":false,"editable":true,"folder":"","name":"default","options":{"path":"/var/lib/grafana/dashboards/default"},"orgId":1,"type":"file"}]}}` | Grafana dashboard providers configuration |
70+
| grafana.dashboardsConfigMaps | object | `{"default":"supersonic-grafana-default-dashboard"}` | Grafana dashboard ConfigMaps |
71+
| grafana."grafana.ini" | object | `{"auth":{"disable_login_form":true},"auth.anonymous":{"enabled":true,"org_role":"Admin"},"dashboards":{"default_home_dashboard_path":"/var/lib/grafana/dashboards/default/default.json"}}` | Grafana.ini configuration |
72+
| grafana.resources | object | `{"limits":{"cpu":1,"memory":"1Gi"},"requests":{"cpu":"100m","memory":"128Mi"}}` | Resource limits and requests for Grafana |
73+
| grafana.service | object | `{"port":80,"targetPort":3000,"type":"ClusterIP"}` | Service configuration |
74+
| grafana.ingress | object | `{"annotations":{},"enabled":false,"hosts":[],"ingressClassName":"","path":"/","pathType":"ImplementationSpecific","tls":[]}` | Ingress configuration |

0 commit comments

Comments
 (0)