Skip to content

Commit d9f8474

Browse files
authored
Re-use Prometheus and/or Grafana when installing multiple SuperSONIC instances in the same namespace (#39)
* Re-use Prometheus and/or Grafana when installing multiple SuperSONIC instances in the same namespace * add app label to prometheus metrics * display metrics from multiple instances in the same dashboard * print scaling metric in `helm install` message * fix * clean up tpl files and fix prometheus address * improve prometheusUrl and grafanaUrl logic * move scaling metric to a separate file for clarity * connect envoy to existng prometheus if possible * reconfigure prometheus to scrape metrics via servicemonitor
1 parent 14709da commit d9f8474

19 files changed

+298
-121
lines changed

helm/supersonic/dashboards/default.json

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@
9292
"colorMode": "value",
9393
"graphMode": "none",
9494
"justifyMode": "auto",
95-
"orientation": "auto",
95+
"orientation": "horizontal",
9696
"reduceOptions": {
9797
"calcs": [
9898
"lastNotNull"
@@ -112,7 +112,7 @@
112112
},
113113
"disableTextWrap": false,
114114
"editorMode": "code",
115-
"expr": "sum by(release)(envoy_cluster_membership_total{envoy_cluster_name=\"triton_grpc_service\", release=\"${release_name}\"})",
115+
"expr": "sum by(release)(envoy_cluster_membership_total{envoy_cluster_name=\"triton_grpc_service\", release=~\"${release_name}\"})",
116116
"fullMetaSearch": false,
117117
"includeNullMetadata": true,
118118
"instant": false,
@@ -197,7 +197,7 @@
197197
"calcs": [],
198198
"displayMode": "list",
199199
"placement": "bottom",
200-
"showLegend": false
200+
"showLegend": true
201201
},
202202
"tooltip": {
203203
"mode": "multi",
@@ -211,7 +211,7 @@
211211
"uid": "prometheus"
212212
},
213213
"expr": "${server_load_metric}",
214-
"legendFormat": "__auto",
214+
"legendFormat": "{{release}}",
215215
"refId": "A"
216216
}
217217
],
@@ -328,7 +328,7 @@
328328
"uid": "prometheus"
329329
},
330330
"editorMode": "code",
331-
"expr": "sum by (release) ( rate(nv_inference_compute_infer_duration_us{release=\"${release_name}\"}[15s])) /sum by (release) ( (rate(nv_inference_exec_count{release=\"${release_name}\"}[15s]) * 1000) + 0.001)",
331+
"expr": "sum by (app) ( rate(nv_inference_compute_infer_duration_us{release=~\"${release_name}\"}[15s])) /sum by (app) ( (rate(nv_inference_exec_count{release=~\"${release_name}\"}[15s]) * 1000) + 0.001)",
332332
"instant": false,
333333
"legendFormat": "Inference",
334334
"range": true,
@@ -340,7 +340,7 @@
340340
"uid": "prometheus"
341341
},
342342
"editorMode": "code",
343-
"expr": "sum by (release) ( rate(nv_inference_queue_duration_us{release=\"${release_name}\"}[15s])) /sum by (release) ( (rate(nv_inference_exec_count{release=\"${release_name}\"}[15s]) * 1000) + 0.001)",
343+
"expr": "sum by (app) ( rate(nv_inference_queue_duration_us{release=~\"${release_name}\"}[15s])) /sum by (app) ( (rate(nv_inference_exec_count{release=~\"${release_name}\"}[15s]) * 1000) + 0.001)",
344344
"hide": false,
345345
"instant": false,
346346
"legendFormat": "Queue",
@@ -353,7 +353,7 @@
353353
"uid": "prometheus"
354354
},
355355
"editorMode": "code",
356-
"expr": "sum by (release) ( rate(nv_inference_compute_input_duration_us{release=\"${release_name}\"}[15s])) /sum by (release) ( (rate(nv_inference_exec_count{release=\"${release_name}\"}[15s]) * 1000) + 0.001)",
356+
"expr": "sum by (app) ( rate(nv_inference_compute_input_duration_us{release=~\"${release_name}\"}[15s])) /sum by (app) ( (rate(nv_inference_exec_count{release=~\"${release_name}\"}[15s]) * 1000) + 0.001)",
357357
"hide": false,
358358
"instant": false,
359359
"legendFormat": "Input",
@@ -366,7 +366,7 @@
366366
"uid": "prometheus"
367367
},
368368
"editorMode": "code",
369-
"expr": "sum by (release) ( rate(nv_inference_compute_output_duration_us{release=\"${release_name}\"}[15s])) /sum by (release) ( (rate(nv_inference_exec_count{release=\"${release_name}\"}[15s]) * 1000) + 0.001)",
369+
"expr": "sum by (app) ( rate(nv_inference_compute_output_duration_us{release=~\"${release_name}\"}[15s])) /sum by (app) ( (rate(nv_inference_exec_count{release=~\"${release_name}\"}[15s]) * 1000) + 0.001)",
370370
"hide": false,
371371
"instant": false,
372372
"legendFormat": "Output",
@@ -379,7 +379,7 @@
379379
"uid": "prometheus"
380380
},
381381
"editorMode": "code",
382-
"expr": " sum(\n rate(envoy_http_downstream_rq_time_sum{envoy_http_conn_manager_prefix=\"ingress_grpc\", release=\"${release_name}\"}[15s])\n /\n rate(envoy_http_downstream_rq_time_count{envoy_http_conn_manager_prefix=\"ingress_grpc\", release=\"${release_name}\"}[15s])\n ) by (release)",
382+
"expr": " sum(\n rate(envoy_http_downstream_rq_time_sum{envoy_http_conn_manager_prefix=\"ingress_grpc\", release=~\"${release_name}\"}[15s])\n /\n rate(envoy_http_downstream_rq_time_count{envoy_http_conn_manager_prefix=\"ingress_grpc\", release=~\"${release_name}\"}[15s])\n ) by (app)",
383383
"hide": false,
384384
"instant": false,
385385
"legendFormat": "Total (measured at proxy)",
@@ -462,7 +462,7 @@
462462
"calcs": [],
463463
"displayMode": "list",
464464
"placement": "bottom",
465-
"showLegend": false
465+
"showLegend": true
466466
},
467467
"tooltip": {
468468
"mode": "multi",
@@ -477,7 +477,7 @@
477477
},
478478
"disableTextWrap": false,
479479
"editorMode": "code",
480-
"expr": "sum by(release)(envoy_cluster_membership_total{envoy_cluster_name=\"triton_grpc_service\", release=\"${release_name}\"})",
480+
"expr": "sum by(release)(envoy_cluster_membership_total{envoy_cluster_name=\"triton_grpc_service\", release=~\"${release_name}\"})",
481481
"fullMetaSearch": false,
482482
"includeNullMetadata": true,
483483
"instant": false,
@@ -909,7 +909,7 @@
909909
"calcs": [],
910910
"displayMode": "list",
911911
"placement": "right",
912-
"showLegend": false
912+
"showLegend": true
913913
},
914914
"tooltip": {
915915
"mode": "multi",
@@ -925,7 +925,7 @@
925925
},
926926
"editorMode": "code",
927927
"exemplar": false,
928-
"expr": "sum (rate(nv_inference_count{release=\"${release_name}\"}[15s]))by(release)",
928+
"expr": "sum (rate(nv_inference_count{release=~\"${release_name}\"}[15s]))by(release)",
929929
"instant": false,
930930
"interval": "",
931931
"legendFormat": "{{ release }}",
@@ -1022,7 +1022,7 @@
10221022
},
10231023
"editorMode": "code",
10241024
"exemplar": false,
1025-
"expr": "sum (rate(nv_inference_count{release=\"${release_name}\"}[15s]))by(pod)",
1025+
"expr": "sum (rate(nv_inference_count{release=~\"${release_name}\"}[15s]))by(pod)",
10261026
"instant": false,
10271027
"interval": "",
10281028
"legendFormat": "{{ pod }}",
@@ -1119,7 +1119,7 @@
11191119
},
11201120
"editorMode": "code",
11211121
"exemplar": false,
1122-
"expr": "sum (rate(nv_inference_count{release=\"${release_name}\"}[15s]))by(model)",
1122+
"expr": "sum (rate(nv_inference_count{release=~\"${release_name}\"}[15s]))by(model)",
11231123
"instant": false,
11241124
"interval": "",
11251125
"legendFormat": "{{ model }}",

helm/supersonic/dashboards/variables.json

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,16 @@
1111
"type": "prometheus",
1212
"uid": "prometheus"
1313
},
14-
"definition": "label_values(release)",
15-
"hide": 2,
14+
"definition": "label_values(up{app=\"%CHART_NAME%\", namespace=\"%NAMESPACE%\"}, release)",
15+
"hide": 0,
1616
"includeAll": true,
1717
"label": "Release",
1818
"multi": true,
1919
"name": "release_name",
2020
"options": [],
2121
"query": {
2222
"qryType": 1,
23-
"query": "label_values(release)",
23+
"query": "label_values(up{app=\"%CHART_NAME%\", namespace=\"%NAMESPACE%\"}, release)",
2424
"refId": "PrometheusVariableQueryEditor-VariableQuery"
2525
},
2626
"refresh": 1,
@@ -45,14 +45,6 @@
4545
"skipUrlSync": false,
4646
"type": "constant"
4747
},
48-
{
49-
"hide": 2,
50-
"label": "Release Name",
51-
"name": "release_name",
52-
"query": "%RELEASE_NAME%",
53-
"skipUrlSync": false,
54-
"type": "constant"
55-
},
5648
{
5749
"hide": 2,
5850
"label": "Version",

helm/supersonic/templates/NOTES.txt

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,24 @@ SuperSONIC chart successfully installed!
1515
| Instance name *: {{ (include "supersonic.name" .) }}
1616
| * equal to release name, unless nameOverride is specified.
1717
└-----------------------------------------------------------------------------┘
18+
19+
Scaling metric:{{ if not ( eq .Values.prometheus.serverLoadMetric "" ) }} {{ .Values.prometheus.serverLoadMetric }}{{ else }}{{ include "supersonic.defaultMetric" . | nindent 4 }}{{ end }}
20+
21+
Scaling threshold: {{ include "supersonic.serverLoadThreshold" . }}{{"\n"}}
22+
23+
{{- if or (and .Values.grafana.enabled (eq (include "supersonic.grafanaExists" .) "true")) (eq (include "supersonic.prometheusExists" .) "true") .Values.prometheus.external }}
24+
┌-----------------------------------------------------------------------------┐
25+
| NOTICE: Using existing/external monitoring components
26+
{{- if and .Values.grafana.enabled (eq (include "supersonic.grafanaExists" .) "true") }}
27+
| • Re-using existing Grafana instance
28+
{{- end }}
29+
{{- if .Values.prometheus.external }}
30+
| • Using external Prometheus instance
31+
{{- else if (eq (include "supersonic.prometheusExists" .) "true") }}
32+
| • Re-using existing Prometheus instance
33+
{{- end }}
34+
└-----------------------------------------------------------------------------┘
35+
{{- end }}
1836
┌-----------------------------------------------------------------------------┐
1937
| Documentation: https://fastmachinelearning.org/SuperSONIC
2038
|
@@ -24,11 +42,9 @@ SuperSONIC chart successfully installed!
2442
| gRPC endpoint: {{ include "supersonic.grpcEndpoint" . }}
2543
{{- end }}
2644
|
27-
{{- if (or (not .Values.prometheus.external) .Values.prometheus.url) }}
2845
| Prometheus UI: {{ include "supersonic.prometheusUrl" . }}
29-
{{- end }}
3046
|
3147
{{- if .Values.grafana.enabled }}
32-
| Grafana dashboard: https://{{ .Values.grafana.ingress.hostName }}
48+
| Grafana dashboard: {{ include "supersonic.grafanaUrl" . }}
3349
{{- end }}
3450
└-----------------------------------------------------------------------------┘
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
{{/*
2+
Get Grafana name
3+
*/}}
4+
{{- define "supersonic.grafanaName" -}}
5+
{{- printf "%s-grafana" (include "supersonic.name" .) | trunc 63 | trimSuffix "-" -}}
6+
{{- end -}}
7+
8+
{{/*
9+
Check if Grafana exists in the namespace (from any release)
10+
*/}}
11+
{{- define "supersonic.grafanaExists" -}}
12+
{{- $root := . -}}
13+
{{- $exists := false -}}
14+
{{- if (lookup "v1" "Service" .Release.Namespace "") -}}
15+
{{- range (lookup "v1" "Service" .Release.Namespace "").items -}}
16+
{{- if and (eq (index .metadata.labels "app.kubernetes.io/name") "supersonic")
17+
(eq (index .metadata.labels "app.kubernetes.io/component") "grafana")
18+
(ne (index .metadata.labels "app.kubernetes.io/instance") (include "supersonic.name" $root))}}
19+
{{- $exists = true -}}
20+
{{- break -}}
21+
{{- end -}}
22+
{{- end -}}
23+
{{- end -}}
24+
{{- $exists -}}
25+
{{- end -}}
26+
27+
{{/*
28+
Get existing Grafana service name (from any release)
29+
*/}}
30+
{{- define "supersonic.existingGrafanaName" -}}
31+
{{- $root := . -}}
32+
{{- range (lookup "v1" "Service" .Release.Namespace "").items }}
33+
{{- if and (eq (index .metadata.labels "app.kubernetes.io/name") "supersonic")
34+
(eq (index .metadata.labels "app.kubernetes.io/component") "grafana")
35+
(ne (index .metadata.labels "app.kubernetes.io/instance") (include "supersonic.name" $root))}}
36+
{{- .metadata.name -}}
37+
{{- break }}
38+
{{- end }}
39+
{{- end }}
40+
{{- end -}}
41+
42+
{{/*
43+
Get Grafana URL (handles ingress, existing, and new instances)
44+
*/}}
45+
{{- define "supersonic.grafanaUrl" -}}
46+
{{- if and .Values.grafana.ingress.enabled .Values.grafana.ingress.hostName -}}
47+
https://{{ .Values.grafana.ingress.hostName }}
48+
{{- else -}}
49+
{{- $foundIngress := false -}}
50+
{{- if (lookup "networking.k8s.io/v1" "Ingress" .Release.Namespace "") -}}
51+
{{- $root := . -}}
52+
{{- range (lookup "networking.k8s.io/v1" "Ingress" .Release.Namespace "").items -}}
53+
{{- if and (eq (index .metadata.labels "app.kubernetes.io/name") "supersonic")
54+
(eq (index .metadata.labels "app.kubernetes.io/component") "grafana")
55+
(ne (index .metadata.labels "app.kubernetes.io/instance") (include "supersonic.name" $root))}}
56+
{{- range .spec.rules -}}
57+
{{- if .host -}}
58+
{{- $foundIngress = true -}}
59+
https://{{ .host }}
60+
{{- break -}}
61+
{{- end -}}
62+
{{- end -}}
63+
{{- break -}}
64+
{{- end -}}
65+
{{- end -}}
66+
{{- end -}}
67+
{{- if not $foundIngress -}}
68+
{{- if (eq (include "supersonic.grafanaExists" .) "true") -}}
69+
http://{{ include "supersonic.existingGrafanaName" . }}.{{ .Release.Namespace }}.svc.cluster.local
70+
{{- else -}}
71+
http://{{ include "supersonic.grafanaName" . }}.{{ .Release.Namespace }}.svc.cluster.local
72+
{{- end -}}
73+
{{- end -}}
74+
{{- end -}}
75+
{{- end -}}
Lines changed: 14 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
{{- /* templates/_helpers.tpl */ -}}
2+
3+
{{/*
4+
Get release name (or override)
5+
*/}}
26
{{- define "supersonic.name" -}}
37
{{- if .Values.nameOverride }}
48
{{- printf "%s" .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
@@ -7,62 +11,25 @@
711
{{- end }}
812
{{- end -}}
913

14+
{{/*
15+
Get Triton name
16+
*/}}
1017
{{- define "supersonic.tritonName" -}}
1118
{{- printf "%s-triton" (include "supersonic.name" .) | trunc 63 | trimSuffix "-" -}}
1219
{{- end -}}
1320

21+
{{/*
22+
Get Envoy name
23+
*/}}
1424
{{- define "supersonic.envoyName" -}}
1525
{{- printf "%s-envoy" (include "supersonic.name" .) | trunc 63 | trimSuffix "-" -}}
1626
{{- end -}}
1727

18-
{{- define "supersonic.prometheusName" -}}
19-
{{- printf "%s-prometheus" (include "supersonic.name" .) | trunc 63 | trimSuffix "-" -}}
20-
{{- end -}}
21-
22-
{{- define "supersonic.grafanaName" -}}
23-
{{- printf "%s-grafana" (include "supersonic.name" .) | trunc 63 | trimSuffix "-" -}}
24-
{{- end -}}
25-
26-
{{- define "supersonic.defaultMetric" -}}
27-
{{- if not ( eq .Values.prometheus.serverLoadMetric "" ) }}
28-
{{- printf "%s" .Values.prometheus.serverLoadMetric -}}
29-
{{- else }}
30-
sum by (job) (
31-
rate(nv_inference_queue_duration_us{job=~"{{ include "supersonic.tritonName" . }}"}[15s])
32-
)
33-
/
34-
sum by (job) (
35-
(rate(nv_inference_exec_count{job=~"{{ include "supersonic.tritonName" . }}"}[15s]) * 1000) + 0.001
36-
)
37-
{{- end }}
38-
{{- end }}
39-
28+
{{/*
29+
Get gRPC endpoint
30+
*/}}
4031
{{- define "supersonic.grpcEndpoint" -}}
4132
{{- if .Values.ingress.enabled -}}
4233
{{ .Values.ingress.hostName }}:443
4334
{{- end }}
44-
{{- end }}
45-
46-
{{- define "supersonic.prometheusUrl" -}}
47-
{{- if (not .Values.prometheus.external) -}}
48-
{{- if .Values.prometheus.ingress.enabled -}}
49-
https://{{ .Values.prometheus.ingress.hostName }}
50-
{{- else -}}
51-
http://{{ include "supersonic.prometheusName" . }}.{{ .Release.Namespace }}.svc.cluster.local:9090
52-
{{- end -}}
53-
{{- else if .Values.prometheus.url -}}
54-
{{ .Values.prometheus.scheme }}://{{ .Values.prometheus.url }}
55-
{{- end }}
56-
{{- end }}
57-
58-
{{- define "supersonic.validateRBACPermissions" -}}
59-
{{- if not .Values.prometheus.external -}}
60-
{{- $canReadRoles := false -}}
61-
{{- if (lookup "rbac.authorization.k8s.io/v1" "Role" .Release.Namespace "") -}}
62-
{{- $canReadRoles = true -}}
63-
{{- end -}}
64-
{{- if not $canReadRoles -}}
65-
{{- fail "\nError: Failed to install Prometheus due to lack of permissions to get 'roles' in API group 'rbac.authorization.k8s.io'.\nEither:\n1. Set prometheus.external=true in value.yaml and provide an external Prometheus URL, or\n2. Request necessary RBAC permissions from your cluster administrator." -}}
66-
{{- end -}}
67-
{{- end -}}
68-
{{- end -}}
35+
{{- end }}

0 commit comments

Comments
 (0)