fastmachinelearning · kondratyevd · Mar 13, 2025 · Mar 13, 2025 · May 23, 2025 · May 30, 2025
diff --git a/docs/.values-table.md b/docs/.values-table.md
@@ -34,9 +34,12 @@
 | envoy.rate_limiter.listener_level.max_tokens | int | `5` | Maximum number of simultaneous connections to the Envoy Proxy. Each new connection takes a "token" from the "bucket" which initially contains ``max_tokens`` tokens. |
 | envoy.rate_limiter.listener_level.tokens_per_fill | int | `1` | ``tokens_per_fill`` tokens are added to the "bucket" every ``fill_interval``, allowing new connections to be established. |
 | envoy.rate_limiter.listener_level.fill_interval | string | `"12s"` | For example, adding a new token every 12 seconds allows 5 new connections every minute. |
-| envoy.rate_limiter.prometheus_based | object | `{"enabled":false,"luaConfig":"cfg/envoy-filter.lua"}` | This rate limiter rejects new connections based on metric extracted from Prometheus (e.g. inference queue latency). The metric is taken from parameter ``prometheus.serverLoadMetric``, and the threshold is set by ``prometheus.serverLoadThreshold``. These parameters are the same as those used by the KEDA autoscaler. |
+| envoy.rate_limiter.prometheus_based | object | `{"enabled":false}` | This rate limiter rejects new connections based on metric extracted from Prometheus (e.g. inference queue latency). The metric is taken from parameter ``prometheus.serverLoadMetric``, and the threshold is set by ``prometheus.serverLoadThreshold``. These parameters are the same as those used by the KEDA autoscaler. |
 | envoy.rate_limiter.prometheus_based.enabled | bool | `false` | Enable rate limiter |
 | envoy.loadBalancerPolicy | string | `"LEAST_REQUEST"` | Envoy load balancer policy. Options: ROUND_ROBIN, LEAST_REQUEST, RING_HASH, RANDOM, MAGLEV |
+| envoy.lua_filter.enabled | bool | `false` |  |
+| envoy.lua_filter.lua_config | string | `"cfg/envoy-filter.lua"` |  |
+| envoy.dynamic_routing | object | `{"enabled":false}` | Enable dynamic routing in Envoy proxy. |
 | envoy.auth.enabled | bool | `false` | Enable authentication in Envoy proxy |
 | envoy.auth.jwt_issuer | string | `""` |  |
 | envoy.auth.jwt_remote_jwks_uri | string | `""` |  |

diff --git a/helm/supersonic/cfg/envoy-filter-dynamic.lua b/helm/supersonic/cfg/envoy-filter-dynamic.lua
@@ -0,0 +1,64 @@
+function envoy_on_request(request_handle)
+    local path = request_handle:headers():get(":path")
+    local contentType = request_handle:headers():get("content-type")
+
+
+    ---- Extract model_name from ModelInferRequest ----
+    if contentType == "application/grpc" then
+        -- request_handle:logInfo("path = " .. path)
+        if path == "/inference.GRPCInferenceService/ModelInfer" then
+
+            local model_name, model_version = extract_model_name_and_version(request_handle, body)
+            -- request_handle:logInfo("ModelInfer model_name = " .. model_name .. " model_version = " .. model_version)
+
+            -- log and propagate via dynamic metadata
+            if model_name and model_version then
+                local svc_name = "RELEASE-" .. model_name .. "-v" .. model_version
+                local header_value = svc_name .. ".NAMESPACE.svc.cluster.local:8001"
+                request_handle:logInfo("route-to = " .. header_value)
+                -- add header
+                request_handle:headers():add("route-to", header_value)
+            end
+        else
+            --- for non-inference calls, for now just forward to default service
+            request_handle:headers():add("route-to", "RELEASE-triton.NAMESPACE.svc.cluster.local:8001")
+        end
+    end
+end
+
+function extract_model_name_and_version(request_handle)
+    local model_name = ""
+    local model_version = ""
+    local body = request_handle:body():getBytes(0, request_handle:body():length())
+
+    if body and #body > 5 then
+        -- strip the 5-byte gRPC header (1-byte flag + 4-byte msg-len)
+        local msg = body:sub(6)
+
+        -- protobuf wire format for field 1, wire type 2: tag = 0x0A
+        -- field 1 is the model name - we know it from here:
+        -- wire type 2 means that the field is length-delimited
+        if msg:byte(1) == 0x0A then
+            -- next byte is a varint length (assumes <128 bytes)
+            local name_len = msg:byte(2)
+            -- extract UTF-8 model name
+            model_name = msg:sub(3, 2 + name_len)
+            -- request_handle:logInfo("ModelInfer model_name = " .. model_name)
+            local offset = 3 + name_len
+
+            -- Extract model version (field 2, wire type 2, tag 0x12)
+            if msg:byte(offset) == 0x12 then
+                local ver_len = msg:byte(offset + 1)
+                model_version = msg:sub(offset + 2, offset + 1 + ver_len)
+                -- request_handle:logInfo("ModelInfer model_version = " .. model_version)
+                offset = offset + 2 + ver_len
+            else
+                request_handle:logWarn(string.format("No model_version field (expected tag 0x12 at offset %d, got 0x%02X)", 
+                    offset, msg:byte(offset)))
+            end
+        else
+            request_handle:logErr("Unexpected protobuf tag: " .. string.format("0x%02X", msg:byte(1)))
+        end
+    end
+    return model_name, model_version
+end
diff --git a/helm/supersonic/templates/envoy/configmaps.yaml b/helm/supersonic/templates/envoy/configmaps.yaml
@@ -57,11 +57,22 @@ static_resources:
                       routes:
                         - match:
                             prefix: "/"
+                          {{- if .envoy.dynamic_routing.enabled }}
+                          typed_per_filter_config:
+                            envoy.filters.http.dynamic_forward_proxy:
+                              "@type": type.googleapis.com/envoy.extensions.filters.http.dynamic_forward_proxy.v3.PerRouteConfig
+                              host_rewrite_header: "route-to"
+                          route:
+                            cluster: dynamic_forward_proxy_cluster
+                            timeout: {{ .envoy.grpc_route_timeout }}
+                          {{- else }}
                           route:
                             cluster: triton_grpc_service
                             timeout: {{ .envoy.grpc_route_timeout }}
+                          {{- end }}
+
                 http_filters:
-                  {{- with .envoy.rate_limiter.prometheus_based }}
+                  {{- with .envoy.lua_filter }}
                   {{- if .enabled }}
                   - name: envoy.filters.http.lua
                     typed_config:
@@ -94,6 +105,16 @@ static_resources:
                           provider_name: provider_icecube
                   {{- end }}
                   {{- end }}
+                  {{- if .envoy.dynamic_routing.enabled }}
+                  - name: envoy.filters.http.dynamic_forward_proxy
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.extensions.filters.http.dynamic_forward_proxy.v3.FilterConfig
+                      dns_cache_config:
+                        name: dynamic_cache
+                        dns_lookup_family: ALL
+                        dns_cache_circuit_breaker:
+                          max_pending_requests: 1024
+                  {{- end }}
                   - name: envoy.filters.http.router
                     typed_config:
                       "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
@@ -176,6 +197,22 @@ static_resources:
                     socket_address:
                       address: {{ .tritonName }}
                       port_value: {{ .tritonGrpcPort }}
+    {{- if .envoy.dynamic_routing.enabled }}
+    - name: dynamic_forward_proxy_cluster
+      connect_timeout: 2s
+      lb_policy: CLUSTER_PROVIDED
+      http2_protocol_options:
+        max_concurrent_streams: 1000
+      cluster_type:
+        name: envoy.clusters.dynamic_forward_proxy
+        typed_config:
+          "@type": type.googleapis.com/envoy.extensions.clusters.dynamic_forward_proxy.v3.ClusterConfig
+          dns_cache_config:
+            name: dynamic_cache
+            dns_lookup_family: ALL
+            dns_cache_circuit_breaker:
+              max_pending_requests: 1024
+    {{- end }}
 {{- end }}
 {{- end }}
 
@@ -226,7 +263,7 @@ data:
 {{ include "envoy.configuration.yaml" $envoyContext | indent 4 }}
 ---
 
-{{- if .Values.envoy.rate_limiter.prometheus_based.enabled }}
+{{- if .Values.envoy.lua_filter.enabled }}
 {{- /* Create a ConfigMap for the Lua filter */}}
 apiVersion: v1
 kind: ConfigMap
@@ -239,12 +276,14 @@ metadata:
 data:
   envoy-filter.lua: |-
     {{- /* Read and process the Lua configuration file */}}
-    {{- $luaConfig := $.Files.Get .Values.envoy.rate_limiter.prometheus_based.luaConfig | nindent 4 }}
+    {{- $luaConfig := $.Files.Get .Values.envoy.lua_filter.lua_config | nindent 4 }}
     {{- $luaConfig = $luaConfig | replace "SERVER_LOAD_METRIC" (include "supersonic.defaultMetric" . | quote) }}
     {{- $luaConfig = $luaConfig | replace "SERVER_LOAD_THRESHOLD" (quote .Values.serverLoadThreshold) }}
     {{- $luaConfig = $luaConfig | replace "PROMETHEUS_SCHEME" (include "supersonic.prometheusScheme" .) }}
     {{- $luaConfig = $luaConfig | replace "PROMETHEUS_HOST" (include "supersonic.prometheusHost" .) }}
     {{- $luaConfig = $luaConfig | replace "PROMETHEUS_PORT" (include "supersonic.prometheusPort" .) }}
+    {{- $luaConfig = $luaConfig | replace "RELEASE" .Release.Name }}
+    {{- $luaConfig = $luaConfig | replace "NAMESPACE" .Release.Namespace }}
     {{ $luaConfig | indent 4 }}
 
 ---

diff --git a/helm/supersonic/templates/envoy/deployment.yaml b/helm/supersonic/templates/envoy/deployment.yaml
@@ -47,7 +47,7 @@ spec:
         volumeMounts:
         - name: {{ include "supersonic.name" . }}-envoy-config
           mountPath: /etc/envoy
-        {{- if .Values.envoy.rate_limiter.prometheus_based.enabled }}
+        {{- if .Values.envoy.lua_filter.enabled }}
         - name: {{ include "supersonic.name" . }}-lua-volume
           mountPath: /etc/envoy/lua
           readOnly: true
@@ -58,7 +58,7 @@ spec:
       - name: {{ include "supersonic.name" . }}-envoy-config
         configMap:
           name: {{ include "supersonic.name" . }}-envoy-config
-      {{- if .Values.envoy.rate_limiter.prometheus_based.enabled }}
+      {{- if .Values.envoy.lua_filter.enabled }}
       - name: {{ include "supersonic.name" . }}-lua-volume
         configMap:
           name: {{ include "supersonic.name" . }}-lua-config

diff --git a/helm/supersonic/values.schema.json b/helm/supersonic/values.schema.json
@@ -370,14 +370,10 @@
               "properties": {
                 "enabled": {
                   "type": "boolean"
-                },
-                "luaConfig": {
-                  "type": "string"
                 }
               },
               "required": [
-                "enabled",
-                "luaConfig"
+                "enabled"
               ]
             }
           },
@@ -389,6 +385,32 @@
         "loadBalancerPolicy": {
           "type": "string"
         },
+        "lua_filter": {
+          "type": "object",
+          "properties": {
+            "enabled": {
+              "type": "boolean"
+            },
+            "lua_config": {
+              "type": "string"
+            }
+          },
+          "required": [
+            "enabled",
+            "lua_config"
+          ]
+        },
+        "dynamic_routing": {
+          "type": "object",
+          "properties": {
+            "enabled": {
+              "type": "boolean"
+            }
+          },
+          "required": [
+            "enabled"
+          ]
+        },
         "auth": {
           "type": "object",
           "properties": {
@@ -424,11 +446,13 @@
       "required": [
         "args",
         "auth",
+        "dynamic_routing",
         "enabled",
         "grpc_route_timeout",
         "image",
         "ingress",
         "loadBalancerPolicy",
+        "lua_filter",
         "rate_limiter",
         "replicas",
         "resources",

diff --git a/helm/supersonic/values.yaml b/helm/supersonic/values.yaml
@@ -156,12 +156,19 @@ envoy:
     prometheus_based:
       # -- Enable rate limiter
       enabled: false
-      luaConfig: "cfg/envoy-filter.lua" 
 
   # -- Envoy load balancer policy.
   # Options: ROUND_ROBIN, LEAST_REQUEST, RING_HASH, RANDOM, MAGLEV
   loadBalancerPolicy: "LEAST_REQUEST"
 
+  lua_filter:
+    enabled: false
+    lua_config: "cfg/envoy-filter.lua"
+
+  # -- Enable dynamic routing in Envoy proxy.
+  dynamic_routing:
+    enabled: false
+
   auth:
     # -- Enable authentication in Envoy proxy
     enabled: false

diff --git a/values/values-geddes-cms.yaml b/values/values-geddes-cms.yaml
@@ -11,10 +11,12 @@ triton:
       --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \
       --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoTauTag/TrainingFiles/data/DeepTauIdSONIC/ \
       --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoMET/METPUSubtraction/data/models/ \
+      --model-control-mode=explicit \
       --allow-gpu-metrics=true \
       --log-verbose=0 \
       --strict-model-config=false \
       --exit-timeout-secs=60
+
   resources:
     limits: { nvidia.com/gpu: 1, cpu: 2, memory: 4G}
     requests: { nvidia.com/gpu: 1, cpu: 2, memory: 4G}
@@ -37,6 +39,14 @@ envoy:
     enabled: true
     hostName: sonic-cms.geddes.rcac.purdue.edu
     ingressClassName: public
+  rate_limiter:
+    prometheus_based:
+      enabled: false
+  dynamic_routing:
+    enabled: true
+  lua_filter:
+    enabled: true
+    lua_config: "cfg/envoy-filter-dynamic.lua"
 
 autoscaler:
   enabled: true
@@ -55,15 +65,20 @@ tolerations:
     effect: NoSchedule
 
 prometheus:
-  enabled: true
-  server:
-    ingress:
-      enabled: true
-      hostName: prometheus-cms.geddes.rcac.purdue.edu
-      ingressClassName: public
+  # enabled: false
+  external:
+    enabled: true
+    url: "prometheus-cms.geddes.rcac.purdue.edu"
+    port: 443
+    scheme: https
+  # server:
+  #   ingress:
+  #     enabled: true
+  #     hostName: prometheus-cms.geddes.rcac.purdue.edu
+  #     ingressClassName: public
 
 grafana:
-  enabled: true
+  enabled: false
   ingress:
     enabled: true
     hostName: grafana-cms.geddes.rcac.purdue.edu