-
Notifications
You must be signed in to change notification settings - Fork 195
SLO Aware Routing Sidecar + Plugin EPP Integration and Helm Deployment #1839
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
47f33a8
8c77f1f
8ca0214
b337ade
0ae94df
2e220d7
729c53b
7b59026
07f7ae1
678f608
ddee4c7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ package runner | |
| import ( | ||
| "context" | ||
| "crypto/tls" | ||
| "encoding/json" | ||
| "errors" | ||
| "flag" | ||
| "fmt" | ||
|
|
@@ -69,13 +70,15 @@ import ( | |
| "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector" | ||
| "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" | ||
| "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix" | ||
| "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router" | ||
| "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker" | ||
| "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile" | ||
| "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/scorer" | ||
| testfilter "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/test/filter" | ||
| runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" | ||
| "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" | ||
| "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" | ||
| latencypredictor "sigs.k8s.io/gateway-api-inference-extension/sidecars/latencypredictorasync" | ||
| "sigs.k8s.io/gateway-api-inference-extension/version" | ||
| ) | ||
|
|
||
|
|
@@ -126,6 +129,7 @@ var ( | |
| "then a self-signed certificate is used.") | ||
| // metric flags | ||
| totalQueuedRequestsMetric = flag.String("total-queued-requests-metric", runserver.DefaultTotalQueuedRequestsMetric, "Prometheus metric for the number of queued requests.") | ||
| totalRunningRequestsMetric = flag.String("total-running-requests-metric", runserver.DefaultTotalRunningRequestsMetric, "Prometheus metric for the number of running requests.") | ||
| kvCacheUsagePercentageMetric = flag.String("kv-cache-usage-percentage-metric", runserver.DefaultKvCacheUsagePercentageMetric, "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).") | ||
| // LoRA metrics | ||
| loraInfoMetric = flag.String("lora-info-metric", runserver.DefaultLoraInfoMetric, "Prometheus metric for the LoRA info metrics (must be in vLLM label format).") | ||
|
|
@@ -145,7 +149,10 @@ var ( | |
| modelServerMetricsScheme = flag.String("model-server-metrics-scheme", "http", "Scheme to scrape metrics from pods") | ||
| modelServerMetricsHttpsInsecureSkipVerify = flag.Bool("model-server-metrics-https-insecure-skip-verify", true, "When using 'https' scheme for 'model-server-metrics-scheme', configure 'InsecureSkipVerify' (default to true)") | ||
| haEnableLeaderElection = flag.Bool("ha-enable-leader-election", false, "Enables leader election for high availability. When enabled, readiness probes will only pass on the leader.") | ||
| tracing = flag.Bool("tracing", true, "Enables emitting traces") | ||
|
|
||
| // Latency Predictor Flag | ||
| enableLatencyPredictor = flag.Bool("enable-latency-predictor", false, "Enable the regression-based latency predictor and scheduler scorer.") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need an explicit flag? isn't the predictor a plugin? and so the plugins configuration should take care of enablement. |
||
| tracing = flag.Bool("tracing", true, "Enables emitting traces") | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: move
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
|
|
||
| setupLog = ctrl.Log.WithName("setup") | ||
| ) | ||
|
|
@@ -227,7 +234,20 @@ func (r *Runner) Run(ctx context.Context) error { | |
| return err | ||
| } | ||
|
|
||
| rawConfig, err := r.parseConfigurationPhaseOne(ctx) | ||
| // =================================================================== | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All of this initialization should be part of the predictor plugin initialization itself. We should not have any predictor specific logic here.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agreed, will move this |
||
| // == Latency Predictor Integration | ||
| // =================================================================== | ||
| var predictor latencypredictor.PredictorInterface // Use the interface type | ||
| if *enableLatencyPredictor { | ||
| setupLog.Info("Latency predictor is enabled. Initializing...") | ||
| predictor = latencypredictor.New(latencypredictor.ConfigFromEnv(), ctrl.Log.WithName("latency-predictor")) | ||
| } else { | ||
| setupLog.Info("Latency predictor is disabled.") | ||
| predictor = nil // This will be a true nil interface | ||
| } | ||
| // =================================================================== | ||
|
|
||
| rawConfig, err := r.parseConfigurationPhaseOne(ctx, predictor) | ||
| if err != nil { | ||
| setupLog.Error(err, "Failed to parse configuration") | ||
| return err | ||
|
|
@@ -366,6 +386,7 @@ func (r *Runner) Run(ctx context.Context) error { | |
| Director: director, | ||
| SaturationDetector: saturationDetector, | ||
| UseExperimentalDatalayerV2: r.featureGates[datalayer.FeatureGate], // pluggable data layer feature flag | ||
| LatencyPredictor: predictor, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this is used anywhere, could be an artifact before everything was transitioned to the plugin format?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah it's not, will remove |
||
| } | ||
| if err := serverRunner.SetupWithManager(ctx, mgr); err != nil { | ||
| setupLog.Error(err, "Failed to setup EPP controllers") | ||
|
|
@@ -383,6 +404,12 @@ func (r *Runner) Run(ctx context.Context) error { | |
| return err | ||
| } | ||
|
|
||
| if *enableLatencyPredictor && predictor != nil { | ||
| if err := registerLatencyPredictorServer(mgr, predictor); err != nil { | ||
| return err | ||
| } | ||
| } | ||
|
|
||
| // --- Start Manager --- | ||
| // This blocks until a signal is received. | ||
| setupLog.Info("Controller manager starting") | ||
|
|
@@ -432,7 +459,14 @@ func (r *Runner) registerInTreePlugins() { | |
| plugins.Register(testresponsereceived.DestinationEndpointServedVerifierType, testresponsereceived.DestinationEndpointServedVerifierFactory) | ||
| } | ||
|
|
||
| func (r *Runner) parseConfigurationPhaseOne(ctx context.Context) (*configapi.EndpointPickerConfig, error) { | ||
| func (r *Runner) registerLatencyPredictorPlugins(predictor latencypredictor.PredictorInterface) { | ||
| plugins.Register(slo_aware_router.SLOAwareRouterPluginType, func(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { | ||
| return slo_aware_router.NewSLOAwareRouter(predictor, slo_aware_router.HeadroomSelectionStrategy).WithName(name), nil | ||
| }) | ||
| plugins.Register(profile.SLOAwareProfileHandlerType, profile.SLOAwareProfileHandlerFactory) | ||
| } | ||
|
|
||
| func (r *Runner) parseConfigurationPhaseOne(ctx context.Context, predictor latencypredictor.PredictorInterface) (*configapi.EndpointPickerConfig, error) { | ||
| if *configText == "" && *configFile == "" { | ||
| return nil, nil // configuring through code, not through file | ||
| } | ||
|
|
@@ -454,6 +488,12 @@ func (r *Runner) parseConfigurationPhaseOne(ctx context.Context) (*configapi.End | |
| loader.RegisterFeatureGate(flowcontrol.FeatureGate) | ||
|
|
||
| r.registerInTreePlugins() | ||
| // If we have a latency predictor enabled and predictor and datastore are not nil, | ||
| // register the latency predictor plugins (currently just the SLO scorer). | ||
| if *enableLatencyPredictor && predictor != nil { | ||
| setupLog.Info("Registering latency predictor plugins") | ||
| r.registerLatencyPredictorPlugins(predictor) | ||
| } | ||
|
|
||
| rawConfig, featureGates, err := loader.LoadConfigPhaseOne(configBytes, logger) | ||
| if err != nil { | ||
|
|
@@ -538,6 +578,7 @@ func (r *Runner) setupMetricsCollection(setupLog logr.Logger, useExperimentalDat | |
| func setupMetricsV1(setupLog logr.Logger) (datalayer.EndpointFactory, error) { | ||
| mapping, err := backendmetrics.NewMetricMapping( | ||
| *totalQueuedRequestsMetric, | ||
| *totalRunningRequestsMetric, | ||
| *kvCacheUsagePercentageMetric, | ||
| *loraInfoMetric, | ||
| *cacheInfoMetric, | ||
|
|
@@ -586,6 +627,7 @@ func setupDatalayer(logger logr.Logger) (datalayer.EndpointFactory, error) { | |
| *modelServerMetricsHttpsInsecureSkipVerify, | ||
| nil) | ||
| extractor, err := dlmetrics.NewExtractor(*totalQueuedRequestsMetric, | ||
| *totalRunningRequestsMetric, | ||
| *kvCacheUsagePercentageMetric, | ||
| *loraInfoMetric, *cacheInfoMetric) | ||
|
|
||
|
|
@@ -653,6 +695,18 @@ func registerHealthServer(mgr manager.Manager, logger logr.Logger, ds datastore. | |
| return nil | ||
| } | ||
|
|
||
| // registerLatencyPredictorServer adds the Latency Predictor server as a Runnable to the given manager. | ||
| func registerLatencyPredictorServer(mgr manager.Manager, predictor latencypredictor.PredictorInterface) error { | ||
| // For the runnable, you'll need to type assert back to the concrete type | ||
| concretePredictor := predictor.(*latencypredictor.Predictor) | ||
| if err := mgr.Add(runnable.NoLeaderElection(&predictorRunnable{predictor: concretePredictor})); err != nil { | ||
| setupLog.Error(err, "Failed to register latency predictor runnable") | ||
| return err | ||
| } | ||
| setupLog.Info("Latency predictor runnable added to manager.") | ||
| return nil | ||
| } | ||
|
|
||
| func validateFlags() error { | ||
| if (*poolName != "" && *endpointSelector != "") || (*poolName == "" && *endpointSelector == "") { | ||
| return errors.New("either pool-name or endpoint-selector must be set") | ||
|
|
@@ -799,3 +853,25 @@ func resolvePoolNamespace(poolNamespace string) string { | |
| } | ||
| return runserver.DefaultPoolNamespace | ||
| } | ||
|
|
||
| // =================================================================== | ||
| // == Latency Predictor Plugin and Helpers | ||
| // =================================================================== | ||
|
|
||
| // predictorRunnable implements controller-runtime's Runnable interface to manage the predictor's lifecycle. | ||
| type predictorRunnable struct { | ||
| predictor *latencypredictor.Predictor | ||
| } | ||
|
|
||
| func (p *predictorRunnable) Start(ctx context.Context) error { | ||
| setupLog.Info("Starting latency predictor...") | ||
| if err := p.predictor.Start(ctx); err != nil { | ||
| setupLog.Error(err, "Failed to start latency predictor") | ||
| return err | ||
| } | ||
| setupLog.Info("Latency predictor started.") | ||
| <-ctx.Done() | ||
| setupLog.Info("Stopping latency predictor...") | ||
| p.predictor.Stop() | ||
| return nil | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -121,6 +121,27 @@ $ helm install triton-llama3-8b-instruct \ | |
| oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0 | ||
| ``` | ||
|
|
||
| ### Install with SLO-Aware Routing | ||
|
|
||
| For full details see the dedicated [SLO-Aware Routing Guide](../../../site-src/guides/slo-aware-routing.md) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is gonna be a bad link, site-src is the source of truth for our static website: https://gateway-api-inference-extension.sigs.k8s.io/ Your PR comes with a preview: https://deploy-preview-1839--gateway-api-inference-extension.netlify.app/ so you should be able to validate the correct URL pathing that way |
||
|
|
||
| #### SLO-Aware Router Environment Variables | ||
|
|
||
| The behavior of the SLO-aware router can be fine-tuned using the following environment variables in the Endpoint Picker deployment. These can be set under `inferenceExtension.env` in your `values.yaml` file. | ||
|
|
||
| | Environment Variable | Description | Default | | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are those set as env vars instead of a plugin specific configuration parameters? |
||
| | -------------------------------- | ------------------------------------------------------------------------------------------------------- | ----------- | | ||
| | `SAMPLING_MEAN` | The sampling mean (lambda) for the Poisson distribution of token sampling. | `100.0` | | ||
| | `MAX_SAMPLED_TOKENS` | The maximum number of tokens to sample for TPOT prediction. | `20` | | ||
| | `SLO_BUFFER_FACTOR` | A buffer to apply to the SLO to make it more or less strict. | `1.0` | | ||
| | `NEG_HEADROOM_TTFT_WEIGHT` | The weight to give to the TTFT when a pod has negative headroom. | `0.8` | | ||
| | `NEG_HEADROOM_TPOT_WEIGHT` | The weight to give to the TPOT when a pod has negative headroom. | `0.2` | | ||
| | `HEADROOM_TTFT_WEIGHT` | The weight to give to the TTFT when a pod has positive headroom. | `0.8` | | ||
| | `HEADROOM_TPOT_WEIGHT` | The weight to give to the TPOT when a pod has positive headroom. | `0.2` | | ||
| | `HEADROOM_SELECTION_STRATEGY` | The strategy to use for selecting a pod based on headroom. Options: `least`, `most`, `composite-least`, `composite-most`, `composite-only`. | `least` | | ||
|
|
||
| **Note:** Enabling SLO-aware routing also exposes a number of Prometheus metrics for monitoring the feature, including actual vs. predicted latency, SLO violations, and more. | ||
|
|
||
| ### Install with High Availability (HA) | ||
|
|
||
| To deploy the EndpointPicker in a high-availability (HA) active-passive configuration set replicas to be greater than one. In such a setup, only one "leader" replica will be active and ready to process traffic at any given time. If the leader pod fails, another pod will be elected as the new leader, ensuring service continuity. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,7 +11,28 @@ data: | |
| - type: queue-scorer | ||
| - type: kv-cache-utilization-scorer | ||
| - type: prefix-cache-scorer | ||
| {{- if .Values.inferenceExtension.latencyPredictor.enabled }} | ||
| - type: slo-aware-routing | ||
| - type: slo-aware-profile-handler | ||
| - type: max-score-picker | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this plugin is added by default, please remove it |
||
| {{- end }} | ||
| schedulingProfiles: | ||
| {{- if .Values.inferenceExtension.latencyPredictor.enabled }} | ||
| - name: prefix | ||
| plugins: | ||
| - pluginRef: prefix-cache-scorer | ||
| - name: default | ||
| plugins: | ||
| - pluginRef: slo-aware-routing | ||
| weight: 0 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are we adding this plugin if the weight is 0?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need this to track requests and gather latency data but, we don't want to use it for routing
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a design smell |
||
| - pluginRef: queue-scorer | ||
| - pluginRef: kv-cache-utilization-scorer | ||
| - pluginRef: max-score-picker | ||
| - name: slo | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Didn't we discuss that this will be called predicted-latency, not SLO? |
||
| plugins: | ||
| - pluginRef: slo-aware-routing | ||
| - pluginRef: max-score-picker | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto, this is added by default, no need to explicitly set it |
||
| {{- else }} | ||
| - name: default | ||
| plugins: | ||
| - pluginRef: queue-scorer | ||
|
|
@@ -20,10 +41,10 @@ data: | |
| weight: 2 | ||
| - pluginRef: prefix-cache-scorer | ||
| weight: 3 | ||
| {{- end }} | ||
| {{- if (hasKey .Values.inferenceExtension "pluginsCustomConfig") }} | ||
| {{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }} | ||
| {{- end }} | ||
|
|
||
| --- | ||
| {{- if .Values.inferenceExtension.sidecar.enabled }} | ||
| apiVersion: v1 | ||
|
|
@@ -34,3 +55,25 @@ metadata: | |
| data: | ||
| {{- .Values.inferenceExtension.sidecar.configMap.data | toYaml | nindent 2 }} | ||
| {{- end }} | ||
| --- | ||
| {{- if .Values.inferenceExtension.latencyPredictor.enabled }} | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: {{ include "gateway-api-inference-extension.name" . }}-latency-predictor-training | ||
| namespace: {{ .Release.Namespace }} | ||
| data: | ||
| {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.trainingServer.config }} | ||
| {{ $key }}: {{ $value | quote }} | ||
| {{- end }} | ||
| --- | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: {{ include "gateway-api-inference-extension.name" . }}-latency-predictor-prediction | ||
| namespace: {{ .Release.Namespace }} | ||
| data: | ||
| {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.predictionServers.config }} | ||
| {{ $key }}: {{ $value | quote }} | ||
| {{- end }} | ||
| {{- end }} | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if this should be called
sidecarsif its just a go interface for the predictor API