Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions artifacts/flagger/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1344,6 +1344,7 @@ spec:
- prometheus
- influxdb
- datadog
- externalmetrics
- stackdriver
- cloudwatch
- newrelic
Expand Down
1 change: 1 addition & 0 deletions charts/flagger/crds/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1344,6 +1344,7 @@ spec:
- prometheus
- influxdb
- datadog
- externalmetrics
- stackdriver
- cloudwatch
- newrelic
Expand Down
8 changes: 8 additions & 0 deletions charts/flagger/templates/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,14 @@ rules:
- revisions
verbs:
- get
- apiGroups:
- external.metrics.k8s.io
resources:
- '*'
verbs:
- get
- watch
- list
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
Expand Down
50 changes: 50 additions & 0 deletions docs/gitbook/usage/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,22 @@ Reference the template in the canary analysis:
interval: 1m
```

### Datadog Rate Limits

For bigger setups, you might run into rate limits on the Datadog API. To avoid this,
you can use the Datadog Cluster Agent to retrieve metrics in batches instead. It will then
expose these metrics as an external metrics server.

See [Datadog Documentation](https://docs.datadoghq.com/containers/guide/cluster_agent_autoscaling_metrics).

Once you have enabled Datadog's external metrics endpoint and `DatadogMetric` CRD (without
necessarily using `registerAPIService`), you can use Flagger's
[External Metrics Provider](#kubernetes-external-metrics) to query the metrics from there.

The server address is usually `datadog-cluster-agent-metrics-server` and exposed on port 8443.
ExternalMetrics will be named as `datadogmetric@<namespace>:<metricname>`, for example
`datadogmetric@istio-system:istio-mesh-request-count`.

## Amazon CloudWatch

You can create custom metric checks using the CloudWatch metrics provider.
Expand Down Expand Up @@ -781,3 +797,37 @@ Reference the template in the canary analysis:
max: 99
interval: 1m
```

## Kubernetes External Metrics

You can query an external metrics provider that implements the
[Kubernetes External Metrics API](https://kubernetes.io/docs/reference/external-api/external-metrics.v1beta1/).

By default, Flagger will use its bound Service Account for authentication. *Optionally* you can provide a Bearer token through a Secret (that must contain a field named `token`) :

```yaml
apiVersion: v1
kind: Secret
metadata:
name: external-metric-server-token
namespace: default
data:
token: your-access-token
```

External Metrics template example:

```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: my-external-metric
namespace: default
spec:
provider:
type: externalmetrics
address: https://external-metrics-server.default.svc.cluster.local:8443
secretRef: # Optional
name: external-metric-server-token
query: webapp-frontend/job-success-rate?labelSelector=env%3Dproduction
```
9 changes: 5 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,12 @@ require (
google.golang.org/grpc v1.76.0
google.golang.org/protobuf v1.36.10
gopkg.in/h2non/gock.v1 v1.1.2
k8s.io/api v0.34.1
k8s.io/apimachinery v0.34.1
k8s.io/client-go v0.34.1
k8s.io/code-generator v0.34.1
k8s.io/api v0.34.2
k8s.io/apimachinery v0.34.2
k8s.io/client-go v0.34.2
k8s.io/code-generator v0.34.2
k8s.io/klog/v2 v2.130.1
k8s.io/metrics v0.34.2
k8s.io/utils v0.0.0-20250604170112-4c0f3b243397
knative.dev/serving v0.46.6
)
Expand Down
18 changes: 10 additions & 8 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -275,20 +275,22 @@ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
k8s.io/api v0.34.1 h1:jC+153630BMdlFukegoEL8E/yT7aLyQkIVuwhmwDgJM=
k8s.io/api v0.34.1/go.mod h1:SB80FxFtXn5/gwzCoN6QCtPD7Vbu5w2n1S0J5gFfTYk=
k8s.io/apimachinery v0.34.1 h1:dTlxFls/eikpJxmAC7MVE8oOeP1zryV7iRyIjB0gky4=
k8s.io/apimachinery v0.34.1/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
k8s.io/client-go v0.34.1 h1:ZUPJKgXsnKwVwmKKdPfw4tB58+7/Ik3CrjOEhsiZ7mY=
k8s.io/client-go v0.34.1/go.mod h1:kA8v0FP+tk6sZA0yKLRG67LWjqufAoSHA2xVGKw9Of8=
k8s.io/code-generator v0.34.1 h1:WpphT26E+j7tEgIUfFr5WfbJrktCGzB3JoJH9149xYc=
k8s.io/code-generator v0.34.1/go.mod h1:DeWjekbDnJWRwpw3s0Jat87c+e0TgkxoR4ar608yqvg=
k8s.io/api v0.34.2 h1:fsSUNZhV+bnL6Aqrp6O7lMTy6o5x2C4XLjnh//8SLYY=
k8s.io/api v0.34.2/go.mod h1:MMBPaWlED2a8w4RSeanD76f7opUoypY8TFYkSM+3XHw=
k8s.io/apimachinery v0.34.2 h1:zQ12Uk3eMHPxrsbUJgNF8bTauTVR2WgqJsTmwTE/NW4=
k8s.io/apimachinery v0.34.2/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
k8s.io/client-go v0.34.2 h1:Co6XiknN+uUZqiddlfAjT68184/37PS4QAzYvQvDR8M=
k8s.io/client-go v0.34.2/go.mod h1:2VYDl1XXJsdcAxw7BenFslRQX28Dxz91U9MWKjX97fE=
k8s.io/code-generator v0.34.2 h1:9bG6jTxmsU3HXE5BNYJTC8AZ1D6hVVfkm8yYSkdkGY0=
k8s.io/code-generator v0.34.2/go.mod h1:dnDDEd6S/z4uZ+PG1aE58ySCi/lR4+qT3a4DddE4/2I=
k8s.io/gengo/v2 v2.0.0-20250604051438-85fd79dbfd9f h1:SLb+kxmzfA87x4E4brQzB33VBbT2+x7Zq9ROIHmGn9Q=
k8s.io/gengo/v2 v2.0.0-20250604051438-85fd79dbfd9f/go.mod h1:EJykeLsmFC60UQbYJezXkEsG2FLrt0GPNkU5iK5GWxU=
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b h1:MloQ9/bdJyIu9lb1PzujOPolHyvO06MXG5TUIj2mNAA=
k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts=
k8s.io/metrics v0.34.2 h1:zao91FNDVPRGIiHLO2vqqe21zZVPien1goyzn0hsz90=
k8s.io/metrics v0.34.2/go.mod h1:Ydulln+8uZZctUM8yrUQX4rfq/Ay6UzsuXf24QJ37Vc=
k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y=
k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
knative.dev/networking v0.0.0-20250902160145-7dad473f6351 h1:Gv/UqbN0AK+ORoT5e2Kg+3+uMW/y9CCdhpXKxYaVV6k=
Expand Down
1 change: 1 addition & 0 deletions kustomize/base/flagger/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1344,6 +1344,7 @@ spec:
- prometheus
- influxdb
- datadog
- externalmetrics
- stackdriver
- cloudwatch
- newrelic
Expand Down
155 changes: 155 additions & 0 deletions pkg/metrics/providers/externalmetrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/*
Copyright 2020 The Flux authors

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package providers

import (
"context"
"crypto/tls"
"encoding/json"
"fmt"
"io"
"net"
"net/http"
"net/url"
"os"
"time"

flaggerv1 "github.com/fluxcd/flagger/pkg/apis/flagger/v1beta1"
"k8s.io/metrics/pkg/apis/external_metrics"
)

const (
metricServiceEndpointPath = "/apis/external.metrics.k8s.io/v1beta1"
namespacesPath = "/namespaces/"

authorizationHeaderKey = "Authorization"
applicationBearerToken = "token"
)

// ExternalMetricsProvider fetches metrics from an ExternalMetricsProvider.
type ExternalMetricsProvider struct {
metricServiceEndpoint string
bearerToken string

timeout time.Duration
client *http.Client
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we use an ExternalMetricsClient object created by to fetch the external_metrics.ExternalMetricValueList? we can create one using the NewForConfig function in this package. it takes care of loading the service account token automatically and provides a nice interface to fetch the metrics?

Copy link

@mveroone mveroone Jan 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to expose all the attributes of a Rest client so overloading of the host (as we don't want to default to the cluster's API server in case the provider hasn't registered)
We'll have to give it a try, stay tuned.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've got a version that compiles but I'm running it against my colleague's review and we'll probably want to test it after that.

Having a busy schedule for the next 3 weeks (talking about Flagger at Cloud Native Days Paris ^^), likely don't expect news until February.

}

// NewExternalMetricsProvider takes a canary spec, a provider spec, and
// returns a client ready to execute queries against the Service
func NewExternalMetricsProvider(metricInterval string,
provider flaggerv1.MetricTemplateProvider,
credentials map[string][]byte) (*ExternalMetricsProvider, error) {

if provider.Address == "" {
return nil, fmt.Errorf("the Url of the external metric service must be provided")
}

emp := ExternalMetricsProvider{
metricServiceEndpoint: fmt.Sprintf("%s%s", provider.Address, metricServiceEndpointPath),
timeout: 5 * time.Second,
client: http.DefaultClient,
}

if provider.InsecureSkipVerify {
t := http.DefaultTransport.(*http.Transport).Clone()
t.TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
emp.client = &http.Client{Transport: t}
}

if b, ok := credentials[applicationBearerToken]; ok {
emp.bearerToken = string(b)
} else {
// In the absence of a provided token,
// read service account token from volume mount
token, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/token")
if err != nil {
return nil, fmt.Errorf("error reading service account token: %w", err)
}
if len(token) == 0 {
return nil, fmt.Errorf("pod's service account token is empty")
}
emp.bearerToken = string(token)
}

return &emp, nil
}

// RunQuery retrieves the ExternalMetricValue from the ExternalMetricsProvider.metricServiceUrl
// and returns the first result as a float64
func (p *ExternalMetricsProvider) RunQuery(query string) (float64, error) {
u := fmt.Sprintf("%s%s%s", p.metricServiceEndpoint, namespacesPath, query)

req, err := http.NewRequest("GET", u, nil)
if err != nil {
return 0, fmt.Errorf("error http.NewRequest: %w", err)
}
if p.bearerToken != "" {
req.Header.Add(authorizationHeaderKey, fmt.Sprintf("Bearer %s", p.bearerToken))
}

ctx, cancel := context.WithTimeout(req.Context(), p.timeout)
defer cancel()
r, err := p.client.Do(req.WithContext(ctx))
if err != nil {
return 0, fmt.Errorf("request failed: %w", err)
}

defer r.Body.Close()
b, err := io.ReadAll(r.Body)
if err != nil {
return 0, fmt.Errorf("error reading body: %w", err)
}

if r.StatusCode != http.StatusOK {
return 0, fmt.Errorf("error response: %s: %w", string(b), err)
}

var res external_metrics.ExternalMetricValueList
if err := json.Unmarshal(b, &res); err != nil {
return 0, fmt.Errorf("error unmarshaling result: %w, '%s'", err, string(b))
}

if len(res.Items) < 1 {
return 0, fmt.Errorf("invalid response: %s: %w", string(b), ErrNoValuesFound)
}

vs := res.Items[0].Value.AsApproximateFloat64()

return vs, nil
}

// IsOnline will only check the TCP endpoint reachability,
// given that external metric servers don't have a standard health check endpoint defined
func (p *ExternalMetricsProvider) IsOnline() (bool, error) {
var d net.Dialer

ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
defer cancel()

u, err := url.Parse(p.metricServiceEndpoint)
if err != nil {
return false, fmt.Errorf("error parsing metric service url: %w", err)
}

conn, err := d.DialContext(ctx, "tcp", u.Host)
defer conn.Close()
if err != nil {
return false, fmt.Errorf("connection failed: %w", err)
}
return true, err
}
Loading