diff --git a/deploy/legacy/prometheus/values.yaml b/deploy/legacy/prometheus/values.yaml index df5d517bff..790e1550bc 100644 --- a/deploy/legacy/prometheus/values.yaml +++ b/deploy/legacy/prometheus/values.yaml @@ -67,3 +67,15 @@ prometheus: - source_labels: [__name__] action: keep regex: (.*) + resources: + limits: + memory: 10Gi + enableAdminAPI: true + storageSpec: + volumeClaimTemplate: + spec: + # storageClassName: gp2 + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 50Gi diff --git a/test/e2e/common/common.go b/test/e2e/common/common.go index 9772320685..1a8f1fd78e 100644 --- a/test/e2e/common/common.go +++ b/test/e2e/common/common.go @@ -16,12 +16,14 @@ import ( ) const ( - RetinaPort int = 10093 + RetinaPort int = 10093 + PrometheusPort int = 9090 // netObsRGtag is used to tag resources created by this test suite NetObsRGtag = "-e2e-netobs-" KubeSystemNamespace = "kube-system" TestPodNamespace = "kube-system-test" AzureAppInsightsKeyEnv = "AZURE_APP_INSIGHTS_KEY" + OutputFilePathEnv = "OUTPUT_FILEPATH" ) var ( diff --git a/test/e2e/framework/metrics/query-publish.go b/test/e2e/framework/metrics/query-publish.go new file mode 100644 index 0000000000..59ebce3519 --- /dev/null +++ b/test/e2e/framework/metrics/query-publish.go @@ -0,0 +1,174 @@ +package metrics + +import ( + "context" + "encoding/json" + "fmt" + "io/fs" + "log" + "os" + "sync" + "time" + + "github.com/microsoft/retina/pkg/telemetry" + "github.com/microsoft/retina/test/e2e/common" + prom_client "github.com/prometheus/client_golang/api" + prom_v1 "github.com/prometheus/client_golang/api/prometheus/v1" + prom_model "github.com/prometheus/common/model" +) + +type QueryAndPublish struct { + Query string + Endpoint string + AdditionalTelemetryProperty map[string]string + outputFilePath string + stop chan struct{} + wg sync.WaitGroup + telemetryClient *telemetry.TelemetryClient + appInsightsKey string +} + +func (q *QueryAndPublish) Run() error { + if q.appInsightsKey != "" { + telemetry.InitAppInsights(q.appInsightsKey, q.AdditionalTelemetryProperty["retinaVersion"]) + + telemetryClient, err := telemetry.NewAppInsightsTelemetryClient("retina-rate-of-growth", q.AdditionalTelemetryProperty) + if err != nil { + return fmt.Errorf("error creating telemetry client: %w", err) + } + + q.telemetryClient = telemetryClient + } + + q.stop = make(chan struct{}) + q.wg.Add(1) + + go func() { + + t := time.NewTicker(2 * time.Second) + + // First execution + err := q.getAndPublishMetrics() + if err != nil { + log.Fatalf("error getting and publishing metrics: %v", err) + return + } + + for { + select { + + case <-t.C: + err := q.getAndPublishMetrics() + if err != nil { + log.Fatalf("error getting and publishing metrics: %v", err) + return + } + + case <-q.stop: + q.wg.Done() + return + + } + } + + }() + + return nil +} + +func (q *QueryAndPublish) getAndPublishMetrics() error { + // ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + // defer cancel() + + client, err := prom_client.NewClient(prom_client.Config{ + Address: q.Endpoint, + }) + if err != nil { + return fmt.Errorf("error creating prometheus client: %w", err) + } + + promApi := prom_v1.NewAPI(client) + ctx := context.TODO() + + result, warnings, err := promApi.Query(ctx, q.Query, time.Now()) + if err != nil { + return fmt.Errorf("error querying prometheus: %w", err) + } + if len(warnings) > 0 { + log.Println("query warnings: ", warnings) + } + type metrics map[string]string + + allMetrics := []metrics{} + + for _, sample := range result.(prom_model.Vector) { + instance := string(sample.Metric["instance"]) + samplesScraped := sample.Value.String() + + m := map[string]string{ + "instance": instance, + "samplesScraped": samplesScraped, + } + allMetrics = append(allMetrics, m) + } + + // Publish metrics + if q.telemetryClient != nil { + log.Println("Publishing metrics to AppInsights") + for _, metric := range allMetrics { + q.telemetryClient.TrackEvent("metrics-scraped", metric) + + } + } + + // Write metrics to file + if q.outputFilePath != "" { + log.Println("Writing metrics to file ", q.outputFilePath) + + permissions := 0o644 + file, err := os.OpenFile(q.outputFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, fs.FileMode(permissions)) + if err != nil { + return fmt.Errorf("error writing to csv file: %w", err) + } + defer file.Close() + + for _, m := range allMetrics { + b, err := json.Marshal(m) + if err != nil { + return fmt.Errorf("error marshalling metric: %w", err) + } + file.Write(b) + file.WriteString("\n") + } + + } + + return nil +} + +func (q *QueryAndPublish) Stop() error { + telemetry.ShutdownAppInsights() + close(q.stop) + q.wg.Wait() + return nil +} + +func (q *QueryAndPublish) Prevalidate() error { + if os.Getenv(common.AzureAppInsightsKeyEnv) == "" { + log.Println("env ", common.AzureAppInsightsKeyEnv, " not provided") + } + q.appInsightsKey = os.Getenv(common.AzureAppInsightsKeyEnv) + + if _, ok := q.AdditionalTelemetryProperty["retinaVersion"]; !ok { + return fmt.Errorf("retinaVersion is required in AdditionalTelemetryProperty") + } + + if os.Getenv(common.OutputFilePathEnv) == "" { + log.Println("Output file path not provided. Metrics will not be written to file") + return nil + } + q.outputFilePath = os.Getenv(common.OutputFilePathEnv) + + log.Println("Output file path provided: ", q.outputFilePath) + return nil +} diff --git a/test/e2e/growth_metrics_test.go b/test/e2e/growth_metrics_test.go new file mode 100644 index 0000000000..1592cdc0fc --- /dev/null +++ b/test/e2e/growth_metrics_test.go @@ -0,0 +1,112 @@ +package retina + +import ( + "os" + "path/filepath" + "strconv" + "testing" + "time" + + "github.com/microsoft/retina/test/e2e/common" + "github.com/microsoft/retina/test/e2e/framework/azure" + "github.com/microsoft/retina/test/e2e/framework/generic" + "github.com/microsoft/retina/test/e2e/framework/helpers" + "github.com/microsoft/retina/test/e2e/framework/kubernetes" + "github.com/microsoft/retina/test/e2e/framework/metrics" + "github.com/microsoft/retina/test/e2e/framework/types" + "github.com/stretchr/testify/require" +) + +func GetKubeconfig(clusterName, subscriptionId, resourceGroup, kubeConfigFilePath string) *types.Job { + job := types.NewJob("Get kubeconfig") + job.AddStep(&azure.GetAKSKubeConfig{ + ClusterName: clusterName, + SubscriptionID: subscriptionId, + ResourceGroupName: resourceGroup, + Location: "why?", + KubeConfigFilePath: kubeConfigFilePath, + }, nil) + return job +} + +func GrowthTest(additionalTelemetryProperty map[string]string, kubeConfigFilePath string) *types.Job { + job := types.NewJob("Growth Test") + labelAffinity := "app.kubernetes.io/instance=prometheus-kube-prometheus-prometheus" + portForwardId := "port-forward" + metricsStepId := "metrics" + + job.AddStep(&kubernetes.PortForward{ + KubeConfigFilePath: kubeConfigFilePath, + Namespace: common.KubeSystemNamespace, + LabelSelector: "app.kubernetes.io/instance=prometheus-kube-prometheus-prometheus", + LocalPort: strconv.Itoa(common.PrometheusPort), + RemotePort: strconv.Itoa(common.PrometheusPort), + Endpoint: "metrics", + OptionalLabelAffinity: labelAffinity, + }, + &types.StepOptions{ + SkipSavingParametersToJob: true, + RunInBackgroundWithID: portForwardId, + }) + + job.AddStep(&metrics.QueryAndPublish{ + Endpoint: "http://localhost:" + strconv.Itoa(common.PrometheusPort), + Query: "scrape_samples_scraped{job=\"retina-pods\"}", + AdditionalTelemetryProperty: additionalTelemetryProperty, + }, + &types.StepOptions{ + SkipSavingParametersToJob: true, + RunInBackgroundWithID: metricsStepId, + }) + + job.AddStep(&types.Sleep{ + Duration: 60 * time.Second, + }, nil) + + job.AddStep( + &types.Stop{ + BackgroundID: metricsStepId, + }, nil) + + job.AddStep( + &types.Stop{ + BackgroundID: portForwardId, + }, nil) + return job +} + +func Test_GrowthOfMetrics(t *testing.T) { + ctx, cancel := helpers.Context(t) + defer cancel() + + clusterName := common.ClusterNameForE2ETest(t) + + subID := os.Getenv("AZURE_SUBSCRIPTION_ID") + require.NotEmpty(t, subID) + + rg := os.Getenv("AZURE_RESOURCE_GROUP") + if rg == "" { + // Use the cluster name as the resource group name by default. + rg = clusterName + } + + RetinaVersion := os.Getenv(generic.DefaultTagEnv) + require.NotEmpty(t, RetinaVersion) + + additionalTelemetryProperty := map[string]string{} + additionalTelemetryProperty["retinaVersion"] = RetinaVersion + additionalTelemetryProperty["clusterName"] = clusterName + additionalTelemetryProperty["resourceGroup"] = rg + + cwd, err := os.Getwd() + require.NoError(t, err) + + rootDir := filepath.Dir(filepath.Dir(cwd)) + kubeConfigFilePath := filepath.Join(rootDir, "test", "e2e", "test.pem") + + getKubeconfig := types.NewRunner(t, GetKubeconfig(clusterName, subID, rg, kubeConfigFilePath)) + getKubeconfig.Run(ctx) + + growth := types.NewRunner(t, GrowthTest(additionalTelemetryProperty, kubeConfigFilePath)) + growth.Run(ctx) +}