Skip to content

Commit 064c4f1

Browse files
Add tenant-quota collector for opstool environment
1 parent 1dd4faa commit 064c4f1

File tree

20 files changed

+1243
-0
lines changed

20 files changed

+1243
-0
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
ARG PLATFORM
2+
3+
# Builder image
4+
FROM --platform=${PLATFORM} mcr.microsoft.com/oss/go/microsoft/golang:1.24-fips-azurelinux3.0 AS builder
5+
COPY internal/go.mod internal/go.sum internal/
6+
COPY dev-infrastructure/ops-tools/tenant-quota/go.mod dev-infrastructure/ops-tools/tenant-quota/go.sum dev-infrastructure/ops-tools/tenant-quota/
7+
RUN cd dev-infrastructure/ops-tools/tenant-quota && go mod download
8+
WORKDIR /app
9+
COPY dev-infrastructure/ops-tools/tenant-quota/ dev-infrastructure/ops-tools/tenant-quota/
10+
COPY internal/ internal/
11+
ARG TAG
12+
# https://github.com/microsoft/go/tree/microsoft/main/eng/doc/fips#build-option-to-require-fips-mode
13+
ENV CGO_ENABLED=1 GOFLAGS='-tags=requirefips'
14+
15+
RUN cd dev-infrastructure/ops-tools/tenant-quota && \
16+
go build -ldflags="-X github.com/Azure/ARO-HCP/internal/version.CommitSHA=${TAG}" -o tenant-quota-collector .
17+
18+
# Runtime image
19+
FROM --platform=${PLATFORM} mcr.microsoft.com/azurelinux/distroless/base:3.0
20+
USER 65532:65532
21+
WORKDIR /
22+
COPY --from=builder /app/dev-infrastructure/ops-tools/tenant-quota/tenant-quota-collector .
23+
ARG REVISION
24+
LABEL vcs-ref="${REVISION}"
25+
ENTRYPOINT ["/tenant-quota-collector"]
26+
27+
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
TENANT_QUOTA_COLLECTOR_DIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST))))
2+
3+
-include $(TENANT_QUOTA_COLLECTOR_DIR)/../../../setup-templatize-env.mk
4+
-include $(TENANT_QUOTA_COLLECTOR_DIR)/../../../.bingo/Variables.mk
5+
6+
ARO_HCP_REVISION = $(shell git rev-parse HEAD)
7+
ARO_HCP_IMAGE_TAG ?= $(shell DEPLOY_ENV=${DEPLOY_ENV} $(TENANT_QUOTA_COLLECTOR_DIR)/../../../generate-tag.sh)
8+
ARO_HCP_IMAGE_ACR ?= arohcpsvcdev
9+
ARO_HCP_IMAGE_REGISTRY ?= ${ARO_HCP_IMAGE_ACR}.azurecr.io
10+
# Default repository name for tenant-quota (can be overridden via CUSTOM_METRICS_COLLECTOR_IMAGE_REPOSITORY)
11+
TENANT_QUOTA_COLLECTOR_IMAGE_REPOSITORY ?= tenant-quota-collector
12+
ARO_HCP_IMAGE_REPOSITORY ?= ${TENANT_QUOTA_COLLECTOR_IMAGE_REPOSITORY}
13+
TENANT_QUOTA_COLLECTOR_GENERATED_IMAGE_REPOSITORY = $(shell DEPLOY_ENV=${DEPLOY_ENV} BASELINE_REPO=${ARO_HCP_IMAGE_REPOSITORY} $(TENANT_QUOTA_COLLECTOR_DIR)/../../../generate-repo.sh)
14+
TENANT_QUOTA_COLLECTOR_TAGGED_IMAGE ?= $(ARO_HCP_IMAGE_REGISTRY)/$(TENANT_QUOTA_COLLECTOR_GENERATED_IMAGE_REPOSITORY):$(ARO_HCP_IMAGE_TAG)
15+
16+
ifeq ($(ARO_HCP_IMAGE_REGISTRY:%.azurecr.io=azurecr.io),azurecr.io)
17+
ARO_HCP_IMAGE_REGISTRY_IS_ACR = 1
18+
endif
19+
20+
.DEFAULT_GOAL := tenant-quota-collector
21+
22+
tenant-quota-collector:
23+
go build -ldflags="-X github.com/Azure/ARO-HCP/internal/version.CommitSHA=${ARO_HCP_IMAGE_TAG}" -o tenant-quota-collector .
24+
25+
run:
26+
CONFIG_PATH=./deploy/templates/configmap.yaml ./tenant-quota-collector
27+
28+
clean:
29+
rm -f tenant-quota-collector
30+
31+
image: Dockerfile $(shell find . -name '*.go' -o -name 'go.mod' -o -name 'go.sum') Makefile
32+
cd $(TENANT_QUOTA_COLLECTOR_DIR)/../../.. && \
33+
docker build . --file dev-infrastructure/ops-tools/tenant-quota/Dockerfile \
34+
--build-arg PLATFORM=linux/amd64 \
35+
--build-arg REVISION=${ARO_HCP_REVISION} \
36+
--build-arg TAG=${ARO_HCP_IMAGE_TAG} \
37+
--tag ${TENANT_QUOTA_COLLECTOR_TAGGED_IMAGE}
38+
.PHONY: image
39+
40+
build-and-push: image $(ORAS)
41+
az acr login --name ${ARO_HCP_IMAGE_ACR}
42+
docker push ${TENANT_QUOTA_COLLECTOR_TAGGED_IMAGE}
43+
.PHONY: build-and-push
44+
45+
# Required environment variables for opstool deployment
46+
# OPSTOOL_RG: Resource group where opstool cluster is deployed (e.g., hcp-underlay-opstool-uksouth-svc)
47+
# OPSTOOL_SUBSCRIPTION_ID: Subscription ID (opstool subscription)
48+
# OPSTOOL_CLUSTER_NAME: AKS cluster name (e.g., opstool-uksouth-svc-1)
49+
# OPSTOOL_KEYVAULT_NAME: Service Key Vault name (e.g., aro-hcp-dev-svc-kv)
50+
# OPSTOOL_KEYVAULT_RG: Service Key Vault resource group (e.g., global)
51+
#
52+
# Optional: OPSTOOL_VALUES_FILE - Path to custom values.yaml file for tenant configuration
53+
# If not set, uses default values.yaml with RedHat0 tenant
54+
# To add multiple tenants, create a custom values.yaml or use --set flags
55+
56+
# Pipeline targets - use environment variables from pipeline system
57+
tenant-quota-create-identity:
58+
@[ "${OPSTOOL_RG}" ] || ( echo ">> OPSTOOL_RG is not set"; exit 1 )
59+
@[ "${OPSTOOL_SUBSCRIPTION_ID}" ] || ( echo ">> OPSTOOL_SUBSCRIPTION_ID is not set"; exit 1 )
60+
@echo "Creating tenant-quota-collector managed identity..."
61+
@if az identity show --name tenant-quota-collector --resource-group ${OPSTOOL_RG} --subscription ${OPSTOOL_SUBSCRIPTION_ID} >/dev/null 2>&1; then \
62+
echo "✅ Identity already exists"; \
63+
else \
64+
az identity create \
65+
--name tenant-quota-collector \
66+
--resource-group ${OPSTOOL_RG} \
67+
--subscription ${OPSTOOL_SUBSCRIPTION_ID} && \
68+
echo "✅ Identity created successfully"; \
69+
fi
70+
.PHONY: tenant-quota-create-identity
71+
72+
tenant-quota-kv-permissions:
73+
@[ "${OPSTOOL_RG}" ] || ( echo ">> OPSTOOL_RG is not set"; exit 1 )
74+
@[ "${OPSTOOL_SUBSCRIPTION_ID}" ] || ( echo ">> OPSTOOL_SUBSCRIPTION_ID is not set"; exit 1 )
75+
@[ "${OPSTOOL_KEYVAULT_NAME}" ] || ( echo ">> OPSTOOL_KEYVAULT_NAME is not set"; exit 1 )
76+
@[ "${OPSTOOL_KEYVAULT_RG}" ] || ( echo ">> OPSTOOL_KEYVAULT_RG is not set"; exit 1 )
77+
@echo "Granting Key Vault permissions..."
78+
@PRINCIPAL_ID=$$(az identity show --name tenant-quota-collector --resource-group ${OPSTOOL_RG} --subscription ${OPSTOOL_SUBSCRIPTION_ID} --query principalId -o tsv); \
79+
KV_RESOURCE_ID=$$(az keyvault show --name ${OPSTOOL_KEYVAULT_NAME} --resource-group ${OPSTOOL_KEYVAULT_RG} --subscription ${OPSTOOL_SUBSCRIPTION_ID} --query id -o tsv); \
80+
az role assignment create \
81+
--role "Key Vault Secrets User" \
82+
--assignee $$PRINCIPAL_ID \
83+
--scope $$KV_RESOURCE_ID \
84+
--only-show-errors >/dev/null 2>&1 || true; \
85+
echo "✅ Key Vault permissions granted"
86+
.PHONY: tenant-quota-kv-permissions
87+
88+
tenant-quota-federate-identity:
89+
@[ "${OPSTOOL_RG}" ] || ( echo ">> OPSTOOL_RG is not set"; exit 1 )
90+
@[ "${OPSTOOL_SUBSCRIPTION_ID}" ] || ( echo ">> OPSTOOL_SUBSCRIPTION_ID is not set"; exit 1 )
91+
@[ "${OPSTOOL_CLUSTER_NAME}" ] || ( echo ">> OPSTOOL_CLUSTER_NAME is not set"; exit 1 )
92+
@echo "Creating federated identity credential..."
93+
@KUBECONFIG=$$(mktemp); \
94+
az aks get-credentials --name ${OPSTOOL_CLUSTER_NAME} --resource-group ${OPSTOOL_RG} --subscription ${OPSTOOL_SUBSCRIPTION_ID} --file $$KUBECONFIG --overwrite-existing >/dev/null 2>&1; \
95+
kubelogin convert-kubeconfig -l azurecli --kubeconfig $$KUBECONFIG >/dev/null 2>&1; \
96+
ISSUER_URL=$$(kubectl get --raw /.well-known/openid-configuration --kubeconfig $$KUBECONFIG | grep -o '"issuer":"[^"]*' | cut -d'"' -f4); \
97+
if [ -z "$$ISSUER_URL" ]; then \
98+
echo "ERROR: Failed to fetch OIDC issuer URL from cluster"; \
99+
rm -f $$KUBECONFIG; \
100+
exit 1; \
101+
fi; \
102+
SUBJECT="system:serviceaccount:tenant-quota:tenant-quota-collector"; \
103+
if az identity federated-credential show \
104+
--name tenant-quota-collector-fedcred \
105+
--identity-name tenant-quota-collector \
106+
--resource-group ${OPSTOOL_RG} \
107+
--subscription ${OPSTOOL_SUBSCRIPTION_ID} >/dev/null 2>&1; then \
108+
echo "✅ Federated credential already exists"; \
109+
else \
110+
az identity federated-credential create \
111+
--name tenant-quota-collector-fedcred \
112+
--identity-name tenant-quota-collector \
113+
--resource-group ${OPSTOOL_RG} \
114+
--subscription ${OPSTOOL_SUBSCRIPTION_ID} \
115+
--issuer "$$ISSUER_URL" \
116+
--subject "$$SUBJECT" \
117+
--audience "api://AzureADTokenExchange" && \
118+
echo "✅ Federated credential created"; \
119+
fi; \
120+
rm -f $$KUBECONFIG
121+
.PHONY: tenant-quota-federate-identity
122+
123+
tenant-quota-deploy-helm: build-and-push
124+
@[ "${OPSTOOL_RG}" ] || ( echo ">> OPSTOOL_RG is not set"; exit 1 )
125+
@[ "${OPSTOOL_SUBSCRIPTION_ID}" ] || ( echo ">> OPSTOOL_SUBSCRIPTION_ID is not set"; exit 1 )
126+
@[ "${OPSTOOL_CLUSTER_NAME}" ] || ( echo ">> OPSTOOL_CLUSTER_NAME is not set"; exit 1 )
127+
@[ "${OPSTOOL_KEYVAULT_NAME}" ] || ( echo ">> OPSTOOL_KEYVAULT_NAME is not set"; exit 1 )
128+
@echo "Deploying tenant-quota-collector..."
129+
@CLIENT_ID=$$(az identity show --name tenant-quota-collector --resource-group ${OPSTOOL_RG} --subscription ${OPSTOOL_SUBSCRIPTION_ID} --query clientId -o tsv); \
130+
TENANT_ID=$$(az account show --subscription ${OPSTOOL_SUBSCRIPTION_ID} --query tenantId -o tsv); \
131+
echo "✅ Using image: ${TENANT_QUOTA_COLLECTOR_TAGGED_IMAGE}"; \
132+
KUBECONFIG=$$(mktemp); \
133+
az aks get-credentials --name ${OPSTOOL_CLUSTER_NAME} --resource-group ${OPSTOOL_RG} --subscription ${OPSTOOL_SUBSCRIPTION_ID} --file $$KUBECONFIG --overwrite-existing >/dev/null 2>&1; \
134+
kubelogin convert-kubeconfig -l azurecli --kubeconfig $$KUBECONFIG >/dev/null 2>&1; \
135+
if [ -n "${OPSTOOL_VALUES_FILE}" ]; then \
136+
echo "Using custom values file: ${OPSTOOL_VALUES_FILE}"; \
137+
helm upgrade --install tenant-quota $(TENANT_QUOTA_COLLECTOR_DIR)/deploy \
138+
--namespace tenant-quota \
139+
--create-namespace \
140+
--kubeconfig $$KUBECONFIG \
141+
--values ${OPSTOOL_VALUES_FILE} \
142+
--set imageRegistry=arohcpsvcdev.azurecr.io \
143+
--set imageRepository=${TENANT_QUOTA_COLLECTOR_GENERATED_IMAGE_REPOSITORY} \
144+
--set imageTag=${ARO_HCP_IMAGE_TAG} \
145+
--set msiClientId=$$CLIENT_ID \
146+
--set msiTenantId=$$TENANT_ID \
147+
--set secretProvider.keyVault=${OPSTOOL_KEYVAULT_NAME} \
148+
--set secretProvider.msiClientId=$$CLIENT_ID \
149+
--wait; \
150+
else \
151+
echo "Using default values.yaml (RedHat0 tenant)"; \
152+
helm upgrade --install tenant-quota $(TENANT_QUOTA_COLLECTOR_DIR)/deploy \
153+
--namespace tenant-quota \
154+
--create-namespace \
155+
--kubeconfig $$KUBECONFIG \
156+
--set imageRegistry=arohcpsvcdev.azurecr.io \
157+
--set imageRepository=${TENANT_QUOTA_COLLECTOR_GENERATED_IMAGE_REPOSITORY} \
158+
--set imageTag=${ARO_HCP_IMAGE_TAG} \
159+
--set msiClientId=$$CLIENT_ID \
160+
--set msiTenantId=$$TENANT_ID \
161+
--set secretProvider.keyVault=${OPSTOOL_KEYVAULT_NAME} \
162+
--set secretProvider.msiClientId=$$CLIENT_ID \
163+
--set tenants[0].tenantId=64dc69e4-d083-49fc-9569-ebece1dd1408 \
164+
--set tenants[0].tenantName=RedHat0 \
165+
--set tenants[0].servicePrincipalClientId=1ef710d1-afd7-4bf3-8095-e8126650607f \
166+
--set tenants[0].keyVaultSecretName=custom-metrics-collector-redhat0-client-secret \
167+
--wait; \
168+
fi; \
169+
rm -f $$KUBECONFIG; \
170+
echo "✅ Deployment complete!"
171+
@echo ""
172+
@echo "📝 Note: To add additional tenants, create a custom values.yaml file"
173+
@echo " with all tenant configurations and set OPSTOOL_VALUES_FILE=<path>"
174+
@echo " See DEPLOY.md for detailed instructions."
175+
.PHONY: opstool-deploy
176+
177+
# Environment variables are provided by the pipeline system from config.yaml
178+
tenant-quota-deploy: tenant-quota-create-identity tenant-quota-federate-identity tenant-quota-kv-permissions tenant-quota-deploy-helm
179+
@echo "✅ Tenant quota collector deployment complete!"
180+
@echo ""
181+
@echo "⚠️ IMPORTANT: Ensure service principal secrets exist in Key Vault before deployment!"
182+
@echo " If secrets don't exist, the pod will fail to authenticate."
183+
.PHONY: tenant-quota-deploy
184+
185+
# Manual deployment target - requires environment variables to be set manually
186+
opstool-deploy: tenant-quota-create-identity tenant-quota-federate-identity tenant-quota-kv-permissions tenant-quota-deploy-helm
187+
@echo "✅ Opstool deployment complete!"
188+
@echo ""
189+
@echo "⚠️ IMPORTANT: Ensure service principal secrets exist in Key Vault before deployment!"
190+
@echo " If secrets don't exist, the pod will fail to authenticate."
191+
@echo " Use the existing script to store secrets:"
192+
@echo " ../../scripts/kv-add-secret.sh <kv-name> <rg> <secret-name> <secret-value>"
193+
@echo " Example: ../../scripts/kv-add-secret.sh aro-hcp-dev-svc-kv global custom-metrics-collector-redhat0-client-secret \"<sp-client-secret>\""
194+
.PHONY: opstool-deploy
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
apiVersion: v2
2+
name: tenant-quota-collector
3+
version: 0.1.0
4+
appVersion: 0.1.0
5+
kubeVersion: ">=1.27.0"
6+
description: Tenant Quota Collector - collects Azure AD tenant quota metrics from Microsoft Graph API.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Purpose: Tenant configuration - defines which tenants to collect quota from and their service principal credentials
2+
apiVersion: v1
3+
kind: ConfigMap
4+
metadata:
5+
name: tenant-quota-config
6+
namespace: {{ .Release.Namespace }}
7+
data:
8+
config.yaml: |
9+
# Tenant Quota Collector Configuration
10+
# This collector runs in the opstool environment and collects quota from configured tenants.
11+
#
12+
# Configuration is synced from Helm values. To add a new tenant:
13+
# 1. Create a service principal in that tenant with Microsoft Graph API permissions
14+
# 2. Store the service principal secret in the dev Key Vault (aro-hcp-dev-svc-kv)
15+
# 3. Update the Helm values (values.yaml or --set) to include the new tenant
16+
# 4. Redeploy: helm upgrade tenant-quota ./deploy --namespace tenant-quota [--set ...]
17+
#
18+
interval: "5m"
19+
timeout: "30s"
20+
tenants:
21+
{{- range .Values.tenants }}
22+
- tenantId: "{{ .tenantId }}"
23+
tenantName: "{{ .tenantName }}"
24+
servicePrincipalClientId: "{{ .servicePrincipalClientId }}"
25+
keyVaultSecretName: "{{ .keyVaultSecretName }}"
26+
scope: "https://graph.microsoft.com/.default"
27+
{{- end }}
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# Purpose: Main workload - deploys the tenant-quota-collector pod that collects quota metrics from configured tenants
2+
apiVersion: apps/v1
3+
kind: Deployment
4+
metadata:
5+
name: tenant-quota-collector
6+
namespace: {{ .Release.Namespace }}
7+
labels:
8+
app: tenant-quota-collector
9+
spec:
10+
replicas: 1
11+
selector:
12+
matchLabels:
13+
app: tenant-quota-collector
14+
template:
15+
metadata:
16+
labels:
17+
app: tenant-quota-collector
18+
annotations:
19+
azure.workload.identity/use: "true"
20+
spec:
21+
serviceAccountName: tenant-quota-collector
22+
containers:
23+
- name: tenant-quota-collector
24+
image: '{{ .Values.imageRegistry }}/{{ .Values.imageRepository }}{{ if .Values.imageDigest }}@{{ .Values.imageDigest }}{{ else if .Values.imageTag }}:{{ .Values.imageTag }}{{ end }}'
25+
imagePullPolicy: IfNotPresent
26+
ports:
27+
- containerPort: 8080
28+
name: metrics
29+
env:
30+
- name: CONFIG_PATH
31+
value: "/etc/config/config.yaml"
32+
- name: PORT
33+
value: "8080"
34+
- name: LOG_LEVEL
35+
value: "info"
36+
- name: AZURE_CLIENT_ID
37+
value: "{{ .Values.msiClientId }}"
38+
- name: AZURE_TENANT_ID
39+
value: "{{ .Values.msiTenantId }}"
40+
volumeMounts:
41+
- name: config
42+
mountPath: /etc/config
43+
- name: secrets-store
44+
mountPath: "/mnt/secrets-store"
45+
readOnly: true
46+
resources:
47+
requests:
48+
memory: "128Mi"
49+
cpu: "100m"
50+
limits:
51+
memory: "256Mi"
52+
cpu: "200m"
53+
livenessProbe:
54+
httpGet:
55+
path: /healthz
56+
port: 8080
57+
initialDelaySeconds: 30
58+
periodSeconds: 30
59+
readinessProbe:
60+
httpGet:
61+
path: /healthz
62+
port: 8080
63+
initialDelaySeconds: 5
64+
periodSeconds: 10
65+
volumes:
66+
- name: config
67+
configMap:
68+
name: tenant-quota-config
69+
- name: secrets-store
70+
csi:
71+
driver: secrets-store.csi.k8s.io
72+
readOnly: true
73+
volumeAttributes:
74+
secretProviderClass: tenant-quota-collector-secretprovider
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Purpose: Health check probes - configures Prometheus blackbox exporter to probe the /healthz endpoint
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: Probe
4+
metadata:
5+
name: tenant-quota-collector-probe
6+
namespace: {{ .Release.Namespace }}
7+
labels:
8+
release: arohcp-monitor
9+
spec:
10+
jobName: tenant-quota-collector-health
11+
prober:
12+
url: blackbox-exporter:9115
13+
path: /probe
14+
module: http_2xx
15+
targets:
16+
staticConfig:
17+
static: ['tenant-quota-collector:8080/healthz']
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Purpose: Alerting rules - defines Prometheus alert rules for tenant quota usage thresholds (80%, 90%, 95%)
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PrometheusRule
4+
metadata:
5+
name: tenant-quota-alerts
6+
namespace: {{ .Release.Namespace }}
7+
labels:
8+
release: arohcp-monitor
9+
spec:
10+
groups:
11+
- name: tenant-quota
12+
rules:
13+
- alert: TenantQuotaCritical
14+
expr: tenant_quota_usage_percentage >= 95
15+
for: 5m
16+
labels:
17+
severity: critical
18+
annotations:
19+
summary: "Tenant quota usage is critical"
20+
description: 'Tenant {{ print "{{" }} $labels.tenant_name {{ print "}}" }} is at {{ print "{{" }} $value {{ print "}}" }}% capacity'
21+
- alert: TenantQuotaWarning
22+
expr: tenant_quota_usage_percentage >= 90
23+
for: 10m
24+
labels:
25+
severity: warning
26+
annotations:
27+
summary: "Tenant quota usage is high"
28+
description: 'Tenant {{ print "{{" }} $labels.tenant_name {{ print "}}" }} is at {{ print "{{" }} $value {{ print "}}" }}% capacity'
29+
- alert: TenantQuotaInfo
30+
expr: tenant_quota_usage_percentage >= 80
31+
for: 15m
32+
labels:
33+
severity: info
34+
annotations:
35+
summary: "Tenant quota usage is elevated"
36+
description: 'Tenant {{ print "{{" }} $labels.tenant_name {{ print "}}" }} is at {{ print "{{" }} $value {{ print "}}" }}% capacity'

0 commit comments

Comments
 (0)