From d4a0653d475b4623a13c5757b145cd263f14a35d Mon Sep 17 00:00:00 2001 From: Noor Malik <noor.malik@zalando.de> Date: Fri, 27 Dec 2024 15:58:22 +0100 Subject: [PATCH] add aws-efa-device-plugin manifests Signed-off-by: Mikkel Oscar Lyderik Larsen <mikkel.larsen@zalando.de> --- cluster/config-defaults.yaml | 5 + .../01-admission-control/config.yaml | 3 + .../aws-efa-device-plugin/daemonset.yaml | 160 ++++++++++++++++++ .../manifests/aws-efa-device-plugin/rbac.yaml | 10 ++ cluster/manifests/deletions.yaml | 8 + 5 files changed, 186 insertions(+) create mode 100644 cluster/manifests/aws-efa-device-plugin/daemonset.yaml create mode 100644 cluster/manifests/aws-efa-device-plugin/rbac.yaml diff --git a/cluster/config-defaults.yaml b/cluster/config-defaults.yaml index 9065a794b7..2f4492a1df 100644 --- a/cluster/config-defaults.yaml +++ b/cluster/config-defaults.yaml @@ -427,6 +427,11 @@ nvidia_dcgm_exporter_enabled: "false" nvidia_dcgm_exporter_cpu: "10m" nvidia_dcgm_exporter_memory: "200Mi" +# AWS EFA device plugin +aws_efa_device_plugin_enabled: "false" +aws_efa_device_plugin_cpu: "10m" +aws_efa_device_plugin_memory: "20Mi" + # static egress controller settings static_egress_controller_enabled: "true" diff --git a/cluster/manifests/01-admission-control/config.yaml b/cluster/manifests/01-admission-control/config.yaml index 2ff375b52c..a710a1c34c 100644 --- a/cluster/manifests/01-admission-control/config.yaml +++ b/cluster/manifests/01-admission-control/config.yaml @@ -107,6 +107,9 @@ data: pod.pod-security-policy.privileged-service-accounts.kube-system_efs-provisioner: "" {{- if eq .Cluster.ConfigItems.s3_csi_driver "true" }} pod.pod-security-policy.privileged-service-accounts.kube-system_s3-csi-driver: "" +{{- end }} +{{- if eq .Cluster.ConfigItems.aws_efa_device_plugin_enabled "true" }} + pod.pod-security-policy.privileged-service-accounts.kube-system_aws-efa-k8s-device-plugin: "" {{- end }} pod.pod-security-policy.privileged-service-accounts.visibility_logging-agent: "" {{- range $sa := split .Cluster.ConfigItems.teapot_admission_controller_pod_security_policy_privileged_service_accounts "," }} diff --git a/cluster/manifests/aws-efa-device-plugin/daemonset.yaml b/cluster/manifests/aws-efa-device-plugin/daemonset.yaml new file mode 100644 index 0000000000..2a1991f73d --- /dev/null +++ b/cluster/manifests/aws-efa-device-plugin/daemonset.yaml @@ -0,0 +1,160 @@ +{{ if eq .Cluster.ConfigItems.aws_efa_device_plugin_enabled "true" }} +# source: https://github.com/aws/eks-charts/blob/master/stable/aws-efa-k8s-device-plugin/templates/daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: aws-efa-k8s-device-plugin + namespace: kube-system + labels: + application: kubernetes + component: aws-efa-k8s-device-plugin +spec: + selector: + matchLabels: + daemonset: aws-efa-k8s-device-plugin + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + daemonset: aws-efa-k8s-device-plugin + application: kubernetes + component: aws-efa-k8s-device-plugin + annotations: + logging/destination: "{{.Cluster.ConfigItems.log_destination_infra}}" + spec: + serviceAccountName: aws-efa-k8s-device-plugin + tolerations: + - operator: Exists + effect: NoExecute + - operator: Exists + effect: NoSchedule + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - m5dn.24xlarge + - m5n.24xlarge + - m5zn.12xlarge + - m6a.48xlarge + - m6i.32xlarge + - m6id.32xlarge + - m6idn.32xlarge + - m6in.32xlarge + - m7a.48xlarge + - m7g.16xlarge + - m7gd.16xlarge + - m7i.48xlarge + - c5n.9xlarge + - c5n.18xlarge + - c6a.48xlarge + - c6gn.16xlarge + - c6i.32xlarge + - c6id.32xlarge + - c6in.32xlarge + - c7a.48xlarge + - c7g.16xlarge + - c7gd.16xlarge + - c7gn.16xlarge + - c7i.48xlarge + - r5dn.24xlarge + - r5n.24xlarge + - r6a.48xlarge + - r6i.32xlarge + - r6idn.32xlarge + - r6in.32xlarge + - r6id.32xlarge + - r7a.48xlarge + - r7g.16xlarge + - r7gd.16xlarge + - r7i.48xlarge + - r7iz.32xlarge + - x2idn.32xlarge + - x2iedn.32xlarge + - x2iezn.12xlarge + - i3en.12xlarge + - i3en.24xlarge + - i4g.16xlarge + - i4i.32xlarge + - im4gn.16xlarge + - dl1.24xlarge + - dl2q.24xlarge + - g4dn.8xlarge + - g4dn.12xlarge + - g4dn.16xlarge + - g5.8xlarge + - g5.12xlarge + - g5.16xlarge + - g5.24xlarge + - g5.48xlarge + - g6.8xlarge + - g6.12xlarge + - g6.16xlarge + - g6.24xlarge + - g6.48xlarge + - g6e.8xlarge + - g6e.12xlarge + - g6e.16xlarge + - g6e.24xlarge + - g6e.48xlarge + - gr6.8xlarge + - inf1.24xlarge + - p3dn.24xlarge + - p4d.24xlarge + - p4de.24xlarge + - p5.48xlarge + - p5e.48xlarge + - p5en.48xlarge + - trn1.32xlarge + - trn1n.32xlarge + - trn2.48xlarge + - vt1.24xlarge + - hpc6a.48xlarge + - hpc6id.32xlarge + - hpc7a.12xlarge + - hpc7a.24xlarge + - hpc7a.48xlarge + - hpc7a.96xlarge + - hpc7g.4xlarge + - hpc7g.8xlarge + - hpc7g.16xlarge + hostNetwork: true + containers: + # TODO: use the image from the Zalando internal ECR + - image: 602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/aws-efa-k8s-device-plugin:v0.5.4 + name: aws-efa-k8s-device-plugin + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: false + resources: + requests: + cpu: "{{ .Cluster.ConfigItems.aws_efa_device_plugin_cpu }}" + memory: "{{ .Cluster.ConfigItems.aws_efa_device_plugin_memory }}" + limits: + cpu: "{{ .Cluster.ConfigItems.aws_efa_device_plugin_cpu }}" + memory: "{{ .Cluster.ConfigItems.aws_efa_device_plugin_memory }}" + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: infiniband-volume + mountPath: /dev/infiniband/ + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins + - name: infiniband-volume + hostPath: + path: /dev/infiniband/ +{{ end }} diff --git a/cluster/manifests/aws-efa-device-plugin/rbac.yaml b/cluster/manifests/aws-efa-device-plugin/rbac.yaml new file mode 100644 index 0000000000..d09c437991 --- /dev/null +++ b/cluster/manifests/aws-efa-device-plugin/rbac.yaml @@ -0,0 +1,10 @@ +{{ if eq .Cluster.ConfigItems.aws_efa_device_plugin_enabled "true" }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: aws-efa-k8s-device-plugin + namespace: kube-system + labels: + application: kubernetes + component: aws-efa-k8s-device-plugin +{{ end }} diff --git a/cluster/manifests/deletions.yaml b/cluster/manifests/deletions.yaml index ed5f25cdc6..ca4971b39f 100644 --- a/cluster/manifests/deletions.yaml +++ b/cluster/manifests/deletions.yaml @@ -339,3 +339,11 @@ post_apply: - name: kube-janitor kind: ClusterRoleBinding {{- end }} +{{- if ne .Cluster.ConfigItems.aws_efa_device_plugin_enabled "true"}} +- name: aws-efa-k8s-device-plugin + kind: DaemonSet + namespace: kube-system +- name: aws-efa-k8s-device-plugin + kind: ServiceAccount + namespace: kube-system +{{- end}}