Skip to content

Commit

Permalink
Enable Bottlerocket on Neuron Instance types (Inferentia and Trainium) (
Browse files Browse the repository at this point in the history
#8173)

* feat: Add support for Bottlerocket AMIs on Neuron Instances

* tests: Update unit-tests to support Bottlerocket on Neuron

* assets: Update neuron-device-plugin
  • Loading branch information
vigh-m authored Feb 4, 2025
1 parent d080f48 commit a3259d6
Show file tree
Hide file tree
Showing 6 changed files with 165 additions and 109 deletions.
178 changes: 89 additions & 89 deletions pkg/addons/assets/neuron-device-plugin.yaml
Original file line number Diff line number Diff line change
@@ -1,48 +1,31 @@
---
# Source: neuron-helm-chart/templates/device-plugin-rbac.yaml
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: neuron-device-plugin
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- ""
resources:
- pods
verbs:
- update
- patch
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- patch
- update
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["events"]
verbs: ["create", "patch"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["update", "patch", "get", "list", "watch"]
- apiGroups: [""]
resources: ["nodes/status"]
verbs: ["patch", "update"]
---
# Source: neuron-helm-chart/templates/device-plugin-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: neuron-device-plugin
namespace: kube-system
---
# Source: neuron-helm-chart/templates/device-plugin-rbac.yaml
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
Expand All @@ -53,89 +36,106 @@ roleRef:
kind: ClusterRole
name: neuron-device-plugin
subjects:
- kind: ServiceAccount
name: neuron-device-plugin
namespace: kube-system
- kind: ServiceAccount
name: neuron-device-plugin
namespace: kube-system
---
# Source: neuron-helm-chart/templates/device-plugin-daemonset.yaml
# https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: neuron-device-plugin-daemonset
name: neuron-device-plugin
namespace: kube-system
spec:
selector:
matchLabels:
name: neuron-device-plugin-ds
app.kubernetes.io/name: neuron-device-plugin
updateStrategy:
type: RollingUpdate
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: neuron-device-plugin-ds
app.kubernetes.io/name: neuron-device-plugin
spec:
serviceAccount: neuron-device-plugin
serviceAccountName: neuron-device-plugin
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: aws.amazon.com/neuron
- effect: NoSchedule
key: aws.amazon.com/neuron
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
- effect: NoSchedule
key: sagemaker.amazonaws.com/node-health-status
operator: Equal
value: Unschedulable
priorityClassName: system-node-critical
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "beta.kubernetes.io/instance-type"
operator: In
values:
- inf1.xlarge
- inf1.2xlarge
- inf1.6xlarge
- inf1.24xlarge
- trn1.2xlarge
- trn1.32xlarge
- matchExpressions:
- key: "node.kubernetes.io/instance-type"
operator: In
values:
- inf1.xlarge
- inf1.2xlarge
- inf1.6xlarge
- inf1.24xlarge
- trn1.2xlarge
- trn1.32xlarge
- matchExpressions:
- key: "node.kubernetes.io/instance-type"
operator: In
values:
- inf1.xlarge
- inf1.2xlarge
- inf1.6xlarge
- inf1.24xlarge
- inf2.xlarge
- inf2.8xlarge
- inf2.24xlarge
- inf2.48xlarge
- trn1.2xlarge
- trn1.32xlarge
- trn1n.32xlarge
- trn2.48xlarge
- trn2u.48xlarge
- ml.inf2.xlarge
- ml.inf2.8xlarge
- ml.inf2.24xlarge
- ml.inf2.48xlarge
- ml.trn1.2xlarge
- ml.trn1.32xlarge
- ml.trn1n.32xlarge
- key: eks.amazonaws.com/compute-type
operator: NotIn
values:
- fargate
- hybrid
- auto
containers:
- image: "public.ecr.aws/neuron/neuron-device-plugin:2.1.2.0"
imagePullPolicy: Always
name: k8s-neuron-device-plugin-ctr
env:
- name: KUBECONFIG
value: /etc/kubernetes/kubelet.conf
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: infa-map
mountPath: /run
- image: "public.ecr.aws/neuron/neuron-device-plugin:2.23.30.0"
imagePullPolicy: IfNotPresent
name: neuron-device-plugin
env:
- name: KUBECONFIG
value: /etc/kubernetes/kubelet.conf
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- mountPath: /var/lib/kubelet/device-plugins
name: device-plugin
- mountPath: /run
name: infa-map
- mountPath: /opt/aws
name: aws-config
readOnly: true
volumes:
- name: device-plugin
hostPath:
- hostPath:
path: /var/lib/kubelet/device-plugins
- name: infa-map
hostPath:
name: device-plugin
- hostPath:
path: /run
name: infa-map
- hostPath:
path: /opt/aws
name: aws-config
40 changes: 24 additions & 16 deletions pkg/apis/eksctl.io/v1alpha5/gpu_validation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,19 @@ var _ = Describe("GPU instance support", func() {
amiFamily: api.NodeImageFamilyUbuntu2004,
gpuInstanceType: "g4dn.xlarge",
}),
Entry("Bottlerocket", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "inf1.xlarge",
expectUnsupportedErr: true,
instanceTypeName: "Inferentia",
Entry("Bottlerocket INF", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "inf1.xlarge",
instanceTypeName: "Inferentia",
}),
Entry("Bottlerocket", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "trn1.2xlarge",
expectUnsupportedErr: true,
instanceTypeName: "Trainium",
Entry("Bottlerocket TRN", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "trn1.2xlarge",
instanceTypeName: "Trainium",
}),
Entry("Bottlerocket NVIDIA", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "g4dn.xlarge",
}),
)

Expand Down Expand Up @@ -140,14 +142,20 @@ var _ = Describe("GPU instance support", func() {
gpuInstanceType: "g4dn.xlarge",
}),
Entry("Bottlerocket infra", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "inf1.xlarge",
expectUnsupportedErr: true,
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "inf1.xlarge",
}),
Entry("Bottlerocket infra", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "trn1.2xlarge",
expectUnsupportedErr: true,
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "inf2.xlarge",
}),
Entry("Bottlerocket infra", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "trn1.2xlarge",
}),
Entry("Bottlerocket infra", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "trn2.48xlarge",
}),
Entry("Bottlerocket nvidia", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
Expand Down
5 changes: 3 additions & 2 deletions pkg/apis/eksctl.io/v1alpha5/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -738,12 +738,13 @@ func validateNodeGroupBase(np NodePool, path string, controlPlaneOnOutposts bool

if ng.AMIFamily != NodeImageFamilyAmazonLinux2 &&
ng.AMIFamily != NodeImageFamilyAmazonLinux2023 &&
ng.AMIFamily != NodeImageFamilyBottlerocket &&
ng.AMIFamily != "" {
// Only AL2 and AL2023 support Inferentia hosts.
// Only AL2, AL2023 and Bottlerocket support Inferentia hosts.
if instanceutils.IsInferentiaInstanceType(instanceType) {
return ErrUnsupportedInstanceTypes("Inferentia", ng.AMIFamily, fmt.Sprintf("please use %s instead", NodeImageFamilyAmazonLinux2))
}
// Only AL2 and AL2023 support Trainium hosts.
// Only AL2, AL2023 and Bottlerocket support Trainium hosts.
if instanceutils.IsTrainiumInstanceType(instanceType) {
return ErrUnsupportedInstanceTypes("Trainium", ng.AMIFamily, fmt.Sprintf("please use %s instead", NodeImageFamilyAmazonLinux2))
}
Expand Down
1 change: 1 addition & 0 deletions pkg/cfn/builder/managed_nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ func getAMIType(ng *api.ManagedNodeGroup, instanceType string) ekstypes.AMITypes
api.NodeImageFamilyBottlerocket: {
X86x64: ekstypes.AMITypesBottlerocketX8664,
X86Nvidia: ekstypes.AMITypesBottlerocketX8664Nvidia,
X86Neuron: ekstypes.AMITypesBottlerocketX8664,
ARM: ekstypes.AMITypesBottlerocketArm64,
ARMGPU: ekstypes.AMITypesBottlerocketArm64Nvidia,
},
Expand Down
44 changes: 44 additions & 0 deletions pkg/cfn/builder/managed_nodegroup_ami_type_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,50 @@ var _ = DescribeTable("Managed Nodegroup AMI type", func(e amiTypeEntry) {
expectedAMIType: "BOTTLEROCKET_x86_64_NVIDIA",
}),

Entry("Bottlerocket x86 Neuron Inferentia 1 Accelerated instance type", amiTypeEntry{
nodeGroup: &api.ManagedNodeGroup{
NodeGroupBase: &api.NodeGroupBase{
Name: "test",
AMIFamily: api.NodeImageFamilyBottlerocket,
InstanceType: "inf1.xlarge",
},
},
expectedAMIType: "BOTTLEROCKET_x86_64",
}),

Entry("Bottlerocket x86 Neuron Inferentia 2 Accelerated instance type", amiTypeEntry{
nodeGroup: &api.ManagedNodeGroup{
NodeGroupBase: &api.NodeGroupBase{
Name: "test",
AMIFamily: api.NodeImageFamilyBottlerocket,
InstanceType: "inf2.xlarge",
},
},
expectedAMIType: "BOTTLEROCKET_x86_64",
}),

Entry("Bottlerocket x86 Neuron Trainium 1 Accelerated instance type", amiTypeEntry{
nodeGroup: &api.ManagedNodeGroup{
NodeGroupBase: &api.NodeGroupBase{
Name: "test",
AMIFamily: api.NodeImageFamilyBottlerocket,
InstanceType: "trn1.2xlarge",
},
},
expectedAMIType: "BOTTLEROCKET_x86_64",
}),

Entry("Bottlerocket x86 Neuron Trainium 2 Accelerated instance type", amiTypeEntry{
nodeGroup: &api.ManagedNodeGroup{
NodeGroupBase: &api.NodeGroupBase{
Name: "test",
AMIFamily: api.NodeImageFamilyBottlerocket,
InstanceType: "trn2.48xlarge",
},
},
expectedAMIType: "BOTTLEROCKET_x86_64",
}),

Entry("non-native Ubuntu", amiTypeEntry{
nodeGroup: &api.ManagedNodeGroup{
NodeGroupBase: &api.NodeGroupBase{
Expand Down
6 changes: 4 additions & 2 deletions pkg/utils/instance/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,14 @@ func IsNvidiaInstanceType(instanceType string) bool {

// IsInferentiaInstanceType returns true if the instance type requires AWS Neuron
func IsInferentiaInstanceType(instanceType string) bool {
return strings.HasPrefix(instanceType, "inf1")
return strings.HasPrefix(instanceType, "inf1") ||
strings.HasPrefix(instanceType, "inf2")
}

// IsTrainiumnstanceType returns true if the instance type requires AWS Neuron
func IsTrainiumInstanceType(instanceType string) bool {
return strings.HasPrefix(instanceType, "trn1")
return strings.HasPrefix(instanceType, "trn1") ||
strings.HasPrefix(instanceType, "trn2")
}

// GetSmallestInstanceType returns the smallest instance type in instanceTypes.
Expand Down

0 comments on commit a3259d6

Please sign in to comment.