Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable Bottlerocket on Neuron Instance types (Inferentia and Trainium) #8173

Merged
merged 3 commits into from
Feb 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 89 additions & 89 deletions pkg/addons/assets/neuron-device-plugin.yaml
Original file line number Diff line number Diff line change
@@ -1,48 +1,31 @@
---
# Source: neuron-helm-chart/templates/device-plugin-rbac.yaml
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: neuron-device-plugin
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- ""
resources:
- pods
verbs:
- update
- patch
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- patch
- update
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["events"]
verbs: ["create", "patch"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["update", "patch", "get", "list", "watch"]
- apiGroups: [""]
resources: ["nodes/status"]
verbs: ["patch", "update"]
---
# Source: neuron-helm-chart/templates/device-plugin-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: neuron-device-plugin
namespace: kube-system
---
# Source: neuron-helm-chart/templates/device-plugin-rbac.yaml
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
Expand All @@ -53,89 +36,106 @@ roleRef:
kind: ClusterRole
name: neuron-device-plugin
subjects:
- kind: ServiceAccount
name: neuron-device-plugin
namespace: kube-system
- kind: ServiceAccount
name: neuron-device-plugin
namespace: kube-system
---
# Source: neuron-helm-chart/templates/device-plugin-daemonset.yaml
# https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: neuron-device-plugin-daemonset
name: neuron-device-plugin
namespace: kube-system
spec:
selector:
matchLabels:
name: neuron-device-plugin-ds
app.kubernetes.io/name: neuron-device-plugin
updateStrategy:
type: RollingUpdate
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: neuron-device-plugin-ds
app.kubernetes.io/name: neuron-device-plugin
spec:
serviceAccount: neuron-device-plugin
serviceAccountName: neuron-device-plugin
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: aws.amazon.com/neuron
- effect: NoSchedule
key: aws.amazon.com/neuron
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
- effect: NoSchedule
key: sagemaker.amazonaws.com/node-health-status
operator: Equal
value: Unschedulable
priorityClassName: system-node-critical
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "beta.kubernetes.io/instance-type"
operator: In
values:
- inf1.xlarge
- inf1.2xlarge
- inf1.6xlarge
- inf1.24xlarge
- trn1.2xlarge
- trn1.32xlarge
- matchExpressions:
- key: "node.kubernetes.io/instance-type"
operator: In
values:
- inf1.xlarge
- inf1.2xlarge
- inf1.6xlarge
- inf1.24xlarge
- trn1.2xlarge
- trn1.32xlarge
- matchExpressions:
- key: "node.kubernetes.io/instance-type"
operator: In
values:
- inf1.xlarge
- inf1.2xlarge
- inf1.6xlarge
- inf1.24xlarge
- inf2.xlarge
- inf2.8xlarge
- inf2.24xlarge
- inf2.48xlarge
- trn1.2xlarge
- trn1.32xlarge
- trn1n.32xlarge
- trn2.48xlarge
- trn2u.48xlarge
- ml.inf2.xlarge
- ml.inf2.8xlarge
- ml.inf2.24xlarge
- ml.inf2.48xlarge
- ml.trn1.2xlarge
- ml.trn1.32xlarge
- ml.trn1n.32xlarge
- key: eks.amazonaws.com/compute-type
operator: NotIn
values:
- fargate
- hybrid
- auto
containers:
- image: "public.ecr.aws/neuron/neuron-device-plugin:2.1.2.0"
imagePullPolicy: Always
name: k8s-neuron-device-plugin-ctr
env:
- name: KUBECONFIG
value: /etc/kubernetes/kubelet.conf
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: infa-map
mountPath: /run
- image: "public.ecr.aws/neuron/neuron-device-plugin:2.23.30.0"
imagePullPolicy: IfNotPresent
name: neuron-device-plugin
env:
- name: KUBECONFIG
value: /etc/kubernetes/kubelet.conf
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- mountPath: /var/lib/kubelet/device-plugins
name: device-plugin
- mountPath: /run
name: infa-map
- mountPath: /opt/aws
name: aws-config
readOnly: true
volumes:
- name: device-plugin
hostPath:
- hostPath:
path: /var/lib/kubelet/device-plugins
- name: infa-map
hostPath:
name: device-plugin
- hostPath:
path: /run
name: infa-map
- hostPath:
path: /opt/aws
name: aws-config
40 changes: 24 additions & 16 deletions pkg/apis/eksctl.io/v1alpha5/gpu_validation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,19 @@ var _ = Describe("GPU instance support", func() {
amiFamily: api.NodeImageFamilyUbuntu2004,
gpuInstanceType: "g4dn.xlarge",
}),
Entry("Bottlerocket", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "inf1.xlarge",
expectUnsupportedErr: true,
instanceTypeName: "Inferentia",
Entry("Bottlerocket INF", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "inf1.xlarge",
instanceTypeName: "Inferentia",
}),
Entry("Bottlerocket", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "trn1.2xlarge",
expectUnsupportedErr: true,
instanceTypeName: "Trainium",
Entry("Bottlerocket TRN", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "trn1.2xlarge",
instanceTypeName: "Trainium",
}),
Entry("Bottlerocket NVIDIA", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "g4dn.xlarge",
}),
)

Expand Down Expand Up @@ -140,14 +142,20 @@ var _ = Describe("GPU instance support", func() {
gpuInstanceType: "g4dn.xlarge",
}),
Entry("Bottlerocket infra", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "inf1.xlarge",
expectUnsupportedErr: true,
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "inf1.xlarge",
}),
Entry("Bottlerocket infra", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "trn1.2xlarge",
expectUnsupportedErr: true,
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "inf2.xlarge",
}),
Entry("Bottlerocket infra", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "trn1.2xlarge",
}),
Entry("Bottlerocket infra", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
gpuInstanceType: "trn2.48xlarge",
}),
Entry("Bottlerocket nvidia", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyBottlerocket,
Expand Down
5 changes: 3 additions & 2 deletions pkg/apis/eksctl.io/v1alpha5/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -738,12 +738,13 @@ func validateNodeGroupBase(np NodePool, path string, controlPlaneOnOutposts bool

if ng.AMIFamily != NodeImageFamilyAmazonLinux2 &&
ng.AMIFamily != NodeImageFamilyAmazonLinux2023 &&
ng.AMIFamily != NodeImageFamilyBottlerocket &&
ng.AMIFamily != "" {
// Only AL2 and AL2023 support Inferentia hosts.
// Only AL2, AL2023 and Bottlerocket support Inferentia hosts.
if instanceutils.IsInferentiaInstanceType(instanceType) {
return ErrUnsupportedInstanceTypes("Inferentia", ng.AMIFamily, fmt.Sprintf("please use %s instead", NodeImageFamilyAmazonLinux2))
}
// Only AL2 and AL2023 support Trainium hosts.
// Only AL2, AL2023 and Bottlerocket support Trainium hosts.
if instanceutils.IsTrainiumInstanceType(instanceType) {
return ErrUnsupportedInstanceTypes("Trainium", ng.AMIFamily, fmt.Sprintf("please use %s instead", NodeImageFamilyAmazonLinux2))
}
Expand Down
1 change: 1 addition & 0 deletions pkg/cfn/builder/managed_nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ func getAMIType(ng *api.ManagedNodeGroup, instanceType string) ekstypes.AMITypes
api.NodeImageFamilyBottlerocket: {
X86x64: ekstypes.AMITypesBottlerocketX8664,
X86Nvidia: ekstypes.AMITypesBottlerocketX8664Nvidia,
X86Neuron: ekstypes.AMITypesBottlerocketX8664,
ARM: ekstypes.AMITypesBottlerocketArm64,
ARMGPU: ekstypes.AMITypesBottlerocketArm64Nvidia,
},
Expand Down
44 changes: 44 additions & 0 deletions pkg/cfn/builder/managed_nodegroup_ami_type_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,50 @@ var _ = DescribeTable("Managed Nodegroup AMI type", func(e amiTypeEntry) {
expectedAMIType: "BOTTLEROCKET_x86_64_NVIDIA",
}),

Entry("Bottlerocket x86 Neuron Inferentia 1 Accelerated instance type", amiTypeEntry{
nodeGroup: &api.ManagedNodeGroup{
NodeGroupBase: &api.NodeGroupBase{
Name: "test",
AMIFamily: api.NodeImageFamilyBottlerocket,
InstanceType: "inf1.xlarge",
},
},
expectedAMIType: "BOTTLEROCKET_x86_64",
}),

Entry("Bottlerocket x86 Neuron Inferentia 2 Accelerated instance type", amiTypeEntry{
nodeGroup: &api.ManagedNodeGroup{
NodeGroupBase: &api.NodeGroupBase{
Name: "test",
AMIFamily: api.NodeImageFamilyBottlerocket,
InstanceType: "inf2.xlarge",
},
},
expectedAMIType: "BOTTLEROCKET_x86_64",
}),

Entry("Bottlerocket x86 Neuron Trainium 1 Accelerated instance type", amiTypeEntry{
nodeGroup: &api.ManagedNodeGroup{
NodeGroupBase: &api.NodeGroupBase{
Name: "test",
AMIFamily: api.NodeImageFamilyBottlerocket,
InstanceType: "trn1.2xlarge",
},
},
expectedAMIType: "BOTTLEROCKET_x86_64",
}),

Entry("Bottlerocket x86 Neuron Trainium 2 Accelerated instance type", amiTypeEntry{
nodeGroup: &api.ManagedNodeGroup{
NodeGroupBase: &api.NodeGroupBase{
Name: "test",
AMIFamily: api.NodeImageFamilyBottlerocket,
InstanceType: "trn2.48xlarge",
},
},
expectedAMIType: "BOTTLEROCKET_x86_64",
}),

Entry("non-native Ubuntu", amiTypeEntry{
nodeGroup: &api.ManagedNodeGroup{
NodeGroupBase: &api.NodeGroupBase{
Expand Down
6 changes: 4 additions & 2 deletions pkg/utils/instance/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,14 @@ func IsNvidiaInstanceType(instanceType string) bool {

// IsInferentiaInstanceType returns true if the instance type requires AWS Neuron
func IsInferentiaInstanceType(instanceType string) bool {
return strings.HasPrefix(instanceType, "inf1")
return strings.HasPrefix(instanceType, "inf1") ||
strings.HasPrefix(instanceType, "inf2")
}

// IsTrainiumnstanceType returns true if the instance type requires AWS Neuron
func IsTrainiumInstanceType(instanceType string) bool {
return strings.HasPrefix(instanceType, "trn1")
return strings.HasPrefix(instanceType, "trn1") ||
strings.HasPrefix(instanceType, "trn2")
}

// GetSmallestInstanceType returns the smallest instance type in instanceTypes.
Expand Down
Loading