Skip to content

Commit 25d3594

Browse files
author
Jun Peng
committed
Validate nvidia device plugin functionality in e2e tests
1 parent 8de2e74 commit 25d3594

File tree

9 files changed

+198
-24
lines changed

9 files changed

+198
-24
lines changed

cmd/e2e-test/node/create.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"github.com/aws/eks-hybrid/test/e2e/kubernetes"
2424
osystem "github.com/aws/eks-hybrid/test/e2e/os"
2525
"github.com/aws/eks-hybrid/test/e2e/peered"
26+
peeredtypes "github.com/aws/eks-hybrid/test/e2e/peered/types"
2627
"github.com/aws/eks-hybrid/test/e2e/s3"
2728
)
2829

@@ -198,7 +199,7 @@ func (c *create) Run(log *zap.Logger, opts *cli.GlobalOptions) error {
198199
network := peered.Network{
199200
EC2: ec2Client,
200201
Logger: logger,
201-
K8s: peered.K8s{
202+
K8s: peeredtypes.K8s{
202203
Interface: k8s,
203204
Dynamic: k8sDynamic,
204205
},

test/e2e/addon/nvidiadeviceplugin.go

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,24 @@ package addon
22

33
import (
44
"context"
5+
_ "embed"
56
"fmt"
7+
"strings"
68
"time"
79

810
"github.com/aws/aws-sdk-go-v2/service/eks"
911
"github.com/go-logr/logr"
1012
"k8s.io/apimachinery/pkg/util/wait"
11-
clientgo "k8s.io/client-go/kubernetes"
1213
"k8s.io/client-go/rest"
1314

1415
"github.com/aws/eks-hybrid/test/e2e/commands"
1516
"github.com/aws/eks-hybrid/test/e2e/kubernetes"
17+
peeredtypes "github.com/aws/eks-hybrid/test/e2e/peered/types"
1618
)
1719

1820
type NvidiaDevicePluginTest struct {
1921
Cluster string
20-
K8S clientgo.Interface
22+
K8S peeredtypes.K8s
2123
EKSClient *eks.Client
2224
K8SConfig *rest.Config
2325
Logger logr.Logger
@@ -30,8 +32,15 @@ const (
3032
nodeWaitTimeout = 5 * time.Minute
3133
nvidiaDriverWaitTimeout = 20 * time.Minute
3234
nvidiaDriverWaitInterval = 1 * time.Minute
35+
testPodName = "gpu-pod"
3336
)
3437

38+
//go:embed testdata/nvidia-device-plugin-v0.17.1.yaml
39+
var devicePluginYaml []byte
40+
41+
//go:embed testdata/gpu-pod.yaml
42+
var gpuPodYaml []byte
43+
3544
// WaitForNvidiaDrivers checks if nvidia-smi command succeeds on the node
3645
func (n *NvidiaDevicePluginTest) WaitForNvidiaDriverReady(ctx context.Context) error {
3746
node, err := kubernetes.WaitForNode(ctx, n.K8S, n.NodeName, n.Logger)
@@ -57,3 +66,63 @@ func (n *NvidiaDevicePluginTest) WaitForNvidiaDriverReady(ctx context.Context) e
5766

5867
return nil
5968
}
69+
70+
func (n *NvidiaDevicePluginTest) Create(ctx context.Context) error {
71+
objs, err := kubernetes.YamlToUnstructured(devicePluginYaml)
72+
if err != nil {
73+
return fmt.Errorf("failed to read device plugin yaml file: %w", err)
74+
}
75+
76+
n.Logger.Info("Applying device plugin yaml")
77+
78+
if err := kubernetes.UpsertManifestsWithRetries(ctx, n.K8S, objs); err != nil {
79+
return fmt.Errorf("failed to deploy device plugin: %w", err)
80+
}
81+
return nil
82+
}
83+
84+
func (n *NvidiaDevicePluginTest) Validate(ctx context.Context) error {
85+
objs, err := kubernetes.YamlToUnstructured(gpuPodYaml)
86+
if err != nil {
87+
return fmt.Errorf("failed to read gpu yaml file: %w", err)
88+
}
89+
90+
n.Logger.Info("Applying gpu pod yaml")
91+
92+
if err := kubernetes.UpsertManifestsWithRetries(ctx, n.K8S, objs); err != nil {
93+
return fmt.Errorf("failed to deploy gpu pod: %w", err)
94+
}
95+
96+
if err := kubernetes.WaitForPodToBeCompleted(ctx, n.K8S, testPodName, namespace); err != nil {
97+
return fmt.Errorf("failed to wait for gpu pod to be completed: %w", err)
98+
}
99+
100+
logs, err := kubernetes.FetchLogs(ctx, n.K8S, testPodName, namespace)
101+
if err != nil {
102+
return fmt.Errorf("failed to fetch logs for gpu pod: %w", err)
103+
}
104+
105+
if !strings.Contains(logs, "Test PASSED") {
106+
return fmt.Errorf("gpu pod test failed: %s", logs)
107+
}
108+
109+
if err := kubernetes.DeleteManifestsWithRetries(ctx, n.K8S, objs); err != nil {
110+
return fmt.Errorf("failed to delete gpu pod: %w", err)
111+
}
112+
113+
return nil
114+
}
115+
116+
func (n *NvidiaDevicePluginTest) Delete(ctx context.Context) error {
117+
objs, err := kubernetes.YamlToUnstructured(devicePluginYaml)
118+
if err != nil {
119+
return fmt.Errorf("failed to read device plugin yaml file: %w", err)
120+
}
121+
122+
n.Logger.Info("Deleting device plugin yaml")
123+
if err := kubernetes.DeleteManifestsWithRetries(ctx, n.K8S, objs); err != nil {
124+
return fmt.Errorf("failed to delete device plugin: %w", err)
125+
}
126+
127+
return nil
128+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
namespace: default
5+
name: gpu-pod
6+
spec:
7+
restartPolicy: Never
8+
containers:
9+
- name: cuda-container
10+
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0
11+
resources:
12+
limits:
13+
nvidia.com/gpu: 1 # requesting 1 GPU
14+
runtimeClassName: nvidia
15+
tolerations:
16+
- key: nvidia.com/gpu
17+
operator: Exists
18+
effect: NoSchedule
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
---
2+
apiVersion: node.k8s.io/v1
3+
kind: RuntimeClass
4+
metadata:
5+
name: nvidia
6+
handler: nvidia
7+
8+
---
9+
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
10+
#
11+
# Licensed under the Apache License, Version 2.0 (the "License");
12+
# you may not use this file except in compliance with the License.
13+
# You may obtain a copy of the License at
14+
#
15+
# http://www.apache.org/licenses/LICENSE-2.0
16+
#
17+
# Unless required by applicable law or agreed to in writing, software
18+
# distributed under the License is distributed on an "AS IS" BASIS,
19+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20+
# See the License for the specific language governing permissions and
21+
# limitations under the License.
22+
23+
apiVersion: apps/v1
24+
kind: DaemonSet
25+
metadata:
26+
name: nvidia-device-plugin-daemonset
27+
namespace: kube-system
28+
spec:
29+
selector:
30+
matchLabels:
31+
name: nvidia-device-plugin-ds
32+
updateStrategy:
33+
type: RollingUpdate
34+
template:
35+
metadata:
36+
labels:
37+
name: nvidia-device-plugin-ds
38+
spec:
39+
tolerations:
40+
- key: nvidia.com/gpu
41+
operator: Exists
42+
effect: NoSchedule
43+
# Mark this pod as a critical add-on; when enabled, the critical add-on
44+
# scheduler reserves resources for critical add-on pods so that they can
45+
# be rescheduled after a failure.
46+
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
47+
priorityClassName: "system-node-critical"
48+
runtimeClassName: nvidia
49+
containers:
50+
- image: nvcr.io/nvidia/k8s-device-plugin:v0.17.1
51+
name: nvidia-device-plugin-ctr
52+
env:
53+
- name: FAIL_ON_INIT_ERROR
54+
value: "false"
55+
securityContext:
56+
allowPrivilegeEscalation: false
57+
capabilities:
58+
drop: ["ALL"]
59+
volumeMounts:
60+
- name: device-plugin
61+
mountPath: /var/lib/kubelet/device-plugins
62+
volumes:
63+
- name: device-plugin
64+
hostPath:
65+
path: /var/lib/kubelet/device-plugins

test/e2e/kubernetes/pod.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,18 @@ func WaitForPodsToBeRunning(ctx context.Context, k8s kubernetes.Interface, listO
132132
return nil
133133
}
134134

135+
// WaitForPodToBeCompleted waits until the pod is in Completed phase.
136+
func WaitForPodToBeCompleted(ctx context.Context, k8s kubernetes.Interface, name, namespace string) error {
137+
_, err := ik8s.GetAndWait(ctx, nodePodWaitTimeout, k8s.CoreV1().Pods(namespace), name, func(pod *corev1.Pod) bool {
138+
return pod != nil && pod.Status.Phase == corev1.PodSucceeded
139+
})
140+
if err != nil {
141+
return fmt.Errorf("waiting for pod %s in namespace %s to be completed: %w", name, namespace, err)
142+
}
143+
144+
return nil
145+
}
146+
135147
func waitForPodToBeDeleted(ctx context.Context, k8s kubernetes.Interface, name, namespace string) error {
136148
_, err := ik8s.ListAndWait(ctx, nodePodWaitTimeout, k8s.CoreV1().Pods(namespace), func(pods *corev1.PodList) bool {
137149
return len(pods.Items) == 0

test/e2e/peered/network.go

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,17 @@ import (
88
ec2sdk "github.com/aws/aws-sdk-go-v2/service/ec2"
99
"github.com/aws/aws-sdk-go-v2/service/ec2/types"
1010
"github.com/go-logr/logr"
11-
"k8s.io/apimachinery/pkg/runtime/schema"
12-
"k8s.io/client-go/dynamic"
13-
clientgo "k8s.io/client-go/kubernetes"
1411

1512
"github.com/aws/eks-hybrid/test/e2e/cni"
1613
"github.com/aws/eks-hybrid/test/e2e/ec2"
1714
"github.com/aws/eks-hybrid/test/e2e/kubernetes"
15+
peeredtypes "github.com/aws/eks-hybrid/test/e2e/peered/types"
1816
)
1917

2018
type Network struct {
2119
EC2 *ec2sdk.Client
2220
Logger logr.Logger
23-
K8s K8s
21+
K8s peeredtypes.K8s
2422

2523
Cluster *HybridCluster
2624
}
@@ -71,17 +69,3 @@ func (n *Network) addRoutesForCIDRs(ctx context.Context, instance ec2.Instance,
7169

7270
return nil
7371
}
74-
75-
var (
76-
_ clientgo.Interface = K8s{}
77-
_ dynamic.Interface = K8s{}
78-
)
79-
80-
type K8s struct {
81-
clientgo.Interface
82-
Dynamic dynamic.Interface
83-
}
84-
85-
func (k K8s) Resource(resource schema.GroupVersionResource) dynamic.NamespaceableResourceInterface {
86-
return k.Dynamic.Resource(resource)
87-
}

test/e2e/peered/types/types.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
package types
2+
3+
import (
4+
"k8s.io/apimachinery/pkg/runtime/schema"
5+
"k8s.io/client-go/dynamic"
6+
clientgo "k8s.io/client-go/kubernetes"
7+
)
8+
9+
var (
10+
_ clientgo.Interface = K8s{}
11+
_ dynamic.Interface = K8s{}
12+
)
13+
14+
type K8s struct {
15+
clientgo.Interface
16+
Dynamic dynamic.Interface
17+
}
18+
19+
func (k K8s) Resource(resource schema.GroupVersionResource) dynamic.NamespaceableResourceInterface {
20+
return k.Dynamic.Resource(resource)
21+
}

test/e2e/suite/addons/addons_test.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,8 +221,11 @@ var _ = Describe("Hybrid Nodes", func() {
221221

222222
// wait for nvidia drivers to be installed
223223
addonEc2Test.Logger.Info("Checking NVIDIA drivers on node")
224-
devicePluginTest := addonEc2Test.NewNvidiaDevicePluginTest(nodeName)
224+
devicePluginTest := addonEc2Test.NewNvidiaDevicePluginTest(testNode.PeerdNode().Name)
225225
Expect(devicePluginTest.WaitForNvidiaDriverReady(ctx)).NotTo(HaveOccurred(), "NVIDIA drivers should be ready")
226+
Expect(devicePluginTest.Create(ctx)).To(Succeed(), "nvidia device plugin should have created successfully")
227+
Expect(devicePluginTest.Validate(ctx)).To(Succeed(), "nvidia device plugin should have been validated successfully")
228+
Expect(devicePluginTest.Delete(ctx)).To(Succeed(), "should clean up nvidia device plugin")
226229

227230
// clean up node
228231
addonEc2Test.Logger.Info("Resetting hybrid node...")

test/e2e/suite/peered_vpc.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"github.com/aws/eks-hybrid/test/e2e/nodeadm"
3434
osystem "github.com/aws/eks-hybrid/test/e2e/os"
3535
"github.com/aws/eks-hybrid/test/e2e/peered"
36+
peeredtypes "github.com/aws/eks-hybrid/test/e2e/peered/types"
3637
"github.com/aws/eks-hybrid/test/e2e/s3"
3738
"github.com/aws/eks-hybrid/test/e2e/ssm"
3839
)
@@ -58,7 +59,7 @@ type PeeredVPCTest struct {
5859
ec2Client *ec2v2.Client
5960
SSMClient *ssmv2.Client
6061
cfnClient *cloudformation.Client
61-
k8sClient peered.K8s
62+
k8sClient peeredtypes.K8s
6263
K8sClientConfig *rest.Config
6364
s3Client *s3v2.Client
6465
iamClient *iam.Client
@@ -142,7 +143,7 @@ func BuildPeeredVPCTestForSuite(ctx context.Context, suite *SuiteConfiguration)
142143
return nil, err
143144
}
144145

145-
test.k8sClient = peered.K8s{
146+
test.k8sClient = peeredtypes.K8s{
146147
Interface: k8s,
147148
Dynamic: dynamicK8s,
148149
}

0 commit comments

Comments
 (0)