Skip to content

Commit 7ac7952

Browse files
author
Jun Peng
committed
Validate nvidia device plugin functionality in e2e tests
1 parent ac0bd07 commit 7ac7952

File tree

9 files changed

+197
-23
lines changed

9 files changed

+197
-23
lines changed

cmd/e2e-test/node/create.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/aws/eks-hybrid/test/e2e/kubernetes"
2525
osystem "github.com/aws/eks-hybrid/test/e2e/os"
2626
"github.com/aws/eks-hybrid/test/e2e/peered"
27+
peeredtypes "github.com/aws/eks-hybrid/test/e2e/peered/types"
2728
"github.com/aws/eks-hybrid/test/e2e/s3"
2829
)
2930

@@ -206,7 +207,7 @@ func (c *create) Run(log *zap.Logger, opts *cli.GlobalOptions) error {
206207
network := peered.Network{
207208
EC2: ec2Client,
208209
Logger: logger,
209-
K8s: peered.K8s{
210+
K8s: peeredtypes.K8s{
210211
Interface: k8s,
211212
Dynamic: k8sDynamic,
212213
},

test/e2e/addon/nvidiadeviceplugin.go

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,24 @@ package addon
22

33
import (
44
"context"
5+
_ "embed"
56
"fmt"
7+
"strings"
68
"time"
79

810
"github.com/aws/aws-sdk-go-v2/service/eks"
911
"github.com/go-logr/logr"
1012
"k8s.io/apimachinery/pkg/util/wait"
11-
clientgo "k8s.io/client-go/kubernetes"
1213
"k8s.io/client-go/rest"
1314

1415
"github.com/aws/eks-hybrid/test/e2e/commands"
1516
"github.com/aws/eks-hybrid/test/e2e/kubernetes"
17+
peeredtypes "github.com/aws/eks-hybrid/test/e2e/peered/types"
1618
)
1719

1820
type NvidiaDevicePluginTest struct {
1921
Cluster string
20-
K8S clientgo.Interface
22+
K8S peeredtypes.K8s
2123
EKSClient *eks.Client
2224
K8SConfig *rest.Config
2325
Logger logr.Logger
@@ -31,8 +33,15 @@ const (
3133
nodeWaitTimeout = 5 * time.Minute
3234
nvidiaDriverWaitTimeout = 20 * time.Minute
3335
nvidiaDriverWaitInterval = 1 * time.Minute
36+
testPodName = "gpu-pod"
3437
)
3538

39+
//go:embed testdata/nvidia-device-plugin-v0.17.1.yaml
40+
var devicePluginYaml []byte
41+
42+
//go:embed testdata/gpu-pod.yaml
43+
var gpuPodYaml []byte
44+
3645
// WaitForNvidiaDrivers checks if nvidia-smi command succeeds on the node
3746
func (n *NvidiaDevicePluginTest) WaitForNvidiaDriverReady(ctx context.Context) error {
3847
node, err := kubernetes.WaitForNode(ctx, n.K8S, n.NodeName, n.Logger)
@@ -58,3 +67,63 @@ func (n *NvidiaDevicePluginTest) WaitForNvidiaDriverReady(ctx context.Context) e
5867

5968
return nil
6069
}
70+
71+
func (n *NvidiaDevicePluginTest) Create(ctx context.Context) error {
72+
objs, err := kubernetes.YamlToUnstructured(devicePluginYaml)
73+
if err != nil {
74+
return fmt.Errorf("failed to read device plugin yaml file: %w", err)
75+
}
76+
77+
n.Logger.Info("Applying device plugin yaml")
78+
79+
if err := kubernetes.UpsertManifestsWithRetries(ctx, n.K8S, objs); err != nil {
80+
return fmt.Errorf("failed to deploy device plugin: %w", err)
81+
}
82+
return nil
83+
}
84+
85+
func (n *NvidiaDevicePluginTest) Validate(ctx context.Context) error {
86+
objs, err := kubernetes.YamlToUnstructured(gpuPodYaml)
87+
if err != nil {
88+
return fmt.Errorf("failed to read gpu yaml file: %w", err)
89+
}
90+
91+
n.Logger.Info("Applying gpu pod yaml")
92+
93+
if err := kubernetes.UpsertManifestsWithRetries(ctx, n.K8S, objs); err != nil {
94+
return fmt.Errorf("failed to deploy gpu pod: %w", err)
95+
}
96+
97+
if err := kubernetes.WaitForPodToBeCompleted(ctx, n.K8S, testPodName, namespace); err != nil {
98+
return fmt.Errorf("failed to wait for gpu pod to be completed: %w", err)
99+
}
100+
101+
logs, err := kubernetes.FetchLogs(ctx, n.K8S, testPodName, namespace)
102+
if err != nil {
103+
return fmt.Errorf("failed to fetch logs for gpu pod: %w", err)
104+
}
105+
106+
if !strings.Contains(logs, "Test PASSED") {
107+
return fmt.Errorf("gpu pod test failed: %s", logs)
108+
}
109+
110+
if err := kubernetes.DeleteManifestsWithRetries(ctx, n.K8S, objs); err != nil {
111+
return fmt.Errorf("failed to delete gpu pod: %w", err)
112+
}
113+
114+
return nil
115+
}
116+
117+
func (n *NvidiaDevicePluginTest) Delete(ctx context.Context) error {
118+
objs, err := kubernetes.YamlToUnstructured(devicePluginYaml)
119+
if err != nil {
120+
return fmt.Errorf("failed to read device plugin yaml file: %w", err)
121+
}
122+
123+
n.Logger.Info("Deleting device plugin yaml")
124+
if err := kubernetes.DeleteManifestsWithRetries(ctx, n.K8S, objs); err != nil {
125+
return fmt.Errorf("failed to delete device plugin: %w", err)
126+
}
127+
128+
return nil
129+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
namespace: default
5+
name: gpu-pod
6+
spec:
7+
restartPolicy: Never
8+
containers:
9+
- name: cuda-container
10+
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0
11+
resources:
12+
limits:
13+
nvidia.com/gpu: 1 # requesting 1 GPU
14+
runtimeClassName: nvidia
15+
tolerations:
16+
- key: nvidia.com/gpu
17+
operator: Exists
18+
effect: NoSchedule
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
---
2+
apiVersion: node.k8s.io/v1
3+
kind: RuntimeClass
4+
metadata:
5+
name: nvidia
6+
handler: nvidia
7+
8+
---
9+
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
10+
#
11+
# Licensed under the Apache License, Version 2.0 (the "License");
12+
# you may not use this file except in compliance with the License.
13+
# You may obtain a copy of the License at
14+
#
15+
# http://www.apache.org/licenses/LICENSE-2.0
16+
#
17+
# Unless required by applicable law or agreed to in writing, software
18+
# distributed under the License is distributed on an "AS IS" BASIS,
19+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20+
# See the License for the specific language governing permissions and
21+
# limitations under the License.
22+
23+
apiVersion: apps/v1
24+
kind: DaemonSet
25+
metadata:
26+
name: nvidia-device-plugin-daemonset
27+
namespace: kube-system
28+
spec:
29+
selector:
30+
matchLabels:
31+
name: nvidia-device-plugin-ds
32+
updateStrategy:
33+
type: RollingUpdate
34+
template:
35+
metadata:
36+
labels:
37+
name: nvidia-device-plugin-ds
38+
spec:
39+
tolerations:
40+
- key: nvidia.com/gpu
41+
operator: Exists
42+
effect: NoSchedule
43+
# Mark this pod as a critical add-on; when enabled, the critical add-on
44+
# scheduler reserves resources for critical add-on pods so that they can
45+
# be rescheduled after a failure.
46+
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
47+
priorityClassName: "system-node-critical"
48+
runtimeClassName: nvidia
49+
containers:
50+
- image: nvcr.io/nvidia/k8s-device-plugin:v0.17.1
51+
name: nvidia-device-plugin-ctr
52+
env:
53+
- name: FAIL_ON_INIT_ERROR
54+
value: "false"
55+
securityContext:
56+
allowPrivilegeEscalation: false
57+
capabilities:
58+
drop: ["ALL"]
59+
volumeMounts:
60+
- name: device-plugin
61+
mountPath: /var/lib/kubelet/device-plugins
62+
volumes:
63+
- name: device-plugin
64+
hostPath:
65+
path: /var/lib/kubelet/device-plugins

test/e2e/kubernetes/pod.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,18 @@ func WaitForPodsToBeRunning(ctx context.Context, k8s kubernetes.Interface, listO
132132
return nil
133133
}
134134

135+
// WaitForPodToBeCompleted waits until the pod is in Completed phase.
136+
func WaitForPodToBeCompleted(ctx context.Context, k8s kubernetes.Interface, name, namespace string) error {
137+
_, err := ik8s.GetAndWait(ctx, nodePodWaitTimeout, k8s.CoreV1().Pods(namespace), name, func(pod *corev1.Pod) bool {
138+
return pod != nil && pod.Status.Phase == corev1.PodSucceeded
139+
})
140+
if err != nil {
141+
return fmt.Errorf("waiting for pod %s in namespace %s to be completed: %w", name, namespace, err)
142+
}
143+
144+
return nil
145+
}
146+
135147
func waitForPodToBeDeleted(ctx context.Context, k8s kubernetes.Interface, name, namespace string) error {
136148
_, err := ik8s.ListAndWait(ctx, nodePodWaitTimeout, k8s.CoreV1().Pods(namespace), func(pods *corev1.PodList) bool {
137149
return len(pods.Items) == 0

test/e2e/peered/network.go

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,17 @@ import (
99
ec2sdk "github.com/aws/aws-sdk-go-v2/service/ec2"
1010
"github.com/aws/aws-sdk-go-v2/service/ec2/types"
1111
"github.com/go-logr/logr"
12-
"k8s.io/apimachinery/pkg/runtime/schema"
13-
"k8s.io/client-go/dynamic"
14-
clientgo "k8s.io/client-go/kubernetes"
1512

1613
"github.com/aws/eks-hybrid/test/e2e/cni"
1714
"github.com/aws/eks-hybrid/test/e2e/ec2"
1815
"github.com/aws/eks-hybrid/test/e2e/kubernetes"
16+
peeredtypes "github.com/aws/eks-hybrid/test/e2e/peered/types"
1917
)
2018

2119
type Network struct {
2220
EC2 *ec2sdk.Client
2321
Logger logr.Logger
24-
K8s K8s
22+
K8s peeredtypes.K8s
2523

2624
Cluster *HybridCluster
2725
}
@@ -85,17 +83,3 @@ func (n *Network) addRoutesForCIDRs(ctx context.Context, instance *PeeredInstanc
8583

8684
return nil
8785
}
88-
89-
var (
90-
_ clientgo.Interface = K8s{}
91-
_ dynamic.Interface = K8s{}
92-
)
93-
94-
type K8s struct {
95-
clientgo.Interface
96-
Dynamic dynamic.Interface
97-
}
98-
99-
func (k K8s) Resource(resource schema.GroupVersionResource) dynamic.NamespaceableResourceInterface {
100-
return k.Dynamic.Resource(resource)
101-
}

test/e2e/peered/types/types.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
package types
2+
3+
import (
4+
"k8s.io/apimachinery/pkg/runtime/schema"
5+
"k8s.io/client-go/dynamic"
6+
clientgo "k8s.io/client-go/kubernetes"
7+
)
8+
9+
var (
10+
_ clientgo.Interface = K8s{}
11+
_ dynamic.Interface = K8s{}
12+
)
13+
14+
type K8s struct {
15+
clientgo.Interface
16+
Dynamic dynamic.Interface
17+
}
18+
19+
func (k K8s) Resource(resource schema.GroupVersionResource) dynamic.NamespaceableResourceInterface {
20+
return k.Dynamic.Resource(resource)
21+
}

test/e2e/suite/addons/addons_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,9 @@ var _ = Describe("Hybrid Nodes", func() {
259259
addonEc2Test.Logger.Info("Checking NVIDIA drivers on pre-created GPU node", "nodeName", standardLinuxGPUNodeName)
260260
devicePluginTest := addonEc2Test.NewNvidiaDevicePluginTest(standardLinuxGPUNodeName)
261261
Expect(devicePluginTest.WaitForNvidiaDriverReady(ctx)).NotTo(HaveOccurred(), "NVIDIA drivers should be ready")
262+
Expect(devicePluginTest.Create(ctx)).To(Succeed(), "nvidia device plugin should have created successfully")
263+
Expect(devicePluginTest.Validate(ctx)).To(Succeed(), "nvidia device plugin should have been validated successfully")
264+
Expect(devicePluginTest.Delete(ctx)).To(Succeed(), "should clean up nvidia device plugin")
262265
}, Label("nvidia-device-plugin"))
263266

264267
It("runs cert manager tests", func(ctx context.Context) {

test/e2e/suite/peered_vpc.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import (
3434
"github.com/aws/eks-hybrid/test/e2e/nodeadm"
3535
osystem "github.com/aws/eks-hybrid/test/e2e/os"
3636
"github.com/aws/eks-hybrid/test/e2e/peered"
37+
peeredtypes "github.com/aws/eks-hybrid/test/e2e/peered/types"
3738
"github.com/aws/eks-hybrid/test/e2e/s3"
3839
"github.com/aws/eks-hybrid/test/e2e/ssm"
3940
)
@@ -59,7 +60,7 @@ type PeeredVPCTest struct {
5960
ec2Client *ec2v2.Client
6061
SSMClient *ssmv2.Client
6162
cfnClient *cloudformation.Client
62-
k8sClient peered.K8s
63+
k8sClient peeredtypes.K8s
6364
K8sClientConfig *rest.Config
6465
s3Client *s3v2.Client
6566
iamClient *iam.Client
@@ -145,7 +146,7 @@ func BuildPeeredVPCTestForSuite(ctx context.Context, suite *SuiteConfiguration)
145146
return nil, err
146147
}
147148

148-
test.k8sClient = peered.K8s{
149+
test.k8sClient = peeredtypes.K8s{
149150
Interface: k8s,
150151
Dynamic: dynamicK8s,
151152
}

0 commit comments

Comments
 (0)