@@ -2,22 +2,24 @@ package addon
22
33import (
44 "context"
5+ _ "embed"
56 "fmt"
7+ "strings"
68 "time"
79
810 "github.com/aws/aws-sdk-go-v2/service/eks"
911 "github.com/go-logr/logr"
1012 "k8s.io/apimachinery/pkg/util/wait"
11- clientgo "k8s.io/client-go/kubernetes"
1213 "k8s.io/client-go/rest"
1314
1415 "github.com/aws/eks-hybrid/test/e2e/commands"
1516 "github.com/aws/eks-hybrid/test/e2e/kubernetes"
17+ peeredtypes "github.com/aws/eks-hybrid/test/e2e/peered/types"
1618)
1719
1820type NvidiaDevicePluginTest struct {
1921 Cluster string
20- K8S clientgo. Interface
22+ K8S peeredtypes. K8s
2123 EKSClient * eks.Client
2224 K8SConfig * rest.Config
2325 Logger logr.Logger
@@ -30,8 +32,15 @@ const (
3032 nodeWaitTimeout = 5 * time .Minute
3133 nvidiaDriverWaitTimeout = 20 * time .Minute
3234 nvidiaDriverWaitInterval = 1 * time .Minute
35+ testPodName = "gpu-pod"
3336)
3437
38+ //go:embed testdata/nvidia-device-plugin-v0.17.1.yaml
39+ var devicePluginYaml []byte
40+
41+ //go:embed testdata/gpu-pod.yaml
42+ var gpuPodYaml []byte
43+
3544// WaitForNvidiaDrivers checks if nvidia-smi command succeeds on the node
3645func (n * NvidiaDevicePluginTest ) WaitForNvidiaDriverReady (ctx context.Context ) error {
3746 node , err := kubernetes .WaitForNode (ctx , n .K8S , n .NodeName , n .Logger )
@@ -57,3 +66,63 @@ func (n *NvidiaDevicePluginTest) WaitForNvidiaDriverReady(ctx context.Context) e
5766
5867 return nil
5968}
69+
70+ func (n * NvidiaDevicePluginTest ) Create (ctx context.Context ) error {
71+ objs , err := kubernetes .YamlToUnstructured (devicePluginYaml )
72+ if err != nil {
73+ return fmt .Errorf ("failed to read device plugin yaml file: %w" , err )
74+ }
75+
76+ n .Logger .Info ("Applying device plugin yaml" )
77+
78+ if err := kubernetes .UpsertManifestsWithRetries (ctx , n .K8S , objs ); err != nil {
79+ return fmt .Errorf ("failed to deploy device plugin: %w" , err )
80+ }
81+ return nil
82+ }
83+
84+ func (n * NvidiaDevicePluginTest ) Validate (ctx context.Context ) error {
85+ objs , err := kubernetes .YamlToUnstructured (gpuPodYaml )
86+ if err != nil {
87+ return fmt .Errorf ("failed to read gpu yaml file: %w" , err )
88+ }
89+
90+ n .Logger .Info ("Applying gpu pod yaml" )
91+
92+ if err := kubernetes .UpsertManifestsWithRetries (ctx , n .K8S , objs ); err != nil {
93+ return fmt .Errorf ("failed to deploy gpu pod: %w" , err )
94+ }
95+
96+ if err := kubernetes .WaitForPodToBeCompleted (ctx , n .K8S , testPodName , namespace ); err != nil {
97+ return fmt .Errorf ("failed to wait for gpu pod to be completed: %w" , err )
98+ }
99+
100+ logs , err := kubernetes .FetchLogs (ctx , n .K8S , testPodName , namespace )
101+ if err != nil {
102+ return fmt .Errorf ("failed to fetch logs for gpu pod: %w" , err )
103+ }
104+
105+ if ! strings .Contains (logs , "Test PASSED" ) {
106+ return fmt .Errorf ("gpu pod test failed: %s" , logs )
107+ }
108+
109+ if err := kubernetes .DeleteManifestsWithRetries (ctx , n .K8S , objs ); err != nil {
110+ return fmt .Errorf ("failed to delete gpu pod: %w" , err )
111+ }
112+
113+ return nil
114+ }
115+
116+ func (n * NvidiaDevicePluginTest ) Delete (ctx context.Context ) error {
117+ objs , err := kubernetes .YamlToUnstructured (devicePluginYaml )
118+ if err != nil {
119+ return fmt .Errorf ("failed to read device plugin yaml file: %w" , err )
120+ }
121+
122+ n .Logger .Info ("Deleting device plugin yaml" )
123+ if err := kubernetes .DeleteManifestsWithRetries (ctx , n .K8S , objs ); err != nil {
124+ return fmt .Errorf ("failed to delete device plugin: %w" , err )
125+ }
126+
127+ return nil
128+ }
0 commit comments