@@ -2,22 +2,24 @@ package addon
22
33import (
44 "context"
5+ _ "embed"
56 "fmt"
7+ "strings"
68 "time"
79
810 "github.com/aws/aws-sdk-go-v2/service/eks"
911 "github.com/go-logr/logr"
1012 "k8s.io/apimachinery/pkg/util/wait"
11- clientgo "k8s.io/client-go/kubernetes"
1213 "k8s.io/client-go/rest"
1314
1415 "github.com/aws/eks-hybrid/test/e2e/commands"
1516 "github.com/aws/eks-hybrid/test/e2e/kubernetes"
17+ peeredtypes "github.com/aws/eks-hybrid/test/e2e/peered/types"
1618)
1719
1820type NvidiaDevicePluginTest struct {
1921 Cluster string
20- K8S clientgo. Interface
22+ K8S peeredtypes. K8s
2123 EKSClient * eks.Client
2224 K8SConfig * rest.Config
2325 Logger logr.Logger
@@ -31,8 +33,15 @@ const (
3133 nodeWaitTimeout = 5 * time .Minute
3234 nvidiaDriverWaitTimeout = 20 * time .Minute
3335 nvidiaDriverWaitInterval = 1 * time .Minute
36+ testPodName = "gpu-pod"
3437)
3538
39+ //go:embed testdata/nvidia-device-plugin-v0.17.1.yaml
40+ var devicePluginYaml []byte
41+
42+ //go:embed testdata/gpu-pod.yaml
43+ var gpuPodYaml []byte
44+
3645// WaitForNvidiaDrivers checks if nvidia-smi command succeeds on the node
3746func (n * NvidiaDevicePluginTest ) WaitForNvidiaDriverReady (ctx context.Context ) error {
3847 node , err := kubernetes .WaitForNode (ctx , n .K8S , n .NodeName , n .Logger )
@@ -58,3 +67,63 @@ func (n *NvidiaDevicePluginTest) WaitForNvidiaDriverReady(ctx context.Context) e
5867
5968 return nil
6069}
70+
71+ func (n * NvidiaDevicePluginTest ) Create (ctx context.Context ) error {
72+ objs , err := kubernetes .YamlToUnstructured (devicePluginYaml )
73+ if err != nil {
74+ return fmt .Errorf ("failed to read device plugin yaml file: %w" , err )
75+ }
76+
77+ n .Logger .Info ("Applying device plugin yaml" )
78+
79+ if err := kubernetes .UpsertManifestsWithRetries (ctx , n .K8S , objs ); err != nil {
80+ return fmt .Errorf ("failed to deploy device plugin: %w" , err )
81+ }
82+ return nil
83+ }
84+
85+ func (n * NvidiaDevicePluginTest ) Validate (ctx context.Context ) error {
86+ objs , err := kubernetes .YamlToUnstructured (gpuPodYaml )
87+ if err != nil {
88+ return fmt .Errorf ("failed to read gpu yaml file: %w" , err )
89+ }
90+
91+ n .Logger .Info ("Applying gpu pod yaml" )
92+
93+ if err := kubernetes .UpsertManifestsWithRetries (ctx , n .K8S , objs ); err != nil {
94+ return fmt .Errorf ("failed to deploy gpu pod: %w" , err )
95+ }
96+
97+ if err := kubernetes .WaitForPodToBeCompleted (ctx , n .K8S , testPodName , namespace ); err != nil {
98+ return fmt .Errorf ("failed to wait for gpu pod to be completed: %w" , err )
99+ }
100+
101+ logs , err := kubernetes .FetchLogs (ctx , n .K8S , testPodName , namespace )
102+ if err != nil {
103+ return fmt .Errorf ("failed to fetch logs for gpu pod: %w" , err )
104+ }
105+
106+ if ! strings .Contains (logs , "Test PASSED" ) {
107+ return fmt .Errorf ("gpu pod test failed: %s" , logs )
108+ }
109+
110+ if err := kubernetes .DeleteManifestsWithRetries (ctx , n .K8S , objs ); err != nil {
111+ return fmt .Errorf ("failed to delete gpu pod: %w" , err )
112+ }
113+
114+ return nil
115+ }
116+
117+ func (n * NvidiaDevicePluginTest ) Delete (ctx context.Context ) error {
118+ objs , err := kubernetes .YamlToUnstructured (devicePluginYaml )
119+ if err != nil {
120+ return fmt .Errorf ("failed to read device plugin yaml file: %w" , err )
121+ }
122+
123+ n .Logger .Info ("Deleting device plugin yaml" )
124+ if err := kubernetes .DeleteManifestsWithRetries (ctx , n .K8S , objs ); err != nil {
125+ return fmt .Errorf ("failed to delete device plugin: %w" , err )
126+ }
127+
128+ return nil
129+ }
0 commit comments