Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cmd/e2e-test/node/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/aws/eks-hybrid/test/e2e/kubernetes"
osystem "github.com/aws/eks-hybrid/test/e2e/os"
"github.com/aws/eks-hybrid/test/e2e/peered"
peeredtypes "github.com/aws/eks-hybrid/test/e2e/peered/types"
"github.com/aws/eks-hybrid/test/e2e/s3"
)

Expand Down Expand Up @@ -206,7 +207,7 @@ func (c *create) Run(log *zap.Logger, opts *cli.GlobalOptions) error {
network := peered.Network{
EC2: ec2Client,
Logger: logger,
K8s: peered.K8s{
K8s: peeredtypes.K8s{
Interface: k8s,
Dynamic: k8sDynamic,
},
Expand Down
73 changes: 71 additions & 2 deletions test/e2e/addon/nvidiadeviceplugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,24 @@ package addon

import (
"context"
_ "embed"
"fmt"
"strings"
"time"

"github.com/aws/aws-sdk-go-v2/service/eks"
"github.com/go-logr/logr"
"k8s.io/apimachinery/pkg/util/wait"
clientgo "k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"

"github.com/aws/eks-hybrid/test/e2e/commands"
"github.com/aws/eks-hybrid/test/e2e/kubernetes"
peeredtypes "github.com/aws/eks-hybrid/test/e2e/peered/types"
)

type NvidiaDevicePluginTest struct {
Cluster string
K8S clientgo.Interface
K8S peeredtypes.K8s
EKSClient *eks.Client
K8SConfig *rest.Config
Logger logr.Logger
Expand All @@ -31,8 +33,15 @@ const (
nodeWaitTimeout = 5 * time.Minute
nvidiaDriverWaitTimeout = 20 * time.Minute
nvidiaDriverWaitInterval = 1 * time.Minute
testPodName = "gpu-pod"
)

//go:embed testdata/nvidia-device-plugin-v0.17.1.yaml
var devicePluginYaml []byte

//go:embed testdata/gpu-pod.yaml
var gpuPodYaml []byte

// WaitForNvidiaDrivers checks if nvidia-smi command succeeds on the node
func (n *NvidiaDevicePluginTest) WaitForNvidiaDriverReady(ctx context.Context) error {
node, err := kubernetes.WaitForNode(ctx, n.K8S, n.NodeName, n.Logger)
Expand All @@ -58,3 +67,63 @@ func (n *NvidiaDevicePluginTest) WaitForNvidiaDriverReady(ctx context.Context) e

return nil
}

func (n *NvidiaDevicePluginTest) Create(ctx context.Context) error {
objs, err := kubernetes.YamlToUnstructured(devicePluginYaml)
if err != nil {
return fmt.Errorf("failed to read device plugin yaml file: %w", err)
}

n.Logger.Info("Applying device plugin yaml")

if err := kubernetes.UpsertManifestsWithRetries(ctx, n.K8S, objs); err != nil {
return fmt.Errorf("failed to deploy device plugin: %w", err)
}
return nil
}

func (n *NvidiaDevicePluginTest) Validate(ctx context.Context) error {
objs, err := kubernetes.YamlToUnstructured(gpuPodYaml)
if err != nil {
return fmt.Errorf("failed to read gpu yaml file: %w", err)
}

n.Logger.Info("Applying gpu pod yaml")

if err := kubernetes.UpsertManifestsWithRetries(ctx, n.K8S, objs); err != nil {
return fmt.Errorf("failed to deploy gpu pod: %w", err)
}

if err := kubernetes.WaitForPodToBeCompleted(ctx, n.K8S, testPodName, namespace); err != nil {
return fmt.Errorf("failed to wait for gpu pod to be completed: %w", err)
}

logs, err := kubernetes.FetchLogs(ctx, n.K8S, testPodName, namespace)
if err != nil {
return fmt.Errorf("failed to fetch logs for gpu pod: %w", err)
}

if !strings.Contains(logs, "Test PASSED") {
return fmt.Errorf("gpu pod test failed: %s", logs)
}

if err := kubernetes.DeleteManifestsWithRetries(ctx, n.K8S, objs); err != nil {
return fmt.Errorf("failed to delete gpu pod: %w", err)
}

return nil
}

func (n *NvidiaDevicePluginTest) Delete(ctx context.Context) error {
objs, err := kubernetes.YamlToUnstructured(devicePluginYaml)
if err != nil {
return fmt.Errorf("failed to read device plugin yaml file: %w", err)
}

n.Logger.Info("Deleting device plugin yaml")
if err := kubernetes.DeleteManifestsWithRetries(ctx, n.K8S, objs); err != nil {
return fmt.Errorf("failed to delete device plugin: %w", err)
}

return nil
}
18 changes: 18 additions & 0 deletions test/e2e/addon/testdata/gpu-pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: v1
kind: Pod
metadata:
namespace: default
name: gpu-pod
spec:
restartPolicy: Never
containers:
- name: cuda-container
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0
resources:
limits:
nvidia.com/gpu: 1 # requesting 1 GPU
runtimeClassName: nvidia
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
65 changes: 65 additions & 0 deletions test/e2e/addon/testdata/nvidia-device-plugin-v0.17.1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
---
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: nvidia
handler: nvidia

---
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-ds
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
runtimeClassName: nvidia
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.17.1
name: nvidia-device-plugin-ctr
env:
- name: FAIL_ON_INIT_ERROR
value: "false"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
12 changes: 12 additions & 0 deletions test/e2e/kubernetes/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,18 @@ func WaitForPodsToBeRunning(ctx context.Context, k8s kubernetes.Interface, listO
return nil
}

// WaitForPodToBeCompleted waits until the pod is in Completed phase.
func WaitForPodToBeCompleted(ctx context.Context, k8s kubernetes.Interface, name, namespace string) error {
_, err := ik8s.GetAndWait(ctx, nodePodWaitTimeout, k8s.CoreV1().Pods(namespace), name, func(pod *corev1.Pod) bool {
return pod != nil && pod.Status.Phase == corev1.PodSucceeded
})
if err != nil {
return fmt.Errorf("waiting for pod %s in namespace %s to be completed: %w", name, namespace, err)
}

return nil
}

func waitForPodToBeDeleted(ctx context.Context, k8s kubernetes.Interface, name, namespace string) error {
_, err := ik8s.ListAndWait(ctx, nodePodWaitTimeout, k8s.CoreV1().Pods(namespace), func(pods *corev1.PodList) bool {
return len(pods.Items) == 0
Expand Down
20 changes: 2 additions & 18 deletions test/e2e/peered/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,17 @@ import (
ec2sdk "github.com/aws/aws-sdk-go-v2/service/ec2"
"github.com/aws/aws-sdk-go-v2/service/ec2/types"
"github.com/go-logr/logr"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/dynamic"
clientgo "k8s.io/client-go/kubernetes"

"github.com/aws/eks-hybrid/test/e2e/cni"
"github.com/aws/eks-hybrid/test/e2e/ec2"
"github.com/aws/eks-hybrid/test/e2e/kubernetes"
peeredtypes "github.com/aws/eks-hybrid/test/e2e/peered/types"
)

type Network struct {
EC2 *ec2sdk.Client
Logger logr.Logger
K8s K8s
K8s peeredtypes.K8s

Cluster *HybridCluster
}
Expand Down Expand Up @@ -85,17 +83,3 @@ func (n *Network) addRoutesForCIDRs(ctx context.Context, instance *PeeredInstanc

return nil
}

var (
_ clientgo.Interface = K8s{}
_ dynamic.Interface = K8s{}
)

type K8s struct {
clientgo.Interface
Dynamic dynamic.Interface
}

func (k K8s) Resource(resource schema.GroupVersionResource) dynamic.NamespaceableResourceInterface {
return k.Dynamic.Resource(resource)
}
21 changes: 21 additions & 0 deletions test/e2e/peered/types/types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package types

import (
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/dynamic"
clientgo "k8s.io/client-go/kubernetes"
)

var (
_ clientgo.Interface = K8s{}
_ dynamic.Interface = K8s{}
)

type K8s struct {
clientgo.Interface
Dynamic dynamic.Interface
}

func (k K8s) Resource(resource schema.GroupVersionResource) dynamic.NamespaceableResourceInterface {
return k.Dynamic.Resource(resource)
}
3 changes: 3 additions & 0 deletions test/e2e/suite/addons/addons_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,9 @@ var _ = Describe("Hybrid Nodes", func() {
addonEc2Test.Logger.Info("Checking NVIDIA drivers on pre-created GPU node", "nodeName", standardLinuxGPUNodeName)
devicePluginTest := addonEc2Test.NewNvidiaDevicePluginTest(standardLinuxGPUNodeName)
Expect(devicePluginTest.WaitForNvidiaDriverReady(ctx)).NotTo(HaveOccurred(), "NVIDIA drivers should be ready")
Expect(devicePluginTest.Create(ctx)).To(Succeed(), "nvidia device plugin should have created successfully")
Expect(devicePluginTest.Validate(ctx)).To(Succeed(), "nvidia device plugin should have been validated successfully")
Expect(devicePluginTest.Delete(ctx)).To(Succeed(), "should clean up nvidia device plugin")
}, Label("nvidia-device-plugin"))

It("runs cert manager tests", func(ctx context.Context) {
Expand Down
5 changes: 3 additions & 2 deletions test/e2e/suite/peered_vpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/aws/eks-hybrid/test/e2e/nodeadm"
osystem "github.com/aws/eks-hybrid/test/e2e/os"
"github.com/aws/eks-hybrid/test/e2e/peered"
peeredtypes "github.com/aws/eks-hybrid/test/e2e/peered/types"
"github.com/aws/eks-hybrid/test/e2e/s3"
"github.com/aws/eks-hybrid/test/e2e/ssm"
)
Expand All @@ -59,7 +60,7 @@ type PeeredVPCTest struct {
ec2Client *ec2v2.Client
SSMClient *ssmv2.Client
cfnClient *cloudformation.Client
k8sClient peered.K8s
k8sClient peeredtypes.K8s
K8sClientConfig *rest.Config
s3Client *s3v2.Client
iamClient *iam.Client
Expand Down Expand Up @@ -145,7 +146,7 @@ func BuildPeeredVPCTestForSuite(ctx context.Context, suite *SuiteConfiguration)
return nil, err
}

test.k8sClient = peered.K8s{
test.k8sClient = peeredtypes.K8s{
Interface: k8s,
Dynamic: dynamicK8s,
}
Expand Down
Loading