@@ -22,6 +22,7 @@ import (
2222 "github.com/aws/eks-hybrid/test/e2e/addon"
2323 "github.com/aws/eks-hybrid/test/e2e/cfn"
2424 "github.com/aws/eks-hybrid/test/e2e/cleanup"
25+ e2eCommands "github.com/aws/eks-hybrid/test/e2e/commands"
2526 "github.com/aws/eks-hybrid/test/e2e/constants"
2627 e2eErrors "github.com/aws/eks-hybrid/test/e2e/errors"
2728 "github.com/aws/eks-hybrid/test/e2e/os"
@@ -352,12 +353,26 @@ func (s *stack) setupJumpbox(ctx context.Context, clusterName string) error {
352353 }
353354
354355 command := "/root/download-private-key.sh"
355- output , err := e2eSSM .RunCommand (ctx , s .ssmClient , * jumpbox .InstanceId , command , s .logger )
356- if err != nil {
357- return fmt .Errorf ("jumpbox getting private key from ssm: %w" , err )
356+ var output e2eCommands.RemoteCommandOutput
357+
358+ // Retry private key download to handle IAM role credential propagation timing.
359+ // This is different from the retry for SSM call inside RunCommand since it is not the SSM API call that fails,
360+ // but the command execution on the jumpbox done by that SSM API call that fails due to IAM role credentials
361+ // not being available through IMDS yet, even though the instance profile is attached.
362+ for range 3 {
363+ output , err = e2eSSM .RunCommand (ctx , s .ssmClient , * jumpbox .InstanceId , command , s .logger )
364+ if err != nil {
365+ return fmt .Errorf ("jumpbox getting private key from ssm: %w" , err )
366+ }
367+ if output .Status == "Success" {
368+ break
369+ }
370+ s .logger .Info ("Private key download failed, retrying in 10 seconds" )
371+ time .Sleep (10 * time .Second )
358372 }
373+
359374 if output .Status != "Success" {
360- return fmt .Errorf ("jumpbox getting private key from ssm" )
375+ return fmt .Errorf ("jumpbox getting private key from ssm after retries " )
361376 }
362377
363378 return nil
0 commit comments