Skip to content

Commit ac0bd07

Browse files
authored
Merge pull request #655 from achen1210/retry-ssm-get-parameter
Fix IAM credential propagation race condition in jumpbox private key download
2 parents d93e1db + 224d2ee commit ac0bd07

File tree

1 file changed

+19
-4
lines changed

1 file changed

+19
-4
lines changed

test/e2e/cluster/stack.go

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"github.com/aws/eks-hybrid/test/e2e/addon"
2323
"github.com/aws/eks-hybrid/test/e2e/cfn"
2424
"github.com/aws/eks-hybrid/test/e2e/cleanup"
25+
e2eCommands "github.com/aws/eks-hybrid/test/e2e/commands"
2526
"github.com/aws/eks-hybrid/test/e2e/constants"
2627
e2eErrors "github.com/aws/eks-hybrid/test/e2e/errors"
2728
"github.com/aws/eks-hybrid/test/e2e/os"
@@ -352,12 +353,26 @@ func (s *stack) setupJumpbox(ctx context.Context, clusterName string) error {
352353
}
353354

354355
command := "/root/download-private-key.sh"
355-
output, err := e2eSSM.RunCommand(ctx, s.ssmClient, *jumpbox.InstanceId, command, s.logger)
356-
if err != nil {
357-
return fmt.Errorf("jumpbox getting private key from ssm: %w", err)
356+
var output e2eCommands.RemoteCommandOutput
357+
358+
// Retry private key download to handle IAM role credential propagation timing.
359+
// This is different from the retry for SSM call inside RunCommand since it is not the SSM API call that fails,
360+
// but the command execution on the jumpbox done by that SSM API call that fails due to IAM role credentials
361+
// not being available through IMDS yet, even though the instance profile is attached.
362+
for range 3 {
363+
output, err = e2eSSM.RunCommand(ctx, s.ssmClient, *jumpbox.InstanceId, command, s.logger)
364+
if err != nil {
365+
return fmt.Errorf("jumpbox getting private key from ssm: %w", err)
366+
}
367+
if output.Status == "Success" {
368+
break
369+
}
370+
s.logger.Info("Private key download failed, retrying in 10 seconds")
371+
time.Sleep(10 * time.Second)
358372
}
373+
359374
if output.Status != "Success" {
360-
return fmt.Errorf("jumpbox getting private key from ssm")
375+
return fmt.Errorf("jumpbox getting private key from ssm after retries")
361376
}
362377

363378
return nil

0 commit comments

Comments
 (0)