Skip to content

Commit 070dd54

Browse files
committed
fix: synchronize tf init to avoid plugin race condition
1 parent 7b3ad90 commit 070dd54

File tree

6 files changed

+138
-26
lines changed

6 files changed

+138
-26
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ GOLANGCI_LINT ?= $(LOCALBIN)/golangci-lint
115115
## Tool Versions
116116
KUSTOMIZE_VERSION ?= v3.8.7
117117
CONTROLLER_TOOLS_VERSION ?= v0.16.4
118-
GOLANGCI_LINT_VERSION ?= v1.60.3
118+
GOLANGCI_LINT_VERSION ?= v2.5.0
119119

120120
KUSTOMIZE_INSTALL_SCRIPT ?= "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh"
121121
.PHONY: kustomize
@@ -135,7 +135,7 @@ $(ENVTEST): $(LOCALBIN)
135135
.PHONY: golangci-lint
136136
golangci-lint: $(GOLANGCI_LINT) ## Download golangci-lint locally if necessary
137137
$(GOLANGCI_LINT): $(LOCALBIN)
138-
test -s $(LOCALBIN)/golangci-lint || GOBIN=$(LOCALBIN) go install github.com/golangci/golangci-lint/cmd/golangci-lint@$(GOLANGCI_LINT_VERSION)
138+
test -s $(LOCALBIN)/golangci-lint || GOBIN=$(LOCALBIN) go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@$(GOLANGCI_LINT_VERSION)
139139

140140
# Development environment targets
141141
setup-dev-env: create-kind init-tilt

controllers/controlplane/kopscontrolplane_controller.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,9 @@ func (r *KopsControlPlaneReconciler) PrepareCustomCloudResources(ctx context.Con
184184
if err != nil {
185185
return err
186186
}
187-
defer karpenterResourcesContent.Close()
187+
defer func() {
188+
_ = karpenterResourcesContent.Close()
189+
}()
188190

189191
// This is needed because the apply will fail if the file is empty
190192
placeholder := corev1.ConfigMap{

pkg/utils/kops_utils.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,9 @@ func GetUserDataFromTerraformFile(clusterName, igName, terraformOutputDir string
367367
if err != nil {
368368
return "", err
369369
}
370-
defer userDataFile.Close()
370+
defer func() {
371+
_ = userDataFile.Close()
372+
}()
371373
userData, err := io.ReadAll(userDataFile)
372374
if err != nil {
373375
return "", err

pkg/utils/kops_utils_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,10 +125,10 @@ func TestParseSpotinstFeatureflags(t *testing.T) {
125125

126126
for _, tc := range testCases {
127127
t.Run(tc.description, func(t *testing.T) {
128-
os.Unsetenv("SPOTINST_TOKEN")
129-
os.Unsetenv("SPOTINST_ACCOUNT")
128+
_ = os.Unsetenv("SPOTINST_TOKEN")
129+
_ = os.Unsetenv("SPOTINST_ACCOUNT")
130130
for key, value := range tc.environmentVariables {
131-
os.Setenv(key, value)
131+
_ = os.Setenv(key, value)
132132
}
133133

134134
err := ParseSpotinstFeatureflags(tc.input)

pkg/utils/terraform_utils.go

Lines changed: 124 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@ import (
88
"path/filepath"
99
"regexp"
1010
"strings"
11+
"sync"
12+
"syscall"
1113
"text/template"
14+
"time"
1215

1316
"github.com/aws/aws-sdk-go-v2/aws"
1417
"github.com/hashicorp/terraform-exec/tfexec"
@@ -25,6 +28,40 @@ type Template struct {
2528
//go:embed templates/*.tpl
2629
var templates embed.FS
2730

31+
var tfPluginMux sync.Mutex
32+
33+
func lockPluginCache(pluginCacheDir string) (*os.File, error) {
34+
if err := os.MkdirAll(pluginCacheDir, 0755); err != nil {
35+
return nil, err
36+
}
37+
38+
lockPath := filepath.Join(pluginCacheDir, ".terraform-plugin.lock")
39+
lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0644)
40+
if err != nil {
41+
return nil, fmt.Errorf("failed to open lock file: %w", err)
42+
}
43+
44+
if err := syscall.Flock(int(lockFile.Fd()), syscall.LOCK_EX); err != nil {
45+
_ = lockFile.Close()
46+
return nil, fmt.Errorf("failed to acquire exclusive lock: %w", err)
47+
}
48+
49+
return lockFile, nil
50+
}
51+
52+
func unlockPluginCache(lockFile *os.File) error {
53+
if lockFile == nil {
54+
return nil
55+
}
56+
57+
if err := syscall.Flock(int(lockFile.Fd()), syscall.LOCK_UN); err != nil {
58+
_ = lockFile.Close()
59+
return fmt.Errorf("failed to unlock plugin cache: %w", err)
60+
}
61+
62+
return lockFile.Close()
63+
}
64+
2865
// CreateTerraformFileFromTemplate populates a Terraform template and create files in the state
2966
func CreateTerraformFilesFromTemplate(terraformTemplateFilePath string, TerraformOutputFileName string, terraformOutputDir string, templateData any) error {
3067
template := Template{
@@ -43,7 +80,9 @@ func CreateAdditionalTerraformFiles(tfFiles ...Template) error {
4380
if err != nil {
4481
return err
4582
}
46-
defer file.Close()
83+
defer func() {
84+
_ = file.Close()
85+
}()
4786

4887
t := template.New(filepath.Base(tfFile.TemplateFilename)).Funcs(template.FuncMap{
4988
"stringReplace": strings.Replace,
@@ -96,23 +135,52 @@ func initTerraform(ctx context.Context, workingDir, terraformExecPath string, cr
96135
return nil, err
97136
}
98137

138+
pluginCacheDir := fmt.Sprintf("%s/plugin-cache", filepath.Dir(terraformExecPath))
139+
99140
env := map[string]string{
100141
"AWS_ACCESS_KEY_ID": credentials.AccessKeyID,
101142
"AWS_SECRET_ACCESS_KEY": credentials.SecretAccessKey,
102143
"SPOTINST_TOKEN": os.Getenv("SPOTINST_TOKEN"),
103144
"SPOTINST_ACCOUNT": os.Getenv("SPOTINST_ACCOUNT"),
104-
"TF_PLUGIN_CACHE_DIR": fmt.Sprintf("%s/plugin-cache", filepath.Dir(terraformExecPath)),
145+
"TF_PLUGIN_CACHE_DIR": pluginCacheDir,
105146
}
106147

107-
// this overrides all ENVVARs that are passed to Terraform
108148
err = tf.SetEnv(env)
109149
if err != nil {
110150
return nil, err
111151
}
112152

113-
err = tf.Init(ctx, tfexec.Upgrade(true))
153+
tfPluginMux.Lock()
154+
defer tfPluginMux.Unlock()
155+
156+
lockFile, err := lockPluginCache(pluginCacheDir)
114157
if err != nil {
115-
return nil, err
158+
return nil, fmt.Errorf("failed to acquire plugin cache lock: %w", err)
159+
}
160+
defer func() {
161+
time.Sleep(500 * time.Millisecond)
162+
_ = unlockPluginCache(lockFile)
163+
}()
164+
165+
var initErr error
166+
maxRetries := 3
167+
for i := 0; i < maxRetries; i++ {
168+
initErr = tf.Init(ctx, tfexec.Upgrade(true))
169+
if initErr == nil {
170+
break
171+
}
172+
173+
if strings.Contains(initErr.Error(), "text file busy") && i < maxRetries-1 {
174+
waitTime := time.Duration(i+1) * 2 * time.Second
175+
time.Sleep(waitTime)
176+
continue
177+
}
178+
179+
break
180+
}
181+
182+
if initErr != nil {
183+
return nil, initErr
116184
}
117185

118186
return tf, nil
@@ -127,12 +195,24 @@ func ApplyTerraform(ctx context.Context, workingDir, terraformExecPath string, c
127195
return err
128196
}
129197

130-
err = tf.Apply(ctx)
131-
if err != nil {
132-
return err
198+
var applyErr error
199+
maxRetries := 5
200+
for i := 0; i < maxRetries; i++ {
201+
applyErr = tf.Apply(ctx)
202+
if applyErr == nil {
203+
return nil
204+
}
205+
206+
if strings.Contains(applyErr.Error(), "text file busy") && i < maxRetries-1 {
207+
waitTime := time.Duration(i+1) * time.Second
208+
time.Sleep(waitTime)
209+
continue
210+
}
211+
212+
break
133213
}
134214

135-
return nil
215+
return applyErr
136216
}
137217

138218
// PlanTerraform just applies the already created terraform files
@@ -150,12 +230,24 @@ func PlanTerraform(ctx context.Context, workingDir, terraformExecPath string, cr
150230
return err
151231
}
152232

153-
_, err = tf.Plan(ctx, tfexec.Out(workingDir+"/plan.out"))
154-
if err != nil {
155-
return err
233+
var planErr error
234+
maxRetries := 5
235+
for i := 0; i < maxRetries; i++ {
236+
_, planErr = tf.Plan(ctx, tfexec.Out(workingDir+"/plan.out"))
237+
if planErr == nil {
238+
return nil
239+
}
240+
241+
if strings.Contains(planErr.Error(), "text file busy") && i < maxRetries-1 {
242+
waitTime := time.Duration(i+1) * time.Second
243+
time.Sleep(waitTime)
244+
continue
245+
}
246+
247+
break
156248
}
157249

158-
return nil
250+
return planErr
159251
}
160252

161253
func DestroyTerraform(ctx context.Context, workingDir, terraformExecPath string, credentials aws.Credentials) error {
@@ -164,20 +256,34 @@ func DestroyTerraform(ctx context.Context, workingDir, terraformExecPath string,
164256
return err
165257
}
166258

167-
err = tf.Destroy(ctx)
168-
if err != nil {
169-
return err
259+
var destroyErr error
260+
maxRetries := 5
261+
for i := 0; i < maxRetries; i++ {
262+
destroyErr = tf.Destroy(ctx)
263+
if destroyErr == nil {
264+
return nil
265+
}
266+
267+
if strings.Contains(destroyErr.Error(), "text file busy") && i < maxRetries-1 {
268+
waitTime := time.Duration(i+1) * time.Second
269+
time.Sleep(waitTime)
270+
continue
271+
}
272+
273+
break
170274
}
171275

172-
return nil
276+
return destroyErr
173277
}
174278

175279
func CleanupTerraformDirectory(dir string) error {
176280
d, err := os.Open(dir)
177281
if err != nil {
178282
return err
179283
}
180-
defer d.Close()
284+
defer func() {
285+
_ = d.Close()
286+
}()
181287
names, err := d.Readdirnames(-1)
182288
if err != nil {
183289
return err

pkg/utils/terraform_utils_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,9 @@ func TestModifyTerraformProviderVersion(t *testing.T) {
222222
t.Run(tc.description, func(t *testing.T) {
223223
tmpDir, err := os.MkdirTemp("", "test_terraform_provider")
224224
g.Expect(err).NotTo(HaveOccurred())
225-
defer os.RemoveAll(tmpDir)
225+
defer func() {
226+
_ = os.RemoveAll(tmpDir)
227+
}()
226228

227229
kubernetesFile := fmt.Sprintf("%s/kubernetes.tf", tmpDir)
228230

0 commit comments

Comments
 (0)