Skip to content

Commit c475ca4

Browse files
test/util: retry connection failures in dev
Our connection strategy from the test runner to the service cluster can be unreliable, as it's fairly naive - we just run `oc port-forward`. Let's retry connection failures to ameliorate issues stemming from this. Signed-off-by: Steve Kuznetsov <[email protected]>
1 parent 1a9fbc9 commit c475ca4

File tree

2 files changed

+105
-1
lines changed

2 files changed

+105
-1
lines changed

test/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ require (
290290
github.com/go-logr/logr v1.4.3
291291
github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
292292
github.com/google/go-cmp v0.7.0
293-
golang.org/x/net v0.47.0 // indirect
293+
golang.org/x/net v0.47.0
294294
golang.org/x/sys v0.38.0 // indirect
295295
golang.org/x/text v0.31.0 // indirect
296296
gopkg.in/yaml.v3 v3.0.1 // indirect

test/util/framework/per_invocation_framework.go

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,14 @@
1515
package framework
1616

1717
import (
18+
"bytes"
1819
"context"
20+
"crypto/tls"
21+
"errors"
1922
"fmt"
2023
"hash/fnv"
24+
"io"
25+
"net"
2126
"net/http"
2227
"os"
2328
"path"
@@ -27,7 +32,12 @@ import (
2732
"sync"
2833
"time"
2934

35+
"github.com/onsi/ginkgo/v2"
3036
"github.com/onsi/ginkgo/v2/types"
37+
"golang.org/x/net/http2"
38+
39+
"k8s.io/apimachinery/pkg/util/sets"
40+
"k8s.io/apimachinery/pkg/util/wait"
3141

3242
"github.com/Azure/azure-sdk-for-go/sdk/azcore"
3343
azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm"
@@ -52,6 +62,7 @@ type perBinaryInvocationTestContext struct {
5262
subscriptionID string
5363
azureCredentials azcore.TokenCredential
5464
identityPoolState *leasedIdentityPoolState
65+
defaultTransport *http.Transport
5566
}
5667

5768
type CleanupFunc func(ctx context.Context) error
@@ -83,6 +94,7 @@ func invocationContext() *perBinaryInvocationTestContext {
8394
isDevelopmentEnvironment: IsDevelopmentEnvironment(),
8495
skipCleanup: skipCleanup(),
8596
pooledIdentities: pooledIdentities(),
97+
defaultTransport: defaultHTTPTransport(),
8698
}
8799
})
88100
return invocationContextInstance
@@ -138,6 +150,15 @@ func (p *armSystemDataPolicy) Do(req *policy.Request) (*http.Response, error) {
138150
}
139151

140152
func (tc *perBinaryInvocationTestContext) getClientFactoryOptions() *azcorearm.ClientOptions {
153+
if tc.isDevelopmentEnvironment {
154+
return &azcorearm.ClientOptions{
155+
ClientOptions: azcore.ClientOptions{
156+
Transport: &proxiedConnectionTransporter{
157+
delegate: tc.defaultTransport,
158+
},
159+
},
160+
}
161+
}
141162
return nil
142163
}
143164

@@ -154,6 +175,9 @@ func (tc *perBinaryInvocationTestContext) getHCPClientFactoryOptions() *azcorear
154175
},
155176
},
156177
},
178+
Transport: &proxiedConnectionTransporter{
179+
delegate: tc.defaultTransport,
180+
},
157181
InsecureAllowCredentialWithHTTP: true,
158182
PerCallPolicies: []policy.Policy{
159183
&armSystemDataPolicy{},
@@ -164,6 +188,86 @@ func (tc *perBinaryInvocationTestContext) getHCPClientFactoryOptions() *azcorear
164188
return nil
165189
}
166190

191+
// default transport taken judiciously from azcore library to mimick their behavior when no transporter is provided
192+
func defaultHTTPTransport() *http.Transport {
193+
dialer := &net.Dialer{
194+
Timeout: 30 * time.Second,
195+
KeepAlive: 30 * time.Second,
196+
}
197+
defaultTransport := &http.Transport{
198+
Proxy: http.ProxyFromEnvironment,
199+
DialContext: dialer.DialContext,
200+
ForceAttemptHTTP2: true,
201+
MaxIdleConns: 100,
202+
MaxIdleConnsPerHost: 10,
203+
IdleConnTimeout: 90 * time.Second,
204+
TLSHandshakeTimeout: 10 * time.Second,
205+
ExpectContinueTimeout: 1 * time.Second,
206+
TLSClientConfig: &tls.Config{
207+
MinVersion: tls.VersionTLS12,
208+
Renegotiation: tls.RenegotiateFreelyAsClient,
209+
},
210+
}
211+
// TODO: evaluate removing this once https://github.com/golang/go/issues/59690 has been fixed
212+
if http2Transport, err := http2.ConfigureTransports(defaultTransport); err == nil {
213+
// if the connection has been idle for 10 seconds, send a ping frame for a health check
214+
http2Transport.ReadIdleTimeout = 10 * time.Second
215+
// if there's no response to the ping within the timeout, the connection will be closed
216+
http2Transport.PingTimeout = 5 * time.Second
217+
}
218+
return defaultTransport
219+
}
220+
221+
// proxiedConnectionTransporter retries connections done across the proxy path to a local RP,
222+
// in order to paper over transient errors in the proxied connection
223+
type proxiedConnectionTransporter struct {
224+
delegate *http.Transport
225+
}
226+
227+
func (t *proxiedConnectionTransporter) Do(req *http.Request) (*http.Response, error) {
228+
retryCtx, cancel := context.WithTimeoutCause(req.Context(), 2*time.Minute, errors.New("proxy transport retry timeout"))
229+
defer cancel()
230+
231+
var body []byte
232+
if req != nil && req.Body != nil {
233+
b, err := io.ReadAll(req.Body)
234+
if err != nil {
235+
return nil, err
236+
}
237+
if err := req.Body.Close(); err != nil {
238+
return nil, err
239+
}
240+
body = b
241+
}
242+
243+
var response *http.Response
244+
responseErr := wait.ExponentialBackoffWithContext(retryCtx, wait.Backoff{
245+
Duration: 800 * time.Millisecond,
246+
Factor: 2,
247+
Jitter: 0.1,
248+
Steps: 10,
249+
Cap: 20 * time.Second,
250+
}, func(ctx context.Context) (done bool, err error) {
251+
thisReq := req.Clone(ctx)
252+
thisReq.Body = io.NopCloser(bytes.NewReader(body))
253+
resp, err := t.delegate.RoundTrip(thisReq)
254+
response = resp
255+
if err != nil {
256+
if sets.NewString(
257+
"connect: connection refused",
258+
"connect: connection reset by peer",
259+
"proxy error from localhost",
260+
).Has(err.Error()) {
261+
ginkgo.GinkgoLogr.Info("Re-trying request.", "err", err)
262+
return false, nil
263+
}
264+
return true, err
265+
}
266+
return true, nil
267+
})
268+
return response, responseErr
269+
}
270+
167271
func (tc *perBinaryInvocationTestContext) getSubscriptionID(ctx context.Context, subscriptionClient *armsubscriptions.Client) (string, error) {
168272
tc.contextLock.RLock()
169273
if len(tc.subscriptionID) > 0 {

0 commit comments

Comments
 (0)