From ed05c99f05a65bd24a620aadbb3580cefdb56491 Mon Sep 17 00:00:00 2001 From: Scott Cotton Date: Thu, 19 Dec 2024 14:45:58 +0100 Subject: [PATCH] add check to access agent readiness endpoint This is intended as minimal change to fix https://github.com/signadot/signadot/issues/5172 tested with warp, it causes restarts and we are able to connect whereas previously it hangs still to consider: - do we need to make this visible in status? - do we need to make check period configurable? add check to agent-metrics endpoints in tp monitor --- internal/locald/rootmanager/tp_monitor.go | 39 +++++++++++++++++------ internal/locald/sandboxmanager/sdk.go | 2 +- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/internal/locald/rootmanager/tp_monitor.go b/internal/locald/rootmanager/tp_monitor.go index 4e5750e..8be52ae 100644 --- a/internal/locald/rootmanager/tp_monitor.go +++ b/internal/locald/rootmanager/tp_monitor.go @@ -2,6 +2,7 @@ package rootmanager import ( "fmt" + "net/http" "time" "log/slog" @@ -128,16 +129,34 @@ func (mon *tpMonitor) checkTunnelProxyAccess(ctx context.Context) bool { restartSvcs = true } } + if !restartSvcs { + // the grpc check for connecting to the tunnel proxy does not suffice + // because it has built-in retries and may re-use a connection while + // we are unable to establish a new connection. So, we also check + // the controller manager health endpoint + cli := &http.Client{ + Transport: &http.Transport{}, + Timeout: 10 * time.Second, + } + resp, err := cli.Get("http://agent-metrics.signadot.svc:9090/metrics") + if err != nil { + mon.log.Error("unable to reach agent-metrics, restarting services", "error", err) + restartSvcs = true + } else { + resp.Body.Close() + } + } + if !restartSvcs { + mon.starting = false + return true + } - if restartSvcs { - // Restart localnet - mon.root.stopLocalnetService() - mon.root.runLocalnetService(ctx, mon.tpLocalAddr, mon.ipMap) + // Restart localnet + mon.root.stopLocalnetService() + mon.root.runLocalnetService(ctx, mon.tpLocalAddr, mon.ipMap) - // Restart etc hosts - mon.root.stopEtcHostsService() - mon.root.runEtcHostsService(ctx, mon.tpLocalAddr, mon.ipMap) - } - mon.starting = false - return true + // Restart etc hosts + mon.root.stopEtcHostsService() + mon.root.runEtcHostsService(ctx, mon.tpLocalAddr, mon.ipMap) + return false } diff --git a/internal/locald/sandboxmanager/sdk.go b/internal/locald/sandboxmanager/sdk.go index 2530386..ac37cfa 100644 --- a/internal/locald/sandboxmanager/sdk.go +++ b/internal/locald/sandboxmanager/sdk.go @@ -17,7 +17,7 @@ import ( var ( ErrSandboxManagerUnavailable = errors.New( - "sandboxmanager is not running, start it with \"signadot local connect\"") + `sandboxmanager is not running, start it with "signadot local connect"`) ) func GetStatus() (*sbmapi.StatusResponse, error) {