Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/health check #16

Merged
merged 10 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"args": [
"run",
"--config",
"config.yaml"
".vscode/config/local.config.yaml"
]
},
{
Expand Down
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
- [Configuration](#configuration)
- [Startup](#startup)
- [Runtime](#runtime)
- [Check: Health](#check-health)
- [API](#api)
- [Code of Conduct](#code-of-conduct)
- [Working Language](#working-language)
Expand Down Expand Up @@ -84,6 +85,26 @@ checks:
enabled: true
```

### Check: Health

Available configuration options:

- `checks.health.enabled` (boolean): Currently not used.
- `checks.health.targets` (list of strings): List of targets to send health probe. Needs to be a valid url. Can be another `sparrow` instance. Use health endpoint, e.g. `https://sparrow-dns.telekom.de/checks/health`. The remote `sparrow` instance needs the `healthEndpoint` enabled.
- `checks.health.healthEndpoint` (boolean): Needs to be activated when the `sparrow` should expose its own health endpoint. Mandatory if another `sparrow` instance wants perform a health check.

Example configuration:

```YAML
checks:
health:
enabled: true
targets:
- "https://gitlab.devops.telekom.de"
healthEndpoint: false
```


### API

The `sparrow` exposes an API that does provide access to the check results. Each check will register its own endpoint at `/v1/metrics/{check-name}`. The API definition will be exposed at `/openapi`
Expand Down
7 changes: 7 additions & 0 deletions cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
// NewCmdRun creates a new run command
func NewCmdRun() *cobra.Command {
flagMapping := config.RunFlagsNameMapping{
ApiListeningAddress: "apiListeningAddress",
LoaderType: "loaderType",
LoaderInterval: "loaderInterval",
LoaderHttpUrl: "loaderHttpUrl",
Expand All @@ -29,6 +30,8 @@ func NewCmdRun() *cobra.Command {
Run: run(&flagMapping),
}

cmd.PersistentFlags().String(flagMapping.ApiListeningAddress, ":8080", "api: The address the server is listening on")

cmd.PersistentFlags().StringP(flagMapping.LoaderType, "l", "http", "defines the loader type that will load the checks configuration during the runtime")
cmd.PersistentFlags().Int(flagMapping.LoaderInterval, 300, "defines the interval the loader reloads the configuration in seconds")
cmd.PersistentFlags().String(flagMapping.LoaderHttpUrl, "", "http loader: The url where to get the remote configuration")
Expand All @@ -37,6 +40,8 @@ func NewCmdRun() *cobra.Command {
cmd.PersistentFlags().Int(flagMapping.LoaderHttpRetryCount, 3, "http loader: Amount of retries trying to load the configuration")
cmd.PersistentFlags().Int(flagMapping.LoaderHttpRetryDelay, 1, "http loader: The initial delay between retries in seconds")

viper.BindPFlag(flagMapping.ApiListeningAddress, cmd.PersistentFlags().Lookup(flagMapping.ApiListeningAddress))

viper.BindPFlag(flagMapping.LoaderType, cmd.PersistentFlags().Lookup(flagMapping.LoaderType))
viper.BindPFlag(flagMapping.LoaderInterval, cmd.PersistentFlags().Lookup(flagMapping.LoaderInterval))
viper.BindPFlag(flagMapping.LoaderHttpUrl, cmd.PersistentFlags().Lookup(flagMapping.LoaderHttpUrl))
Expand All @@ -56,6 +61,8 @@ func run(fm *config.RunFlagsNameMapping) func(cmd *cobra.Command, args []string)

cfg := config.NewConfig()

cfg.SetApiListeningAddress(viper.GetString(fm.ApiListeningAddress))

cfg.SetLoaderType(viper.GetString(fm.LoaderType))
cfg.SetLoaderInterval(viper.GetInt(fm.LoaderInterval))
cfg.SetLoaderHttpUrl(viper.GetString(fm.LoaderHttpUrl))
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ require (
github.com/getkin/kin-openapi v0.120.0
github.com/go-chi/chi/v5 v5.0.10
github.com/jarcoal/httpmock v1.3.1
github.com/mitchellh/mapstructure v1.5.0
github.com/spf13/cobra v1.8.0
github.com/spf13/viper v1.17.0
github.com/stretchr/testify v1.8.4
Expand All @@ -24,7 +25,6 @@ require (
github.com/josharian/intern v1.0.0 // indirect
github.com/magiconair/properties v1.8.7 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect
github.com/pelletier/go-toml/v2 v2.1.0 // indirect
github.com/perimeterx/marshmallow v1.1.5 // indirect
Expand Down
5 changes: 3 additions & 2 deletions pkg/checks/checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,16 @@ import (
// The key is the name of the Check
// The name needs to map the configuration item key
var RegisteredChecks = map[string]func() Check{
"rtt": GetRoundtripCheck,
"rtt": GetRoundtripCheck,
"health": GetHealthCheck,
}

//go:generate moq -out checks_moq.go . Check
type Check interface {
// Run is called once per check interval
// this should error if there is a problem running the check
// Returns an error and a result. Returning a non nil error will cause a shutdown of the system
Run(ctx context.Context) (Result, error)
Run(ctx context.Context) error
// Startup is called once when the check is registered
// In the Run() method, the check should send results to the cResult channel
// this will cause sparrow to update its data store with the results
Expand Down
204 changes: 204 additions & 0 deletions pkg/checks/health.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
package checks

import (
"context"
"fmt"
"net/http"
"sync"
"time"

"github.com/caas-team/sparrow/internal/helper"
"github.com/caas-team/sparrow/internal/logger"
"github.com/caas-team/sparrow/pkg/api"
"github.com/getkin/kin-openapi/openapi3"
"github.com/mitchellh/mapstructure"
)

// Health is a check that measures the availability of an endpoint
type Health struct {
route string
config HealthConfig
c chan<- Result
done chan bool
}

// Configuration of the health check config
type HealthConfig struct {
Enabled bool `json:"enabled,omitempty"`
Targets []string `json:"targets,omitempty"`
HealthEndpoint bool `json:"healthEndpoint,omitempty"`
}

// Data that will be stored in the database
type healthData struct {
Targets []Target `json:"targets"`
}

type Target struct {
Target string `json:"target"`
Status string `json:"status"`
}

// Constructor for the HealthCheck
func GetHealthCheck() Check {
return &Health{
route: "health",
}
}

// Starts the health check
func (h *Health) Run(ctx context.Context) error {
ctx, cancel := logger.NewContextWithLogger(ctx, "health")
defer cancel()
log := logger.FromContext(ctx)

for {
delay := time.Minute
log.Info("Next health check will run after delay", "delay", delay.String())
select {
case <-ctx.Done():
log.Debug("Context closed. Stopping health check")
return ctx.Err()
case <-h.done:
log.Debug("Soft shut down")
return nil
case <-time.After(delay):
log.Info("Start health check run")
healthData := h.Check(ctx)

log.Debug("Saving health check data to database")
h.c <- Result{Timestamp: time.Now(), Data: healthData}

log.Info("Successfully finished health check run")
}
}
}

// Startup is called once when the health check is registered
func (h *Health) Startup(ctx context.Context, cResult chan<- Result) error {
h.c = cResult
return nil
}

// Shutdown is called once when the check is unregistered or sparrow shuts down
func (h *Health) Shutdown(ctx context.Context) error {
http.Handle(h.route, http.NotFoundHandler())
h.done <- true

return nil
}

// SetConfig sets the configuration for the health check
func (h *Health) SetConfig(ctx context.Context, config any) error {
var checkCfg HealthConfig
if err := mapstructure.Decode(config, &checkCfg); err != nil {
return ErrInvalidConfig
}
h.config = checkCfg
return nil
}

// Schema provides the schema of the data that will be provided
// by the heath check
func (h *Health) Schema() (*openapi3.SchemaRef, error) {
return OpenapiFromPerfData[healthData](healthData{})

}

// RegisterHandler dynamically registers a server handler
// if it is enabled by the config
func (h *Health) RegisterHandler(ctx context.Context, router *api.RoutingTree) {
if h.config.HealthEndpoint {
router.Add(http.MethodGet, h.route, func(w http.ResponseWriter, _ *http.Request) {
w.Write([]byte("ok"))
})
}
}

// DeregisterHandler dynamically deletes the server handler
func (h *Health) DeregisterHandler(ctx context.Context, router *api.RoutingTree) {
router.Remove(http.MethodGet, h.route)
}

// Check performs a health check using a retry function
// to get the health status for all targets
func (h *Health) Check(ctx context.Context) healthData {
log := logger.FromContext(ctx)
if len(h.config.Targets) == 0 {
log.Debug("No targets defined")
return healthData{}
}
log.Debug("Getting health status for each target in separate routine", "amount", len(h.config.Targets))

var healthData healthData
var wg sync.WaitGroup
var mu sync.Mutex

for _, target := range h.config.Targets {
target := target
wg.Add(1)
l := log.With("target", target)

getHealthRetry := helper.Retry(func(ctx context.Context) error {
return getHealth(ctx, target)
}, helper.RetryConfig{
Count: 3,
Delay: time.Microsecond,
})

go func() {
defer wg.Done()

targetData := Target{
Target: target,
Status: "healthy",
}

l.Debug("Starting retry routine to get health of target")
if err := getHealthRetry(ctx); err != nil {
targetData.Status = "unhealthy"
}

l.Debug("Successfully got health status of target", "status", targetData.Status)
mu.Lock()
healthData.Targets = append(healthData.Targets, targetData)
mu.Unlock()
}()
}

log.Debug("Waiting for all routines to finish")
wg.Wait()

log.Info("Successfully got health status from all targets")
return healthData
}

// getHealth performs a http get request
// returns ok if status code is 200
func getHealth(ctx context.Context, url string) error {
log := logger.FromContext(ctx).With("url", url)

client := &http.Client{
Timeout: time.Second * 5,
}

req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
log.Error("Could not create http GET request", "error", err.Error())
return err
}

res, err := client.Do(req)
if err != nil {
log.Error("Http get request failed", "error", err.Error())
return err
}
defer res.Body.Close()

if res.StatusCode != http.StatusOK {
log.Error("Http get request failed", "status", res.Status)
return fmt.Errorf("request failed, status is %s", res.Status)
}

return nil
}
Loading