Skip to content

Commit

Permalink
Soft-fail on node password verification if the secret cannot be created
Browse files Browse the repository at this point in the history
Allows nodes to join the cluster during a webhook outage. This also
enhances auditability by creating Kubernetes events for the deferred
verification.

Signed-off-by: Brad Davidson <[email protected]>

Signed-off-by: Deshi Xiao <[email protected]>
  • Loading branch information
xiaods committed Sep 1, 2023
1 parent 7126e40 commit 8ce4446
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 41 deletions.
2 changes: 2 additions & 0 deletions pkg/daemons/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/xiaods/k8e/pkg/util"
utilnet "k8s.io/apimachinery/pkg/util/net"
"k8s.io/apiserver/pkg/authentication/authenticator"
"k8s.io/client-go/tools/record"
utilsnet "k8s.io/utils/net"
)

Expand Down Expand Up @@ -314,6 +315,7 @@ type ControlRuntime struct {
ClientETCDKey string

Core *core.Factory
Event record.EventRecorder
EtcdConfig endpoint.ETCDConfig
}

Expand Down
68 changes: 44 additions & 24 deletions pkg/nodepassword/nodepassword.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,38 @@ import (
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/pointer"
)

var (
// Hasher provides the algorithm for generating and verifying hashes
Hasher = hash.NewSCrypt()
Hasher = hash.NewSCrypt()
ErrVerifyFailed = errVerifyFailed()
)

type passwordError struct {
node string
err error
}

func (e *passwordError) Error() string {
return fmt.Sprintf("unable to verify password for node %s: %v", e.node, e.err)
}

func (e *passwordError) Is(target error) bool {
switch target {
case ErrVerifyFailed:
return true
}
return false
}

func (e *passwordError) Unwrap() error {
return e.err
}

func errVerifyFailed() error { return &passwordError{} }

func getSecretName(nodeName string) string {
return strings.ToLower(nodeName + ".node-password." + version.Program)
}
Expand All @@ -30,39 +55,34 @@ func verifyHash(secretClient coreclient.SecretClient, nodeName, pass string) err
name := getSecretName(nodeName)
secret, err := secretClient.Get(metav1.NamespaceSystem, name, metav1.GetOptions{})
if err != nil {
return err
return &passwordError{node: nodeName, err: err}
}
if hash, ok := secret.Data["hash"]; ok {
if err := Hasher.VerifyHash(string(hash), pass); err != nil {
return errors.Wrapf(err, "unable to verify hash for node '%s'", nodeName)
return &passwordError{node: nodeName, err: err}
}
return nil
}
return fmt.Errorf("unable to locate hash data for node secret '%s'", name)
return &passwordError{node: nodeName, err: errors.New("password hash not found in node secret")}
}

// Ensure will verify a node-password secret if it exists, otherwise it will create one
func Ensure(secretClient coreclient.SecretClient, nodeName, pass string) error {
if err := verifyHash(secretClient, nodeName, pass); !apierrors.IsNotFound(err) {
return err
}

hash, err := Hasher.CreateHash(pass)
if err != nil {
return errors.Wrapf(err, "unable to create hash for node '%s'", nodeName)
}

immutable := true
_, err = secretClient.Create(&v1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: getSecretName(nodeName),
Namespace: metav1.NamespaceSystem,
},
Immutable: &immutable,
Data: map[string][]byte{"hash": []byte(hash)},
})
if apierrors.IsAlreadyExists(err) {
return verifyHash(secretClient, nodeName, pass)
err := verifyHash(secretClient, nodeName, pass)
if apierrors.IsNotFound(err) {
var hash string
hash, err = Hasher.CreateHash(pass)
if err != nil {
return &passwordError{node: nodeName, err: err}
}
_, err = secretClient.Create(&v1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: getSecretName(nodeName),
Namespace: metav1.NamespaceSystem,
},
Immutable: pointer.Bool(true),
Data: map[string][]byte{"hash": []byte(hash)},
})
}
return err
}
Expand Down
8 changes: 8 additions & 0 deletions pkg/nodepassword/nodepassword_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package nodepassword

import (
"errors"
"fmt"
"log"
"os"
Expand Down Expand Up @@ -89,6 +90,13 @@ func TestMigrateFileNodes(t *testing.T) {
assertNotEqual(t, Ensure(secretClient, newNode, "wrong-password"), nil)
}

func Test_PasswordError(t *testing.T) {
err := &passwordError{node: "test", err: fmt.Errorf("inner error")}
assertEqual(t, errors.Is(err, ErrVerifyFailed), true)
assertEqual(t, errors.Is(err, fmt.Errorf("different error")), false)
assertNotEqual(t, errors.Unwrap(err), nil)
}

// --------------------------

// mock secret client interface
Expand Down
4 changes: 4 additions & 0 deletions pkg/server/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@ import (
"github.com/xiaods/k8e/pkg/generated/controllers/k8e.cattle.io"
"github.com/xiaods/k8e/pkg/util"
"github.com/xiaods/k8e/pkg/version"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/client-go/tools/record"
)

type Context struct {
Expand All @@ -29,6 +31,7 @@ type Context struct {
Auth *rbac.Factory
Core *core.Factory
K8s kubernetes.Interface
Event record.EventRecorder
}

func (c *Context) Start(ctx context.Context) error {
Expand Down Expand Up @@ -58,6 +61,7 @@ func NewContext(ctx context.Context, cfg string) (*Context, error) {
Apps: apps.NewFactoryFromConfigOrDie(restConfig),
Batch: batch.NewFactoryFromConfigOrDie(restConfig),
Core: core.NewFactoryFromConfigOrDie(restConfig),
Event: util.BuildControllerEventRecorder(k8s, version.Program+"-supervisor", metav1.NamespaceAll),
}, nil
}

Expand Down
54 changes: 37 additions & 17 deletions pkg/server/router.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,13 @@ import (
"github.com/xiaods/k8e/pkg/nodepassword"
"github.com/xiaods/k8e/pkg/util"
"github.com/xiaods/k8e/pkg/version"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/json"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apiserver/pkg/authentication/user"
"k8s.io/apiserver/pkg/endpoints/handlers/responsewriters"
"k8s.io/apiserver/pkg/endpoints/request"
Expand Down Expand Up @@ -455,12 +458,23 @@ func passwordBootstrap(ctx context.Context, config *Config) nodePassBootstrapper
}
}

// verify that the node exists, if using Node Identity auth
if err := verifyNode(ctx, nodeClient, node); err != nil {
return "", http.StatusUnauthorized, err
}

// verify that the node password secret matches, or create it if it does not
if err := nodepassword.Ensure(secretClient, node.Name, node.Password); err != nil {
return "", http.StatusForbidden, err
// if the verification failed, reject the request
if errors.Is(err, nodepassword.ErrVerifyFailed) {
return "", http.StatusForbidden, err
}
// If verification failed due to an error creating the node password secret, allow
// the request, but retry verification until the outage is resolved. This behavior
// allows nodes to join the cluster during outages caused by validating webhooks
// blocking secret creation - if the outage requires new nodes to join in order to
// run the webhook pods, we must fail open here to resolve the outage.
return verifyRemotePassword(ctx, config, &mu, deferredNodes, node)
}

return node.Name, http.StatusOK, nil
Expand All @@ -487,7 +501,7 @@ func verifyLocalPassword(ctx context.Context, config *Config, mu *sync.Mutex, de
}

if err := nodepassword.Hasher.VerifyHash(passHash, node.Password); err != nil {
return "", http.StatusForbidden, errors.Wrapf(err, "unable to verify local password for node '%s'", node.Name)
return "", http.StatusForbidden, errors.Wrap(err, "unable to verify local node password")
}

mu.Lock()
Expand All @@ -496,7 +510,7 @@ func verifyLocalPassword(ctx context.Context, config *Config, mu *sync.Mutex, de
if _, ok := deferredNodes[node.Name]; !ok {
deferredNodes[node.Name] = true
go ensureSecret(ctx, config, node)
logrus.Debugf("Password verified locally for node '%s'", node.Name)
logrus.Infof("Password verified locally for node %s", node.Name)
}

return node.Name, http.StatusOK, nil
Expand All @@ -509,7 +523,7 @@ func verifyRemotePassword(ctx context.Context, config *Config, mu *sync.Mutex, d
if _, ok := deferredNodes[node.Name]; !ok {
deferredNodes[node.Name] = true
go ensureSecret(ctx, config, node)
logrus.Debugf("Password verification deferred for node '%s'", node.Name)
logrus.Infof("Password verification deferred for node %s", node.Name)
}

return node.Name, http.StatusOK, nil
Expand All @@ -526,19 +540,25 @@ func verifyNode(ctx context.Context, nodeClient coreclient.NodeClient, node *nod

func ensureSecret(ctx context.Context, config *Config, node *nodeInfo) {
runtime := config.ControlConfig.Runtime
for {
select {
case <-ctx.Done():
return
case <-time.After(1 * time.Second):
if runtime.Core != nil {
logrus.Debugf("Runtime core has become available, ensuring password secret for node '%s'", node.Name)
secretClient := runtime.Core.Core().V1().Secret()
if err := nodepassword.Ensure(secretClient, node.Name, node.Password); err != nil {
logrus.Warnf("Error ensuring node password secret for pre-validated node '%s': %v", node.Name, err)
}
return
wait.PollImmediateUntilWithContext(ctx, time.Second*5, func(ctx context.Context) (bool, error) {
if runtime.Core != nil {
secretClient := runtime.Core.Core().V1().Secret()
// This is consistent with events attached to the node generated by the kubelet
// https://github.com/kubernetes/kubernetes/blob/612130dd2f4188db839ea5c2dea07a96b0ad8d1c/pkg/kubelet/kubelet.go#L479-L485
nodeRef := &corev1.ObjectReference{
Kind: "Node",
Name: node.Name,
UID: types.UID(node.Name),
Namespace: "",
}
if err := nodepassword.Ensure(secretClient, node.Name, node.Password); err != nil {
runtime.Event.Eventf(nodeRef, corev1.EventTypeWarning, "NodePasswordValidationFailed", "Deferred node password secret validation failed: %v", err)
// Return true to stop polling if the password verification failed; only retry on secret creation errors.
return errors.Is(err, nodepassword.ErrVerifyFailed), nil
}
runtime.Event.Event(nodeRef, corev1.EventTypeNormal, "NodePasswordValidationComplete", "Deferred node password secret validation complete")
return true, nil
}
}
return false, nil
})
}
1 change: 1 addition & 0 deletions pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ func runControllers(ctx context.Context, config *Config) error {
controlConfig.Runtime.NodePasswdFile); err != nil {
logrus.Warn(errors.Wrap(err, "error migrating node-password file"))
}
controlConfig.Runtime.Event = sc.Event
controlConfig.Runtime.Core = sc.Core

for name, cb := range controlConfig.Runtime.ClusterControllerStarts {
Expand Down

0 comments on commit 8ce4446

Please sign in to comment.