diff --git a/pkg/daemons/config/types.go b/pkg/daemons/config/types.go index b6dc4a4f..2e785da5 100644 --- a/pkg/daemons/config/types.go +++ b/pkg/daemons/config/types.go @@ -16,6 +16,7 @@ import ( "github.com/xiaods/k8e/pkg/util" utilnet "k8s.io/apimachinery/pkg/util/net" "k8s.io/apiserver/pkg/authentication/authenticator" + "k8s.io/client-go/tools/record" utilsnet "k8s.io/utils/net" ) @@ -314,6 +315,7 @@ type ControlRuntime struct { ClientETCDKey string Core *core.Factory + Event record.EventRecorder EtcdConfig endpoint.ETCDConfig } diff --git a/pkg/nodepassword/nodepassword.go b/pkg/nodepassword/nodepassword.go index 810adad5..58902a61 100644 --- a/pkg/nodepassword/nodepassword.go +++ b/pkg/nodepassword/nodepassword.go @@ -15,13 +15,38 @@ import ( v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/pointer" ) var ( // Hasher provides the algorithm for generating and verifying hashes - Hasher = hash.NewSCrypt() + Hasher = hash.NewSCrypt() + ErrVerifyFailed = errVerifyFailed() ) +type passwordError struct { + node string + err error +} + +func (e *passwordError) Error() string { + return fmt.Sprintf("unable to verify password for node %s: %v", e.node, e.err) +} + +func (e *passwordError) Is(target error) bool { + switch target { + case ErrVerifyFailed: + return true + } + return false +} + +func (e *passwordError) Unwrap() error { + return e.err +} + +func errVerifyFailed() error { return &passwordError{} } + func getSecretName(nodeName string) string { return strings.ToLower(nodeName + ".node-password." + version.Program) } @@ -30,39 +55,34 @@ func verifyHash(secretClient coreclient.SecretClient, nodeName, pass string) err name := getSecretName(nodeName) secret, err := secretClient.Get(metav1.NamespaceSystem, name, metav1.GetOptions{}) if err != nil { - return err + return &passwordError{node: nodeName, err: err} } if hash, ok := secret.Data["hash"]; ok { if err := Hasher.VerifyHash(string(hash), pass); err != nil { - return errors.Wrapf(err, "unable to verify hash for node '%s'", nodeName) + return &passwordError{node: nodeName, err: err} } return nil } - return fmt.Errorf("unable to locate hash data for node secret '%s'", name) + return &passwordError{node: nodeName, err: errors.New("password hash not found in node secret")} } // Ensure will verify a node-password secret if it exists, otherwise it will create one func Ensure(secretClient coreclient.SecretClient, nodeName, pass string) error { - if err := verifyHash(secretClient, nodeName, pass); !apierrors.IsNotFound(err) { - return err - } - - hash, err := Hasher.CreateHash(pass) - if err != nil { - return errors.Wrapf(err, "unable to create hash for node '%s'", nodeName) - } - - immutable := true - _, err = secretClient.Create(&v1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: getSecretName(nodeName), - Namespace: metav1.NamespaceSystem, - }, - Immutable: &immutable, - Data: map[string][]byte{"hash": []byte(hash)}, - }) - if apierrors.IsAlreadyExists(err) { - return verifyHash(secretClient, nodeName, pass) + err := verifyHash(secretClient, nodeName, pass) + if apierrors.IsNotFound(err) { + var hash string + hash, err = Hasher.CreateHash(pass) + if err != nil { + return &passwordError{node: nodeName, err: err} + } + _, err = secretClient.Create(&v1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: getSecretName(nodeName), + Namespace: metav1.NamespaceSystem, + }, + Immutable: pointer.Bool(true), + Data: map[string][]byte{"hash": []byte(hash)}, + }) } return err } diff --git a/pkg/nodepassword/nodepassword_test.go b/pkg/nodepassword/nodepassword_test.go index b855a787..86a49cda 100644 --- a/pkg/nodepassword/nodepassword_test.go +++ b/pkg/nodepassword/nodepassword_test.go @@ -1,6 +1,7 @@ package nodepassword import ( + "errors" "fmt" "log" "os" @@ -89,6 +90,13 @@ func TestMigrateFileNodes(t *testing.T) { assertNotEqual(t, Ensure(secretClient, newNode, "wrong-password"), nil) } +func Test_PasswordError(t *testing.T) { + err := &passwordError{node: "test", err: fmt.Errorf("inner error")} + assertEqual(t, errors.Is(err, ErrVerifyFailed), true) + assertEqual(t, errors.Is(err, fmt.Errorf("different error")), false) + assertNotEqual(t, errors.Unwrap(err), nil) +} + // -------------------------- // mock secret client interface diff --git a/pkg/server/context.go b/pkg/server/context.go index 27ebe0df..656bf91a 100644 --- a/pkg/server/context.go +++ b/pkg/server/context.go @@ -16,9 +16,11 @@ import ( "github.com/xiaods/k8e/pkg/generated/controllers/k8e.cattle.io" "github.com/xiaods/k8e/pkg/util" "github.com/xiaods/k8e/pkg/version" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" + "k8s.io/client-go/tools/record" ) type Context struct { @@ -29,6 +31,7 @@ type Context struct { Auth *rbac.Factory Core *core.Factory K8s kubernetes.Interface + Event record.EventRecorder } func (c *Context) Start(ctx context.Context) error { @@ -58,6 +61,7 @@ func NewContext(ctx context.Context, cfg string) (*Context, error) { Apps: apps.NewFactoryFromConfigOrDie(restConfig), Batch: batch.NewFactoryFromConfigOrDie(restConfig), Core: core.NewFactoryFromConfigOrDie(restConfig), + Event: util.BuildControllerEventRecorder(k8s, version.Program+"-supervisor", metav1.NamespaceAll), }, nil } diff --git a/pkg/server/router.go b/pkg/server/router.go index 6e7f4500..5bbebd33 100644 --- a/pkg/server/router.go +++ b/pkg/server/router.go @@ -26,10 +26,13 @@ import ( "github.com/xiaods/k8e/pkg/nodepassword" "github.com/xiaods/k8e/pkg/util" "github.com/xiaods/k8e/pkg/version" + corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/json" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apiserver/pkg/authentication/user" "k8s.io/apiserver/pkg/endpoints/handlers/responsewriters" "k8s.io/apiserver/pkg/endpoints/request" @@ -455,12 +458,23 @@ func passwordBootstrap(ctx context.Context, config *Config) nodePassBootstrapper } } + // verify that the node exists, if using Node Identity auth if err := verifyNode(ctx, nodeClient, node); err != nil { return "", http.StatusUnauthorized, err } + // verify that the node password secret matches, or create it if it does not if err := nodepassword.Ensure(secretClient, node.Name, node.Password); err != nil { - return "", http.StatusForbidden, err + // if the verification failed, reject the request + if errors.Is(err, nodepassword.ErrVerifyFailed) { + return "", http.StatusForbidden, err + } + // If verification failed due to an error creating the node password secret, allow + // the request, but retry verification until the outage is resolved. This behavior + // allows nodes to join the cluster during outages caused by validating webhooks + // blocking secret creation - if the outage requires new nodes to join in order to + // run the webhook pods, we must fail open here to resolve the outage. + return verifyRemotePassword(ctx, config, &mu, deferredNodes, node) } return node.Name, http.StatusOK, nil @@ -487,7 +501,7 @@ func verifyLocalPassword(ctx context.Context, config *Config, mu *sync.Mutex, de } if err := nodepassword.Hasher.VerifyHash(passHash, node.Password); err != nil { - return "", http.StatusForbidden, errors.Wrapf(err, "unable to verify local password for node '%s'", node.Name) + return "", http.StatusForbidden, errors.Wrap(err, "unable to verify local node password") } mu.Lock() @@ -496,7 +510,7 @@ func verifyLocalPassword(ctx context.Context, config *Config, mu *sync.Mutex, de if _, ok := deferredNodes[node.Name]; !ok { deferredNodes[node.Name] = true go ensureSecret(ctx, config, node) - logrus.Debugf("Password verified locally for node '%s'", node.Name) + logrus.Infof("Password verified locally for node %s", node.Name) } return node.Name, http.StatusOK, nil @@ -509,7 +523,7 @@ func verifyRemotePassword(ctx context.Context, config *Config, mu *sync.Mutex, d if _, ok := deferredNodes[node.Name]; !ok { deferredNodes[node.Name] = true go ensureSecret(ctx, config, node) - logrus.Debugf("Password verification deferred for node '%s'", node.Name) + logrus.Infof("Password verification deferred for node %s", node.Name) } return node.Name, http.StatusOK, nil @@ -526,19 +540,25 @@ func verifyNode(ctx context.Context, nodeClient coreclient.NodeClient, node *nod func ensureSecret(ctx context.Context, config *Config, node *nodeInfo) { runtime := config.ControlConfig.Runtime - for { - select { - case <-ctx.Done(): - return - case <-time.After(1 * time.Second): - if runtime.Core != nil { - logrus.Debugf("Runtime core has become available, ensuring password secret for node '%s'", node.Name) - secretClient := runtime.Core.Core().V1().Secret() - if err := nodepassword.Ensure(secretClient, node.Name, node.Password); err != nil { - logrus.Warnf("Error ensuring node password secret for pre-validated node '%s': %v", node.Name, err) - } - return + wait.PollImmediateUntilWithContext(ctx, time.Second*5, func(ctx context.Context) (bool, error) { + if runtime.Core != nil { + secretClient := runtime.Core.Core().V1().Secret() + // This is consistent with events attached to the node generated by the kubelet + // https://github.com/kubernetes/kubernetes/blob/612130dd2f4188db839ea5c2dea07a96b0ad8d1c/pkg/kubelet/kubelet.go#L479-L485 + nodeRef := &corev1.ObjectReference{ + Kind: "Node", + Name: node.Name, + UID: types.UID(node.Name), + Namespace: "", + } + if err := nodepassword.Ensure(secretClient, node.Name, node.Password); err != nil { + runtime.Event.Eventf(nodeRef, corev1.EventTypeWarning, "NodePasswordValidationFailed", "Deferred node password secret validation failed: %v", err) + // Return true to stop polling if the password verification failed; only retry on secret creation errors. + return errors.Is(err, nodepassword.ErrVerifyFailed), nil } + runtime.Event.Event(nodeRef, corev1.EventTypeNormal, "NodePasswordValidationComplete", "Deferred node password secret validation complete") + return true, nil } - } + return false, nil + }) } diff --git a/pkg/server/server.go b/pkg/server/server.go index d4e0948b..5adc24a6 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -120,6 +120,7 @@ func runControllers(ctx context.Context, config *Config) error { controlConfig.Runtime.NodePasswdFile); err != nil { logrus.Warn(errors.Wrap(err, "error migrating node-password file")) } + controlConfig.Runtime.Event = sc.Event controlConfig.Runtime.Core = sc.Core for name, cb := range controlConfig.Runtime.ClusterControllerStarts {