diff --git a/frontend/pkg/frontend/frontend.go b/frontend/pkg/frontend/frontend.go index 03b6d2535..24d2bbd48 100644 --- a/frontend/pkg/frontend/frontend.go +++ b/frontend/pkg/frontend/frontend.go @@ -16,6 +16,7 @@ import ( "strconv" "strings" "sync/atomic" + "time" cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" "golang.org/x/sync/errgroup" @@ -408,6 +409,14 @@ func (f *Frontend) ArmResourceCreateOrUpdate(writer http.ResponseWriter, request doc = database.NewResourceDocument(resourceID) } + // CheckForProvisioningStateConflict does not log conflict errors + // but does log unexpected errors like database failures. + cloudError := f.CheckForProvisioningStateConflict(ctx, operationRequest, doc) + if cloudError != nil { + arm.WriteCloudError(writer, cloudError) + return + } + body, err := BodyFromContext(ctx) if err != nil { f.logger.Error(err.Error()) @@ -420,7 +429,7 @@ func (f *Frontend) ArmResourceCreateOrUpdate(writer http.ResponseWriter, request return } - cloudError := versionedRequestCluster.ValidateStatic(versionedCurrentCluster, updating, request.Method) + cloudError = versionedRequestCluster.ValidateStatic(versionedCurrentCluster, updating, request.Method) if cloudError != nil { f.logger.Error(cloudError.Error()) arm.WriteCloudError(writer, cloudError) @@ -563,6 +572,16 @@ func (f *Frontend) ArmResourceDelete(writer http.ResponseWriter, request *http.R return } + operationRequest := database.OperationRequestDelete + + // CheckForProvisioningStateConflict does not log conflict errors + // but does log unexpected errors like database failures. + cloudError := f.CheckForProvisioningStateConflict(ctx, operationRequest, doc) + if cloudError != nil { + arm.WriteCloudError(writer, cloudError) + return + } + err = f.clusterServiceClient.DeleteCSCluster(ctx, doc.InternalID) if err != nil { f.logger.Error(fmt.Sprintf("failed to delete cluster %s: %v", resourceID, err)) @@ -570,7 +589,27 @@ func (f *Frontend) ArmResourceDelete(writer http.ResponseWriter, request *http.R return } - operationDoc, err := f.StartOperation(writer, request, doc, database.OperationRequestDelete) + // Deletion is underway; mark any active operation as canceled. + if doc.ActiveOperationID != "" { + updated, err := f.dbClient.UpdateOperationDoc(ctx, doc.ActiveOperationID, func(updateDoc *database.OperationDocument) bool { + if updateDoc.Status != arm.ProvisioningStateCanceled { + updateDoc.LastTransitionTime = time.Now() + updateDoc.Status = arm.ProvisioningStateCanceled + return true + } + return false + }) + if err != nil { + f.logger.Error(err.Error()) + arm.WriteInternalServerError(writer) + return + } + if updated { + f.logger.Info(fmt.Sprintf("canceled operation '%s'", doc.ActiveOperationID)) + } + } + + operationDoc, err := f.StartOperation(writer, request, doc, operationRequest) if err != nil { f.logger.Error(fmt.Sprintf("failed to write operation document: %v", err)) arm.WriteInternalServerError(writer) diff --git a/frontend/pkg/frontend/helpers.go b/frontend/pkg/frontend/helpers.go index 033dce36d..2c988c8fd 100644 --- a/frontend/pkg/frontend/helpers.go +++ b/frontend/pkg/frontend/helpers.go @@ -8,6 +8,7 @@ import ( "errors" "fmt" "net/http" + "strings" cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" ocmerrors "github.com/openshift-online/ocm-sdk-go/errors" @@ -17,6 +18,57 @@ import ( "github.com/Azure/ARO-HCP/internal/database" ) +// CheckForProvisioningStateConflict returns a "409 Conflict" error response if the +// provisioning state of the resource is non-terminal, or any of its parent resources +// within the same provider namespace are in a "Deleting" state. +func (f *Frontend) CheckForProvisioningStateConflict(ctx context.Context, operationRequest database.OperationRequest, doc *database.ResourceDocument) *arm.CloudError { + switch operationRequest { + case database.OperationRequestCreate: + // Resource must already exist for there to be a conflict. + case database.OperationRequestDelete: + if doc.ProvisioningState == arm.ProvisioningStateDeleting { + return arm.NewCloudError( + http.StatusConflict, + arm.CloudErrorCodeConflict, + doc.Key.String(), + "Resource is already deleting") + } + case database.OperationRequestUpdate: + if !doc.ProvisioningState.IsTerminal() { + return arm.NewCloudError( + http.StatusConflict, + arm.CloudErrorCodeConflict, + doc.Key.String(), + "Cannot update resource while resource is %s", + strings.ToLower(string(doc.ProvisioningState))) + } + } + + parent := doc.Key.GetParent() + + // ResourceType casing is preserved for parents in the same namespace. + for parent.ResourceType.Namespace == doc.Key.ResourceType.Namespace { + parentDoc, err := f.dbClient.GetResourceDoc(ctx, parent) + if err != nil { + f.logger.Error(err.Error()) + return arm.NewInternalServerError() + } + + if parentDoc.ProvisioningState == arm.ProvisioningStateDeleting { + return arm.NewCloudError( + http.StatusConflict, + arm.CloudErrorCodeConflict, + doc.Key.String(), + "Cannot %s resource while parent resource is deleting", + strings.ToLower(string(operationRequest))) + } + + parent = parent.GetParent() + } + + return nil +} + func (f *Frontend) MarshalResource(ctx context.Context, resourceID *arm.ResourceID, versionedInterface api.Version) ([]byte, *arm.CloudError) { var responseBody []byte diff --git a/frontend/pkg/frontend/helpers_test.go b/frontend/pkg/frontend/helpers_test.go new file mode 100644 index 000000000..1cda3e16d --- /dev/null +++ b/frontend/pkg/frontend/helpers_test.go @@ -0,0 +1,223 @@ +package frontend + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "fmt" + "log/slog" + "net/http" + "testing" + + "github.com/Azure/ARO-HCP/internal/api/arm" + "github.com/Azure/ARO-HCP/internal/database" +) + +func TestCheckForProvisioningStateConflict(t *testing.T) { + const clusterResourceID = "/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/testGroup/providers/Microsoft.RedHatOpenShift/hcpOpenShiftClusters/testCluster" + const nodePoolResourceID = clusterResourceID + "/nodePools/testNodePool" + + tests := []struct { + name string + resourceID string + operationRequest database.OperationRequest + directConflicts map[arm.ProvisioningState]bool + parentConflicts map[arm.ProvisioningState]bool + }{ + { + name: "Create cluster", + resourceID: clusterResourceID, + operationRequest: database.OperationRequestCreate, + directConflicts: map[arm.ProvisioningState]bool{ + arm.ProvisioningStateSucceeded: false, + arm.ProvisioningStateFailed: false, + arm.ProvisioningStateCanceled: false, + arm.ProvisioningStateAccepted: false, + arm.ProvisioningStateDeleting: false, + arm.ProvisioningStateProvisioning: false, + arm.ProvisioningStateUpdating: false, + }, + }, + { + name: "Delete cluster", + resourceID: clusterResourceID, + operationRequest: database.OperationRequestDelete, + directConflicts: map[arm.ProvisioningState]bool{ + arm.ProvisioningStateSucceeded: false, + arm.ProvisioningStateFailed: false, + arm.ProvisioningStateCanceled: false, + arm.ProvisioningStateAccepted: false, + arm.ProvisioningStateDeleting: true, + arm.ProvisioningStateProvisioning: false, + arm.ProvisioningStateUpdating: false, + }, + }, + { + name: "Update cluster", + resourceID: clusterResourceID, + operationRequest: database.OperationRequestUpdate, + directConflicts: map[arm.ProvisioningState]bool{ + arm.ProvisioningStateSucceeded: false, + arm.ProvisioningStateFailed: false, + arm.ProvisioningStateCanceled: false, + arm.ProvisioningStateAccepted: true, + arm.ProvisioningStateDeleting: true, + arm.ProvisioningStateProvisioning: true, + arm.ProvisioningStateUpdating: true, + }, + }, + { + name: "Create node pool", + resourceID: nodePoolResourceID, + operationRequest: database.OperationRequestCreate, + directConflicts: map[arm.ProvisioningState]bool{ + arm.ProvisioningStateSucceeded: false, + arm.ProvisioningStateFailed: false, + arm.ProvisioningStateCanceled: false, + arm.ProvisioningStateAccepted: false, + arm.ProvisioningStateDeleting: false, + arm.ProvisioningStateProvisioning: false, + arm.ProvisioningStateUpdating: false, + }, + parentConflicts: map[arm.ProvisioningState]bool{ + arm.ProvisioningStateSucceeded: false, + arm.ProvisioningStateFailed: false, + arm.ProvisioningStateCanceled: false, + arm.ProvisioningStateAccepted: false, + arm.ProvisioningStateDeleting: true, + arm.ProvisioningStateProvisioning: false, + arm.ProvisioningStateUpdating: false, + }, + }, + { + name: "Delete node pool", + resourceID: nodePoolResourceID, + operationRequest: database.OperationRequestDelete, + directConflicts: map[arm.ProvisioningState]bool{ + arm.ProvisioningStateSucceeded: false, + arm.ProvisioningStateFailed: false, + arm.ProvisioningStateCanceled: false, + arm.ProvisioningStateAccepted: false, + arm.ProvisioningStateDeleting: true, + arm.ProvisioningStateProvisioning: false, + arm.ProvisioningStateUpdating: false, + }, + parentConflicts: map[arm.ProvisioningState]bool{ + arm.ProvisioningStateSucceeded: false, + arm.ProvisioningStateFailed: false, + arm.ProvisioningStateCanceled: false, + arm.ProvisioningStateAccepted: false, + arm.ProvisioningStateDeleting: true, + arm.ProvisioningStateProvisioning: false, + arm.ProvisioningStateUpdating: false, + }, + }, + { + name: "Update node pool", + resourceID: nodePoolResourceID, + operationRequest: database.OperationRequestUpdate, + directConflicts: map[arm.ProvisioningState]bool{ + arm.ProvisioningStateSucceeded: false, + arm.ProvisioningStateFailed: false, + arm.ProvisioningStateCanceled: false, + arm.ProvisioningStateAccepted: true, + arm.ProvisioningStateDeleting: true, + arm.ProvisioningStateProvisioning: true, + arm.ProvisioningStateUpdating: true, + }, + parentConflicts: map[arm.ProvisioningState]bool{ + arm.ProvisioningStateSucceeded: false, + arm.ProvisioningStateFailed: false, + arm.ProvisioningStateCanceled: false, + arm.ProvisioningStateAccepted: false, + arm.ProvisioningStateDeleting: true, + arm.ProvisioningStateProvisioning: false, + arm.ProvisioningStateUpdating: false, + }, + }, + } + + for _, tt := range tests { + var name string + + resourceID, err := arm.ParseResourceID(tt.resourceID) + if err != nil { + t.Fatal(err) + } + + for directState, directConflict := range tt.directConflicts { + name = fmt.Sprintf("%s (directState=%s)", tt.name, directState) + t.Run(name, func(t *testing.T) { + ctx := context.Background() + + frontend := &Frontend{ + logger: slog.Default(), + dbClient: database.NewCache(), + } + + doc := database.NewResourceDocument(resourceID) + doc.ProvisioningState = directState + + parentResourceID := resourceID.GetParent() + if parentResourceID.ResourceType.Namespace == resourceID.ResourceType.Namespace { + parentDoc := database.NewResourceDocument(parentResourceID) + // Hold the provisioning state to something benign. + parentDoc.ProvisioningState = arm.ProvisioningStateSucceeded + _ = frontend.dbClient.CreateResourceDoc(ctx, parentDoc) + } + + cloudError := frontend.CheckForProvisioningStateConflict(ctx, tt.operationRequest, doc) + + if cloudError == nil { + if directConflict { + t.Errorf("Expected %d %s but got no error", http.StatusConflict, http.StatusText(http.StatusConflict)) + } + } else { + if !directConflict || cloudError.StatusCode != http.StatusConflict { + t.Errorf("Got unexpected error: %d %s", cloudError.StatusCode, http.StatusText(cloudError.StatusCode)) + } + } + }) + } + + for parentState, parentConflict := range tt.parentConflicts { + name = fmt.Sprintf("%s (parentState=%s)", tt.name, parentState) + t.Run(name, func(t *testing.T) { + ctx := context.Background() + + frontend := &Frontend{ + logger: slog.Default(), + dbClient: database.NewCache(), + } + + doc := database.NewResourceDocument(resourceID) + // Hold the provisioning state to something benign. + doc.ProvisioningState = arm.ProvisioningStateSucceeded + + parentResourceID := resourceID.GetParent() + if parentResourceID.ResourceType.Namespace == resourceID.ResourceType.Namespace { + parentDoc := database.NewResourceDocument(parentResourceID) + parentDoc.ProvisioningState = parentState + _ = frontend.dbClient.CreateResourceDoc(ctx, parentDoc) + } else { + t.Fatalf("Parent resource type namespace (%s) differs from child namespace (%s)", + parentResourceID.ResourceType.Namespace, + resourceID.ResourceType.Namespace) + } + + cloudError := frontend.CheckForProvisioningStateConflict(ctx, tt.operationRequest, doc) + + if cloudError == nil { + if parentConflict { + t.Errorf("Expected %d %s but got no error", http.StatusConflict, http.StatusText(http.StatusConflict)) + } + } else { + if !parentConflict || cloudError.StatusCode != http.StatusConflict { + t.Errorf("Got unexpected error: %d %s", cloudError.StatusCode, http.StatusText(cloudError.StatusCode)) + } + } + }) + } + } +} diff --git a/frontend/pkg/frontend/node_pool.go b/frontend/pkg/frontend/node_pool.go index 281a12ba4..7d2d3a6db 100644 --- a/frontend/pkg/frontend/node_pool.go +++ b/frontend/pkg/frontend/node_pool.go @@ -9,6 +9,7 @@ import ( "fmt" "maps" "net/http" + "time" cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" @@ -123,6 +124,14 @@ func (f *Frontend) CreateOrUpdateNodePool(writer http.ResponseWriter, request *h doc = database.NewResourceDocument(resourceID) } + // CheckForProvisioningStateConflict does not log conflict errors + // but does log unexpected errors like database failures. + cloudError := f.CheckForProvisioningStateConflict(ctx, operationRequest, doc) + if cloudError != nil { + arm.WriteCloudError(writer, cloudError) + return + } + body, err := BodyFromContext(ctx) if err != nil { f.logger.Error(err.Error()) @@ -135,7 +144,7 @@ func (f *Frontend) CreateOrUpdateNodePool(writer http.ResponseWriter, request *h return } - cloudError := versionedRequestNodePool.ValidateStatic(versionedCurrentNodePool, updating, request.Method) + cloudError = versionedRequestNodePool.ValidateStatic(versionedCurrentNodePool, updating, request.Method) if cloudError != nil { f.logger.Error(cloudError.Error()) arm.WriteCloudError(writer, cloudError) @@ -281,6 +290,16 @@ func (f *Frontend) DeleteNodePool(writer http.ResponseWriter, request *http.Requ return } + operationRequest := database.OperationRequestDelete + + // CheckForProvisioningStateConflict does not log conflict errors + // but does log unexpected errors like database failures. + cloudError := f.CheckForProvisioningStateConflict(ctx, operationRequest, doc) + if cloudError != nil { + arm.WriteCloudError(writer, cloudError) + return + } + err = f.clusterServiceClient.DeleteCSNodePool(ctx, doc.InternalID) if err != nil { f.logger.Error(fmt.Sprintf("failed to delete node pool %s: %v", resourceID, err)) @@ -288,7 +307,27 @@ func (f *Frontend) DeleteNodePool(writer http.ResponseWriter, request *http.Requ return } - operationDoc, err := f.StartOperation(writer, request, doc, database.OperationRequestDelete) + // Deletion is underway; mark any active operation as canceled. + if doc.ActiveOperationID != "" { + updated, err := f.dbClient.UpdateOperationDoc(ctx, doc.ActiveOperationID, func(updateDoc *database.OperationDocument) bool { + if updateDoc.Status != arm.ProvisioningStateCanceled { + updateDoc.LastTransitionTime = time.Now() + updateDoc.Status = arm.ProvisioningStateCanceled + return true + } + return false + }) + if err != nil { + f.logger.Error(err.Error()) + arm.WriteInternalServerError(writer) + return + } + if updated { + f.logger.Info(fmt.Sprintf("canceled operation '%s'", doc.ActiveOperationID)) + } + } + + operationDoc, err := f.StartOperation(writer, request, doc, operationRequest) if err != nil { f.logger.Error(fmt.Sprintf("failed to write operation document: %v", err)) arm.WriteInternalServerError(writer) diff --git a/internal/database/document.go b/internal/database/document.go index f330ecf79..32c5570ed 100644 --- a/internal/database/document.go +++ b/internal/database/document.go @@ -97,7 +97,7 @@ type OperationDocument struct { func NewOperationDocument(request OperationRequest) *OperationDocument { now := time.Now().UTC() - return &OperationDocument{ + doc := &OperationDocument{ BaseDocument: newBaseDocument(), PartitionKey: operationsPartitionKey, Request: request, @@ -105,6 +105,14 @@ func NewOperationDocument(request OperationRequest) *OperationDocument { LastTransitionTime: now, Status: arm.ProvisioningStateAccepted, } + + // When deleting, set Status directly to ProvisioningStateDeleting + // so any further deletion requests are rejected with 409 Conflict. + if request == OperationRequestDelete { + doc.Status = arm.ProvisioningStateDeleting + } + + return doc } // ToStatus converts an OperationDocument to the ARM operation status format.