Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions internal/controller/metal3.io/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1435,6 +1435,12 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner
servicingData.ActualFirmwareSettings = hfs.Status.Settings
servicingData.TargetFirmwareSettings = hfs.Spec.Settings
}

// Track if HFS spec has actual settings - check independently since getHostFirmwareSettings
// returns nil when no changes even if object exists
hfsExists := &metal3api.HostFirmwareSettings{}
hfsExistsErr := r.Get(info.ctx, info.request.NamespacedName, hfsExists)
servicingData.HasFirmwareSettingsSpec = (hfsExistsErr == nil && len(hfsExists.Spec.Settings) > 0)
}

if liveFirmwareUpdatesAllowed {
Expand All @@ -1451,16 +1457,52 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(prov provisioner.Provisioner
servicingData.TargetFirmwareComponents = hfc.Spec.Updates
}
}

// Track if HFC spec has actual updates - check independently since getHostFirmwareComponents
// returns nil when no changes even if object exists
hfcExists := &metal3api.HostFirmwareComponents{}
hfcExistsErr := r.Get(info.ctx, info.request.NamespacedName, hfcExists)
servicingData.HasFirmwareComponentsSpec = (hfcExistsErr == nil && len(hfcExists.Spec.Updates) > 0)
}

hasChanges := fwDirty || hfsDirty || hfcDirty

info.log.Info("servicing data flags:",
"hasSettingsSpec", servicingData.HasFirmwareSettingsSpec,
"hasComponentsSpec", servicingData.HasFirmwareComponentsSpec,
"fwDirty", fwDirty, "hfsDirty", hfsDirty, "hfcDirty", hfcDirty,
"hasChanges", hasChanges, "note", "hasSpec flags check if spec.settings/spec.updates have content")

// Even if settings are clean, we need to check the result of the current servicing.
if !hasChanges && info.host.Status.OperationalStatus != metal3api.OperationalStatusServicing && info.host.Status.ErrorType != metal3api.ServicingError {
// If nothing is going on, return control to the power management.
return nil
}

// If we're in a servicing error state and updates were removed from spec,
// let the provisioner handle the transition back to active
if info.host.Status.ErrorType == metal3api.ServicingError && !hasChanges {
info.log.Info("updates removed from spec while in servicing error state, attempting recovery")
provResult, _, err := prov.Service(servicingData, false, false)
if err != nil {
return actionError{fmt.Errorf("failed to recover from servicing error: %w", err)}
}
if provResult.ErrorMessage != "" {
info.log.Info("failed to recover from servicing error", "error", provResult.ErrorMessage)
return actionError{fmt.Errorf("failed to recover from servicing error: %s", provResult.ErrorMessage)}
}
if provResult.Dirty {
info.log.Info("abort operation in progress, checking back later")
return actionContinue{provResult.RequeueAfter}
}
// If abort completed and no error, we've successfully recovered
info.log.Info("successfully recovered from servicing error")
info.host.Status.ErrorType = ""
info.host.Status.ErrorMessage = ""
info.host.Status.OperationalStatus = metal3api.OperationalStatusOK
return actionComplete{}
}

// FIXME(janders/dtantsur): this implementation may lead to a scenario where if we never actually
// succeed before leaving this state (e.g. by deprovisioning) we lose the signal that the
// update didn't actually happen. This is deemed an acceptable risk for the moment since it is only
Expand Down
46 changes: 45 additions & 1 deletion pkg/provisioner/ironic/servicing.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,24 @@ func (p *ironicProvisioner) startServicing(bmcAccess bmc.AccessDetails, ironicNo
return
}

func (p *ironicProvisioner) abortServicing(ironicNode *nodes.Node) (result provisioner.Result, started bool, err error) {
// Clear maintenance flag first if it's set
if ironicNode.Maintenance {
p.log.Info("clearing maintenance flag before aborting servicing")
result, err = p.setMaintenanceFlag(ironicNode, false, "")
return result, started, err
}

// Set started to let the controller know about the change
p.log.Info("aborting servicing due to removal of spec.updates/spec.settings")
started, result, err = p.tryChangeNodeProvisionState(
ironicNode,
nodes.ProvisionStateOpts{Target: nodes.TargetAbort},
)
p.log.Info("abort result", "started", started, "result", result, "error", err)
return result, started, err
}

func (p *ironicProvisioner) Service(data provisioner.ServicingData, unprepared, restartOnFailure bool) (result provisioner.Result, started bool, err error) {
if !p.availableFeatures.HasServicing() {
result, err = operationFailed(fmt.Sprintf("servicing not supported: requires API version 1.87, available is 1.%d", p.availableFeatures.MaxVersion))
Expand All @@ -94,9 +112,29 @@ func (p *ironicProvisioner) Service(data provisioner.ServicingData, unprepared,
return result, started, err
}

// Check if there are any pending updates
serviceSteps, err := p.buildServiceSteps(bmcAccess, data)
if err != nil {
result, err = operationFailed(err.Error())
return result, started, err
}

p.log.Info("servicing state check:",
"hasSettingsSpec", data.HasFirmwareSettingsSpec,
"hasComponentsSpec", data.HasFirmwareComponentsSpec,
"serviceStepsCount", len(serviceSteps),
"nodeState", ironicNode.ProvisionState)

switch nodes.ProvisionState(ironicNode.ProvisionState) {
case nodes.ServiceFail:
// When servicing failed, we need to clean host provisioning settings.
// When servicing failed and user actually removed the specs (not just no updates calculated),
// we need to abort the servicing operation to back out
if !data.HasFirmwareSettingsSpec && !data.HasFirmwareComponentsSpec {
p.log.Info("aborting servicing because spec.updates/spec.settings was removed")
return p.abortServicing(ironicNode)
}

// When servicing failed and there are pending updates, we need to clean host provisioning settings
// If restartOnFailure is false, it means the settings aren't cleared.
if !restartOnFailure {
result, err = operationFailed(ironicNode.LastError)
Expand Down Expand Up @@ -125,6 +163,12 @@ func (p *ironicProvisioner) Service(data provisioner.ServicingData, unprepared,
p.log.Info("servicing finished on the host")
result, err = operationComplete()
case nodes.Servicing, nodes.ServiceWait:
// If user actually removed spec.updates/spec.settings while servicing is in progress, abort immediately
if !data.HasFirmwareSettingsSpec && !data.HasFirmwareComponentsSpec {
p.log.Info("aborting in-progress servicing because spec.updates/spec.settings was removed")
return p.abortServicing(ironicNode)
}

p.log.Info("waiting for host to become active",
"state", ironicNode.ProvisionState,
"serviceStep", ironicNode.ServiceStep)
Expand Down
38 changes: 37 additions & 1 deletion pkg/provisioner/ironic/servicing_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,40 @@ func TestService(t *testing.T) {
expectedDirty: false,
expectedError: true,
},
// Service abort tests - Complete spec removal triggers abort
{
name: "serviceFail_abort_complete_spec_removal",
ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{
ProvisionState: string(nodes.ServiceFail),
UUID: nodeUUID,
}),
skipConfig: true, // Complete spec removal triggers abort
expectedStarted: true, // Abort triggers state change
expectedRequestAfter: 10, // Abort returns RequeueAfter: 10s
expectedDirty: true, // Abort returns Dirty: true
},
{
name: "servicing_abort_complete_spec_removal",
ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{
ProvisionState: string(nodes.Servicing),
UUID: nodeUUID,
}),
skipConfig: true, // Complete spec removal triggers abort
expectedStarted: true, // Abort triggers state change
expectedRequestAfter: 10, // Abort returns RequeueAfter: 10s
expectedDirty: true, // Abort returns Dirty: true
},
{
name: "serviceWait_abort_complete_spec_removal",
ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{
ProvisionState: string(nodes.ServiceWait),
UUID: nodeUUID,
}),
skipConfig: true, // Complete spec removal triggers abort
expectedStarted: true, // Abort triggers state change
expectedRequestAfter: 10, // Abort returns RequeueAfter: 10s
expectedDirty: true, // Abort returns Dirty: true
},
}

for _, tc := range cases {
Expand All @@ -164,13 +198,15 @@ func TestService(t *testing.T) {
host.Status.Provisioning.ID = nodeUUID
prepData := provisioner.ServicingData{}
if !tc.skipConfig {
host.Spec.BMC.Address = "raid-test://test.bmc/"
host.Spec.BMC.Address = "bios-test://test.bmc/"
prepData.ActualFirmwareSettings = metal3api.SettingsMap{
"Answer": "unknown",
}
prepData.TargetFirmwareSettings = metal3api.DesiredSettingsMap{
"Answer": intstr.FromInt(42),
}
// Set flags to prevent abort logic from triggering
prepData.HasFirmwareSettingsSpec = true
}

publisher := func(reason, message string) {}
Expand Down
3 changes: 3 additions & 0 deletions pkg/provisioner/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ type ServicingData struct {
TargetFirmwareSettings metal3api.DesiredSettingsMap
ActualFirmwareSettings metal3api.SettingsMap
TargetFirmwareComponents []metal3api.FirmwareUpdate
// Flags to track if specs exist (vs. just no updates calculated)
HasFirmwareSettingsSpec bool
HasFirmwareComponentsSpec bool
}

type ProvisionData struct {
Expand Down