Skip to content
10 changes: 10 additions & 0 deletions helix-core/src/main/java/org/apache/helix/HelixAdmin.java
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,16 @@ void autoEnableMaintenanceMode(String clusterName, boolean enabled, String reaso
void manuallyEnableMaintenanceMode(String clusterName, boolean enabled, String reason,
Map<String, String> customFields);

/**
* Enable maintenance mode via automation systems (like HelixACM). To be called by automation services.
* @param clusterName
* @param enabled
* @param reason
* @param customFields user-specified KV mappings to be stored in the ZNode
*/
void automationEnableMaintenanceMode(String clusterName, boolean enabled, String reason,

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are already 3 methods which similar name, enableMM, autoEnableMM, manuallyEnableMM and now automationEnableMaintenanceMode.
I have multiple queries here

  1. Is there a reason to not why we are not overloading the new method with the existing name autoEnableMM
  2. IMO, there should only be one method enableMM with different triggering entities. Should we create an issue in apache helix as todo for this?

Map<String, String> customFields);

/**
* Check specific cluster is in maintenance mode or not
* @param clusterName the cluster name
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -948,8 +948,14 @@ public void enableMaintenanceMode(String clusterName, boolean enabled) {
public boolean isInMaintenanceMode(String clusterName) {
HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor);
PropertyKey.Builder keyBuilder = accessor.keyBuilder();
return accessor.getBaseDataAccessor()
.exists(keyBuilder.maintenance().getPath(), AccessOption.PERSISTENT);

MaintenanceSignal signal = accessor.getProperty(keyBuilder.maintenance());

if (signal == null) {
return false;
}

return signal.hasMaintenanceReasons();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be dangerous. What if the new version read an old ZNode.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated the code.

}

@Override
Expand Down Expand Up @@ -1182,6 +1188,14 @@ public void manuallyEnableMaintenanceMode(String clusterName, boolean enabled, S
MaintenanceSignal.TriggeringEntity.USER);
}

@Override
public void automationEnableMaintenanceMode(String clusterName, boolean enabled, String reason,
Map<String, String> customFields) {
processMaintenanceMode(clusterName, enabled, reason,
MaintenanceSignal.AutoTriggerReason.NOT_APPLICABLE, customFields,
MaintenanceSignal.TriggeringEntity.AUTOMATION);
}

/**
* Helper method for enabling/disabling maintenance mode.
* @param clusterName
Expand All @@ -1201,23 +1215,54 @@ private void processMaintenanceMode(String clusterName, final boolean enabled,
triggeringEntity == MaintenanceSignal.TriggeringEntity.CONTROLLER ? "automatically"
: "manually", enabled ? "enters" : "exits", reason == null ? "NULL" : reason);
final long currentTime = System.currentTimeMillis();

MaintenanceSignal maintenanceSignal = accessor.getProperty(keyBuilder.maintenance());
if (!enabled) {
// Exit maintenance mode
accessor.removeProperty(keyBuilder.maintenance());
// Exit maintenance mode for this specific triggering entity

if (maintenanceSignal != null) {
// If a specific actor is exiting maintenance mode
boolean removed = maintenanceSignal.removeMaintenanceReason(triggeringEntity);

if (removed) {
// If there are still reasons for maintenance mode, update the ZNode
if (maintenanceSignal.getRecord().getListField("reasons") != null
&& !maintenanceSignal.getRecord().getListField("reasons").isEmpty()) {
if (!accessor.setProperty(keyBuilder.maintenance(), maintenanceSignal)) {
throw new HelixException("Failed to update maintenance signal!");
}
} else {
// If this was the last reason, remove the maintenance ZNode entirely
accessor.removeProperty(keyBuilder.maintenance());
}
}
}
} else {
// Enter maintenance mode
MaintenanceSignal maintenanceSignal = new MaintenanceSignal(MAINTENANCE_ZNODE_ID);
if (maintenanceSignal == null) {
// Create a new maintenance signal if it doesn't exist
maintenanceSignal = new MaintenanceSignal(MAINTENANCE_ZNODE_ID);
}

// First check for potential old client updates (simpleFields different than listField entries)
// This MUST happen before we modify any simpleFields to avoid overwriting important data needed for reconciliation
maintenanceSignal.reconcileMaintenanceData();

// Add the reason to the maintenance signal
if (reason != null) {
maintenanceSignal.setReason(reason);
}

maintenanceSignal.setTimestamp(currentTime);
maintenanceSignal.setTriggeringEntity(triggeringEntity);

switch (triggeringEntity) {
case CONTROLLER:
// autoEnable
maintenanceSignal.setAutoTriggerReason(internalReason);
break;
case USER:
case AUTOMATION:
case UNKNOWN:
// manuallyEnable
if (customFields != null && !customFields.isEmpty()) {
Expand All @@ -1231,8 +1276,18 @@ private void processMaintenanceMode(String clusterName, final boolean enabled,
}
break;
}
if (!accessor.createMaintenance(maintenanceSignal)) {
throw new HelixException("Failed to create maintenance signal!");

// Add this reason to the multi-actor maintenance reasons list
maintenanceSignal.addMaintenanceReason(reason, currentTime, triggeringEntity);

if (accessor.getProperty(keyBuilder.maintenance()) == null) {
if (!accessor.createMaintenance(maintenanceSignal)) {
throw new HelixException("Failed to create maintenance signal!");
}
} else {
if (!accessor.setProperty(keyBuilder.maintenance(), maintenanceSignal)) {
throw new HelixException("Failed to update maintenance signal!");
}
}
}

Expand All @@ -1246,7 +1301,8 @@ private void processMaintenanceMode(String clusterName, final boolean enabled,
}
return new ControllerHistory(oldRecord)
.updateMaintenanceHistory(enabled, reason, currentTime, internalReason,
customFields, triggeringEntity);
customFields, triggeringEntity,
isInMaintenanceMode(clusterName));
} catch (IOException e) {
logger.error("Failed to update maintenance history! Exception: {}", e);
return oldRecord;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ private enum MaintenanceConfigKey {
MAINTENANCE_HISTORY,
OPERATION_TYPE,
DATE,
REASON

REASON,
IN_MAINTENANCE_AFTER_OPERATION
}

private enum ManagementModeConfigKey {
Expand Down Expand Up @@ -180,10 +180,11 @@ public ZNRecord updateManagementModeHistory(String controller, ClusterManagement
* @param internalReason
* @param customFields
* @param triggeringEntity
* @param inMaintenanceAfterOperation whether the cluster is still in maintenance mode after this operation
*/
public ZNRecord updateMaintenanceHistory(boolean enabled, String reason, long currentTime,
MaintenanceSignal.AutoTriggerReason internalReason, Map<String, String> customFields,
MaintenanceSignal.TriggeringEntity triggeringEntity) throws IOException {
MaintenanceSignal.TriggeringEntity triggeringEntity, boolean inMaintenanceAfterOperation) throws IOException {
DateFormat df = new SimpleDateFormat("yyyy-MM-dd-HH:" + "mm:ss");
df.setTimeZone(TimeZone.getTimeZone("UTC"));
String dateTime = df.format(new Date(currentTime));
Expand All @@ -198,6 +199,8 @@ public ZNRecord updateMaintenanceHistory(boolean enabled, String reason, long cu
String.valueOf(currentTime));
maintenanceEntry.put(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(),
triggeringEntity.name());
maintenanceEntry.put(MaintenanceConfigKey.IN_MAINTENANCE_AFTER_OPERATION.name(),
String.valueOf(inMaintenanceAfterOperation));
if (triggeringEntity == MaintenanceSignal.TriggeringEntity.CONTROLLER) {
// If auto-triggered
maintenanceEntry.put(MaintenanceSignal.MaintenanceSignalProperty.AUTO_TRIGGER_REASON.name(),
Expand Down
Loading