diff --git a/helix-core/src/main/java/org/apache/helix/HelixAdmin.java b/helix-core/src/main/java/org/apache/helix/HelixAdmin.java index 5c2ef10f20..f0cad61bac 100644 --- a/helix-core/src/main/java/org/apache/helix/HelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/HelixAdmin.java @@ -422,6 +422,16 @@ void autoEnableMaintenanceMode(String clusterName, boolean enabled, String reaso void manuallyEnableMaintenanceMode(String clusterName, boolean enabled, String reason, Map customFields); + /** + * Enable maintenance mode via automation systems (like HelixACM). To be called by automation services. + * @param clusterName + * @param enabled + * @param reason + * @param customFields user-specified KV mappings to be stored in the ZNode + */ + void automationEnableMaintenanceMode(String clusterName, boolean enabled, String reason, + Map customFields); + /** * Check specific cluster is in maintenance mode or not * @param clusterName the cluster name diff --git a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java index 0d72ac4aaa..91468ca847 100644 --- a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java @@ -948,8 +948,18 @@ public void enableMaintenanceMode(String clusterName, boolean enabled) { public boolean isInMaintenanceMode(String clusterName) { HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); - return accessor.getBaseDataAccessor() - .exists(keyBuilder.maintenance().getPath(), AccessOption.PERSISTENT); + + MaintenanceSignal signal = accessor.getProperty(keyBuilder.maintenance()); + + if (signal == null) { + return false; + } + + // The cluster is in maintenance mode if the maintenance signal ZNode exists + // This includes cases where old clients have wiped listField data but simpleFields remain + // cluster should remain in maintenance mode as long as ZNode exists + return signal.hasMaintenanceReasons() || + (signal.getReason() != null && !signal.getReason().isEmpty()); } @Override @@ -1182,6 +1192,14 @@ public void manuallyEnableMaintenanceMode(String clusterName, boolean enabled, S MaintenanceSignal.TriggeringEntity.USER); } + @Override + public void automationEnableMaintenanceMode(String clusterName, boolean enabled, String reason, + Map customFields) { + processMaintenanceMode(clusterName, enabled, reason, + MaintenanceSignal.AutoTriggerReason.NOT_APPLICABLE, customFields, + MaintenanceSignal.TriggeringEntity.AUTOMATION); + } + /** * Helper method for enabling/disabling maintenance mode. * @param clusterName @@ -1201,23 +1219,74 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, triggeringEntity == MaintenanceSignal.TriggeringEntity.CONTROLLER ? "automatically" : "manually", enabled ? "enters" : "exits", reason == null ? "NULL" : reason); final long currentTime = System.currentTimeMillis(); + + MaintenanceSignal maintenanceSignal = accessor.getProperty(keyBuilder.maintenance()); if (!enabled) { - // Exit maintenance mode - accessor.removeProperty(keyBuilder.maintenance()); + // Exit maintenance mode for this specific triggering entity + + // Early return if no maintenance signal exists + if (maintenanceSignal == null) { + if (triggeringEntity == MaintenanceSignal.TriggeringEntity.USER) { + logger.info("USER administrative override: no maintenance signal exists, nothing to remove"); + } else { + // CONTROLLER/AUTOMATION: strict no-op + logger.info("Entity {} attempted to exit maintenance mode but no maintenance signal exists", triggeringEntity); + } + return; + } + + // If a specific actor is exiting maintenance mode + boolean removed = maintenanceSignal.removeMaintenanceReason(triggeringEntity); + + if (removed) { + // If there are still reasons for maintenance mode, update the ZNode + + if (maintenanceSignal.hasMaintenanceReasons()) { + if (!accessor.setProperty(keyBuilder.maintenance(), maintenanceSignal)) { + throw new HelixException("Failed to update maintenance signal!"); + } + } else { + // If this was the last reason, remove the maintenance ZNode entirely + accessor.removeProperty(keyBuilder.maintenance()); + } + } else { + // Case where triggering entity doesn't have an entry + // Note: CONTROLLER/AUTOMATION is strict no-op, USER can do administrative override + if (triggeringEntity == MaintenanceSignal.TriggeringEntity.USER) { + // USER has special privilege to force exit maintenance mode as administrative override + logger.info("USER administrative override: forcefully exiting maintenance mode for cluster {}", clusterName); + accessor.removeProperty(keyBuilder.maintenance()); + } else { + // CONTROLLER/AUTOMATION: strict no-op if their entry not found + logger.info("Entity {} doesn't have a maintenance reason entry, exit request ignored", triggeringEntity); + } + } } else { // Enter maintenance mode - MaintenanceSignal maintenanceSignal = new MaintenanceSignal(MAINTENANCE_ZNODE_ID); + if (maintenanceSignal == null) { + // Create a new maintenance signal if it doesn't exist + maintenanceSignal = new MaintenanceSignal(MAINTENANCE_ZNODE_ID); + } + + // This is CRITICAL: Reconcile any legacy data BEFORE updating simpleFields + // This must happen before any simpleField updates to preserve legacy USER data + maintenanceSignal.reconcileLegacyData(); + + // Add the reason to the maintenance signal if (reason != null) { maintenanceSignal.setReason(reason); } + maintenanceSignal.setTimestamp(currentTime); maintenanceSignal.setTriggeringEntity(triggeringEntity); + switch (triggeringEntity) { case CONTROLLER: // autoEnable maintenanceSignal.setAutoTriggerReason(internalReason); break; case USER: + case AUTOMATION: case UNKNOWN: // manuallyEnable if (customFields != null && !customFields.isEmpty()) { @@ -1231,8 +1300,18 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, } break; } - if (!accessor.createMaintenance(maintenanceSignal)) { - throw new HelixException("Failed to create maintenance signal!"); + + // Add this reason to the multi-actor maintenance reasons list + maintenanceSignal.addMaintenanceReason(reason, currentTime, triggeringEntity); + + if (accessor.getProperty(keyBuilder.maintenance()) == null) { + if (!accessor.createMaintenance(maintenanceSignal)) { + throw new HelixException("Failed to create maintenance signal!"); + } + } else { + if (!accessor.setProperty(keyBuilder.maintenance(), maintenanceSignal)) { + throw new HelixException("Failed to update maintenance signal!"); + } } } @@ -1246,7 +1325,8 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, } return new ControllerHistory(oldRecord) .updateMaintenanceHistory(enabled, reason, currentTime, internalReason, - customFields, triggeringEntity); + customFields, triggeringEntity, + isInMaintenanceMode(clusterName)); } catch (IOException e) { logger.error("Failed to update maintenance history! Exception: {}", e); return oldRecord; diff --git a/helix-core/src/main/java/org/apache/helix/model/ControllerHistory.java b/helix-core/src/main/java/org/apache/helix/model/ControllerHistory.java index 47b0958d9c..89289c3fa5 100644 --- a/helix-core/src/main/java/org/apache/helix/model/ControllerHistory.java +++ b/helix-core/src/main/java/org/apache/helix/model/ControllerHistory.java @@ -56,8 +56,8 @@ private enum MaintenanceConfigKey { MAINTENANCE_HISTORY, OPERATION_TYPE, DATE, - REASON - + REASON, + IN_MAINTENANCE_AFTER_OPERATION } private enum ManagementModeConfigKey { @@ -180,10 +180,11 @@ public ZNRecord updateManagementModeHistory(String controller, ClusterManagement * @param internalReason * @param customFields * @param triggeringEntity + * @param inMaintenanceAfterOperation whether the cluster is still in maintenance mode after this operation */ public ZNRecord updateMaintenanceHistory(boolean enabled, String reason, long currentTime, MaintenanceSignal.AutoTriggerReason internalReason, Map customFields, - MaintenanceSignal.TriggeringEntity triggeringEntity) throws IOException { + MaintenanceSignal.TriggeringEntity triggeringEntity, boolean inMaintenanceAfterOperation) throws IOException { DateFormat df = new SimpleDateFormat("yyyy-MM-dd-HH:" + "mm:ss"); df.setTimeZone(TimeZone.getTimeZone("UTC")); String dateTime = df.format(new Date(currentTime)); @@ -198,6 +199,8 @@ public ZNRecord updateMaintenanceHistory(boolean enabled, String reason, long cu String.valueOf(currentTime)); maintenanceEntry.put(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), triggeringEntity.name()); + maintenanceEntry.put(MaintenanceConfigKey.IN_MAINTENANCE_AFTER_OPERATION.name(), + String.valueOf(inMaintenanceAfterOperation)); if (triggeringEntity == MaintenanceSignal.TriggeringEntity.CONTROLLER) { // If auto-triggered maintenanceEntry.put(MaintenanceSignal.MaintenanceSignalProperty.AUTO_TRIGGER_REASON.name(), diff --git a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java index 83e0e1c604..e286b2ec74 100644 --- a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java +++ b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java @@ -19,12 +19,24 @@ * under the License. */ +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Iterator; import org.apache.helix.zookeeper.datamodel.ZNRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.type.TypeFactory; +import java.io.IOException; /** * A ZNode that signals that the cluster is in maintenance mode. */ public class MaintenanceSignal extends PauseSignal { + private static final Logger LOG = LoggerFactory.getLogger(MaintenanceSignal.class); /** * Pre-defined fields set by Helix Controller only. @@ -40,6 +52,7 @@ public enum MaintenanceSignalProperty { */ public enum TriggeringEntity { CONTROLLER, + AUTOMATION, // triggered by automation systems (like HelixACM) USER, // manually triggered by user UNKNOWN } @@ -57,6 +70,11 @@ public enum AutoTriggerReason { NOT_APPLICABLE // Not triggered automatically or automatically exiting maintenance mode } + /** + * Constant for the name of the reasons list field + */ + private static final String REASONS_LIST_FIELD = "reasons"; + public MaintenanceSignal(String id) { super(id); } @@ -112,4 +130,241 @@ public void setTimestamp(long timestamp) { public long getTimestamp() { return _record.getLongField(MaintenanceSignalProperty.TIMESTAMP.name(), -1); } + + /** + * Add a new maintenance reason. If the triggering entity already has a reason, it will be replaced. + * + * @param reason The reason for maintenance + * @param timestamp The timestamp when maintenance was triggered + * @param triggeringEntity The entity that triggered maintenance + */ + public void addMaintenanceReason(String reason, long timestamp, TriggeringEntity triggeringEntity) { + LOG.info("Adding maintenance reason for entity: {}, reason: {}, timestamp: {}", + triggeringEntity, reason, timestamp); + + List> reasons = getMaintenanceReasons(); + LOG.debug("Before addition: Reasons list contains {} entries", reasons.size()); + + // Filter out any existing reasons for this triggering entity + List> filteredReasons = filterReasons(reasons, null, + Arrays.asList(triggeringEntity)); + + // Always add the new reason at the end of the list + Map newEntry = new HashMap<>(); + newEntry.put(PauseSignalProperty.REASON.name(), reason); + newEntry.put(MaintenanceSignalProperty.TIMESTAMP.name(), Long.toString(timestamp)); + newEntry.put(MaintenanceSignalProperty.TRIGGERED_BY.name(), triggeringEntity.name()); + filteredReasons.add(newEntry); + + updateReasonsListField(filteredReasons); + LOG.debug("After addition: Reasons list contains {} entries", filteredReasons.size()); + } + + /** + * Helper method to update the ZNRecord with the current reasons list. + * Each reason is stored as a single JSON string in the list. + * + * @param reasons The list of reason maps to store + */ + private void updateReasonsListField(List> reasons) { + List reasonsList = new ArrayList<>(); + + for (Map entry : reasons) { + String jsonString = convertMapToJsonString(entry); + if (!jsonString.isEmpty()) { + reasonsList.add(jsonString); + } + } + + _record.setListField(REASONS_LIST_FIELD, reasonsList); + } + + /** + * Convert a map to a JSON-style string + */ + private String convertMapToJsonString(Map map) { + try { + return new ObjectMapper().writeValueAsString(map); + } catch (IOException e) { + LOG.warn("Failed to convert map to JSON string: {}", e.getMessage()); + return ""; + } + } + + /** + * Get all maintenance reasons currently active. + * + * @return List of maintenance reasons as maps + */ + public List> getMaintenanceReasons() { + List> reasons = new ArrayList<>(); + List reasonsList = _record.getListField(REASONS_LIST_FIELD); + + if (reasonsList != null && !reasonsList.isEmpty()) { + for (String entryStr : reasonsList) { + Map entry = parseJsonStyleEntry(entryStr); + if (!entry.isEmpty()) { + reasons.add(entry); + } + } + } + + return reasons; + } + + /** + * Parse an entry string in JSON format into a map + */ + private Map parseJsonStyleEntry(String entryStr) { + Map map = new HashMap<>(); + try { + return new ObjectMapper().readValue(entryStr, + TypeFactory.defaultInstance().constructMapType(HashMap.class, String.class, String.class)); + } catch (IOException e) { + LOG.warn("Failed to parse JSON entry: {}, error: {}", entryStr, e.getMessage()); + } + return map; + } + + /** + * Remove a maintenance reason by triggering entity. + * + * @param triggeringEntity The entity whose reason should be removed + * @return true if a reason was removed, false otherwise + */ + public boolean removeMaintenanceReason(TriggeringEntity triggeringEntity) { + LOG.info("Removing maintenance reason for entity: {}", triggeringEntity); + + List> reasons = getMaintenanceReasons(); + List> filteredReasons = filterReasons(reasons, null, + Arrays.asList(triggeringEntity)); + + // Return false early if no entities were filtered out + if (filteredReasons.size() == reasons.size()) { + LOG.info("Entity {} doesn't have a maintenance reason entry, ignoring exit request", triggeringEntity); + return false; + } + + // Update reasons list field with filtered reasons + updateReasonsListField(filteredReasons); + + // Always set/reset the simpleFields if filteredReasons.size() != 0 + if (!filteredReasons.isEmpty()) { + // Get the most recent reason (last element, since list is in chronological order) + Map mostRecent = filteredReasons.get(filteredReasons.size() - 1); + String newReason = mostRecent.get(PauseSignalProperty.REASON.name()); + long newTimestamp = Long.parseLong(mostRecent.get(MaintenanceSignalProperty.TIMESTAMP.name())); + TriggeringEntity newEntity = TriggeringEntity.valueOf( + mostRecent.get(MaintenanceSignalProperty.TRIGGERED_BY.name())); + + LOG.info("Updated to most recent reason: {}, entity: {}, timestamp: {}", + newReason, newEntity, newTimestamp); + + setReason(newReason); + setTimestamp(newTimestamp); + setTriggeringEntity(newEntity); + } + + return true; + } + + /** + * Filter maintenance reasons based on include and exclude entity lists. + * + * @param reasons The original list of maintenance reasons + * @param includeEntities List of entities to include (null means include all) + * @param excludeEntities List of entities to exclude (null means exclude none) + * @return Filtered list of maintenance reasons (maintains original order) + */ + private List> filterReasons(List> reasons, + List includeEntities, + List excludeEntities) { + List> filtered = new ArrayList<>(); + + for (Map reason : reasons) { + String triggeredByStr = reason.get(MaintenanceSignalProperty.TRIGGERED_BY.name()); + if (triggeredByStr == null) { + continue; + } + + TriggeringEntity entity; + try { + entity = TriggeringEntity.valueOf(triggeredByStr); + } catch (IllegalArgumentException ex) { + LOG.warn("Unknown triggering entity: {}, skipping", triggeredByStr); + continue; + } + + if ((includeEntities != null && !includeEntities.contains(entity)) || + (excludeEntities != null && excludeEntities.contains(entity))) { + continue; + } + + filtered.add(reason); + } + + return filtered; + } + + + /** + * Check if there are any active maintenance reasons. + * + * @return true if there are any reasons for maintenance, false otherwise + */ + public boolean hasMaintenanceReasons() { + return !getMaintenanceReasons().isEmpty(); + } + + /** + * Checks if there is a maintenance reason from a specific triggering entity. + * + * @param triggeringEntity The entity to check + * @return true if there is a maintenance reason from this entity + */ + public boolean hasMaintenanceReason(TriggeringEntity triggeringEntity) { + List> reasons = getMaintenanceReasons(); + List> filteredReasons = filterReasons(reasons, + Arrays.asList(triggeringEntity), null); + return !filteredReasons.isEmpty(); + } + + + + + + /** + * Reconcile legacy data from simpleFields into listFields.reasons if it's missing. + * This preserves maintenance data written by old USER clients that only set simpleFields. + * + * NOTE: Only reconciles USER data, as: + * - CONTROLLER is part of core Helix system and should use proper APIs + * - AUTOMATION is new and has no legacy clients + * - Only USER entities represent external legacy clients that may wipe data + */ + public void reconcileLegacyData() { + // Check if simpleFields exist but corresponding listFields entry is missing + String simpleReason = getReason(); + TriggeringEntity simpleEntity = getTriggeringEntity(); + long simpleTimestamp = getTimestamp(); + + // Early return if no simple reason exists, not a USER entity, or USER already has a reason + List> reasons = getMaintenanceReasons(); + if (simpleReason == null || simpleEntity != TriggeringEntity.USER || + !filterReasons(reasons, Arrays.asList(TriggeringEntity.USER), null).isEmpty()) { + return; + } + + // Legacy USER data exists but not in listFields - preserve it + Map legacyEntry = new HashMap<>(); + legacyEntry.put(PauseSignalProperty.REASON.name(), simpleReason); + legacyEntry.put(MaintenanceSignalProperty.TIMESTAMP.name(), String.valueOf(simpleTimestamp)); + legacyEntry.put(MaintenanceSignalProperty.TRIGGERED_BY.name(), simpleEntity.name()); + + reasons.add(legacyEntry); + updateReasonsListField(reasons); + + LOG.info("Reconciled legacy USER maintenance data: reason={}, timestamp={}", + simpleReason, simpleTimestamp); + } } diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java index 6654098f8b..c69c7913c4 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java +++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.HashMap; +import java.util.List; import java.util.Map; import javax.management.MalformedObjectNameException; import javax.management.ObjectName; @@ -40,11 +41,13 @@ import org.apache.helix.model.ExternalView; import org.apache.helix.model.IdealState; import org.apache.helix.model.MaintenanceSignal; +import org.apache.helix.model.PauseSignal; import org.apache.helix.monitoring.mbeans.MonitorDomainNames; import org.testng.Assert; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; +import org.apache.helix.zookeeper.datamodel.ZNRecord; import static org.apache.helix.monitoring.mbeans.ClusterStatusMonitor.CLUSTER_DN_KEY; @@ -439,4 +442,923 @@ private static Map convertStringToMap(String value) throws IOExc return new ObjectMapper().readValue(value, TypeFactory.defaultInstance().constructMapType(HashMap.class, String.class, String.class)); } + + /** + * Helper method to get maintenance reason for a specific triggering entity. + * @param signal The maintenance signal + * @param triggeringEntity The entity to get reason for + * @return The reason string, or null if not found + */ + private static String getMaintenanceReason(MaintenanceSignal signal, MaintenanceSignal.TriggeringEntity triggeringEntity) { + List> reasons = signal.getMaintenanceReasons(); + for (Map reason : reasons) { + String triggeredByStr = reason.get(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name()); + if (triggeredByStr != null && MaintenanceSignal.TriggeringEntity.valueOf(triggeredByStr) == triggeringEntity) { + return reason.get(PauseSignal.PauseSignalProperty.REASON.name()); + } + } + return null; + } + + /** + * Utility method to verify maintenance history entry. + * @param expectedOperationType Expected operation type (ENTER/EXIT) + * @param expectedTriggeredBy Expected triggering entity (USER/AUTOMATION/CONTROLLER) + * @param expectedInMaintenanceAfterOperation Expected maintenance state after operation (true/false) + * @param expectedReason Expected reason (optional, can be null) + */ + private void verifyMaintenanceHistory(String expectedOperationType, String expectedTriggeredBy, + String expectedInMaintenanceAfterOperation, String expectedReason) throws Exception { + ControllerHistory history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); + Map lastEntry = convertStringToMap( + history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); + Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), expectedOperationType); + Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), expectedTriggeredBy); + Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), expectedInMaintenanceAfterOperation); + if (expectedReason != null) { + Assert.assertEquals(lastEntry.get("REASON"), expectedReason); + } + } + + /** + * Test basic multi-actor stacking behavior. + * Verifies core functionality: actor-based stacking, actor override, simpleFields most recent + */ + @Test + public void testAutomationMaintenanceMode() throws Exception { + boolean result; + ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); + _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + Assert.assertTrue(result, "Should be out of maintenance mode."); + + // Step 1: USER enters MM (t1) with reason_A + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, "reason_A", null); + + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 1); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "reason_A"); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.USER), "reason_A"); + + // Verify history entry for USER entering maintenance + verifyMaintenanceHistory("ENTER", "USER", "true", "reason_A"); + + Thread.sleep(10); // Ensure different timestamps + + // Step 2: AUTOMATION enters MM (t2) with reason_B - should stack with USER + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, "reason_B", null); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.AUTOMATION); // Most recent + Assert.assertEquals(signal.getReason(), "reason_B"); // Most recent + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.USER), "reason_A"); + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), "reason_B"); + + // Verify history entry for AUTOMATION entering maintenance + verifyMaintenanceHistory("ENTER", "AUTOMATION", "true", "reason_B"); + + Thread.sleep(10); + + // Step 3: USER enters MM again (t3) with reason_C - should override previous USER entry + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "reason_C", null); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); // Still only 2 (USER overrode itself) + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); // Most recent + Assert.assertEquals(signal.getReason(), "reason_C"); // Most recent + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.USER), + "reason_C"); // Updated + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), + "reason_B"); // Unchanged + + // Verify history entry for USER overriding previous entry + verifyMaintenanceHistory("ENTER", "USER", "true", "reason_C"); + + Thread.sleep(10); + + // Step 4: AUTOMATION enters MM again (t4) with reason_D - should override previous AUTOMATION entry + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "reason_D", null); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); // Still only 2 (AUTOMATION overrode itself) + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.AUTOMATION); // Most recent + Assert.assertEquals(signal.getReason(), "reason_D"); // Most recent + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.USER), + "reason_C"); // Unchanged + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), + "reason_D"); // Updated + + // Verify history entry for AUTOMATION overriding previous entry + verifyMaintenanceHistory("ENTER", "AUTOMATION", "true", "reason_D"); + + // Clean exit sequence: actors exit in order + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 1); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + Assert.assertEquals(signal.getReason(), "reason_D"); // Updated to remaining reason + + // Verify history entry for USER exiting maintenance (but still in maintenance due to AUTOMATION) + verifyMaintenanceHistory("EXIT", "USER", "true", null); + + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, + null); + result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + Assert.assertTrue(result, "Should be completely out of maintenance mode."); + + // Verify history entry for AUTOMATION exiting maintenance (completely out) + verifyMaintenanceHistory("EXIT", "AUTOMATION", "false", null); + } + + /** + * USER administrative override after old client data loss + * 1. Multi-actor setup with CONTROLLER, USER, AUTOMATION + * 2. Old client wipes listField data (keeps only simpleFields) + * 3. AUTOMATION tries to exit MM (no-op since its data was wiped) + * 4. USER exits MM (administrative override - forces complete exit) + */ + @Test + public void testLegacyClientCompatibility() throws Exception { + boolean result; + ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); + clusterConfig.setMaxPartitionsPerInstance(-1); + clusterConfig.setNumOfflineInstancesForAutoExit(-1); // Disable auto-exit to prevent race conditions + _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); + + // Wait for config to be applied + result = TestHelper.verify(() -> { + ClusterConfig currentConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); + return currentConfig.getNumOfflineInstancesForAutoExit() == -1; + }, 2000L); + Assert.assertTrue(result, "Config should be applied."); + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "reason_A", null); + Thread.sleep(10); + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, true, "reason_B", + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); + Thread.sleep(10); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "reason_C", null); + + // Verify multi-actor setup + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 3); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Verify history entry for AUTOMATION entering maintenance + verifyMaintenanceHistory("ENTER", "AUTOMATION", "true", "reason_C"); + + // Simulate old client wiping listField data (only keeps simpleFields) + Thread.sleep(10); + ZNRecord record = new ZNRecord("maintenance"); + record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "reason_D"); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TIMESTAMP.name(), + String.valueOf(System.currentTimeMillis())); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), + MaintenanceSignal.TriggeringEntity.USER.name()); + // Old client doesn't set listField - simulates wiping all listField data + _dataAccessor.setProperty(_keyBuilder.maintenance(), new MaintenanceSignal(record)); + + // Verify old client wiped listField data but simpleFields remain + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0, + "Old client should have wiped listField data"); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "reason_D"); + Assert.assertFalse(signal.hasMaintenanceReasons(), + "Should have no listField reasons after old client wipe"); + + // AUTOMATION tries to exit MM -> should be no-op because its entry was wiped + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + + // Verify maintenance signal remains the same (no-op) + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal, "Should still be in maintenance after AUTOMATION no-op"); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0); + Assert.assertEquals(signal.getReason(), "reason_D"); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + + // Verify history entry for AUTOMATION no-op exit (still in maintenance) + verifyMaintenanceHistory("EXIT", "AUTOMATION", "true", null); + + // USER tries to exit MM -> should trigger administrative override and delete maintenance ZNode + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, + null); + + // Verify we're completely out of maintenance mode due to USER administrative override + result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + Assert.assertTrue(result, "Should be completely out of maintenance mode due to USER administrative override."); + + // Verify in history that we're no longer in maintenance + verifyMaintenanceHistory("EXIT", "USER", "false", null); + } + + /** + * Helper method to set up a common scenario for maintenance mode tests: + * 1. USER enters maintenance mode + * 2. AUTOMATION enters maintenance mode + * 3. User B enters maintenance mode (overrides User A) + * 4. Old client enters maintenance mode (simple fields only - wipes listField data) + */ + private void setupMultiActorMaintenanceScenario() throws Exception { + ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); + clusterConfig.setMaxPartitionsPerInstance(-1); + clusterConfig.setNumOfflineInstancesForAutoExit(-1); // Disable auto-exit to prevent race conditions + _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); + // Step 0: CONTROLLER puts the cluster into MM (t1) + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, true, + "reason_Controller", + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); + + // Verify maintenance signal with USER reason only + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 1); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); + + // Step 1: USER (UserA) puts the cluster into MM (t1) + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "reason_A", null); + + // Verify maintenance signal with USER reason only + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + + // Step 2: AUTOMATION puts the cluster into MM (t2) + Thread.sleep(10); // Ensure different timestamps + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "reason_B", null); + + // Verify maintenance signal has both reasons + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 3); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Step 3: USER (UserB) puts the cluster into MM (t3) - overrides UserA's entry + Thread.sleep(10); + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "reason_C", null); + + // Verify maintenance signal still has same number of reasons but UserB's reason replaced UserA's + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 3); + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.USER), "reason_C"); + + // Step 4: USER (Old Client) enters cluster into MM (t4) + // Simulate old client by directly creating a MaintenanceSignal with simple fields only + // Per design doc: "Legacy clients use the dataAccessor.set() API to create Maintenance signals, + // which results in the entire ZNRecord being overwritten, including purging all existing ListField entries" + Thread.sleep(10); + ZNRecord record = new ZNRecord("maintenance"); + record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "reason_D"); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TIMESTAMP.name(), + String.valueOf(System.currentTimeMillis())); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), + MaintenanceSignal.TriggeringEntity.USER.name()); + + // Use setProperty (not updateProperty) to simulate old client completely overwriting the ZNode + _dataAccessor.setProperty(_keyBuilder.maintenance(), new MaintenanceSignal(record)); + + // Verify maintenance signal has updated simple fields but listField data was wiped by old client + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "reason_D"); + + // Verify reasons list was wiped by old client (data loss accepted by design) + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0, + "Old client should have wiped all listField data"); + } + + /** + * Test Case A: USER administrative override after old client data loss + * 1. After old client wipes data, verify no listField reasons exist + * 2. USER tries to exit MM - should trigger administrative override and delete maintenance ZNode + * 3. Verify maintenance mode is completely exited + */ + @Test + public void testUserAdministrativeOverride() throws Exception { + boolean result; + // Set up the initial state with all actors having entered maintenance mode + // Note: setupMultiActorMaintenanceScenario() ends with old client wiping listField data + setupMultiActorMaintenanceScenario(); + + // Verify the old client has wiped the listField data + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0, + "Old client should have wiped listField data"); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "reason_D"); + + // Step 6A: USER (New client) tries to exit MM + // Since USER doesn't have an entry in listFields.reasons, this should trigger administrative override + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + + // Verify we're completely out of maintenance mode due to USER administrative override + result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + Assert.assertTrue(result, "Should be completely out of maintenance mode due to USER administrative override."); + + // Verify in history that we're no longer in maintenance + verifyMaintenanceHistory("EXIT", "USER", "false", null); + } + + /** + * Old client enters, new clients operate independently (no reconciliation) + * 1. Old client enters maintenance mode without updating the reasons list (data wiped) + * 2. New client enters maintenance mode and operates independently + * 3. All actors exit in sequence + */ + @Test + public void testOldClientDataLoss() throws Exception { + boolean result; + // Set up the initial state with all actors having entered maintenance mode + setupMultiActorMaintenanceScenario(); + + // At this point, the old client in setupMultiActorMaintenanceScenario has wiped the listField data + // Verify that the old client action resulted in data loss + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + + // simpleFields should show USER data (from old client) + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "reason_D"); + + // But listFields.reasons should be empty (wiped by old client) + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0, + "Old client should have wiped listField data"); + Assert.assertFalse(signal.hasMaintenanceReasons(), + "Should not have maintenance reasons after old client wipe"); + + // Step 6B: AUTOMATION enters MM again - should work independently (no reconciliation) + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "reason_F", null); + + // Verify signal now has only AUTOMATION reason (no reconciliation of old USER data) + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER), + "Controller data was not lost"); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), + "User data was lost"); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Verify the new AUTOMATION reason + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), + "reason_F"); + + // Verify history entry for AUTOMATION entering maintenance after data loss + verifyMaintenanceHistory("ENTER", "AUTOMATION", + "true", "reason_F"); + + // Exit the only remaining actors + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, + null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, + null); + + // Verify we're out of maintenance mode + result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, + 2000L); + Assert.assertTrue(result, "Should be completely out of maintenance mode after AUTOMATION exit."); + } + + /** + * Test AUTOMATION re-entry after old client wipes data + * Old client wipes data, AUTOMATION re-enters independently, + * then sequence of exits with no-ops and administrative override + */ + @Test + public void testAutomationReentryAfterDataLoss() throws Exception { + boolean result; + // Setup the multi-actor scenario which ends with old client wiping data + setupMultiActorMaintenanceScenario(); + + // AUTOMATION enters MM again (works independently, no reconciliation) + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "reason_E", null); + + // Verify signal now has both USER and AUTOMATION entries. + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), + "reason_E"); + + // Verify history entry for AUTOMATION entering maintenance after data loss + verifyMaintenanceHistory("ENTER", "AUTOMATION", + "true", "reason_E"); + + + // Automation exits MM + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, + null); + // USER exits MM + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, + null); + + // Verify we're completely out of maintenance mode due to USER administrative override + result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + Assert.assertTrue(result, "Should be out of maintenance mode."); + // Verify in history that we're no longer in maintenance + verifyMaintenanceHistory("EXIT", "USER", "false", null); + } + + /** + * Test old client force exit by deleting entire maintenance znode + */ + @Test + public void testOldClientDeletesEntireZnode() throws Exception { + // Setup: Multiple actors enter maintenance mode + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "user_reason", null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "automation_reason", null); + + // Verify we have both actors + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); + + // Case D: USER (old client) exits MM by deleting entire znode + _dataAccessor.removeProperty(_keyBuilder.maintenance()); + + // Verify we're completely out of maintenance mode (all data lost) + boolean result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, + 2000L); + Assert.assertTrue(result, "Should be completely out of maintenance mode after old client deletes znode."); + } + + /** + * Test that simpleFields always reflect the most recently added reason + */ + @Test + public void testSimpleFieldsReflectMostRecent() throws Exception { + // Entry 1: USER at t1 + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "user_first", null); + + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getReason(), "user_first"); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + + // Verify history entry for USER entering maintenance + verifyMaintenanceHistory("ENTER", "USER", "true", "user_first"); + + Thread.sleep(10); + + // Entry 2: AUTOMATION at t2 (should become most recent) + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "automation_second", null); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getReason(), "automation_second"); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.AUTOMATION); + + // Verify history entry for AUTOMATION entering maintenance + verifyMaintenanceHistory("ENTER", "AUTOMATION", "true", "automation_second"); + + Thread.sleep(10); + + // Entry 3: USER at t3 (should become most recent) + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "user_third", null); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getReason(), "user_third"); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + + // Verify history entry for USER overriding previous entry + verifyMaintenanceHistory("ENTER", "USER", "true", "user_third"); + + Thread.sleep(10); + + // Entry 4: AUTOMATION again at t4 (should become most recent, overriding its previous entry) + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "automation_fourth", null); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getReason(), "automation_fourth"); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.AUTOMATION); + // Should still have 2 actors (AUTOMATION entry was overwritten) + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); + + // Verify history entry for AUTOMATION overriding previous entry + verifyMaintenanceHistory("ENTER", "AUTOMATION", "true", "automation_fourth"); + + // Clean up + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + + // Verify in history that we're no longer in maintenance + verifyMaintenanceHistory("EXIT", "AUTOMATION", "false", null); + } + + /** + * Test edge cases around empty maintenance mode + */ + @Test + public void testEmptyStateEdgeCases() throws Exception { + boolean result; + // Test 1: Try to exit when no maintenance mode exists + + // AUTOMATION tries to exit when no MM exists -> should be no-op + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + Assert.assertNull(_dataAccessor.getProperty(_keyBuilder.maintenance())); + + // USER tries to exit when no MM exists -> should be no-op (nothing to override) + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + Assert.assertNull(_dataAccessor.getProperty(_keyBuilder.maintenance())); + + // Test 2: Create maintenance mode with only simpleFields (old client style) + ZNRecord record = new ZNRecord("maintenance"); + record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "Old client reason"); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), + MaintenanceSignal.TriggeringEntity.USER.name()); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TIMESTAMP.name(), + String.valueOf(System.currentTimeMillis())); + _dataAccessor.setProperty(_keyBuilder.maintenance(), new MaintenanceSignal(record)); + + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0, "Should have no listField reasons"); + Assert.assertFalse(signal.hasMaintenanceReasons(), "Should report no maintenance reasons"); + + // AUTOMATION tries to exit -> should be no-op + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + Assert.assertNotNull(_dataAccessor.getProperty(_keyBuilder.maintenance()), + "Should still exist after no-op"); + + // USER tries to exit -> should be administrative override + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + Assert.assertTrue(result, "Should be completely out of maintenance mode."); + + // Verify in history that we're no longer in maintenance + verifyMaintenanceHistory("EXIT", "USER", "false", null); + } + + /** + * Test mixed entry/exit scenarios to stress test maintenance mode stacking + * Verifies complex sequences of actors entering and exiting in different orders + */ + @Test + public void testMixedEntryExitScenarios() throws Exception { + boolean result; + // Phase 1: Multiple actors enter + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "user_reason_1", null); + Thread.sleep(10); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "automation_reason_1", null); + + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); + Assert.assertEquals(signal.getReason(), "automation_reason_1"); // Most recent + + // Verify history entry for AUTOMATION entering maintenance + verifyMaintenanceHistory("ENTER", "AUTOMATION", "true", "automation_reason_1"); + + // Phase 2: One actor exits, then re-enters with different reason + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); // USER exits + Thread.sleep(10); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 1); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Verify history entry for USER exiting maintenance (but still in maintenance due to AUTOMATION) + verifyMaintenanceHistory("EXIT", "USER", "true", null); + + // USER re-enters with new reason + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "user_reason_2", null); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); + Assert.assertEquals(signal.getReason(), "user_reason_2"); // Most recent + + // Verify history entry for USER re-entering maintenance + verifyMaintenanceHistory("ENTER", "USER", "true", "user_reason_2"); + + // Phase 3: Clean exit + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + + // Verify in history that we're no longer in maintenance + verifyMaintenanceHistory("EXIT", "AUTOMATION", "false", null); + } + + /** + * Test basic multi-actor stacking behavior including CONTROLLER entity + * Creates actual conditions that trigger CONTROLLER maintenance mode + */ + @Test + public void testMultiActorStackingWithController() throws Exception { + boolean result; + ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); + clusterConfig.setMaxPartitionsPerInstance(-1); + _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); + // Step 1: Directly trigger CONTROLLER maintenance mode using API + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, true, + "Test controller maintenance", + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); + + // Verify CONTROLLER entered maintenance mode automatically + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 1); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + + // Verify history entry for CONTROLLER entering maintenance + verifyMaintenanceHistory("ENTER", "CONTROLLER", "true", "Test controller maintenance"); + + // Step 2: USER enters MM - should stack with CONTROLLER + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "user_reason", null); + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "user_reason"); // Most recent + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + + // Verify history entry for USER entering maintenance + verifyMaintenanceHistory("ENTER", "USER", "true", "user_reason"); + + // Step 3: AUTOMATION enters MM - should stack with both + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "automation_reason", null); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 3); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.AUTOMATION); + Assert.assertEquals(signal.getReason(), "automation_reason"); // Most recent + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Verify history entry for AUTOMATION entering maintenance + verifyMaintenanceHistory("ENTER", "AUTOMATION", "true", "automation_reason"); + + // Step 4: USER exits - should remain in maintenance with CONTROLLER and AUTOMATION + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Verify history entry for USER exiting maintenance (but still in maintenance due to others) + verifyMaintenanceHistory("EXIT", "USER", "true", null); + + // Step 5: AUTOMATION exits - should remain in maintenance with only CONTROLLER + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 1); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Verify history entry for AUTOMATION exiting maintenance (but still in maintenance due to CONTROLLER) + verifyMaintenanceHistory("EXIT", "AUTOMATION", "true", null); + + // Step 6: Exit CONTROLLER maintenance mode + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, false, null, + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); + + // Verify maintenance mode is completely off + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + + // Verify history entry for CONTROLLER exiting maintenance (completely out) + verifyMaintenanceHistory("EXIT", "CONTROLLER", "false", null); + } + + /** + * Test old client wipes data including CONTROLLER entry + * Verifies CONTROLLER no-op behavior after data loss + */ + @Test + public void testControllerNoOpAfterOldClientWipe() throws Exception { + boolean result; + ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); + clusterConfig.setMaxPartitionsPerInstance(-1); + clusterConfig.setNumOfflineInstancesForAutoExit(-1); // Disable auto-exit to prevent race conditions + _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); + // Step 1: Directly trigger CONTROLLER maintenance mode using API + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, true, + "Test controller maintenance", + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); + + // Verify CONTROLLER entered maintenance mode + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); + + // Add USER and AUTOMATION to create multi-actor scenario + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "user_reason", null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "automation_reason", null); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 3); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Simulate old client wiping all data + ZNRecord record = new ZNRecord("maintenance"); + record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "Old client reason"); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TIMESTAMP.name(), + String.valueOf(System.currentTimeMillis())); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), + MaintenanceSignal.TriggeringEntity.USER.name()); + // Old client doesn't set listField - simulates wiping all listField data + _dataAccessor.setProperty(_keyBuilder.maintenance(), new MaintenanceSignal(record)); + + // Verify old client wiped listField data + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0, + "Old client should have wiped listField data"); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + + // Try CONTROLLER exit - should be no-op since its entry was wiped + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, false, null, + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); + + // Verify maintenance signal remains the same (no-op) + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal, "Should still be in maintenance after CONTROLLER no-op"); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0); + Assert.assertEquals(signal.getReason(), "Old client reason"); + + // Verify history entry for CONTROLLER no-op exit (still in maintenance) + verifyMaintenanceHistory("EXIT", "CONTROLLER", "true", null); + + // Try AUTOMATION exit - should also be no-op since its entry was wiped + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + + // Verify maintenance signal remains the same (no-op) + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal, "Should still be in maintenance after AUTOMATION no-op"); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0); + Assert.assertEquals(signal.getReason(), "Old client reason"); + + // Verify history entry for AUTOMATION no-op exit (still in maintenance) + verifyMaintenanceHistory("EXIT", "AUTOMATION", "true", null); + + // USER tries to exit -> should be administrative override + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + + // Verify completely out of maintenance mode + result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + Assert.assertTrue(result, "Should be completely out of maintenance mode due to USER administrative override."); + + // Verify in history that we're no longer in maintenance + verifyMaintenanceHistory("EXIT", "USER", "false", null); + } + + /** + * Test CONTROLLER override behavior - same entity overwrites previous entry + */ + @Test + public void testControllerOverrideBehavior() throws Exception { + boolean result; + ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); + clusterConfig.setMaxPartitionsPerInstance(-1); + _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); + // Step 1: Directly trigger CONTROLLER maintenance mode with specific auto-trigger reason + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, true, + "Initial controller reason", + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); + + // Verify CONTROLLER entered with auto-trigger reason + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); + Assert.assertEquals(signal.getAutoTriggerReason(), + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 1); + + // Manually trigger CONTROLLER entry again with different reason (to test override) + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, true, + "manual_controller_reason", + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); + Assert.assertEquals(signal.getAutoTriggerReason(), + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); + Assert.assertEquals(signal.getMaintenanceReasons().size(), + 1); // Should still be only 1 (CONTROLLER overrode itself) + + // Add another actor to verify stacking still works + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "user_reason", null); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + + // Cleanup + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, false, + null, + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + } + + /** + * Test reconciliation of legacy USER data when new client adds reason + * Verifies the critical design requirement to preserve old client data + */ + @Test + public void testReconciliationOfLegacyUserData() throws Exception { + boolean result; + // Step 1: Old client sets simpleFields only (no listFields) + ZNRecord record = new ZNRecord("maintenance"); + record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "legacy_user_reason"); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), + MaintenanceSignal.TriggeringEntity.USER.name()); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TIMESTAMP.name(), + String.valueOf(System.currentTimeMillis())); + _dataAccessor.setProperty(_keyBuilder.maintenance(), new MaintenanceSignal(record)); + + // Verify old client state (no listFields reasons) + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0, + "Old client should have no listField data"); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "legacy_user_reason"); + + // Step 2: New client adds AUTOMATION reason - should trigger reconciliation + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "automation_reason", null); + + // Step 3: Verify both reasons are preserved after reconciliation + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2, + "Should have both reconciled USER and new AUTOMATION reasons"); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), + "USER reason should be preserved"); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION), + "AUTOMATION reason should be added"); + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.USER), + "legacy_user_reason"); + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), + "automation_reason"); + + // Cleanup + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, + false, null, null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, + false, null, null); + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, + 2000L); + } } diff --git a/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java b/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java index 5a1a8a5bcb..4507c71060 100644 --- a/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java +++ b/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java @@ -364,6 +364,12 @@ public void manuallyEnableMaintenanceMode(String clusterName, boolean enabled, S } + @Override + public void automationEnableMaintenanceMode(String clusterName, boolean enabled, String reason, + Map customFields) { + + } + @Override public boolean isInMaintenanceMode(String clusterName) { return false; diff --git a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java index 1e752f3feb..5d1b7a8545 100644 --- a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java +++ b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java @@ -317,6 +317,8 @@ public Response updateCluster(@PathParam("clusterId") String clusterId, // Try to parse the content string. If parseable, use it as a KV mapping. Otherwise, treat it // as a REASON String Map customFieldsMap = null; + // Default to USER triggering entity + boolean isAutomationTriggered = false; try { // Try to parse content customFieldsMap = @@ -328,13 +330,29 @@ public Response updateCluster(@PathParam("clusterId") String clusterId, if ("reason".equalsIgnoreCase(entry.getKey())) { content = entry.getValue(); } + if ("isAutomation".equalsIgnoreCase(entry.getKey())) { + isAutomationTriggered = Boolean.parseBoolean(entry.getValue()); + } } } catch (Exception e) { // NOP } - helixAdmin - .manuallyEnableMaintenanceMode(clusterId, command == Command.enableMaintenanceMode, - content, customFieldsMap); + + if (customFieldsMap != null) { + customFieldsMap.entrySet().removeIf(entry -> + "isAutomation".equalsIgnoreCase(entry.getKey()) || + "reason".equalsIgnoreCase(entry.getKey())); + } + + if (isAutomationTriggered) { + helixAdmin + .automationEnableMaintenanceMode(clusterId, command == Command.enableMaintenanceMode, + content, customFieldsMap); + } else { + helixAdmin + .manuallyEnableMaintenanceMode(clusterId, command == Command.enableMaintenanceMode, + content, customFieldsMap); + } break; case enableWagedRebalanceForAllResources: // Enable WAGED rebalance for all resources in the cluster