diff --git a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java index c8e0d6ef32..42390d5ff2 100644 --- a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java +++ b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java @@ -87,6 +87,8 @@ public class ClusterStatusMonitor implements ClusterStatusMonitorMBean { private AtomicLong _rebalanceFailureCount = new AtomicLong(0L); private AtomicLong _continuousResourceRebalanceFailureCount = new AtomicLong(0L); private AtomicLong _continuousTaskRebalanceFailureCount = new AtomicLong(0L); + private AtomicLong _leaderFailureCount = new AtomicLong(0L); + private AtomicLong _resetLeaderFailureCount = new AtomicLong(0L); private final ConcurrentHashMap _resourceMonitorMap = new ConcurrentHashMap<>(); @@ -660,6 +662,8 @@ public void reset() { _rebalanceFailureCount.set(0L); _continuousResourceRebalanceFailureCount.set(0L); _continuousTaskRebalanceFailureCount.set(0L); + _leaderFailureCount.set(0L); + _resetLeaderFailureCount.set(0L); } catch (Exception e) { LOG.error("Fail to reset ClusterStatusMonitor, cluster: " + _clusterName, e); } @@ -1137,4 +1141,28 @@ public long getNumOfResourcesRebalanceThrottledGauge() { } return total; } + + @Override + public long getLeaderFailureCounter() { + return _leaderFailureCount.get(); + } + + @Override + public long getResetLeaderFailureCounter() { + return _resetLeaderFailureCount.get(); + } + + /** + * Report a leadership failure for distributed controller + */ + public void reportLeaderFailure() { + _leaderFailureCount.incrementAndGet(); + } + + /** + * Report when controller is still leader during reset + */ + public void reportResetLeaderFailure() { + _resetLeaderFailureCount.incrementAndGet(); + } } diff --git a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitorMBean.java b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitorMBean.java index 433818e457..2feb6b41e4 100644 --- a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitorMBean.java +++ b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitorMBean.java @@ -147,4 +147,14 @@ public interface ClusterStatusMonitorMBean extends SensorNameProvider { * state partition is larger than configured threshold (default is 1). */ long getNumOfResourcesRebalanceThrottledGauge(); + + /** + * @return The number of leadership failures for distributed controllers + */ + long getLeaderFailureCounter(); + + /** + * @return The number of times controller was still leader during reset + */ + long getResetLeaderFailureCounter(); } diff --git a/helix-core/src/main/java/org/apache/helix/participant/DistClusterControllerStateModel.java b/helix-core/src/main/java/org/apache/helix/participant/DistClusterControllerStateModel.java index 42f0e4b416..b3e85dc311 100644 --- a/helix-core/src/main/java/org/apache/helix/participant/DistClusterControllerStateModel.java +++ b/helix-core/src/main/java/org/apache/helix/participant/DistClusterControllerStateModel.java @@ -29,6 +29,7 @@ import org.apache.helix.NotificationContext; import org.apache.helix.controller.pipeline.Pipeline; import org.apache.helix.model.Message; +import org.apache.helix.monitoring.mbeans.ClusterStatusMonitor; import org.apache.helix.participant.statemachine.StateModelInfo; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -39,6 +40,7 @@ public class DistClusterControllerStateModel extends AbstractHelixLeaderStandbyS private static Logger logger = LoggerFactory.getLogger(DistClusterControllerStateModel.class); protected Optional _controllerOpt = Optional.empty(); private final Set _enabledPipelineTypes; + private ClusterStatusMonitor _clusterStatusMonitor; public DistClusterControllerStateModel(String zkAddr) { this(zkAddr, Sets.newHashSet(Pipeline.Type.DEFAULT, Pipeline.Type.TASK)); @@ -70,7 +72,15 @@ public void onBecomeLeaderFromStandby(Message message, NotificationContext conte newController.connect(); newController.startTimerTasks(); _controllerOpt = Optional.of(newController); - logStateTransition("STANDBY", "LEADER", clusterName, controllerName); + if (!newController.isLeader()) { + logger.error("Controller Leader session is not the same as the current session for {}. " + + "This should not happen. Controller: {}", clusterName ,controllerName); + + // Publish metrics through ClusterStatusMonitor when not a leader + getClusterStatusMonitor(clusterName).reportLeaderFailure(); + } else { + logStateTransition("STANDBY", "LEADER", clusterName, controllerName); + } } else { logger.error("controller already exists:" + _controllerOpt.get().getInstanceName() + " for " + clusterName); @@ -114,11 +124,33 @@ public String getStateModeInstanceDescription(String partitionName, String insta public void reset() { synchronized (_controllerOpt) { if (_controllerOpt.isPresent()) { + String clusterName = _controllerOpt.get().getClusterName(); logger.info("Disconnecting controller: " + _controllerOpt.get().getInstanceName() + " for " + _controllerOpt.get().getClusterName()); _controllerOpt.get().disconnect(); + if(_controllerOpt.get().isLeader()) { + logger.error("Controller is still leader after disconnecting: {} for {}", + _controllerOpt.get().getInstanceName(), _controllerOpt.get().getClusterName()); + + // Publish metrics when controller is still leader during reset + getClusterStatusMonitor(clusterName).reportResetLeaderFailure(); + } _controllerOpt = Optional.empty(); } + + // Clean up cluster status monitor + if (_clusterStatusMonitor != null) { + _clusterStatusMonitor.reset(); + _clusterStatusMonitor = null; + } + } + } + + private ClusterStatusMonitor getClusterStatusMonitor(String clusterName) { + if (_clusterStatusMonitor == null) { + _clusterStatusMonitor = new ClusterStatusMonitor(clusterName); + _clusterStatusMonitor.active(); } + return _clusterStatusMonitor; } -} +} \ No newline at end of file diff --git a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestClusterStatusMonitor.java b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestClusterStatusMonitor.java index b0837869a9..77ce61b9c1 100644 --- a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestClusterStatusMonitor.java +++ b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestClusterStatusMonitor.java @@ -610,4 +610,75 @@ private void verifyMessageMetrics(ClusterStatusMonitor monitor, Map