Skip to content

Commit 6033d49

Browse files
authored
IGNITE-26152 Fixed node stop hang during snapshot creation. (#12257)
1 parent d1a2e13 commit 6033d49

File tree

10 files changed

+64
-35
lines changed

10 files changed

+64
-35
lines changed

modules/core/src/main/java/org/apache/ignite/internal/processors/authentication/IgniteAuthenticationProcessor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -828,7 +828,7 @@ private void onLocalJoin() {
828828
}
829829

830830
/** {@inheritDoc} */
831-
@Override public void onDoneBeforeTopologyUnlock(GridDhtPartitionsExchangeFuture fut) {
831+
@Override public void onDoneBeforeTopologyUnlock(GridDhtPartitionsExchangeFuture fut, @Nullable Throwable err) {
832832
activateFut.onDone();
833833
}
834834

modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/GridDhtPartitionsExchangeFuture.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2456,7 +2456,7 @@ private String exchangeTimingsLogMessage(String header, List<String> timings) {
24562456
}
24572457

24582458
for (PartitionsExchangeAware comp : cctx.exchange().exchangeAwareComponents())
2459-
comp.onDoneBeforeTopologyUnlock(this);
2459+
comp.onDoneBeforeTopologyUnlock(this, err);
24602460

24612461
// Create and destroy caches and cache proxies.
24622462
cctx.cache().onExchangeDone(this, err);

modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/PartitionsExchangeAware.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.ignite.internal.processors.cache.distributed.dht.preloader;
1919

2020
import org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager;
21+
import org.jetbrains.annotations.Nullable;
2122

2223
/**
2324
* Interface which allows to subscribe a component for partition map exchange events
@@ -53,8 +54,9 @@ public default void onInitAfterTopologyLock(GridDhtPartitionsExchangeFuture fut)
5354
* Guarantees that no updates were performed on local node since exchange process started.
5455
*
5556
* @param fut Partition map exchange future.
57+
* @param err Optional error, e.g. node stopping.
5658
*/
57-
public default void onDoneBeforeTopologyUnlock(GridDhtPartitionsExchangeFuture fut) {
59+
public default void onDoneBeforeTopologyUnlock(GridDhtPartitionsExchangeFuture fut, @Nullable Throwable err) {
5860
// No-op.
5961
}
6062

modules/core/src/main/java/org/apache/ignite/internal/processors/cache/persistence/snapshot/IgniteSnapshotManager.java

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -650,42 +650,37 @@ public IgniteSnapshotManager(GridKernalContext ctx) {
650650
}
651651

652652
/** {@inheritDoc} */
653-
@Override protected void stop0(boolean cancel) {
653+
@Override protected void onKernalStop0(boolean cancel) {
654654
busyLock.block();
655655

656-
try {
657-
snpRmtMgr.stop();
656+
snpRmtMgr.stop();
658657

659-
IgniteCheckedException stopErr = new NodeStoppingException("Node is stopping.");
658+
IgniteCheckedException stopErr = new NodeStoppingException("Node is stopping.");
660659

661-
restoreCacheGrpProc.interrupt(stopErr);
662-
checkSnpProc.interrupt(stopErr);
660+
restoreCacheGrpProc.interrupt(stopErr);
661+
checkSnpProc.interrupt(stopErr);
663662

664-
// Try stop all snapshot processing if not yet.
665-
for (AbstractSnapshotFutureTask<?> sctx : locSnpTasks.values())
666-
sctx.acceptException(new NodeStoppingException(SNP_NODE_STOPPING_ERR_MSG));
663+
// Try stop all snapshot processing if not yet.
664+
for (AbstractSnapshotFutureTask<?> sctx : locSnpTasks.values())
665+
sctx.acceptException(new NodeStoppingException(SNP_NODE_STOPPING_ERR_MSG));
667666

668-
locSnpTasks.clear();
667+
locSnpTasks.clear();
669668

670-
synchronized (snpOpMux) {
671-
if (clusterSnpFut != null) {
672-
clusterSnpFut.onDone(new NodeStoppingException(SNP_NODE_STOPPING_ERR_MSG));
669+
synchronized (snpOpMux) {
670+
if (clusterSnpFut != null) {
671+
clusterSnpFut.onDone(new NodeStoppingException(SNP_NODE_STOPPING_ERR_MSG));
673672

674-
clusterSnpFut = null;
675-
}
673+
clusterSnpFut = null;
676674
}
675+
}
677676

678-
cctx.kernalContext().io().removeMessageListener(DFLT_INITIAL_SNAPSHOT_TOPIC);
679-
cctx.kernalContext().io().removeTransmissionHandler(DFLT_INITIAL_SNAPSHOT_TOPIC);
677+
cctx.kernalContext().io().removeMessageListener(DFLT_INITIAL_SNAPSHOT_TOPIC);
678+
cctx.kernalContext().io().removeTransmissionHandler(DFLT_INITIAL_SNAPSHOT_TOPIC);
680679

681-
if (discoLsnr != null)
682-
cctx.kernalContext().event().removeDiscoveryEventListener(discoLsnr);
680+
if (discoLsnr != null)
681+
cctx.kernalContext().event().removeDiscoveryEventListener(discoLsnr);
683682

684-
cctx.exchange().unregisterExchangeAwareComponent(this);
685-
}
686-
finally {
687-
busyLock.unblock();
688-
}
683+
cctx.exchange().unregisterExchangeAwareComponent(this);
689684
}
690685

691686
/** {@inheritDoc} */
@@ -2221,7 +2216,7 @@ public static boolean isSnapshotOperation(DiscoveryEvent evt) {
22212216
}
22222217

22232218
/** {@inheritDoc} */
2224-
@Override public void onDoneBeforeTopologyUnlock(GridDhtPartitionsExchangeFuture fut) {
2219+
@Override public void onDoneBeforeTopologyUnlock(GridDhtPartitionsExchangeFuture fut, @Nullable Throwable err) {
22252220
if (clusterSnpReq == null || cctx.kernalContext().clientNode() || !isSnapshotOperation(fut.firstEvent()))
22262221
return;
22272222

@@ -2234,6 +2229,11 @@ public static boolean isSnapshotOperation(DiscoveryEvent evt) {
22342229

22352230
if (task == null)
22362231
return;
2232+
else if (err != null) {
2233+
task.onDone(err);
2234+
2235+
return;
2236+
}
22372237

22382238
if (task.start()) {
22392239
cctx.database().forceNewCheckpoint(String.format("Start snapshot operation: %s", snpReq.snapshotName()), lsnr -> {});

modules/core/src/main/java/org/apache/ignite/internal/processors/platform/PlatformContextImpl.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -679,7 +679,7 @@ private void writeCommonEventData(BinaryWriterEx writer, EventAdapter evt) {
679679
}
680680

681681
/** {@inheritDoc} */
682-
@Override public void onDoneBeforeTopologyUnlock(GridDhtPartitionsExchangeFuture fut) {
682+
@Override public void onDoneBeforeTopologyUnlock(GridDhtPartitionsExchangeFuture fut, @Nullable Throwable err) {
683683
AffinityTopologyVersion ver = fut.topologyVersion();
684684

685685
if (ver != null) {

modules/core/src/test/java/org/apache/ignite/internal/processors/cache/distributed/PartitionsExchangeAwareTest.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
import org.apache.ignite.spi.discovery.tcp.ipfinder.vm.TcpDiscoveryVmIpFinder;
4040
import org.apache.ignite.testframework.GridTestUtils;
4141
import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest;
42+
import org.jetbrains.annotations.Nullable;
4243
import org.junit.After;
4344
import org.junit.Before;
4445
import org.junit.Test;
@@ -183,7 +184,10 @@ public void testPartitionsExchangeAware() throws Exception {
183184
}
184185

185186
/** {@inheritDoc} */
186-
@Override public void onDoneBeforeTopologyUnlock(GridDhtPartitionsExchangeFuture fut) {
187+
@Override public void onDoneBeforeTopologyUnlock(
188+
GridDhtPartitionsExchangeFuture fut,
189+
@Nullable Throwable err
190+
) {
187191
try {
188192
onDoneBeforeUnlockReachedLatch.countDown();
189193
onDoneBeforeUnlockWaitLatch.await();

modules/core/src/test/java/org/apache/ignite/internal/processors/cache/distributed/dht/topology/DelayedOwningDuringExchangeTest.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import org.apache.ignite.testframework.GridTestUtils;
3838
import org.apache.ignite.testframework.junits.WithSystemProperty;
3939
import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest;
40+
import org.jetbrains.annotations.Nullable;
4041
import org.junit.Test;
4142

4243
/**
@@ -130,7 +131,10 @@ private void testDelayedRenting(int idx, int mode) throws Exception {
130131
wait(fut, 0);
131132
}
132133

133-
@Override public void onDoneBeforeTopologyUnlock(GridDhtPartitionsExchangeFuture fut) {
134+
@Override public void onDoneBeforeTopologyUnlock(
135+
GridDhtPartitionsExchangeFuture fut,
136+
@Nullable Throwable err
137+
) {
134138
wait(fut, 1);
135139
}
136140

modules/core/src/test/java/org/apache/ignite/internal/processors/cache/persistence/IgnitePdsConsistencyOnDelayedPartitionOwning.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
import org.apache.ignite.plugin.extensions.communication.Message;
5050
import org.apache.ignite.testframework.GridTestUtils;
5151
import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest;
52+
import org.jetbrains.annotations.Nullable;
5253
import org.junit.Test;
5354

5455
import static org.apache.ignite.internal.processors.cache.GridCacheUtils.cacheId;
@@ -203,7 +204,10 @@ public void checkConsistencyNodeLeft() throws Exception {
203204
});
204205

205206
grid(1).context().cache().context().exchange().registerExchangeAwareComponent(new PartitionsExchangeAware() {
206-
@Override public void onDoneBeforeTopologyUnlock(GridDhtPartitionsExchangeFuture fut) {
207+
@Override public void onDoneBeforeTopologyUnlock(
208+
GridDhtPartitionsExchangeFuture fut,
209+
@Nullable Throwable err
210+
) {
207211
if (fut.initialVersion().equals(new AffinityTopologyVersion(7, 0))) {
208212
topInitLatch.countDown();
209213

modules/core/src/test/java/org/apache/ignite/internal/processors/cache/persistence/snapshot/IgniteClusterSnapshotSelfTest.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@
8585
import org.apache.ignite.spi.metric.LongMetric;
8686
import org.apache.ignite.testframework.GridTestUtils;
8787
import org.apache.ignite.transactions.Transaction;
88+
import org.jetbrains.annotations.Nullable;
8889
import org.junit.Before;
8990
import org.junit.Test;
9091

@@ -1134,7 +1135,10 @@ public void testSnapshotPartitionExchangeAwareOrder() throws Exception {
11341135
assertEquals("Exchange order violated: " + fut.firstEvent(), 1, order.getAndIncrement());
11351136
}
11361137

1137-
@Override public void onDoneBeforeTopologyUnlock(GridDhtPartitionsExchangeFuture fut) {
1138+
@Override public void onDoneBeforeTopologyUnlock(
1139+
GridDhtPartitionsExchangeFuture fut,
1140+
@Nullable Throwable err
1141+
) {
11381142
assertEquals("Exchange order violated: " + fut.firstEvent(), 2, order.getAndIncrement());
11391143
}
11401144

modules/core/src/test/java/org/apache/ignite/internal/processors/cache/persistence/snapshot/IgniteSnapshotRemoteRequestTest.java

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
import org.apache.ignite.internal.util.lang.GridAbsPredicate;
5252
import org.apache.ignite.internal.util.typedef.F;
5353
import org.apache.ignite.internal.util.typedef.G;
54+
import org.apache.ignite.internal.util.typedef.X;
5455
import org.apache.ignite.internal.util.typedef.internal.CU;
5556
import org.apache.ignite.internal.util.typedef.internal.U;
5657
import org.apache.ignite.testframework.GridTestUtils;
@@ -268,8 +269,18 @@ public void testSnapshotRequestRemoteSourceNodeLeft() throws Exception {
268269

269270
latch.countDown();
270271

271-
assertThrowsAnyCause(log, () -> fut.get(TIMEOUT), ClusterTopologyCheckedException.class,
272-
"he node from which a snapshot has been requested left the grid");
272+
try {
273+
fut.get(TIMEOUT);
274+
}
275+
catch (Exception e) {
276+
boolean expErr = X.hasCause(e, "The node from which a snapshot has been requested left the grid",
277+
ClusterTopologyCheckedException.class)
278+
|| X.hasCause(e, "Request cancelled. The snapshot operation stopped on the remote node with an error: " +
279+
"The operation is cancelled due to the local node is stopping", IgniteCheckedException.class);
280+
281+
if (!expErr)
282+
fail(e.getMessage());
283+
}
273284
}
274285

275286
/** @throws Exception If fails. */

0 commit comments

Comments
 (0)