Skip to content

Commit 04f6255

Browse files
authored
HDDS-12127. RM should not expire pending deletes, but retry until delete is confirmed or node is dead (#7746)
1 parent f1b59f1 commit 04f6255

File tree

11 files changed

+230
-110
lines changed

11 files changed

+230
-110
lines changed

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ContainerReplicaOp.java

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.hadoop.hdds.scm.container.replication;
1919

2020
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
21+
import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
2122

2223
/**
2324
* Class to wrap details used to track pending replications.
@@ -34,19 +35,20 @@ public enum PendingOpType {
3435
private final PendingOpType opType;
3536
private final DatanodeDetails target;
3637
private final int replicaIndex;
38+
private final SCMCommand<?> command;
3739
private final long deadlineEpochMillis;
3840

3941
public static ContainerReplicaOp create(PendingOpType opType,
4042
DatanodeDetails target, int replicaIndex) {
41-
return new ContainerReplicaOp(opType, target, replicaIndex,
42-
System.currentTimeMillis());
43+
return new ContainerReplicaOp(opType, target, replicaIndex, null, System.currentTimeMillis());
4344
}
4445

4546
public ContainerReplicaOp(PendingOpType opType,
46-
DatanodeDetails target, int replicaIndex, long deadlineEpochMillis) {
47+
DatanodeDetails target, int replicaIndex, SCMCommand<?> command, long deadlineEpochMillis) {
4748
this.opType = opType;
4849
this.target = target;
4950
this.replicaIndex = replicaIndex;
51+
this.command = command;
5052
this.deadlineEpochMillis = deadlineEpochMillis;
5153
}
5254

@@ -62,6 +64,10 @@ public int getReplicaIndex() {
6264
return replicaIndex;
6365
}
6466

67+
public SCMCommand<?> getCommand() {
68+
return command;
69+
}
70+
6571
public long getDeadlineEpochMillis() {
6672
return deadlineEpochMillis;
6773
}

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ContainerReplicaPendingOps.java

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import org.apache.hadoop.hdds.client.ReplicationType;
2222
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
2323
import org.apache.hadoop.hdds.scm.container.ContainerID;
24+
import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
2425

2526
import java.time.Clock;
2627
import java.util.ArrayList;
@@ -117,27 +118,29 @@ public List<ContainerReplicaOp> getPendingOps(ContainerID containerID) {
117118
* @param containerID ContainerID for which to add a replica
118119
* @param target The target datanode
119120
* @param replicaIndex The replica index (zero for Ratis, &gt; 0 for EC)
121+
* @param command The command to send to the datanode
120122
* @param deadlineEpochMillis The time by which the replica should have been
121123
* added and reported by the datanode, or it will
122124
* be discarded.
123125
*/
124126
public void scheduleAddReplica(ContainerID containerID,
125-
DatanodeDetails target, int replicaIndex, long deadlineEpochMillis) {
126-
addReplica(ADD, containerID, target, replicaIndex, deadlineEpochMillis);
127+
DatanodeDetails target, int replicaIndex, SCMCommand<?> command, long deadlineEpochMillis) {
128+
addReplica(ADD, containerID, target, replicaIndex, command, deadlineEpochMillis);
127129
}
128130

129131
/**
130132
* Store a ContainerReplicaOp to delete a replica for the given ContainerID.
131133
* @param containerID ContainerID for which to delete a replica
132134
* @param target The target datanode
133135
* @param replicaIndex The replica index (zero for Ratis, &gt; 0 for EC)
136+
* @param command The command to send to the datanode
134137
* @param deadlineEpochMillis The time by which the replica should have been
135138
* deleted and reported by the datanode, or it will
136139
* be discarded.
137140
*/
138141
public void scheduleDeleteReplica(ContainerID containerID,
139-
DatanodeDetails target, int replicaIndex, long deadlineEpochMillis) {
140-
addReplica(DELETE, containerID, target, replicaIndex, deadlineEpochMillis);
142+
DatanodeDetails target, int replicaIndex, SCMCommand<?> command, long deadlineEpochMillis) {
143+
addReplica(DELETE, containerID, target, replicaIndex, command, deadlineEpochMillis);
141144
}
142145

143146
/**
@@ -150,7 +153,7 @@ public void scheduleDeleteReplica(ContainerID containerID,
150153
*/
151154
public boolean completeAddReplica(ContainerID containerID,
152155
DatanodeDetails target, int replicaIndex) {
153-
boolean completed = completeOp(ADD, containerID, target, replicaIndex);
156+
boolean completed = completeOp(ADD, containerID, target, replicaIndex, true);
154157
if (isMetricsNotNull() && completed) {
155158
if (isEC(replicaIndex)) {
156159
replicationMetrics.incrEcReplicasCreatedTotal();
@@ -172,7 +175,7 @@ public boolean completeAddReplica(ContainerID containerID,
172175
*/
173176
public boolean completeDeleteReplica(ContainerID containerID,
174177
DatanodeDetails target, int replicaIndex) {
175-
boolean completed = completeOp(DELETE, containerID, target, replicaIndex);
178+
boolean completed = completeOp(DELETE, containerID, target, replicaIndex, true);
176179
if (isMetricsNotNull() && completed) {
177180
if (isEC(replicaIndex)) {
178181
replicationMetrics.incrEcReplicasDeletedTotal();
@@ -192,7 +195,7 @@ public boolean completeDeleteReplica(ContainerID containerID,
192195
public boolean removeOp(ContainerID containerID,
193196
ContainerReplicaOp op) {
194197
return completeOp(op.getOpType(), containerID, op.getTarget(),
195-
op.getReplicaIndex());
198+
op.getReplicaIndex(), true);
196199
}
197200

198201
/**
@@ -221,9 +224,13 @@ public void removeExpiredEntries() {
221224
while (iterator.hasNext()) {
222225
ContainerReplicaOp op = iterator.next();
223226
if (clock.millis() > op.getDeadlineEpochMillis()) {
224-
iterator.remove();
227+
if (op.getOpType() != DELETE) {
228+
// For delete ops, we don't remove them from the list as RM must resend them, or they
229+
// will be removed via a container report when they are confirmed as deleted.
230+
iterator.remove();
231+
decrementCounter(op.getOpType(), op.getReplicaIndex());
232+
}
225233
expiredOps.add(op);
226-
decrementCounter(op.getOpType(), op.getReplicaIndex());
227234
updateTimeoutMetrics(op);
228235
}
229236
}
@@ -258,23 +265,26 @@ private void updateTimeoutMetrics(ContainerReplicaOp op) {
258265
}
259266

260267
private void addReplica(ContainerReplicaOp.PendingOpType opType,
261-
ContainerID containerID, DatanodeDetails target, int replicaIndex,
268+
ContainerID containerID, DatanodeDetails target, int replicaIndex, SCMCommand<?> command,
262269
long deadlineEpochMillis) {
263270
Lock lock = writeLock(containerID);
264271
lock(lock);
265272
try {
273+
// Remove any existing duplicate op for the same target and replicaIndex before adding
274+
// the new one. Especially for delete ops, they could be getting resent after expiry.
275+
completeOp(opType, containerID, target, replicaIndex, false);
266276
List<ContainerReplicaOp> ops = pendingOps.computeIfAbsent(
267277
containerID, s -> new ArrayList<>());
268278
ops.add(new ContainerReplicaOp(opType,
269-
target, replicaIndex, deadlineEpochMillis));
279+
target, replicaIndex, command, deadlineEpochMillis));
270280
incrementCounter(opType, replicaIndex);
271281
} finally {
272282
unlock(lock);
273283
}
274284
}
275285

276286
private boolean completeOp(ContainerReplicaOp.PendingOpType opType,
277-
ContainerID containerID, DatanodeDetails target, int replicaIndex) {
287+
ContainerID containerID, DatanodeDetails target, int replicaIndex, boolean notifySubsribers) {
278288
boolean found = false;
279289
// List of completed ops that subscribers will be notified about
280290
List<ContainerReplicaOp> completedOps = new ArrayList<>();
@@ -303,7 +313,7 @@ private boolean completeOp(ContainerReplicaOp.PendingOpType opType,
303313
unlock(lock);
304314
}
305315

306-
if (found) {
316+
if (found && notifySubsribers) {
307317
notifySubscribers(completedOps, containerID, false);
308318
}
309319
return found;

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ECUnderReplicationHandler.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -621,7 +621,7 @@ private void createReplicateCommand(
621621
private void adjustPendingOps(ECContainerReplicaCount replicaCount,
622622
DatanodeDetails target, int replicaIndex) {
623623
replicaCount.addPendingOp(new ContainerReplicaOp(
624-
ContainerReplicaOp.PendingOpType.ADD, target, replicaIndex,
624+
ContainerReplicaOp.PendingOpType.ADD, target, replicaIndex, null,
625625
Long.MAX_VALUE));
626626
}
627627

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@
103103
* that the containers are properly replicated. Replication Manager deals only
104104
* with Quasi Closed / Closed container.
105105
*/
106-
public class ReplicationManager implements SCMService {
106+
public class ReplicationManager implements SCMService, ContainerReplicaPendingOpsSubscriber {
107107

108108
public static final Logger LOG =
109109
LoggerFactory.getLogger(ReplicationManager.class);
@@ -673,8 +673,7 @@ private void adjustPendingOpsAndMetrics(ContainerInfo containerInfo,
673673
if (cmd.getType() == Type.deleteContainerCommand) {
674674
DeleteContainerCommand rcc = (DeleteContainerCommand) cmd;
675675
containerReplicaPendingOps.scheduleDeleteReplica(
676-
containerInfo.containerID(), targetDatanode, rcc.getReplicaIndex(),
677-
scmDeadlineEpochMs);
676+
containerInfo.containerID(), targetDatanode, rcc.getReplicaIndex(), cmd, scmDeadlineEpochMs);
678677
if (rcc.getReplicaIndex() > 0) {
679678
getMetrics().incrEcDeletionCmdsSentTotal();
680679
} else if (rcc.getReplicaIndex() == 0) {
@@ -687,8 +686,7 @@ private void adjustPendingOpsAndMetrics(ContainerInfo containerInfo,
687686
final ByteString targetIndexes = rcc.getMissingContainerIndexes();
688687
for (int i = 0; i < targetIndexes.size(); i++) {
689688
containerReplicaPendingOps.scheduleAddReplica(
690-
containerInfo.containerID(), targets.get(i), targetIndexes.byteAt(i),
691-
scmDeadlineEpochMs);
689+
containerInfo.containerID(), targets.get(i), targetIndexes.byteAt(i), cmd, scmDeadlineEpochMs);
692690
}
693691
getMetrics().incrEcReconstructionCmdsSentTotal();
694692
} else if (cmd.getType() == Type.replicateContainerCommand) {
@@ -702,15 +700,15 @@ private void adjustPendingOpsAndMetrics(ContainerInfo containerInfo,
702700
*/
703701
containerReplicaPendingOps.scheduleAddReplica(
704702
containerInfo.containerID(),
705-
targetDatanode, rcc.getReplicaIndex(), scmDeadlineEpochMs);
703+
targetDatanode, rcc.getReplicaIndex(), cmd, scmDeadlineEpochMs);
706704
} else {
707705
/*
708706
This means the source will push replica to the target, so the op's
709707
target Datanode should be the Datanode the replica will be pushed to.
710708
*/
711709
containerReplicaPendingOps.scheduleAddReplica(
712710
containerInfo.containerID(),
713-
rcc.getTargetDatanode(), rcc.getReplicaIndex(), scmDeadlineEpochMs);
711+
rcc.getTargetDatanode(), rcc.getReplicaIndex(), cmd, scmDeadlineEpochMs);
714712
}
715713

716714
if (rcc.getReplicaIndex() > 0) {
@@ -1043,6 +1041,27 @@ ReplicationQueue getQueue() {
10431041
return replicationQueue.get();
10441042
}
10451043

1044+
@Override
1045+
public void opCompleted(ContainerReplicaOp op, ContainerID containerID, boolean timedOut) {
1046+
if (!(timedOut && op.getOpType() == ContainerReplicaOp.PendingOpType.DELETE)) {
1047+
// We only care about expired delete ops. All others should be ignored.
1048+
return;
1049+
}
1050+
try {
1051+
ContainerInfo containerInfo = containerManager.getContainer(containerID);
1052+
// Sending the command in this way is un-throttled, and the command will have its deadline
1053+
// adjusted to a new deadline as part of the sending process.
1054+
sendDatanodeCommand(op.getCommand(), containerInfo, op.getTarget());
1055+
} catch (ContainerNotFoundException e) {
1056+
// Should not happen, as even deleted containers are currently retained in the SCM container map
1057+
LOG.error("Container {} not found when processing expired delete", containerID, e);
1058+
} catch (NotLeaderException e) {
1059+
// If SCM leadership has changed, this is fine to ignore. All pending ops will be expired
1060+
// once SCM leadership switches.
1061+
LOG.warn("SCM is not leader when processing expired delete", e);
1062+
}
1063+
}
1064+
10461065
/**
10471066
* Configuration used by the Replication Manager.
10481067
*/

hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -836,6 +836,9 @@ private void initializeSystemManagers(OzoneConfiguration conf,
836836
reconfigurationHandler.register(replicationManager.getConfig());
837837
}
838838
serviceManager.register(replicationManager);
839+
// RM gets notified of expired pending delete from containerReplicaPendingOps by subscribing to it
840+
// so it can resend them.
841+
containerReplicaPendingOps.registerSubscriber(replicationManager);
839842
if (configurator.getScmSafeModeManager() != null) {
840843
scmSafeModeManager = configurator.getScmSafeModeManager();
841844
} else {

hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestMoveManager.java

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -199,13 +199,13 @@ public void testMovePendingOpsExist() throws Exception {
199199
nodes.put(src, NodeStatus.inServiceHealthy());
200200
nodes.put(tgt, NodeStatus.inServiceHealthy());
201201

202-
pendingOps.add(new ContainerReplicaOp(ADD, tgt, 0, clock.millis()));
202+
pendingOps.add(new ContainerReplicaOp(ADD, tgt, 0, null, clock.millis()));
203203

204204
assertMoveFailsWith(REPLICATION_FAIL_INFLIGHT_REPLICATION,
205205
containerInfo.containerID());
206206

207207
pendingOps.clear();
208-
pendingOps.add(new ContainerReplicaOp(DELETE, src, 0, clock.millis()));
208+
pendingOps.add(new ContainerReplicaOp(DELETE, src, 0, null, clock.millis()));
209209
assertMoveFailsWith(REPLICATION_FAIL_INFLIGHT_DELETION,
210210
containerInfo.containerID());
211211
}
@@ -325,7 +325,7 @@ public void testDeleteCommandFails() throws Exception {
325325
.when(containerManager).getContainer(any(ContainerID.class));
326326

327327
ContainerReplicaOp op = new ContainerReplicaOp(
328-
ADD, tgt, 0, clock.millis() + 1000);
328+
ADD, tgt, 0, null, clock.millis() + 1000);
329329
moveManager.opCompleted(op, containerInfo.containerID(), false);
330330

331331
MoveManager.MoveResult moveResult = res.get();
@@ -337,14 +337,14 @@ public void testSuccessfulMove() throws Exception {
337337
CompletableFuture<MoveManager.MoveResult> res = setupSuccessfulMove();
338338

339339
ContainerReplicaOp op = new ContainerReplicaOp(
340-
ADD, tgt, 0, clock.millis() + 1000);
340+
ADD, tgt, 0, null, clock.millis() + 1000);
341341
moveManager.opCompleted(op, containerInfo.containerID(), false);
342342

343343
verify(replicationManager).sendDeleteCommand(
344344
eq(containerInfo), eq(0), eq(src), eq(true), anyLong());
345345

346346
op = new ContainerReplicaOp(
347-
DELETE, src, 0, clock.millis() + 1000);
347+
DELETE, src, 0, null, clock.millis() + 1000);
348348
moveManager.opCompleted(op, containerInfo.containerID(), false);
349349

350350
MoveManager.MoveResult finalResult = res.get();
@@ -374,15 +374,15 @@ public void testSuccessfulMoveNonZeroRepIndex() throws Exception {
374374
anyLong());
375375

376376
ContainerReplicaOp op = new ContainerReplicaOp(
377-
ADD, tgt, srcReplica.getReplicaIndex(), clock.millis() + 1000);
377+
ADD, tgt, srcReplica.getReplicaIndex(), null, clock.millis() + 1000);
378378
moveManager.opCompleted(op, containerInfo.containerID(), false);
379379

380380
verify(replicationManager).sendDeleteCommand(
381381
eq(containerInfo), eq(srcReplica.getReplicaIndex()), eq(src),
382382
eq(true), anyLong());
383383

384384
op = new ContainerReplicaOp(
385-
DELETE, src, srcReplica.getReplicaIndex(), clock.millis() + 1000);
385+
DELETE, src, srcReplica.getReplicaIndex(), null, clock.millis() + 1000);
386386
moveManager.opCompleted(op, containerInfo.containerID(), false);
387387

388388
MoveManager.MoveResult finalResult = res.get();
@@ -394,7 +394,7 @@ public void testMoveTimeoutOnAdd() throws Exception {
394394
CompletableFuture<MoveManager.MoveResult> res = setupSuccessfulMove();
395395

396396
ContainerReplicaOp op = new ContainerReplicaOp(
397-
ADD, tgt, 0, clock.millis() + 1000);
397+
ADD, tgt, 0, null, clock.millis() + 1000);
398398
moveManager.opCompleted(op, containerInfo.containerID(), true);
399399

400400
MoveManager.MoveResult finalResult = res.get();
@@ -406,14 +406,14 @@ public void testMoveTimeoutOnDelete() throws Exception {
406406
CompletableFuture<MoveManager.MoveResult> res = setupSuccessfulMove();
407407

408408
ContainerReplicaOp op = new ContainerReplicaOp(
409-
ADD, tgt, 0, clock.millis() + 1000);
409+
ADD, tgt, 0, null, clock.millis() + 1000);
410410
moveManager.opCompleted(op, containerInfo.containerID(), false);
411411

412412
verify(replicationManager).sendDeleteCommand(
413413
eq(containerInfo), eq(0), eq(src), eq(true), anyLong());
414414

415415
op = new ContainerReplicaOp(
416-
DELETE, src, 0, clock.millis() + 1000);
416+
DELETE, src, 0, null, clock.millis() + 1000);
417417
moveManager.opCompleted(op, containerInfo.containerID(), true);
418418

419419
MoveManager.MoveResult finalResult = res.get();
@@ -434,7 +434,7 @@ public void testMoveCompleteSrcNoLongerPresent() throws Exception {
434434
}
435435
}
436436
ContainerReplicaOp op = new ContainerReplicaOp(
437-
ADD, tgt, 0, clock.millis() + 1000);
437+
ADD, tgt, 0, null, clock.millis() + 1000);
438438
moveManager.opCompleted(op, containerInfo.containerID(), false);
439439

440440
MoveManager.MoveResult finalResult = res.get();
@@ -450,7 +450,7 @@ public void testMoveCompleteSrcNotHealthy() throws Exception {
450450

451451
nodes.put(src, NodeStatus.inServiceStale());
452452
ContainerReplicaOp op = new ContainerReplicaOp(
453-
ADD, tgt, 0, clock.millis() + 1000);
453+
ADD, tgt, 0, null, clock.millis() + 1000);
454454
moveManager.opCompleted(op, containerInfo.containerID(), false);
455455

456456
MoveManager.MoveResult finalResult = res.get();
@@ -468,7 +468,7 @@ public void testMoveCompleteSrcNotInService() throws Exception {
468468
HddsProtos.NodeOperationalState.DECOMMISSIONING,
469469
HddsProtos.NodeState.HEALTHY));
470470
ContainerReplicaOp op = new ContainerReplicaOp(
471-
ADD, tgt, 0, clock.millis() + 1000);
471+
ADD, tgt, 0, null, clock.millis() + 1000);
472472
moveManager.opCompleted(op, containerInfo.containerID(), false);
473473

474474
MoveManager.MoveResult finalResult = res.get();
@@ -487,7 +487,7 @@ public void testMoveCompleteFutureReplicasUnhealthy() throws Exception {
487487
.MisReplicatedHealthResult(containerInfo, false, null));
488488

489489
ContainerReplicaOp op = new ContainerReplicaOp(
490-
ADD, tgt, 0, clock.millis() + 1000);
490+
ADD, tgt, 0, null, clock.millis() + 1000);
491491
moveManager.opCompleted(op, containerInfo.containerID(), false);
492492

493493
MoveManager.MoveResult finalResult = res.get();

0 commit comments

Comments
 (0)