3535import java .util .Map ;
3636import java .util .Queue ;
3737import java .util .Set ;
38+ import java .util .concurrent .ArrayBlockingQueue ;
39+ import java .util .concurrent .BlockingQueue ;
3840import java .util .concurrent .CompletableFuture ;
3941import java .util .concurrent .ConcurrentLinkedQueue ;
4042import java .util .concurrent .CopyOnWriteArrayList ;
4143import java .util .concurrent .ExecutionException ;
42- import java .util .concurrent .Semaphore ;
4344import java .util .concurrent .TimeUnit ;
4445import java .util .concurrent .atomic .AtomicLong ;
4546import java .util .function .Function ;
@@ -74,7 +75,8 @@ public class SnapshotReadCache {
7475 private final LinkRecordDecoder linkRecordDecoder ;
7576 private final Time time = Time .SYSTEM ;
7677
77- public SnapshotReadCache (StreamManager streamManager , LogCache cache , ObjectStorage objectStorage , LinkRecordDecoder linkRecordDecoder ) {
78+ public SnapshotReadCache (StreamManager streamManager , LogCache cache , ObjectStorage objectStorage ,
79+ LinkRecordDecoder linkRecordDecoder ) {
7880 activeStreams = CacheBuilder .newBuilder ()
7981 .expireAfterAccess (10 , TimeUnit .MINUTES )
8082 .removalListener ((RemovalListener <Long , Boolean >) notification ->
@@ -128,7 +130,8 @@ public synchronized CompletableFuture<Void> replay(List<S3ObjectMetadata> object
128130 return objectReplay .replay (objects );
129131 }
130132
131- public synchronized CompletableFuture <Void > replay (WriteAheadLog confirmWAL , RecordOffset startOffset , RecordOffset endOffset ) {
133+ public synchronized CompletableFuture <Void > replay (WriteAheadLog confirmWAL , RecordOffset startOffset ,
134+ RecordOffset endOffset ) {
132135 long startNanos = time .nanoseconds ();
133136 return walReplay .replay (confirmWAL , startOffset , endOffset )
134137 .whenComplete ((nil , ex ) -> REPLAY_LATENCY .record (time .nanoseconds () - startNanos ));
@@ -153,32 +156,69 @@ private void activeStream(long streamId) {
153156 }
154157
155158 class WalReplay {
159+ private static final long TASK_WAITING_TIMEOUT_NANOS = TimeUnit .SECONDS .toNanos (5 );
160+ private static final int MAX_WAITING_LOAD_TASK_COUNT = 4096 ;
156161 // soft limit the inflight memory
157- private final Semaphore inflightLimiter = new Semaphore ( Systems .CPU_CORES * 4 ) ;
158- private final Queue <WalReplayTask > waitingLoadTasks = new ConcurrentLinkedQueue <>();
162+ private final int maxInflightLoadingCount = Systems .CPU_CORES * 4 ;
163+ private final BlockingQueue <WalReplayTask > waitingLoadTasks = new ArrayBlockingQueue <>(MAX_WAITING_LOAD_TASK_COUNT );
159164 private final Queue <WalReplayTask > loadingTasks = new ConcurrentLinkedQueue <>();
160165
161166 public CompletableFuture <Void > replay (WriteAheadLog wal , RecordOffset startOffset , RecordOffset endOffset ) {
162- inflightLimiter .acquireUninterruptibly ();
163167 WalReplayTask task = new WalReplayTask (wal , startOffset , endOffset );
164- waitingLoadTasks .add (task );
168+ while (!waitingLoadTasks .add (task )) {
169+ // The replay won't be called on the SnapshotReadCache.eventLoop, so there won't be a deadlock.
170+ eventLoop .submit (this ::clearOverloadedTask ).join ();
171+ }
165172 eventLoop .submit (this ::tryLoad );
166- return task .replayCf .whenComplete ((nil , ex ) -> inflightLimiter . release () );
173+ return task .replayCf .whenCompleteAsync ((nil , ex ) -> tryLoad (), eventLoop );
167174 }
168175
169176 @ EventLoopSafe
170177 private void tryLoad () {
171178 for (; ; ) {
172- WalReplayTask task = waitingLoadTasks .poll ();
179+ if (loadingTasks .size () >= maxInflightLoadingCount ) {
180+ break ;
181+ }
182+ WalReplayTask task = waitingLoadTasks .peek ();
173183 if (task == null ) {
174184 break ;
175185 }
186+ if (time .nanoseconds () - task .timestampNanos > TASK_WAITING_TIMEOUT_NANOS ) {
187+ clearOverloadedTask ();
188+ return ;
189+ }
190+ waitingLoadTasks .poll ();
176191 loadingTasks .add (task );
177192 task .run ();
178193 task .loadCf .whenCompleteAsync ((rst , ex ) -> tryPutIntoCache (), eventLoop );
179194 }
180195 }
181196
197+ /**
198+ * Clears all waiting tasks when the replay system is overloaded.
199+ * This is triggered when tasks wait longer than TASK_WAITING_TIMEOUT_NANOS or waitingLoadTasks is full.
200+ * All dropped tasks have their futures completed with null, and affected
201+ * nodes are notified to commit their WAL to free up resources.
202+ */
203+ @ EventLoopSafe
204+ private void clearOverloadedTask () {
205+ // The WalReplay is overloaded, so we need to drain all tasks promptly.
206+ Set <Integer > nodeIds = new HashSet <>();
207+ int dropCount = 0 ;
208+ for (; ; ) {
209+ WalReplayTask task = waitingLoadTasks .poll ();
210+ if (task == null ) {
211+ break ;
212+ }
213+ nodeIds .add (task .wal .metadata ().nodeId ());
214+ task .loadCf .complete (null );
215+ task .replayCf .complete (null );
216+ dropCount ++;
217+ }
218+ nodeIds .forEach (cacheFreeListener ::notifyListener );
219+ LOGGER .warn ("wal replay is overloaded, drop all {} waiting tasks and request nodes={} to commit" , dropCount , nodeIds );
220+ }
221+
182222 @ EventLoopSafe
183223 private void tryPutIntoCache () {
184224 for (; ; ) {
@@ -195,6 +235,7 @@ private void tryPutIntoCache() {
195235 }
196236
197237 class WalReplayTask {
238+ final long timestampNanos = time .nanoseconds ();
198239 final WriteAheadLog wal ;
199240 final RecordOffset startOffset ;
200241 final RecordOffset endOffset ;
@@ -389,9 +430,11 @@ public void onFree(List<LogCache.StreamRangeBound> bounds) {
389430 requestCommitNodes .add (streamMetadata .nodeId ());
390431 }
391432 }
392- listeners .forEach (listener ->
393- requestCommitNodes .forEach (nodeId ->
394- FutureUtil .suppress (() -> listener .onEvent (new RequestCommitEvent (nodeId )), LOGGER )));
433+ requestCommitNodes .forEach (this ::notifyListener );
434+ }
435+
436+ public void notifyListener (int nodeId ) {
437+ listeners .forEach (listener -> FutureUtil .suppress (() -> listener .onEvent (new RequestCommitEvent (nodeId )), LOGGER ));
395438 }
396439
397440 public void addListener (EventListener listener ) {
0 commit comments