[server] Support parallel shutdown workflow for Venice Server (#2247)

sixpluszero · web-flow · commit c0f8b56debc2 · 2025-10-30T10:00:57.000-07:00
Speed up SIT shutdown by performing parallel unsubscribe and sync offset       

SIT today unsubscribes and syncs offset for each partition sequentially. For   
hosts with a large number of partitions, this process is slow and may take     
more than one minute. SIT executor graceful shutdown timeout is one minute,    
and after that, it will force close and set an Interrupt exception. This       
cancels all remaining graceful shutdown tasks for the remaining partitions.    

After some offline discussion, this PR improves the behavior by implementing   
parallel shutdown for all partitions, similar to Da Vinci today. This is still 
best effort, and we keep the timeout in place since we don't want slowness     
to block the entire server shutdown process. If certain steps are slow, we     
should continue investigating them.                                            

Introduce a new config                                                         
"server.ingestion.checkpoint.during.graceful.shutdown.enabled" to control      
this behavior in the Server. By default, it is false (same as now). It will    
be rolled out gradually everywhere, as there should not be any side effect.
diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/config/VeniceServerConfig.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/config/VeniceServerConfig.java
@@ -158,6 +158,7 @@
 import static com.linkedin.venice.ConfigKeys.SERVER_OPTIMIZE_DATABASE_FOR_BACKUP_VERSION_NO_READ_THRESHOLD_SECONDS;
 import static com.linkedin.venice.ConfigKeys.SERVER_OPTIMIZE_DATABASE_SERVICE_SCHEDULE_INTERNAL_SECONDS;
 import static com.linkedin.venice.ConfigKeys.SERVER_PARALLEL_BATCH_GET_CHUNK_SIZE;
+import static com.linkedin.venice.ConfigKeys.SERVER_PARALLEL_RESOURCE_SHUTDOWN_ENABLED;
 import static com.linkedin.venice.ConfigKeys.SERVER_PARTITION_GRACEFUL_DROP_DELAY_IN_SECONDS;
 import static com.linkedin.venice.ConfigKeys.SERVER_PROMOTION_TO_LEADER_REPLICA_DELAY_SECONDS;
 import static com.linkedin.venice.ConfigKeys.SERVER_PUBSUB_CONSUMER_POLL_RETRY_BACKOFF_MS;
@@ -667,6 +668,8 @@ public class VeniceServerConfig extends VeniceClusterConfig {
   private final int inactiveTopicPartitionCheckerThresholdInSeconds;
   private final int serverIngestionInfoLogLineLimit;
 
+  private final boolean parallelResourceShutdownEnabled;
+
   public VeniceServerConfig(VeniceProperties serverProperties) throws ConfigurationException {
     this(serverProperties, Collections.emptyMap());
   }
@@ -1133,6 +1136,8 @@ public VeniceServerConfig(VeniceProperties serverProperties, Map<String, Map<Str
     this.useMetricsBasedPositionInLagComputation =
         serverProperties.getBoolean(SERVER_USE_METRICS_BASED_POSITION_IN_LAG_COMPUTATION, false);
     this.serverIngestionInfoLogLineLimit = serverProperties.getInt(SERVER_INGESTION_INFO_LOG_LINE_LIMIT, 20);
+    this.parallelResourceShutdownEnabled =
+        serverProperties.getBoolean(SERVER_PARALLEL_RESOURCE_SHUTDOWN_ENABLED, false);
   }
 
   List<Double> extractThrottleLimitFactorsFor(VeniceProperties serverProperties, String configKey) {
@@ -2043,4 +2048,8 @@ public boolean isUseMetricsBasedPositionInLagComputationEnabled() {
   public int getServerIngestionInfoLogLineLimit() {
     return this.serverIngestionInfoLogLineLimit;
   }
+
+  public boolean isParallelResourceShutdownEnabled() {
+    return parallelResourceShutdownEnabled;
+  }
 }
diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/KafkaStoreIngestionService.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/KafkaStoreIngestionService.java
@@ -1386,7 +1386,7 @@ public ByteBuffer getPartitionOffsetRecords(String topicName, int partition) {
    */
   public void syncTopicPartitionOffset(String topicName, int partition) {
     StoreIngestionTask storeIngestionTask = getStoreIngestionTask(topicName);
-    storeIngestionTask.updateOffsetMetadataAndSync(topicName, partition);
+    storeIngestionTask.updateOffsetMetadataAndSync(partition);
   }
 
   public final ReadOnlyStoreRepository getMetadataRepo() {
diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTask.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTask.java
@@ -1713,12 +1713,13 @@ public void run() {
       List<CompletableFuture<Void>> shutdownFutures = new ArrayList<>(partitionConsumptionStateMap.size());
 
       /**
-       * Speed up DaVinci shutdown by closing partitions concurrently.
+       * Speed shutdown by closing partitions concurrently. For Server it is controlled by server config, for DaVinci
+       * client it is always enabled.
        */
-      ExecutorService shutdownExecutorForDvc =
-          isDaVinciClient ? Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() * 2) : null;
+      boolean enableParallelShutdown = serverConfig.isParallelResourceShutdownEnabled() || isDaVinciClient;
+      ExecutorService shutdownExecutor =
+          enableParallelShutdown ? Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() * 2) : null;
 
-      // If the ingestion task is stopped gracefully (server stops), persist processed offset to disk
       for (Map.Entry<Integer, PartitionConsumptionState> entry: partitionConsumptionStateMap.entrySet()) {
         /**
          * Now, there are two threads, which could potentially trigger {@link #syncOffset(String, PartitionConsumptionState)}:
@@ -1732,36 +1733,11 @@ public void run() {
          * offset and checksum, since the checksum could change in another thread, but the corresponding offset change
          * hasn't been applied yet, when checkpointing happens in current thread.
          */
-
-        Runnable shutdownRunnable = () -> {
-          int partition = entry.getKey();
-          PartitionConsumptionState partitionConsumptionState = entry.getValue();
-          consumerUnSubscribeAllTopics(partitionConsumptionState);
-
-          if (ingestionCheckpointDuringGracefulShutdownEnabled) {
-            PubSubTopicPartition topicPartition = new PubSubTopicPartitionImpl(versionTopic, partition);
-            try {
-              CompletableFuture<Void> cmdFuture = storeBufferService.execSyncOffsetCommandAsync(topicPartition, this);
-              waitForSyncOffsetCmd(cmdFuture, topicPartition);
-              waitForAllMessageToBeProcessedFromTopicPartition(topicPartition, partitionConsumptionState);
-            } catch (InterruptedException e) {
-              throw new VeniceException(e);
-            }
-          }
-        };
-
-        if (shutdownExecutorForDvc != null) {
-          shutdownFutures.add(CompletableFuture.runAsync(shutdownRunnable, shutdownExecutorForDvc));
-        } else {
-          /**
-           * TODO: evaluate whether we need to apply concurrent shutdown in Venice Server or not.
-           */
-          shutdownRunnable.run();
-        }
+        executeShutdownRunnable(entry.getValue(), shutdownFutures, shutdownExecutor);
       }
-      if (isDaVinciClient) {
+      if (enableParallelShutdown) {
         /**
-         * DaVinci shutdown shouldn't take that long because of high concurrency, and it is fine to specify a high timeout here
+         * Shutdown shouldn't take that long because of high concurrency, and it is fine to specify a high timeout here
          * to avoid infinite wait in case there is some regression.
          */
         CompletableFuture.allOf(shutdownFutures.toArray(new CompletableFuture[0])).get(60, SECONDS);
@@ -1817,6 +1793,31 @@ public void run() {
     }
   }
 
+  void executeShutdownRunnable(
+      PartitionConsumptionState partitionConsumptionState,
+      List<CompletableFuture<Void>> shutdownFutures,
+      ExecutorService shutdownExecutor) {
+    Runnable shutdownRunnable = () -> {
+      consumerUnSubscribeAllTopics(partitionConsumptionState);
+      // If the ingestion task is stopped gracefully (server stops), persist processed offset to disk.
+      if (getServerConfig().isServerIngestionCheckpointDuringGracefulShutdownEnabled()) {
+        try {
+          PubSubTopicPartition topicPartition = partitionConsumptionState.getReplicaTopicPartition();
+          CompletableFuture<Void> cmdFuture = getStoreBufferService().execSyncOffsetCommandAsync(topicPartition, this);
+          waitForSyncOffsetCmd(cmdFuture, topicPartition);
+          waitForAllMessageToBeProcessedFromTopicPartition(topicPartition, partitionConsumptionState);
+        } catch (InterruptedException e) {
+          throw new VeniceException(e);
+        }
+      }
+    };
+    if (shutdownExecutor != null) {
+      shutdownFutures.add(CompletableFuture.runAsync(shutdownRunnable, shutdownExecutor));
+    } else {
+      shutdownRunnable.run();
+    }
+  }
+
   private void waitForSyncOffsetCmd(CompletableFuture<Void> cmdFuture, PubSubTopicPartition topicPartition)
       throws InterruptedException {
     try {
@@ -1858,7 +1859,7 @@ protected void updateOffsetMetadataAndSyncOffset(DataIntegrityValidator div, @No
     div.updateOffsetRecordForPartition(PartitionTracker.VERSION_TOPIC, pcs.getPartition(), pcs.getOffsetRecord());
     // update the offset metadata in the OffsetRecord.
     updateOffsetMetadataInOffsetRecord(pcs);
-    syncOffset(kafkaVersionTopic, pcs);
+    syncOffset(pcs);
   }
 
   /**
@@ -1868,7 +1869,7 @@ protected void updateAndSyncOffsetFromSnapshot(PartitionTracker vtDivSnapshot, P
     PartitionConsumptionState pcs = getPartitionConsumptionState(topicPartition.getPartitionNumber());
     vtDivSnapshot.updateOffsetRecord(PartitionTracker.VERSION_TOPIC, pcs.getOffsetRecord());
     updateOffsetMetadataInOffsetRecord(pcs);
-    syncOffset(kafkaVersionTopic, pcs);
+    syncOffset(pcs);
   }
 
   private void handleIngestionException(Exception e) {
@@ -2760,10 +2761,9 @@ boolean shouldSyncOffset(
   /**
    * This method flushes data partition on disk and syncs the underlying database with {@link OffsetRecord}.
    * Note that the updates for {@link OffsetRecord} is happened in {@link #updateOffsetMetadataInOffsetRecord}
-   * @param topic, the given version topic(VT) for the store.
    * @param pcs, the corresponding {@link PartitionConsumptionState} to sync with.
    */
-  private void syncOffset(String topic, PartitionConsumptionState pcs) {
+  private void syncOffset(PartitionConsumptionState pcs) {
     int partition = pcs.getPartition();
     if (this.storageEngine.isClosed()) {
       LOGGER.warn("Storage engine has been closed. Could not execute sync offset for replica: {}", pcs.getReplicaId());
@@ -3723,8 +3723,13 @@ public boolean consumerHasAnySubscription() {
 
   public boolean consumerHasSubscription(PubSubTopic topic, PartitionConsumptionState partitionConsumptionState) {
     int partitionId = partitionConsumptionState.getPartition();
-    return aggKafkaConsumerService
-        .hasConsumerAssignedFor(versionTopic, new PubSubTopicPartitionImpl(topic, partitionId));
+    PubSubTopicPartition pubSubTopicPartition;
+    if (topic.isVersionTopic()) {
+      pubSubTopicPartition = partitionConsumptionState.getReplicaTopicPartition();
+    } else {
+      pubSubTopicPartition = new PubSubTopicPartitionImpl(topic, partitionId);
+    }
+    return aggKafkaConsumerService.hasConsumerAssignedFor(versionTopic, pubSubTopicPartition);
   }
 
   /**
@@ -4555,10 +4560,10 @@ public VeniceServerConfig getServerConfig() {
     return serverConfig;
   }
 
-  public void updateOffsetMetadataAndSync(String topic, int partitionId) {
+  public void updateOffsetMetadataAndSync(int partitionId) {
     PartitionConsumptionState pcs = getPartitionConsumptionState(partitionId);
     updateOffsetMetadataInOffsetRecord(pcs);
-    syncOffset(topic, pcs);
+    syncOffset(pcs);
   }
 
   /**
@@ -5098,4 +5103,8 @@ protected static void validateEndOfPushReceivedBeforeTopicSwitch(
       throw new VeniceMessageException(errorMessage);
     }
   }
+
+  AbstractStoreBufferService getStoreBufferService() {
+    return storeBufferService;
+  }
 }
diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTaskTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTaskTest.java
@@ -158,6 +158,7 @@
 import com.linkedin.venice.pubsub.PubSubContext;
 import com.linkedin.venice.pubsub.PubSubPositionDeserializer;
 import com.linkedin.venice.pubsub.PubSubPositionTypeRegistry;
+import com.linkedin.venice.pubsub.PubSubTopicImpl;
 import com.linkedin.venice.pubsub.PubSubTopicPartitionImpl;
 import com.linkedin.venice.pubsub.PubSubTopicRepository;
 import com.linkedin.venice.pubsub.api.DefaultPubSubMessage;
@@ -6267,6 +6268,60 @@ public void testResubscribeAsLeaderFromVersionTopic(boolean aaEnabled) throws In
 
   }
 
+  @Test
+  public void testParallelShutdown() throws InterruptedException {
+    // Setup test data
+    StoreIngestionTask storeIngestionTask = mock(StoreIngestionTask.class);
+    PartitionConsumptionState pcs = mock(PartitionConsumptionState.class);
+    List<CompletableFuture<Void>> shutdownFutures = new ArrayList<>();
+    ExecutorService shutdownExecutor = Executors.newSingleThreadExecutor();
+
+    // Mock server config to enable checkpointing during shutdown
+    VeniceServerConfig serverConfig = mock(VeniceServerConfig.class);
+    when(serverConfig.isServerIngestionCheckpointDuringGracefulShutdownEnabled()).thenReturn(true);
+    when(storeIngestionTask.getServerConfig()).thenReturn(serverConfig);
+
+    // Mock store buffer service
+    StoreBufferService storeBufferService = mock(StoreBufferService.class);
+    when(storeIngestionTask.getStoreBufferService()).thenReturn(storeBufferService);
+    when(storeBufferService.execSyncOffsetCommandAsync(any(), any()))
+        .thenReturn(CompletableFuture.completedFuture(null));
+
+    doCallRealMethod().when(storeIngestionTask).executeShutdownRunnable(any(), anyList(), any());
+
+    // Set up test data
+    PubSubTopicPartition topicPartition = new PubSubTopicPartitionImpl(new PubSubTopicImpl("test_topic_v1"), 0);
+    when(pcs.getReplicaTopicPartition()).thenReturn(topicPartition);
+
+    // Call the method under test
+    storeIngestionTask.executeShutdownRunnable(pcs, shutdownFutures, shutdownExecutor);
+
+    // Wait for async operation to complete
+    Assert.assertEquals(shutdownFutures.size(), 1);
+    shutdownFutures.forEach(CompletableFuture::join);
+
+    // Verify behavior
+    verify(storeIngestionTask).consumerUnSubscribeAllTopics(pcs);
+    verify(storeBufferService).execSyncOffsetCommandAsync(topicPartition, storeIngestionTask);
+    verify(storeIngestionTask).waitForAllMessageToBeProcessedFromTopicPartition(topicPartition, pcs);
+
+    // Test with null executor (synchronous execution)
+    shutdownFutures.clear();
+    storeIngestionTask.executeShutdownRunnable(pcs, shutdownFutures, null);
+    assertTrue(shutdownFutures.isEmpty(), "No futures should be added when executor is null");
+    verify(storeIngestionTask, times(2)).consumerUnSubscribeAllTopics(pcs);
+
+    // Test when checkpointing is disabled
+    when(serverConfig.isServerIngestionCheckpointDuringGracefulShutdownEnabled()).thenReturn(false);
+    storeIngestionTask.executeShutdownRunnable(pcs, shutdownFutures, shutdownExecutor);
+    Assert.assertEquals(shutdownFutures.size(), 1);
+    shutdownFutures.forEach(CompletableFuture::join);
+    verify(storeIngestionTask, times(3)).consumerUnSubscribeAllTopics(pcs);
+
+    // Clean up
+    shutdownExecutor.shutdown();
+  }
+
   private VeniceStoreVersionConfig getDefaultMockVeniceStoreVersionConfig(
       Consumer<VeniceStoreVersionConfig> storeVersionConfigOverride) {
     // mock the store config
diff --git a/internal/venice-common/src/main/java/com/linkedin/venice/ConfigKeys.java b/internal/venice-common/src/main/java/com/linkedin/venice/ConfigKeys.java
@@ -1048,6 +1048,11 @@ private ConfigKeys() {
   public static final String SERVER_ADAPTIVE_THROTTLER_READ_COMPUTE_GET_LATENCY_THRESHOLD =
       "server.adaptive.throttler.read.compute.latency.threshold";
 
+  /**
+   * Config to enable parallel resource shutdown operation to speed up overall ingestion task shutdown.
+   */
+  public static final String SERVER_PARALLEL_RESOURCE_SHUTDOWN_ENABLED = "server.parallel.resource.shutdown.enabled";
+
   /**
    * A list of fully-qualified class names of all stats classes that needs to be initialized in isolated ingestion process,
    * separated by comma. This config will help isolated ingestion process to register extra stats needed for monitoring,

Original file line number	Diff line number	Diff line change
`@@ -1386,7 +1386,7 @@ public ByteBuffer getPartitionOffsetRecords(String topicName, int partition) {`
`1386`	`1386`	`*/`
`1387`	`1387`	`public void syncTopicPartitionOffset(String topicName, int partition) {`
`1388`	`1388`	`StoreIngestionTask storeIngestionTask = getStoreIngestionTask(topicName);`
`1389`		`- storeIngestionTask.updateOffsetMetadataAndSync(topicName, partition);`
	`1389`	`+ storeIngestionTask.updateOffsetMetadataAndSync(partition);`
`1390`	`1390`	`}`
`1391`	`1391`
`1392`	`1392`	`public final ReadOnlyStoreRepository getMetadataRepo() {`