Skip to content

Commit d6ee740

Browse files
committed
Fix deadlock with CC view topic switch logic and adding more tests
1 parent 03d399a commit d6ee740

File tree

7 files changed

+442
-49
lines changed

7 files changed

+442
-49
lines changed

clients/da-vinci-client/src/main/java/com/linkedin/davinci/consumer/VeniceAfterImageConsumerImpl.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ protected CompletableFuture<Void> internalFindNewVersionCheckpoints(
227227
newVersionTopic,
228228
generationId,
229229
partitions);
230-
while (!versionSwapConsumedPerPartitionMap.values().stream().allMatch(x -> x)) {
230+
while (!areAllTrue(versionSwapConsumedPerPartitionMap.values())) {
231231
polledResults = consumerAdapter.poll(5000L);
232232
for (Map.Entry<PubSubTopicPartition, List<DefaultPubSubMessage>> entry: polledResults.entrySet()) {
233233
PubSubTopicPartition pubSubTopicPartition = entry.getKey();
@@ -396,6 +396,15 @@ protected CompletableFuture<Void> internalSeekToEndOfPush(
396396
}, seekExecutorService);
397397
}
398398

399+
private boolean areAllTrue(Collection<Boolean> booleanCollections) {
400+
for (Boolean b: booleanCollections) {
401+
if (!b) {
402+
return false;
403+
}
404+
}
405+
return true;
406+
}
407+
399408
@Override
400409
public CompletableFuture<Void> seekToEndOfPush(Set<Integer> partitions) {
401410
if (partitions.isEmpty()) {

clients/da-vinci-client/src/main/java/com/linkedin/davinci/consumer/VeniceChangelogConsumerImpl.java

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -187,9 +187,9 @@ public class VeniceChangelogConsumerImpl<K, V> implements VeniceChangelogConsume
187187
protected final long versionSwapTimeoutInMs;
188188
protected final AtomicReference<CountDownLatch> onGoingVersionSwapSignal = new AtomicReference<>();
189189
/**
190-
* Interaction of this field should acquire the subscriptionLock.readLock()
190+
* Interaction of this field should acquire the subscriptionLock.writeLock()
191191
*/
192-
protected VersionSwapMessageState versionSwapMessageState = null;
192+
protected volatile VersionSwapMessageState versionSwapMessageState = null;
193193
protected Time time;
194194

195195
public VeniceChangelogConsumerImpl(
@@ -343,6 +343,11 @@ public VeniceChangelogConsumerImpl(
343343
LOGGER.info("Start a change log consumer client for store: {}", storeName);
344344
}
345345

346+
// Unit test only and read only
347+
VersionSwapMessageState getVersionSwapMessageState() {
348+
return this.versionSwapMessageState;
349+
}
350+
346351
@Override
347352
public int getPartitionCount() {
348353
Store store = getStore();
@@ -818,7 +823,7 @@ protected Collection<PubSubMessage<K, ChangeEvent<V>, VeniceChangeCoordinate>> i
818823
String topicSuffix,
819824
boolean includeControlMessage) {
820825
Collection<PubSubMessage<K, ChangeEvent<V>, VeniceChangeCoordinate>> pubSubMessages = new ArrayList<>();
821-
Map<PubSubTopicPartition, List<DefaultPubSubMessage>> messagesMap = Collections.EMPTY_MAP;
826+
Map<PubSubTopicPartition, List<DefaultPubSubMessage>> messagesMap;
822827
boolean lockAcquired = false;
823828

824829
try {
@@ -853,7 +858,6 @@ protected Collection<PubSubMessage<K, ChangeEvent<V>, VeniceChangeCoordinate>> i
853858
versionSwapMessageState.getNewVersionTopic(),
854859
versionSwapMessageState.getVersionSwapGenerationId());
855860
changeCaptureStats.emitVersionSwapCountMetrics(SUCCESS);
856-
changeCaptureStats.setUndergoingVersionSwap(0);
857861
versionSwapMessageState = null;
858862
onGoingVersionSwapSignal.get().countDown();
859863
} else {
@@ -873,7 +877,6 @@ protected Collection<PubSubMessage<K, ChangeEvent<V>, VeniceChangeCoordinate>> i
873877
versionSwapMessageState.getVersionSwapGenerationId(),
874878
versionSwapMessageState.getIncompletePartitions());
875879
changeCaptureStats.emitVersionSwapCountMetrics(SUCCESS);
876-
changeCaptureStats.setUndergoingVersionSwap(0);
877880
versionSwapMessageState = null;
878881
onGoingVersionSwapSignal.get().countDown();
879882
} else {
@@ -1109,8 +1112,6 @@ protected Optional<PubSubMessage<K, ChangeEvent<V>, VeniceChangeCoordinate>> con
11091112
}
11101113
}
11111114
if (messageType.equals(MessageType.DELETE)) {
1112-
Delete delete = (Delete) message.getValue().payloadUnion;
1113-
11141115
// Deletes have a previous and current value of null. So just fill it in!
11151116
ChangeEvent<V> changeEvent = new ChangeEvent<>(null, null);
11161117
pubSubChangeEventMessage = Optional.of(
@@ -1464,7 +1465,6 @@ protected boolean handleVersionSwapMessageInVT(
14641465
versionSwapMessageState =
14651466
new VersionSwapMessageState(versionSwap, totalRegionCount, currentAssignment, time.getMilliseconds());
14661467
onGoingVersionSwapSignal.set(new CountDownLatch(1));
1467-
changeCaptureStats.setUndergoingVersionSwap(1);
14681468
LOGGER.info(
14691469
"New version detected for store: {} through version swap messages. Performing version swap from topic: {} to topic: {}, generation id: {}",
14701470
storeName,
@@ -1650,7 +1650,12 @@ protected boolean switchToNewTopic(PubSubTopic newTopic, String topicSuffix, Int
16501650
}
16511651
unsubscribe(partitions);
16521652
try {
1653-
internalSubscribe(partitions, mergedTopicName).get();
1653+
Set<VeniceChangeCoordinate> beginningOfNewTopic = new HashSet<>(partitions.size());
1654+
for (Integer p: partitions) {
1655+
beginningOfNewTopic
1656+
.add(new VeniceChangeCoordinate(mergedTopicName.getName(), PubSubSymbolicPosition.EARLIEST, p));
1657+
}
1658+
synchronousSeekToCheckpoint(beginningOfNewTopic);
16541659
} catch (Exception e) {
16551660
throw new VeniceException("Subscribe to new topic:" + mergedTopicName + " is not successful, error: " + e);
16561661
}

clients/da-vinci-client/src/main/java/com/linkedin/davinci/consumer/VersionSwapMessageState.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ public synchronized Set<VeniceChangeCoordinate> getNewTopicCheckpointsWithEOPAsB
138138
return checkpoints;
139139
}
140140

141-
public Set<Integer> getAssignedPartitions() {
141+
public synchronized Set<Integer> getAssignedPartitions() {
142142
return Collections.unmodifiableSet(assignedPartitions);
143143
}
144144

clients/da-vinci-client/src/main/java/com/linkedin/davinci/consumer/stats/BasicConsumerStats.java

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository;
1414
import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions;
1515
import com.linkedin.venice.stats.dimensions.VeniceResponseStatusCategory;
16-
import com.linkedin.venice.stats.metrics.AsyncMetricEntityStateBase;
1716
import com.linkedin.venice.stats.metrics.MetricEntity;
1817
import com.linkedin.venice.stats.metrics.MetricEntityStateBase;
1918
import com.linkedin.venice.stats.metrics.MetricEntityStateOneEnum;
@@ -23,7 +22,6 @@
2322
import com.linkedin.venice.stats.metrics.TehutiMetricNameEnum;
2423
import io.opentelemetry.api.common.Attributes;
2524
import io.tehuti.metrics.MetricsRepository;
26-
import io.tehuti.metrics.stats.AsyncGauge;
2725
import io.tehuti.metrics.stats.Avg;
2826
import io.tehuti.metrics.stats.Gauge;
2927
import io.tehuti.metrics.stats.Max;
@@ -34,7 +32,6 @@
3432
import java.util.Collections;
3533
import java.util.Map;
3634
import java.util.Set;
37-
import java.util.concurrent.atomic.AtomicInteger;
3835

3936

4037
/**
@@ -56,8 +53,6 @@ public class BasicConsumerStats extends AbstractVeniceStats {
5653
private final MetricEntityStateOneEnum<VeniceResponseStatusCategory> versionSwapFailCountMetric;
5754
private final MetricEntityStateOneEnum<VeniceResponseStatusCategory> chunkedRecordSuccessCountMetric;
5855
private final MetricEntityStateOneEnum<VeniceResponseStatusCategory> chunkedRecordFailCountMetric;
59-
private final AsyncMetricEntityStateBase undergoingVersionSwapMetric;
60-
private final AtomicInteger undergoingVersionSwap = new AtomicInteger(0);
6156

6257
public BasicConsumerStats(MetricsRepository metricsRepository, String consumerName, String storeName) {
6358
super(metricsRepository, consumerName);
@@ -162,19 +157,6 @@ public BasicConsumerStats(MetricsRepository metricsRepository, String consumerNa
162157
baseDimensionsMap,
163158
VeniceResponseStatusCategory.class);
164159

165-
undergoingVersionSwapMetric = AsyncMetricEntityStateBase.create(
166-
BasicConsumerMetricEntity.UNDERGOING_VERSION_SWAP.getMetricEntity(),
167-
otelRepository,
168-
this::registerSensor,
169-
BasicConsumerTehutiMetricName.UNDERGOING_VERSION_SWAP,
170-
Collections.singletonList(
171-
new AsyncGauge(
172-
(ignored, ignored2) -> this.undergoingVersionSwap.get(),
173-
BasicConsumerTehutiMetricName.UNDERGOING_VERSION_SWAP.getMetricName())),
174-
baseDimensionsMap,
175-
baseAttributes,
176-
this.undergoingVersionSwap::get);
177-
178160
/*
179161
* Record default value for version swap metrics so the UP_DOWN_COUNTER in OTEL will emit a default 0.
180162
* If you don't do this, the OTEL metric will return no data upon query time until a value is recorded.
@@ -223,10 +205,6 @@ public void emitChunkedRecordCountMetrics(VeniceResponseStatusCategory responseS
223205
}
224206
}
225207

226-
public void setUndergoingVersionSwap(int value) {
227-
undergoingVersionSwap.set(value);
228-
}
229-
230208
@VisibleForTesting
231209
public Attributes getBaseAttributes() {
232210
return baseAttributes;
@@ -238,7 +216,7 @@ public Attributes getBaseAttributes() {
238216
public enum BasicConsumerTehutiMetricName implements TehutiMetricNameEnum {
239217
MAX_PARTITION_LAG, RECORDS_CONSUMED, MINIMUM_CONSUMING_VERSION, MAXIMUM_CONSUMING_VERSION, POLL_SUCCESS_COUNT,
240218
POLL_FAIL_COUNT, VERSION_SWAP_SUCCESS_COUNT, VERSION_SWAP_FAIL_COUNT, CHUNKED_RECORD_SUCCESS_COUNT,
241-
CHUNKED_RECORD_FAIL_COUNT, UNDERGOING_VERSION_SWAP;
219+
CHUNKED_RECORD_FAIL_COUNT;
242220

243221
private final String metricName;
244222

@@ -293,13 +271,6 @@ public enum BasicConsumerMetricEntity implements ModuleMetricEntityInterface {
293271
CHUNKED_RECORD_COUNT(
294272
MetricType.COUNTER, MetricUnit.NUMBER, "Measures the count of chunked records consumed",
295273
setOf(VENICE_STORE_NAME, VENICE_RESPONSE_STATUS_CODE_CATEGORY)
296-
),
297-
/**
298-
* Indicate if the changelog consumer is undergoing version swap triggered by version swap messages
299-
*/
300-
UNDERGOING_VERSION_SWAP(
301-
MetricType.ASYNC_GAUGE, MetricUnit.NUMBER, "Indicate if the changelog consumer is undergoing version swap",
302-
setOf(VENICE_STORE_NAME)
303274
);
304275

305276
private final MetricEntity metricEntity;

0 commit comments

Comments
 (0)