Skip to content

Commit 3d50967

Browse files
authored
Merge pull request #3067 from krishan1390/configurable_retry_timeout_system_property
Make retry timeout configurable for ZK calls via system property
2 parents 02a170c + 46e730e commit 3d50967

File tree

7 files changed

+68
-11
lines changed

7 files changed

+68
-11
lines changed

helix-core/src/test/java/org/apache/helix/manager/zk/client/TestHelixZkClient.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@
2424
import org.apache.helix.TestHelper;
2525
import org.apache.helix.ZkUnitTestBase;
2626
import org.apache.helix.zookeeper.api.client.HelixZkClient;
27+
import org.apache.helix.zookeeper.constant.ZkSystemPropertyKeys;
2728
import org.apache.helix.zookeeper.exception.ZkClientException;
29+
import org.apache.helix.zookeeper.impl.client.ZkClient;
2830
import org.apache.helix.zookeeper.zkclient.IZkDataListener;
2931
import org.apache.helix.zookeeper.zkclient.ZkConnection;
3032
import org.testng.Assert;
@@ -197,4 +199,22 @@ public void handleDataDeleted(String s) {
197199

198200
deleteCluster("testSharingZkClient");
199201
}
202+
203+
@Test
204+
public void testZKClientConfig() {
205+
System.setProperty(ZkSystemPropertyKeys.ZK_OPERATION_RETRY_TIMEOUT_MS, "5000");
206+
207+
HelixZkClient.ZkConnectionConfig connectionConfig =
208+
new HelixZkClient.ZkConnectionConfig(ZK_ADDR);
209+
HelixZkClient.ZkClientConfig clientConfig = new HelixZkClient.ZkClientConfig();
210+
211+
// A factory just for this tests, this for avoiding the impact from other tests running in
212+
// parallel.
213+
final SharedZkClientFactory testFactory = new SharedZkClientFactory();
214+
ZkClient zkClient =
215+
(ZkClient) testFactory.buildZkClient(connectionConfig, clientConfig);
216+
Assert.assertEquals(zkClient.getOperationRetryTimeout(), 5000);
217+
218+
zkClient.close();
219+
}
200220
}

zookeeper-api/src/main/java/org/apache/helix/zookeeper/api/client/RealmAwareZkClient.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.apache.helix.msdcommon.exception.InvalidRoutingDataException;
2828
import org.apache.helix.zookeeper.constant.RoutingDataReaderType;
2929
import org.apache.helix.zookeeper.routing.RoutingDataManager;
30+
import org.apache.helix.zookeeper.util.ZNRecordUtil;
3031
import org.apache.helix.zookeeper.zkclient.DataUpdater;
3132
import org.apache.helix.zookeeper.zkclient.IZkChildListener;
3233
import org.apache.helix.zookeeper.zkclient.IZkDataListener;
@@ -63,9 +64,6 @@ enum RealmMode {
6364
SINGLE_REALM, MULTI_REALM
6465
}
6566

66-
// Setting default operation retry timeout to 24 hours. It can also be overwritten via RealmAwareZkClientConfig.
67-
// This timeout will be used while retrying zookeeper operations.
68-
int DEFAULT_OPERATION_TIMEOUT = 24 * 60 * 60 * 1000;
6967
int DEFAULT_CONNECTION_TIMEOUT = 60 * 1000;
7068
int DEFAULT_SESSION_TIMEOUT = 30 * 1000;
7169

@@ -558,7 +556,7 @@ class RealmAwareZkClientConfig {
558556
protected long _connectInitTimeout = DEFAULT_CONNECTION_TIMEOUT;
559557

560558
// Data access configs
561-
protected long _operationRetryTimeout = DEFAULT_OPERATION_TIMEOUT;
559+
protected long _operationRetryTimeout = ZNRecordUtil.getDefaultOperationRetryTimeout();
562560

563561
// Serializer
564562
protected PathBasedZkSerializer _zkSerializer;

zookeeper-api/src/main/java/org/apache/helix/zookeeper/constant/ZkSystemPropertyKeys.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,4 +89,15 @@ public class ZkSystemPropertyKeys {
8989
// TODO: deprecate this config after paginated API is deployed and stable
9090
public static final String ZK_GETCHILDREN_PAGINATION_DISABLED =
9191
"zk.getChildren.pagination.disabled";
92+
93+
/**
94+
* This property defines the default operation retry timeout in milliseconds for ZkClient.
95+
* Most ZkClient operations are retried in cases like connection loss with the ZooKeeper servers.
96+
* During such failures, this timeout decides the total amount of time to spend for retries before giving up.
97+
* A value lesser than or equal to 0 is ignored and the default value will be used.
98+
* <p>
99+
* The default value is 86400000 (24 hours) if not configured.
100+
*/
101+
public static final String ZK_OPERATION_RETRY_TIMEOUT_MS =
102+
"zk.operation.retry.timeout.ms";
92103
}

zookeeper-api/src/main/java/org/apache/helix/zookeeper/impl/client/ZkClient.java

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import org.apache.helix.zookeeper.api.client.HelixZkClient;
2323
import org.apache.helix.zookeeper.exception.ZkClientException;
24+
import org.apache.helix.zookeeper.util.ZNRecordUtil;
2425
import org.apache.helix.zookeeper.zkclient.IZkConnection;
2526
import org.apache.helix.zookeeper.zkclient.ZkConnection;
2627
import org.apache.helix.zookeeper.zkclient.serialize.BasicZkSerializer;
@@ -61,9 +62,6 @@
6162
public class ZkClient extends org.apache.helix.zookeeper.zkclient.ZkClient implements HelixZkClient {
6263
private static Logger LOG = LoggerFactory.getLogger(ZkClient.class);
6364

64-
// Setting default operation retry timeout to 24 hours. It can also be overwritten via RealmAwareZkClientConfig.
65-
// This timeout will be used while retrying zookeeper operations.
66-
public static final int DEFAULT_OPERATION_TIMEOUT = 24 * 60 * 60 * 1000;
6765
public static final int DEFAULT_CONNECTION_TIMEOUT = 60 * 1000;
6866
public static final int DEFAULT_SESSION_TIMEOUT = 30 * 1000;
6967

@@ -117,7 +115,7 @@ public ZkClient(IZkConnection connection, int connectionTimeout,
117115
PathBasedZkSerializer zkSerializer,
118116
String monitorType, String monitorKey) {
119117
this(connection, connectionTimeout, zkSerializer, monitorType, monitorKey,
120-
DEFAULT_OPERATION_TIMEOUT);
118+
ZNRecordUtil.getDefaultOperationRetryTimeout());
121119
}
122120

123121
public ZkClient(String zkServers, String monitorType, String monitorKey) {
@@ -191,7 +189,7 @@ public static class Builder {
191189

192190
PathBasedZkSerializer _zkSerializer;
193191

194-
long _operationRetryTimeout = DEFAULT_OPERATION_TIMEOUT;
192+
long _operationRetryTimeout = ZNRecordUtil.getDefaultOperationRetryTimeout();
195193
int _connectionTimeout = DEFAULT_CONNECTION_TIMEOUT;
196194
int _sessionTimeout = DEFAULT_SESSION_TIMEOUT;
197195

zookeeper-api/src/main/java/org/apache/helix/zookeeper/impl/factory/ZkConnectionManager.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@
2222
import java.util.HashSet;
2323
import java.util.Set;
2424

25-
import org.apache.helix.zookeeper.api.client.HelixZkClient;
2625
import org.apache.helix.zookeeper.impl.client.SharedZkClient;
2726
import org.apache.helix.zookeeper.impl.client.ZkClient;
2827
import org.apache.helix.zookeeper.exception.ZkClientException;
28+
import org.apache.helix.zookeeper.util.ZNRecordUtil;
2929
import org.apache.helix.zookeeper.zkclient.IZkConnection;
3030
import org.apache.helix.zookeeper.zkclient.serialize.BasicZkSerializer;
3131
import org.apache.helix.zookeeper.zkclient.serialize.SerializableSerializer;
@@ -63,7 +63,7 @@ public class ZkConnectionManager extends ZkClient {
6363
*/
6464
protected ZkConnectionManager(IZkConnection zkConnection, long connectionTimeout,
6565
String monitorKey) {
66-
super(zkConnection, (int) connectionTimeout, HelixZkClient.DEFAULT_OPERATION_TIMEOUT,
66+
super(zkConnection, (int) connectionTimeout, ZNRecordUtil.getDefaultOperationRetryTimeout(),
6767
new BasicZkSerializer(new SerializableSerializer()), MONITOR_TYPE, monitorKey, null, true);
6868
_monitorKey = monitorKey;
6969
LOG.info("ZkConnection {} was created for sharing.", _monitorKey);

zookeeper-api/src/main/java/org/apache/helix/zookeeper/util/ZNRecordUtil.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,4 +80,24 @@ public static int getSerializerWriteSizeLimit() {
8080

8181
return writeSizeLimit;
8282
}
83+
84+
/**
85+
* Gets the default operation retry timeout from system property if configured,
86+
* otherwise returns the hardcoded default value.
87+
* It can also be overwritten via respective ZKClient Config
88+
* @return the operation retry timeout in milliseconds
89+
*/
90+
public static long getDefaultOperationRetryTimeout() {
91+
int defaultOperationTimeout = 24 * 60 * 60 * 1000;
92+
String timeoutStr = System.getProperty(ZkSystemPropertyKeys.ZK_OPERATION_RETRY_TIMEOUT_MS);
93+
if (timeoutStr != null && !timeoutStr.trim().isEmpty()) {
94+
try {
95+
long timeout = Long.parseLong(timeoutStr.trim());
96+
return timeout <= 0 ? defaultOperationTimeout : timeout;
97+
} catch (NumberFormatException e) {
98+
// Invalid format, use default value
99+
}
100+
}
101+
return defaultOperationTimeout;
102+
}
83103
}

zookeeper-api/src/main/java/org/apache/helix/zookeeper/zkclient/ZkClient.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ public class ZkClient implements Watcher {
119119
Integer.getInteger(ZkSystemPropertyKeys.JUTE_MAXBUFFER, ZNRecord.SIZE_LIMIT);
120120

121121
private final IZkConnection _connection;
122+
123+
// The operation retry timeout can be configured via:
124+
// 1. Constructor parameter operationRetryTimeout
125+
// 2. System property "zk.operation.retry.timeout.ms" (used as default if not explicitly set)
126+
// 3. Respective ZKClientConfig (eg: RealmAwareZkClientConfig.setOperationRetryTimeout()) for higher-level clients
122127
private final long _operationRetryTimeoutInMillis;
123128
private final Map<String, Set<IZkChildListener>> _childListener = new ConcurrentHashMap<>();
124129
private final ConcurrentHashMap<String, Set<IZkDataListenerEntry>> _dataListener =
@@ -268,6 +273,10 @@ protected ZkClient(IZkConnection zkConnection, int connectionTimeout, long opera
268273
_persistListenerMutex = new ReentrantLock();
269274
}
270275

276+
public long getOperationRetryTimeout() {
277+
return _operationRetryTimeoutInMillis;
278+
}
279+
271280
protected ZkClient(IZkConnection zkConnection, int connectionTimeout, long operationRetryTimeout,
272281
PathBasedZkSerializer zkSerializer, String monitorType, String monitorKey,
273282
String monitorInstanceName, boolean monitorRootPathOnly) {
@@ -2205,6 +2214,7 @@ public <T> T retryUntilConnected(final Callable<T> callable)
22052214
private void waitForRetry(long maxSleep) {
22062215
if (waitUntilConnected(_operationRetryTimeoutInMillis, TimeUnit.MILLISECONDS)) {
22072216
try {
2217+
LOG.debug("zkclient {} Wait for {} ms before retrying operation", _uid, maxSleep);
22082218
Thread.sleep(maxSleep);
22092219
} catch (InterruptedException ex) {
22102220
// we don't need to re-throw.

0 commit comments

Comments
 (0)