Skip to content

Commit d053e45

Browse files
committed
[#238989]yugabyted: Node doesn't join using --join flag
Summary: When trying to add a node using `--join` flag, it fails to join stating the yb-admin command to add the master to cluster failed. Added a retry framework to run command for 30 secs and 10 retries. Test Plan: Manual Testing Reviewers: nikhil Reviewed By: nikhil Subscribers: yugabyted-dev, sgarg-yb Differential Revision: https://phorge.dev.yugabyte.com/D38001
1 parent 2e5ebef commit d053e45

File tree

1 file changed

+25
-2
lines changed

1 file changed

+25
-2
lines changed

bin/yugabyted

+25-2
Original file line numberDiff line numberDiff line change
@@ -8541,10 +8541,10 @@ class YBAdminProxy(object):
85418541
YBAdminProxy.cmd_args.append('--certs_dir_name={}'.format(certs_dir_name[0].group(1)))
85428542

85438543
@staticmethod
8544-
def add_master(master_addrs, new_master_ip, new_master_rpc_port, timeout=10):
8544+
def add_master(master_addrs, new_master_ip, new_master_rpc_port, timeout=30):
85458545
cmd = YBAdminProxy.cmd_args + ["--init_master_addrs", master_addrs,
85468546
"change_master_config", "ADD_SERVER", new_master_ip, str(new_master_rpc_port)]
8547-
out, err, ret_code = run_process(cmd, timeout=timeout, log_cmd=True)
8547+
out, err, ret_code = run_process_with_retries(cmd=cmd, timeout=timeout, log_cmd=True)
85488548
return (0 == ret_code)
85498549

85508550
@staticmethod
@@ -9625,6 +9625,29 @@ def run_process_checked(cmd, timeout=None, log_cmd=True, env_vars=None):
96259625
Output.log_error_and_exit("Error: {}".format(err))
96269626
return out
96279627

9628+
def run_process_with_retries(cmd, encrypted_cmd=None, timeout=None, log_cmd=False, env_vars=None,
9629+
shell=False, retries=10):
9630+
start_time = time.time()
9631+
now = start_time
9632+
try_count = 0
9633+
while True:
9634+
try_count+=1
9635+
if log_cmd:
9636+
Output.log("Running {}. Total retries: {}, Timeout: {}, Try count: {}".format(cmd,
9637+
retries, timeout, try_count))
9638+
out, err, retcode = run_process(cmd=cmd, encrypted_cmd=encrypted_cmd, timeout=timeout,
9639+
log_cmd=log_cmd, env_vars=env_vars, shell=shell)
9640+
now = time.time()
9641+
if retcode:
9642+
if now - start_time > timeout:
9643+
return (out, err, retcode)
9644+
elif try_count == retries:
9645+
return (out, err, retcode)
9646+
else:
9647+
time.sleep(0.2)
9648+
else:
9649+
return (out, err, retcode)
9650+
96289651
def rmcontents(dirname, exclude_names=[]):
96299652
for f in os.listdir(dirname):
96309653
if f in exclude_names:

0 commit comments

Comments
 (0)