Skip to content

Commit 0712d25

Browse files
committed
[#24852]yugabyted: Using clockbound time sync service in yugabyted deployments
Summary: Adding clockbound configuration as part of yugabyted deployments: * If the node is started with `enhance_time_sync_via_clockbound` flag, then, yugabyted will do a hard check for clockbound, i.e., yugabyted will fail to start if clockbound is not installed and configured. It will give an appropriate error msg. * If the node is not started with `enhance_time_sync_via_clockbound` flag, then yugabyted will do a soft check for clockbound, i.e., yugabyted will give a warning with appropriate message. * If the checks pass, yugabyted will start the node with time_source as clockbound. If this flag is set to some other value by the user, yugabyted will fail if `enhance_time_sync_via_clockbound` flag was used, else, it will log it it and continue to start the node without setting time_source. Jira: DB-13966 Test Plan: ./yb_build.sh --java-test 'org.yb.yugabyted.*' Reviewers: nikhil Reviewed By: nikhil Subscribers: yugabyted-dev, sgarg-yb Differential Revision: https://phorge.dev.yugabyte.com/D41289
1 parent acbead1 commit 0712d25

File tree

3 files changed

+94
-90
lines changed

3 files changed

+94
-90
lines changed

bin/configure_clockbound.sh

100644100755
File mode changed.

bin/yugabyted

+91-87
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,11 @@ PREREQS_ERROR_MSGS = {
125125
' please free the port and restart the node.',
126126
'ycql_metric_port': 'YCQL metrics port {} is already in use. For accessing the YCQL metrics,' \
127127
' please free the port and restart the node.',
128+
'clockbound_fail': 'Failed to validate system configuration for clockbound. Please run ' \
129+
'bin/configure_clockbound.sh script to install and configure clockbound.',
128130
'clockbound': 'Clockbound is recommended on AWS/Azure/GCP clusters.' \
129-
' It can reduce read restart errors significantly in concurrent workloads.' \
130-
' Relevant flag: --enhance_time_sync_via_clockbound.',
131+
' It can reduce read restart errors significantly in concurrent workloads. Please run ' \
132+
'bin/configure_clockbound.sh script to install and configure clockbound.',
131133
}
132134
QUICK_START_LINKS = {
133135
'mac' : 'https://docs.yugabyte.com/preview/quick-start/',
@@ -683,7 +685,7 @@ def using_time_sync_service():
683685
'aws.com', 'google.com']
684686

685687
cmd = ['chronyc', 'sources']
686-
out, err, ret_code = run_process(cmd, timeout=1, log_cmd=True)
688+
out, _, ret_code = run_process(cmd, timeout=1, log_cmd=True)
687689
if ret_code == 0:
688690
for source in allow_list:
689691
if source in out:
@@ -693,7 +695,7 @@ def using_time_sync_service():
693695

694696
def is_phc_configured():
695697
cmd = ['systemctl', 'status', 'clockbound']
696-
out, err, retcode = run_process(cmd, timeout=1, log_cmd=True)
698+
out, _, retcode = run_process(cmd, timeout=1, log_cmd=True)
697699
return retcode == 0 and 'PHC' in out
698700

699701
# Check if ip is ipv6
@@ -741,9 +743,6 @@ class ControlScript(object):
741743
atexit.register(self.kill_children)
742744
Output.script_exit_func = self.kill_children
743745

744-
if self.configs.temp_data.get("enhance_time_sync_via_clockbound"):
745-
self.assert_system_configured_for_clockbound()
746-
747746
if self.configs.saved_data.get("read_replica"):
748747
self.start_rr_process()
749748
else:
@@ -2819,11 +2818,16 @@ class ControlScript(object):
28192818
prereqs_warn_flag = True
28202819

28212820
# TODO: Uncomment this block when clockbound becomes GA.
2822-
# # Configuring clockbound is strongly recommended for AWS clusters.
2823-
# if using_time_sync_service() and not self.configs.temp_data[
2824-
# "enhance_time_sync_via_clockbound"]:
2825-
# prereqs_warn.add('clockbound')
2826-
# prereqs_warn_flag = True
2821+
# Configuring clockbound is strongly recommended for AWS clusters.
2822+
if not self.assert_system_configured_for_clockbound():
2823+
if self.configs.temp_data["enhance_time_sync_via_clockbound"]:
2824+
prereqs_failed.add('clockbound_fail')
2825+
prereqs_failed_flag = True
2826+
elif using_time_sync_service():
2827+
prereqs_warn.add('clockbound')
2828+
prereqs_warn_flag = True
2829+
else:
2830+
self.configs.temp_data["is_clockbound_configured"] = True
28272831

28282832
(failed_ports, warning_ports, mandatory_port_available,
28292833
recommended_port_available) = self.check_ports()
@@ -2904,13 +2908,13 @@ class ControlScript(object):
29042908
# Get pre-req failures and warnings
29052909
prereqs_failed_flag, prereqs_failed, prereqs_warn_flag, prereqs_warn, \
29062910
mandatory_port_available, recommended_port_available = check
2907-
if prereqs_warn_flag:
2908-
if OS_NAME == "Linux":
2909-
help_links.append("- Quick start for Linux: " +
2910-
Output.make_underline(QUICK_START_LINKS['linux']))
2911-
else:
2912-
help_links.append("- Quick start for macOS: " +
2913-
Output.make_underline(QUICK_START_LINKS['mac']))
2911+
# if prereqs_warn_flag:
2912+
if OS_NAME == "Linux":
2913+
help_links.append("- Quick start for Linux: " +
2914+
Output.make_underline(QUICK_START_LINKS['linux']))
2915+
else:
2916+
help_links.append("- Quick start for macOS: " +
2917+
Output.make_underline(QUICK_START_LINKS['mac']))
29142918

29152919
if not mandatory_port_available or not recommended_port_available:
29162920
help_links.append("- Default ports: " + Output.make_underline(DEFAULT_PORTS_LINK))
@@ -3100,6 +3104,38 @@ class ControlScript(object):
31003104
master_rpc_port, master_addresses)
31013105
was_already_setup = self.configs.saved_data.get("cluster_member", False)
31023106

3107+
warnings = []
3108+
warnings_for_ui = []
3109+
warning_help_msg=""
3110+
is_first_run = True
3111+
3112+
# Do the pre-req check before forming master and tserver commands
3113+
if is_first_run:
3114+
ulimits_failed = self.script.set_rlimits(print_info=True)
3115+
if ulimits_failed:
3116+
msg = "Failed to meet recommended settings. Ulimits too low - {}.\n".format(
3117+
", ".join(ulimits_failed))
3118+
ulimit_warn_msg = msg + "Note {} will still run, although it may fail for " \
3119+
"larger workloads. For more info, see {}".format(SCRIPT_NAME, CONFIG_LINK)
3120+
self.alerts.append((ALERT_WARNING, ULIMIT_ERR_CODE, ulimit_warn_msg))
3121+
3122+
prereqs_check_result = self.prereqs_check(ulimits=ulimits_failed)
3123+
3124+
if prereqs_check_result['status']==Output.ANIMATION_SUCCESS:
3125+
Output.print_out(prereqs_check_result['msg'])
3126+
elif prereqs_check_result['status']==Output.ANIMATION_WARNING:
3127+
3128+
warnings.extend(list(prereqs_check_result['msg'].values())[:-1])
3129+
warning_help_msg = prereqs_check_result['msg']["help_msg"]
3130+
3131+
prereqs_check_result['msg'].pop("help_msg")
3132+
warnings_for_ui = []
3133+
for k in prereqs_check_result['msg'].keys():
3134+
warnings_for_ui.extend([k])
3135+
elif prereqs_check_result['status']==Output.ANIMATION_FAIL:
3136+
Output.print_and_log(prereqs_check_result['msg'])
3137+
sys.exit(1)
3138+
31033139
common_gflags = self.get_common_flags()
31043140

31053141
yb_master_cmd = self.get_master_cmd(common_gflags)
@@ -3125,7 +3161,6 @@ class ControlScript(object):
31253161
self.processes = {}
31263162
return
31273163

3128-
is_first_run = True
31293164
callhome_thread = None
31303165
masters_list_update_thread = None
31313166
#Start the different thread for extracting the YBC binaries
@@ -3160,37 +3195,6 @@ class ControlScript(object):
31603195

31613196
# Start or initialize yb-master and yb-tserver.
31623197
if is_first_run:
3163-
# Output.init_animation("Running system checks...")
3164-
warnings = []
3165-
warnings_for_ui = []
3166-
warning_help_msg=""
3167-
ulimits_failed = self.script.set_rlimits(print_info=True)
3168-
if ulimits_failed:
3169-
msg = "Failed to meet recommended settings. Ulimits too low - {}.\n".format(
3170-
", ".join(ulimits_failed))
3171-
ulimit_warn_msg = msg + "Note {} will still run, although it may fail for " \
3172-
"larger workloads. For more info, see {}".format(SCRIPT_NAME, CONFIG_LINK)
3173-
self.alerts.append((ALERT_WARNING, ULIMIT_ERR_CODE, ulimit_warn_msg))
3174-
3175-
prereqs_check_result = self.prereqs_check(ulimits=ulimits_failed)
3176-
# Output.update_animation(msg=prereqs_check_result['msg'],
3177-
# status=prereqs_check_result['status'])
3178-
if prereqs_check_result['status']==Output.ANIMATION_SUCCESS:
3179-
Output.print_out(prereqs_check_result['msg'])
3180-
elif prereqs_check_result['status']==Output.ANIMATION_WARNING:
3181-
3182-
warnings.extend(list(prereqs_check_result['msg'].values())[:-1])
3183-
warning_help_msg = prereqs_check_result['msg']["help_msg"]
3184-
3185-
prereqs_check_result['msg'].pop("help_msg")
3186-
warnings_for_ui = []
3187-
for k in prereqs_check_result['msg'].keys():
3188-
warnings_for_ui.extend([k])
3189-
elif prereqs_check_result['status']==Output.ANIMATION_FAIL:
3190-
Output.print_and_log(prereqs_check_result['msg'])
3191-
sys.exit(1)
3192-
3193-
31943198
Output.init_animation("Starting the YugabyteDB Processes...")
31953199

31963200
self.post_install_yb()
@@ -3414,7 +3418,28 @@ class ControlScript(object):
34143418
if join_ip:
34153419
master_addresses = "{}:{},{}".format(get_url_from_ip(join_ip),
34163420
master_rpc_port, master_addresses)
3417-
was_already_setup = self.configs.saved_data.get("cluster_member", False)
3421+
3422+
is_first_run = True
3423+
warnings = []
3424+
warning_help_msg=""
3425+
if is_first_run:
3426+
ulimits_failed = self.script.set_rlimits(print_info=True)
3427+
if ulimits_failed:
3428+
msg = "Failed to meet recommended settings. Ulimits too low - {}.\n".format(
3429+
", ".join(ulimits_failed))
3430+
ulimit_warn_msg = msg + "Note {} will still run, although it may fail for " \
3431+
"larger workloads. For more info, see {}".format(SCRIPT_NAME, CONFIG_LINK)
3432+
self.alerts.append((ALERT_WARNING, ULIMIT_ERR_CODE, ulimit_warn_msg))
3433+
3434+
prereqs_check_result = self.prereqs_check(ulimits=ulimits_failed)
3435+
if prereqs_check_result['status']==Output.ANIMATION_SUCCESS:
3436+
Output.print_out(prereqs_check_result['msg'])
3437+
elif prereqs_check_result['status']==Output.ANIMATION_WARNING:
3438+
warnings.extend(list(prereqs_check_result['msg'].values())[:-1])
3439+
warning_help_msg = prereqs_check_result['msg']["help_msg"]
3440+
elif prereqs_check_result['status']==Output.ANIMATION_FAIL:
3441+
Output.print_and_log(prereqs_check_result['msg'])
3442+
sys.exit(1)
34183443

34193444
common_gflags = self.get_common_flags()
34203445

@@ -3436,7 +3461,6 @@ class ControlScript(object):
34363461
self.processes = {}
34373462
return
34383463

3439-
is_first_run = True
34403464
callhome_thread = None
34413465
masters_list_update_thread = None
34423466
self.stop_callhome = False
@@ -3463,26 +3487,6 @@ class ControlScript(object):
34633487

34643488
# Start or initialize yb-master and yb-tserver.
34653489
if is_first_run:
3466-
warnings = []
3467-
warning_help_msg=""
3468-
ulimits_failed = self.script.set_rlimits(print_info=True)
3469-
if ulimits_failed:
3470-
msg = "Failed to meet recommended settings. Ulimits too low - {}.\n".format(
3471-
", ".join(ulimits_failed))
3472-
ulimit_warn_msg = msg + "Note {} will still run, although it may fail for " \
3473-
"larger workloads. For more info, see {}".format(SCRIPT_NAME, CONFIG_LINK)
3474-
self.alerts.append((ALERT_WARNING, ULIMIT_ERR_CODE, ulimit_warn_msg))
3475-
3476-
prereqs_check_result = self.prereqs_check(ulimits=ulimits_failed)
3477-
if prereqs_check_result['status']==Output.ANIMATION_SUCCESS:
3478-
Output.print_out(prereqs_check_result['msg'])
3479-
elif prereqs_check_result['status']==Output.ANIMATION_WARNING:
3480-
warnings.extend(list(prereqs_check_result['msg'].values())[:-1])
3481-
warning_help_msg = prereqs_check_result['msg']["help_msg"]
3482-
elif prereqs_check_result['status']==Output.ANIMATION_FAIL:
3483-
Output.print_and_log(prereqs_check_result['msg'])
3484-
sys.exit(1)
3485-
34863490
Output.init_animation("Starting the YugabyteDB Processes...")
34873491

34883492
self.post_install_yb()
@@ -3652,14 +3656,18 @@ class ControlScript(object):
36523656

36533657
def config_time_source_clockbound(self, flags):
36543658
# Configure tserver flag time_source=clockbound
3655-
# when --enhance_time_sync_via_clockbound is set.
3656-
if self.configs.temp_data["enhance_time_sync_via_clockbound"]:
3659+
# when clockbound is installed and configured.
3660+
if self.configs.temp_data["is_clockbound_configured"]:
36573661
# Check database configuration.
36583662
time_source = self.get_flag_value(flags, "time_source")
36593663
if time_source and time_source != "clockbound":
3660-
raise ValueError(
3661-
"Cannot configure time_source with"
3662-
" --enhance_time_sync_via_clockbound.")
3664+
if self.configs.temp_data["enhance_time_sync_via_clockbound"]:
3665+
raise ValueError("--time_source gflag is already set to {}.".format(
3666+
time_source) + "Cannot configure time_source with" +
3667+
" --enhance_time_sync_via_clockbound.")
3668+
else:
3669+
Output.log("--time_source gflag is already set to {}.".format(time_source) +
3670+
" Cannot configure time_source to clockbound.")
36633671

36643672
# Configure time_source=clockbound if not already.
36653673
if not time_source:
@@ -4088,18 +4096,13 @@ class ControlScript(object):
40884096
# Sets YW metrics to use local database.
40894097
os.environ["USE_NATIVE_METRICS"] = "true"
40904098

4099+
# Returns true if the system has been configured for clock bound.
4100+
# Runs `configure_clockbound.sh --validate` and returns true if it returns 0.
40914101
def assert_system_configured_for_clockbound(self):
4092-
Output.init_animation("Validating system config for clockbound...")
40934102
configure_clockbound_path = find_binary_location("configure_clockbound.sh")
40944103
cmd = ["bash", configure_clockbound_path, "--validate"]
4095-
out, err, retcode = run_process(cmd)
4096-
if retcode == 0:
4097-
Output.update_animation("System configured for clockbound.")
4098-
else:
4099-
Output.update_animation("Failed to validate system configuration for clockbound.",
4100-
status=Output.ANIMATION_FAIL)
4101-
Output.log_error_and_exit(
4102-
Output.make_red("ERROR") + ": Did you run configure_clockbound.sh script?")
4104+
_, _, retcode = run_process(cmd)
4105+
return retcode == 0
41034106

41044107
# Runs post_install script for linux computers.
41054108
def post_install_yb(self):
@@ -8580,6 +8583,7 @@ class Configs(object):
85808583
"xcluster_target_addresses": "",
85818584
"xcluster_bootstrap_done": "",
85828585
"enhance_time_sync_via_clockbound": False,
8586+
"is_clockbound_configured": False,
85838587
}
85848588
self.config_file = config_file
85858589

yugabyted-ui/apiserver/cmd/server/handlers/api_cluster_info.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,9 @@ var WARNING_MSGS = map[string]string{
6666
"insecure" :"Cluster started in an insecure mode without " +
6767
"authentication and encryption enabled. For non-production use only, " +
6868
"not to be used without firewalls blocking the internet traffic.",
69-
"clockbound": "Clockbound is recommended on AWS clusters. It can reduce read restart errors" +
70-
" significantly in concurrent workloads." +
71-
" Relevant flag: --enhance_time_sync_via_clockbound.",
69+
"clockbound": "Clockbound is recommended on AWS/Azure/GCP clusters. " +
70+
"It can reduce read restart errors significantly in concurrent workloads. " +
71+
"Please run configure_clockbound.sh script to install and configure clockbound.",
7272
}
7373

7474
type SlowQueriesFuture struct {

0 commit comments

Comments
 (0)