Skip to content

Commit 460146c

Browse files
committed
Extend reboot script for rebooting SmartSwitch
1 parent 008a078 commit 460146c

File tree

2 files changed

+244
-1
lines changed

2 files changed

+244
-1
lines changed

scripts/reboot

Lines changed: 140 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ PLATFORM_UPDATE_REBOOT_CAUSE="platform_update_reboot_cause"
1111
REBOOT_CAUSE_FILE="/host/reboot-cause/reboot-cause.txt"
1212
PLATFORM_REBOOT_PRE_CHECK="platform_reboot_pre_check"
1313
REBOOT_TIME=$(date)
14+
PLATFORM_JSON_FILE="platform.json"
15+
PLATFORM_JSON_PATH="${DEVPATH}/${PLATFORM}/${PLATFORM_JSON_FILE}"
1416

1517
# Reboot immediately if we run the kdump capture kernel
1618
VMCORE_FILE=/proc/vmcore
@@ -33,6 +35,7 @@ ASIC_TYPE=$(sonic-cfggen -y /etc/sonic/sonic_version.yml -v asic_type)
3335
SUBTYPE=$(sonic-cfggen -d -v DEVICE_METADATA.localhost.subtype)
3436
ASAN=$(sonic-cfggen -y /etc/sonic/sonic_version.yml -v asan)
3537
VERBOSE=no
38+
EXIT_SUCCESS=0
3639
EXIT_NEXT_IMAGE_NOT_EXISTS=4
3740
EXIT_SONIC_INSTALLER_VERIFY_REBOOT=21
3841
EXIT_PLATFORM_FW_AU_FAILURE=22
@@ -41,6 +44,11 @@ REBOOT_SCRIPT_NAME=$(basename $0)
4144
REBOOT_TYPE="${REBOOT_SCRIPT_NAME}"
4245
TAG_LATEST=no
4346
REBOOT_FLAGS=""
47+
FORCE_REBOOT="no"
48+
SMART_SWITCH="no"
49+
DPU_MODULE_NAME=""
50+
REBOOT_DPU="no"
51+
PRE_SHUTDOWN="no"
4452

4553
function debug()
4654
{
@@ -154,7 +162,7 @@ function reboot_pre_check()
154162
${DEVPATH}/${PLATFORM}/${PLATFORM_REBOOT_PRE_CHECK}
155163
[[ $? -ne 0 ]] && exit $?
156164
fi
157-
165+
158166
# Verify the next image by sonic-installer
159167
local message=$(sonic-installer verify-next-image 2>&1)
160168
if [ $? -ne 0 ]; then
@@ -176,6 +184,104 @@ function check_conflict_boot_in_fw_update()
176184
fi
177185
}
178186

187+
function get_reboot_status()
188+
{
189+
reboot_status=$(gnoi_client -target ${dpu_ip}:${port} -logtostderr -insecure -rpc RebootStatus)
190+
if [ $? -ne 0 ] || [ -z "$reboot_status" ]; then
191+
echo "Error: Failed to send reboot status command to DPU ${DPU_NAME}"
192+
exit ${EXIT_ERROR}
193+
fi
194+
echo "$reboot_status"
195+
}
196+
197+
function reboot_dpu_module()
198+
{
199+
local DPU_NAME=$1
200+
201+
debug "User requested rebooting device ${DPU_NAME} ..."
202+
203+
# Retrieve DPU IP from CONFIG_DB
204+
dpu_ip=$(sonic-db-cli CONFIG_DB HGET "DHCP_SERVER_IPV4_PORT|bridge-midplane|${DPU_NAME}" "ips")
205+
if [ $? -ne 0 ] || [ -z "$dpu_ip" ]; then
206+
echo "Error: Failed to retrieve DPU IP address for ${DPU_NAME}"
207+
exit ${EXIT_ERROR}
208+
fi
209+
210+
# Retrieve GNMI port from CONFIG_DB
211+
port=$(sonic-db-cli CONFIG_DB HGET "GNMI|gnmi" "port")
212+
debug "GNMI port ${port}"
213+
if [ $? -ne 0 ] || [ -z "$port" ]; then
214+
echo "Error: Failed to retrieve GNMI port"
215+
exit ${EXIT_ERROR}
216+
fi
217+
218+
# Issue GNOI client command to reboot the DPU
219+
gnoi_client -target ${dpu_ip}:${port} -logtostderr -insecure -rpc Reboot -jsonin '{"method":3}'
220+
if [ $? -ne 0 ]; then
221+
echo "Error: Failed to send reboot command to DPU ${DPU_NAME}"
222+
exit ${EXIT_ERROR}
223+
fi
224+
225+
# Retrieve dpu_halt_services_timeout value using jq
226+
dpu_halt_services_timeout=$(jq -r '.dpu_halt_services_timeout' "$PLATFORM_JSON_PATH" 2>/dev/null)
227+
228+
# Poll on reboot status response with a timeout mechanism
229+
poll_interval=5
230+
waited_time=0
231+
232+
while true; do
233+
reboot_status=$(get_reboot_status)
234+
debug "GNOI RebootStatus response ${reboot_status}"
235+
is_reboot_active=$(echo "$reboot_status" | grep "active" | awk '{print $2}')
236+
237+
if [ "$is_reboot_active" == "false" ]; then
238+
break
239+
fi
240+
241+
sleep "$poll_interval"
242+
waited_time=$((waited_time + poll_interval))
243+
244+
if [ $waited_time -ge $dpu_halt_services_timeout ]; then
245+
echo "Error: Timeout waiting for DPU ${DPU_NAME} to finish rebooting"
246+
exit ${EXIT_ERROR}
247+
fi
248+
done
249+
250+
# Check if the given DPU_NAME exists in the JSON file
251+
DPU_EXISTS=$(jq -r --arg DPU_NAME "${DPU_NAME}" '.DPUS[] | select(has($DPU_NAME)) | .[$DPU_NAME]' "$PLATFORM_JSON_PATH" 2>/dev/null)
252+
253+
if [ -n "$DPU_EXISTS" ]; then
254+
# Retrieve bus_info for the given DPU_NAME
255+
DPU_BUS_INFO=$(jq -r --arg DPU_NAME "${DPU_NAME}" '.DPUS[] | select(has($DPU_NAME)) | .[$DPU_NAME].bus_info' "$PLATFORM_JSON_PATH")
256+
257+
if [ -n "$DPU_BUS_INFO" ]; then
258+
debug "DPU: ${DPU_NAME}, Bus Info: ${BUS_INFO}"
259+
else
260+
echo "Error: bus_info not found for DPU ${DPU_NAME}"
261+
exit ${EXIT_ERROR}
262+
fi
263+
else
264+
echo "Error: DPU ${DPU_NAME} not found in platform.json"
265+
exit ${EXIT_ERROR}
266+
fi
267+
268+
# Update STATE_DB with DPU PCIe key
269+
sonic-db-cli state_db set "PCIE_DETACH_INFO|${DPU_NAME}" '{"dpu_id": "0", "dpu_state": "detaching", "bus_info": ${DPU_BUS_INFO}}'
270+
271+
# Detach the DPU module PCIe
272+
echo 1 > /sys/bus/pci/devices/${DPU_BUS_INFO}/remove
273+
274+
# Reboot the DPU via platform vendor API
275+
reboot_status=$(python3 -c "import reboot_helper; reboot_helper.reboot_modulei('${DPU_NAME}')")
276+
if [ -z "$reboot_status" ] || [ "$reboot_status" = "false" ]; then
277+
echo "Error: Failed to reboot the platform"
278+
exit ${EXIT_ERROR}
279+
fi
280+
281+
# Rescan the PCIe
282+
echo 1 > /sys/bus/pci/rescan
283+
}
284+
179285
function parse_options()
180286
{
181287
while getopts "h?vf" opt; do
@@ -192,6 +298,13 @@ function parse_options()
192298
f )
193299
REBOOT_FLAGS+=" -f"
194300
;;
301+
d )
302+
REBOOT_DPU="yes"
303+
DPU_MODULE_NAME="$optarg"
304+
;;
305+
p )
306+
PRE_SHUTDOWN="yes"
307+
;;
195308
esac
196309
done
197310
}
@@ -225,6 +338,27 @@ fi
225338

226339
debug "User requested rebooting device ..."
227340

341+
# Check for smart switch by parsing platform.json file
342+
if [ -f "$PLATFORM_JSON_PATH" ]; then
343+
NUM_DPU=$(jq -r '.DPUS | length' "$PLATFORM_JSON_PATH" 2>/dev/null)
344+
345+
if [ "$NUM_DPU" -gt 0 ]; then
346+
SMART_SWITCH="yes"
347+
fi
348+
fi
349+
350+
if [[ "$REBOOT_DPU" == "yes" && "$SMART_SWITCH" == "yes" ]]; then
351+
echo "User requested to reboot the device ${DPU_MODULE_NAME}"
352+
reboot_dpu_module "$DPU_MODULE_NAME"
353+
elif [ "$SMART_SWITCH" == "yes" ]; then
354+
# Loop to iterate over DPUs and invoke reboot_dpu_module in parallel
355+
for (( i=0; i<"$NUM_DPU"; i++ )); do
356+
echo "Rebooting DPU module $i"
357+
reboot_dpu_module "dpu$i" &
358+
done
359+
wait
360+
fi
361+
228362
check_conflict_boot_in_fw_update
229363

230364
setup_reboot_variables
@@ -287,6 +421,11 @@ if [ -x ${WATCHDOG_UTIL} ]; then
287421
${WATCHDOG_UTIL} arm
288422
fi
289423

424+
if [[ "${PRE_SHUTDOWN}" == "yes" ]]; then
425+
echo "${DPU_MODULE_NAME} pre-shutdown steps are completed"
426+
exit ${EXIT_SUCCESS}
427+
fi
428+
290429
if [ -x ${DEVPATH}/${PLATFORM}/${PLAT_REBOOT} ]; then
291430
VERBOSE=yes debug "Rebooting with platform ${PLATFORM} specific tool ..."
292431
${DEVPATH}/${PLATFORM}/${PLAT_REBOOT} $@

scripts/reboot_helper.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
#!/usr/bin/env python3
2+
#
3+
# reboot_helper.py
4+
#
5+
# Utility helper for reboot within SONiC
6+
7+
import sonic_platform
8+
import sys
9+
import syslog
10+
11+
chk_log_level = syslog.LOG_ERR
12+
13+
def _log_msg(lvl, pfx, msg):
14+
if lvl <= chk_log_level:
15+
print("{}: {}".format(pfx, msg))
16+
syslog.syslog(lvl, msg)
17+
18+
19+
def log_err(m):
20+
_log_msg(syslog.LOG_ERR, "Err", m)
21+
22+
23+
def log_info(m):
24+
_log_msg(syslog.LOG_INFO, "Info", m)
25+
26+
27+
def log_debug(m):
28+
_log_msg(syslog.LOG_DEBUG, "Debug", m)
29+
30+
31+
# Global variable for platform chassis
32+
platform_chassis = None
33+
34+
def load_platform_chassis():
35+
global platform_chassis
36+
37+
# Load new platform API class
38+
try:
39+
platform_chassis = sonic_platform.platform.Platform().get_chassis()
40+
except Exception as e:
41+
log_err("Failed to instantiate Chassis due to {}".format(repr(e)))
42+
return False
43+
44+
if not platform_chassis:
45+
log_err("Platform chassis is not loaded")
46+
return False
47+
48+
return True
49+
50+
def reboot_module(module_name):
51+
"""Reboot the specified module by invoking the platform API"""
52+
53+
# Load the platform chassis if not already loaded
54+
if not platform_chassis and not load_platform_chassis():
55+
log_err("Failed to load platform chassis")
56+
return False
57+
58+
# Iterate over the modules to find the one with the specified name
59+
try:
60+
# Use get_all_modules to retrieve all modules on the chassis
61+
modules = platform_chassis.get_all_modules()
62+
63+
# Iterate over the modules to find the one with the specified name
64+
for module in modules:
65+
# Check if the module name matches the provided module_name
66+
if module and module.get_name() == module_name:
67+
# Reboot the module
68+
log_info(f"Rebooting module {module_name}...")
69+
try:
70+
module.reboot()
71+
log_info(f"Reboot command sent for module {module_name}")
72+
return True
73+
except NotImplementedError:
74+
log_error(f"Reboot not implemented for module {module_name}.")
75+
return False
76+
except Exception as e:
77+
log_error(f"An error occurred while rebooting module {module_name}: {e}")
78+
return False
79+
80+
# If the module with the given name is not found
81+
log_err(f"Module {module_name} not found")
82+
return False
83+
84+
except Exception as e:
85+
log_err(f"Error occurred while rebooting module {module_name}: {repr(e)}")
86+
return False
87+
88+
if __name__ == "__main__":
89+
if len(sys.argv) < 3:
90+
print("Usage: reboot_helper.py <command> <module_name>")
91+
sys.exit(1)
92+
93+
command = sys.argv[1]
94+
module_name = sys.argv[2]
95+
96+
if command == "reboot":
97+
success = reboot_module(module_name)
98+
if not success:
99+
sys.exit(1)
100+
else:
101+
print(f"Reboot command sent for module {module_name}")
102+
else:
103+
print(f"Unknown command: {command}")
104+
sys.exit(1)

0 commit comments

Comments
 (0)