Skip to content

Commit 1686dbe

Browse files
committed
Extend reboot script for rebooting SmartSwitch
1 parent 7cbcfda commit 1686dbe

File tree

5 files changed

+557
-2
lines changed

5 files changed

+557
-2
lines changed

scripts/reboot

Lines changed: 194 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,17 @@ EXIT_NEXT_IMAGE_NOT_EXISTS=4
3737
EXIT_SONIC_INSTALLER_VERIFY_REBOOT=21
3838
EXIT_PLATFORM_FW_AU_FAILURE=22
3939
PLATFORM_FWUTIL_AU_REBOOT_HANDLE="platform_fw_au_reboot_handle"
40+
PLATFORM_JSON_FILE="platform.json"
41+
PLATFORM_JSON_PATH="${DEVPATH}/${PLATFORM}/${PLATFORM_JSON_FILE}"
4042
REBOOT_SCRIPT_NAME=$(basename $0)
4143
REBOOT_TYPE="${REBOOT_SCRIPT_NAME}"
4244
TAG_LATEST=no
4345
REBOOT_FLAGS=""
46+
FORCE_REBOOT="no"
47+
SMART_SWITCH="no"
48+
DPU_MODULE_NAME=""
49+
REBOOT_DPU="no"
50+
PRE_SHUTDOWN="no"
4451

4552
function debug()
4653
{
@@ -128,6 +135,8 @@ function show_help_and_exit()
128135
echo " "
129136
echo " Available options:"
130137
echo " -h, -? : getting this help"
138+
echo " -d : DPU module name on a smart switch, option is invalid when on DPU"
139+
echo " -p : Pre-shutdown steps on DPU, invalid on NPU"
131140

132141
exit ${EXIT_SUCCESS}
133142
}
@@ -154,7 +163,7 @@ function reboot_pre_check()
154163
${DEVPATH}/${PLATFORM}/${PLATFORM_REBOOT_PRE_CHECK}
155164
[[ $? -ne 0 ]] && exit $?
156165
fi
157-
166+
158167
# Verify the next image by sonic-installer
159168
local message=$(sonic-installer verify-next-image 2>&1)
160169
if [ $? -ne 0 ]; then
@@ -176,9 +185,128 @@ function check_conflict_boot_in_fw_update()
176185
fi
177186
}
178187

188+
# Function to retrieve DPU IP from CONFIG_DB
189+
function get_dpu_ip()
190+
{
191+
local DPU_NAME=$1
192+
dpu_ip=$(sonic-db-cli CONFIG_DB HGET "DHCP_SERVER_IPV4_PORT|bridge-midplane|${DPU_NAME}" "ips@")
193+
if [ $? -ne 0 ] || [ -z "$dpu_ip" ]; then
194+
echo "Error: Failed to retrieve DPU IP address for ${DPU_NAME}"
195+
exit ${EXIT_ERROR}
196+
fi
197+
debug "$DPU_NAME ip: $dpu_ip"
198+
}
199+
200+
# Function to retrieve GNMI port from CONFIG_DB
201+
function get_gnmi_port() {
202+
local DPU_NAME=$1
203+
port=$(sonic-db-cli CONFIG_DB HGET "DPU_PORT|$DPU_NAME" "gnmi")
204+
if [ $? -ne 0 ] || [ -z "$port" ]; then
205+
echo "Error: Failed to retrieve GNMI port"
206+
exit ${EXIT_ERROR}
207+
fi
208+
debug "$DPU_NAME GNMI port:$port"
209+
}
210+
211+
# Function to get reboot status from DPU
212+
function get_reboot_status()
213+
{
214+
local dpu_ip=$1
215+
local port=$2
216+
reboot_status=$(docker exec -i gnmi gnoi_client -target ${dpu_ip}:${port} -logtostderr -insecure -rpc RebootStatus)
217+
if [ $? -ne 0 ] || [ -z "$reboot_status" ]; then
218+
echo "Error: Failed to send reboot status command to DPU ${DPU_NAME}"
219+
exit ${EXIT_ERROR}
220+
fi
221+
debug "$reboot_status"
222+
}
223+
224+
# Function to retrieve DPU bus info from platform JSON
225+
function get_dpu_bus_info() {
226+
local DPU_NAME=$1
227+
DPU_BUS_INFO=$(jq -r --arg DPU_NAME "${DPU_NAME}" '.DPUS[] | select(has($DPU_NAME)) | .[$DPU_NAME].bus_info' "$PLATFORM_JSON_PATH")
228+
if [ -z "$DPU_BUS_INFO" ]; then
229+
echo "Error: bus_info not found for DPU ${DPU_NAME}"
230+
exit ${EXIT_ERROR}
231+
fi
232+
debug "$DPU_NAME : $DPU_BUS_INFO"
233+
}
234+
235+
# Function to reboot the platform module
236+
function reboot_platform_module() {
237+
local DPU_NAME=$1
238+
reboot_status=$(python3 -c "import reboot_helper; reboot_helper.reboot_module('${DPU_NAME}')")
239+
if [ -z "$reboot_status" ] || [ "$reboot_status" = "false" ]; then
240+
echo "Error: Failed to reboot the platform"
241+
exit ${EXIT_ERROR}
242+
fi
243+
}
244+
245+
function reboot_dpu_module()
246+
{
247+
local DPU_NAME=$1
248+
local DPU_INDEX=${DPU_NAME//[!0-9]/}
249+
250+
debug "User requested rebooting device ${DPU_NAME} ..."
251+
252+
# Retrieve DPU IP and GNMI port
253+
dpu_ip=$(get_dpu_ip "${DPU_NAME}")
254+
port=$(get_gnmi_port "${DPU_NAME}")
255+
256+
if [ -z "$dpu_ip" ] || [ -z "$port" ]; then
257+
echo "Error: Failed to retrieve DPU IP or GNMI port for ${DPU_NAME}"
258+
exit ${EXIT_ERROR}
259+
fi
260+
261+
# Issue GNOI client command to reboot the DPU
262+
docker exec -i gnmi gnoi_client -target ${dpu_ip}:${port} -logtostderr -insecure -rpc Reboot -jsonin '{"method":3}'
263+
if [ $? -ne 0 ]; then
264+
echo "Error: Failed to send reboot command to DPU ${DPU_NAME}"
265+
exit ${EXIT_ERROR}
266+
fi
267+
268+
# Retrieve dpu_halt_services_timeout value using jq
269+
dpu_halt_services_timeout=$(jq -r '.dpu_halt_services_timeout' "$PLATFORM_JSON_PATH" 2>/dev/null)
270+
if [ $? -ne 0 ]; then
271+
echo "Error: Failed to retrieve dpu_halt_services_timeout from ${PLATFORM_JSON_PATH}"
272+
exit ${EXIT_ERROR}
273+
fi
274+
275+
# Poll on reboot status response with a timeout mechanism
276+
poll_interval=5
277+
waited_time=0
278+
while true; do
279+
reboot_status=$(get_reboot_status "${dpu_ip}" "${port}")
280+
debug "GNOI RebootStatus response ${reboot_status}"
281+
is_reboot_active=$(echo "$reboot_status" | grep "active" | awk '{print $2}')
282+
if [ "$is_reboot_active" == "false" ]; then
283+
break
284+
fi
285+
286+
sleep "$poll_interval"
287+
waited_time=$((waited_time + poll_interval))
288+
if [ $waited_time -ge $dpu_halt_services_timeout ]; then
289+
echo "Error: Timeout waiting for DPU ${DPU_NAME} to finish rebooting"
290+
exit ${EXIT_ERROR}
291+
fi
292+
done
293+
294+
# Check if DPU exists and retrieve bus info
295+
DPU_BUS_INFO=$(get_dpu_bus_info "${DPU_NAME}")
296+
297+
# Update STATE_DB and handle PCIe removal and rescan
298+
sonic-db-cli state_db set "PCIE_DETACH_INFO|${DPU_NAME}" '{"dpu_id": "'${DPU_INDEX}'", "dpu_state": "detaching", "bus_info": "'${DPU_BUS_INFO}'"}'
299+
300+
echo 1 > /sys/bus/pci/devices/${DPU_BUS_INFO}/remove
301+
reboot_platform_module "${DPU_NAME}"
302+
echo 1 > /sys/bus/pci/rescan
303+
304+
sonic-db-cli state_db del "PCIE_DETACH_INFO|${DPU_NAME}"
305+
}
306+
179307
function parse_options()
180308
{
181-
while getopts "h?vf" opt; do
309+
while getopts "h?vfpd" opt; do
182310
case ${opt} in
183311
h|\? )
184312
show_help_and_exit
@@ -192,6 +320,13 @@ function parse_options()
192320
f )
193321
REBOOT_FLAGS+=" -f"
194322
;;
323+
d )
324+
REBOOT_DPU="yes"
325+
DPU_MODULE_NAME="$OPTARG"
326+
;;
327+
p )
328+
PRE_SHUTDOWN="yes"
329+
;;
195330
esac
196331
done
197332
}
@@ -215,6 +350,56 @@ function linecard_reboot_notify_supervisor()
215350
fi
216351
}
217352

353+
# Function to reboot all DPUs in parallel
354+
function reboot_all_dpus() {
355+
local NUM_DPU=$1
356+
357+
for (( i=0; i<"$NUM_DPU"; i++ )); do
358+
echo "Rebooting DPU module dpu$i"
359+
reboot_dpu_module "dpu$i" &
360+
done
361+
wait
362+
}
363+
364+
# Function to handle scenarios on smart switch
365+
function handle_smart_switch() {
366+
if [ -f "$PLATFORM_JSON_PATH" ]; then
367+
NUM_DPU=$(jq -r '.DPUS | length' "$PLATFORM_JSON_PATH" 2>/dev/null)
368+
if [ "$NUM_DPU" -gt 0 ]; then
369+
SMART_SWITCH="yes"
370+
fi
371+
fi
372+
373+
if [[ "$REBOOT_DPU" == "yes" ]]; then
374+
if [[ "$SMART_SWITCH" == "yes" ]]; then
375+
echo "User requested to reboot the device ${DPU_MODULE_NAME}"
376+
reboot_dpu_module "$DPU_MODULE_NAME"
377+
else
378+
echo "Invalid '-d' option specified for a non-smart switch"
379+
exit ${EXIT_ERROR}
380+
fi
381+
fi
382+
383+
is_dpu=$(python3 -c "import reboot_helper; reboot_helper.is_dpu()")
384+
debug "Is the platform DPU: $is_dpu"
385+
386+
# Check if system is a DPU and handle -p option accordingly
387+
if [[ "$is_dpu" == "True" && "$PRE_SHUTDOWN" != "yes" ]]; then
388+
echo "Invalid, '-p' option not specified for a DPU"
389+
exit ${EXIT_ERROR}
390+
elif [[ "$is_dpu" != "True" && "$PRE_SHUTDOWN" == "yes" ]]; then
391+
echo "Invalid '-p' option specified for a non-DPU"
392+
exit ${EXIT_ERROR}
393+
fi
394+
395+
if [[ "$SMART_SWITCH" == "yes" ]]; then
396+
# If not a DPU, reboot all DPUs in parallel
397+
if [[ "$is_dpu" != "True" ]]; then
398+
reboot_all_dpus "$NUM_DPU"
399+
fi
400+
fi
401+
}
402+
218403
parse_options $@
219404

220405
# Exit if not superuser
@@ -225,6 +410,8 @@ fi
225410

226411
debug "User requested rebooting device ..."
227412

413+
handle_smart_switch
414+
228415
check_conflict_boot_in_fw_update
229416

230417
setup_reboot_variables
@@ -287,6 +474,11 @@ if [ -x ${WATCHDOG_UTIL} ]; then
287474
${WATCHDOG_UTIL} arm
288475
fi
289476

477+
if [[ "${PRE_SHUTDOWN}" == "yes" ]]; then
478+
echo "${DPU_MODULE_NAME} pre-shutdown steps are completed"
479+
exit ${EXIT_SUCCESS}
480+
fi
481+
290482
if [ -x ${DEVPATH}/${PLATFORM}/${PLAT_REBOOT} ]; then
291483
VERBOSE=yes debug "Rebooting with platform ${PLATFORM} specific tool ..."
292484
${DEVPATH}/${PLATFORM}/${PLAT_REBOOT} $@

0 commit comments

Comments
 (0)