@@ -184,14 +184,60 @@ function check_conflict_boot_in_fw_update()
184184 fi
185185}
186186
187+ # Function to retrieve DPU IP from CONFIG_DB
188+ function get_dpu_ip()
189+ {
190+ local DPU_NAME=$1
191+ dpu_ip=$( sonic-db-cli CONFIG_DB HGET " DHCP_SERVER_IPV4_PORT|bridge-midplane|${DPU_NAME} " " ips" )
192+ if [ $? -ne 0 ] || [ -z " $dpu_ip " ]; then
193+ echo " Error: Failed to retrieve DPU IP address for ${DPU_NAME} "
194+ exit ${EXIT_ERROR}
195+ fi
196+ debug " $DPU_NAME ip: $dpu_ip "
197+ }
198+
199+ # Function to retrieve GNMI port from CONFIG_DB
200+ function get_gnmi_port() {
201+ port=$( sonic-db-cli CONFIG_DB HGET " GNMI|gnmi" " port" )
202+ if [ $? -ne 0 ] || [ -z " $port " ]; then
203+ echo " Error: Failed to retrieve GNMI port"
204+ exit ${EXIT_ERROR}
205+ fi
206+ debug " $DPU_NAME GNMI port:$port "
207+ }
208+
209+ # Function to get reboot status from DPU
187210function get_reboot_status()
188211{
212+ local dpu_ip=$1
213+ local port=$2
189214 reboot_status=$( gnoi_client -target ${dpu_ip} :${port} -logtostderr -insecure -rpc RebootStatus)
190215 if [ $? -ne 0 ] || [ -z " $reboot_status " ]; then
191216 echo " Error: Failed to send reboot status command to DPU ${DPU_NAME} "
192217 exit ${EXIT_ERROR}
193218 fi
194- echo " $reboot_status "
219+ debug " $reboot_status "
220+ }
221+
222+ # Function to retrieve DPU bus info from platform JSON
223+ function get_dpu_bus_info() {
224+ local DPU_NAME=$1
225+ DPU_BUS_INFO=$( jq -r --arg DPU_NAME " ${DPU_NAME} " ' .DPUS[] | select(has($DPU_NAME)) | .[$DPU_NAME].bus_info' " $PLATFORM_JSON_PATH " )
226+ if [ -z " $DPU_BUS_INFO " ]; then
227+ echo " Error: bus_info not found for DPU ${DPU_NAME} "
228+ exit ${EXIT_ERROR}
229+ fi
230+ debug " $DPU_NAME : $DPU_BUS_INFO "
231+ }
232+
233+ # Function to reboot the platform module
234+ function reboot_platform_module() {
235+ local DPU_NAME=$1
236+ reboot_status=$( python3 -c " import reboot_helper; reboot_helper.reboot_module('${DPU_NAME} ')" )
237+ if [ -z " $reboot_status " ] || [ " $reboot_status " = " false" ]; then
238+ echo " Error: Failed to reboot the platform"
239+ exit ${EXIT_ERROR}
240+ fi
195241}
196242
197243function reboot_dpu_module()
@@ -201,20 +247,9 @@ function reboot_dpu_module()
201247
202248 debug " User requested rebooting device ${DPU_NAME} ..."
203249
204- # Retrieve DPU IP from CONFIG_DB
205- dpu_ip=$( sonic-db-cli CONFIG_DB HGET " DHCP_SERVER_IPV4_PORT|bridge-midplane|${DPU_NAME} " " ips" )
206- if [ $? -ne 0 ] || [ -z " $dpu_ip " ]; then
207- echo " Error: Failed to retrieve DPU IP address for ${DPU_NAME} "
208- exit ${EXIT_ERROR}
209- fi
210-
211- # Retrieve GNMI port from CONFIG_DB
212- port=$( sonic-db-cli CONFIG_DB HGET " GNMI|gnmi" " port" )
213- debug " GNMI port ${port} "
214- if [ $? -ne 0 ] || [ -z " $port " ]; then
215- echo " Error: Failed to retrieve GNMI port"
216- exit ${EXIT_ERROR}
217- fi
250+ # Retrieve DPU IP and GNMI port
251+ dpu_ip=$( get_dpu_ip " ${DPU_NAME} " )
252+ port=$( get_gnmi_port)
218253
219254 # Issue GNOI client command to reboot the DPU
220255 gnoi_client -target ${dpu_ip} :${port} -logtostderr -insecure -rpc Reboot -jsonin ' {"method":3}'
@@ -229,60 +264,30 @@ function reboot_dpu_module()
229264 # Poll on reboot status response with a timeout mechanism
230265 poll_interval=5
231266 waited_time=0
232-
233267 while true ; do
234- reboot_status=$( get_reboot_status)
268+ reboot_status=$( get_reboot_status " ${dpu_ip} " " ${port} " )
235269 debug " GNOI RebootStatus response ${reboot_status} "
236270 is_reboot_active=$( echo " $reboot_status " | grep " active" | awk ' {print $2}' )
237-
238271 if [ " $is_reboot_active " == " false" ]; then
239272 break
240273 fi
241274
242275 sleep " $poll_interval "
243276 waited_time=$(( waited_time + poll_interval))
244-
245- if [ $waited_time -ge $dpu_halt_services_timeout ]; then
277+ if [ $waited_time -ge $dpu_halt_services_timeout ]; then
246278 echo " Error: Timeout waiting for DPU ${DPU_NAME} to finish rebooting"
247279 exit ${EXIT_ERROR}
248280 fi
249281 done
250282
251- # Check if the given DPU_NAME exists in the JSON file
252- DPU_EXISTS=$( jq -r --arg DPU_NAME " ${DPU_NAME} " ' .DPUS[] | select(has($DPU_NAME)) | .[$DPU_NAME]' " $PLATFORM_JSON_PATH " 2> /dev/null)
253-
254- if [ -n " $DPU_EXISTS " ]; then
255- # Retrieve bus_info for the given DPU_NAME
256- DPU_BUS_INFO=$( jq -r --arg DPU_NAME " ${DPU_NAME} " ' .DPUS[] | select(has($DPU_NAME)) | .[$DPU_NAME].bus_info' " $PLATFORM_JSON_PATH " )
257-
258- if [ -n " $DPU_BUS_INFO " ]; then
259- debug " DPU: ${DPU_NAME} , Bus Info: ${BUS_INFO} "
260- else
261- echo " Error: bus_info not found for DPU ${DPU_NAME} "
262- exit ${EXIT_ERROR}
263- fi
264- else
265- echo " Error: DPU ${DPU_NAME} not found in platform.json"
266- exit ${EXIT_ERROR}
267- fi
283+ # Check if DPU exists and retrieve bus info
284+ DPU_BUS_INFO=$( get_dpu_bus_info " ${DPU_NAME} " )
268285
269- # Update STATE_DB with DPU PCIe key
270- sonic-db-cli state_db set " PCIE_DETACH_INFO|${DPU_NAME} " ' {"dpu_id": "${DPU_INDEX}", "dpu_state": "detaching", "bus_info": ${DPU_BUS_INFO}}'
271-
272- # Detach the DPU module PCIe
286+ # Update STATE_DB and handle PCIe removal and rescan
287+ sonic-db-cli state_db set " PCIE_DETACH_INFO|${DPU_NAME} " ' {"dpu_id": "' ${DPU_INDEX} ' ", "dpu_state": "detaching", "bus_info": "' ${DPU_BUS_INFO} ' "}'
273288 echo 1 > /sys/bus/pci/devices/${DPU_BUS_INFO} /remove
274-
275- # Reboot the DPU via platform vendor API
276- reboot_status=$( python3 -c " import reboot_helper; reboot_helper.reboot_module('${DPU_NAME} ')" )
277- if [ -z " $reboot_status " ] || [ " $reboot_status " = " false" ]; then
278- echo " Error: Failed to reboot the platform"
279- exit ${EXIT_ERROR}
280- fi
281-
282- # Rescan the PCIe
289+ reboot_platform_module " ${DPU_NAME} "
283290 echo 1 > /sys/bus/pci/rescan
284-
285- # Update STATE_DB to delete DPU PCIe key
286291 sonic-db-cli state_db del " PCIE_DETACH_INFO|${DPU_NAME} "
287292}
288293
@@ -332,6 +337,17 @@ function linecard_reboot_notify_supervisor()
332337 fi
333338}
334339
340+ # Function to reboot all DPUs in parallel
341+ function reboot_all_dpus() {
342+ local NUM_DPU=$1
343+
344+ for (( i= 0 ; i< "$NUM_DPU "; i++ )) ; do
345+ echo " Rebooting DPU module dpu$i "
346+ reboot_dpu_module " dpu$i " &
347+ done
348+ wait
349+ }
350+
335351parse_options $@
336352
337353# Exit if not superuser
342358
343359debug " User requested rebooting device ..."
344360
345- # Check for smart switch by parsing platform.json file
346361if [ -f " $PLATFORM_JSON_PATH " ]; then
347362 NUM_DPU=$( jq -r ' .DPUS | length' " $PLATFORM_JSON_PATH " 2> /dev/null)
348-
349363 if [ " $NUM_DPU " -gt 0 ]; then
350364 SMART_SWITCH=" yes"
351365 fi
@@ -355,12 +369,7 @@ if [[ "$REBOOT_DPU" == "yes" && "$SMART_SWITCH" == "yes" ]]; then
355369 echo " User requested to reboot the device ${DPU_MODULE_NAME} "
356370 reboot_dpu_module " $DPU_MODULE_NAME "
357371elif [ " $SMART_SWITCH " == " yes" ]; then
358- # Loop to iterate over DPUs and invoke reboot_dpu_module in parallel
359- for (( i= 0 ; i< "$NUM_DPU "; i++ )) ; do
360- echo " Rebooting DPU module $i "
361- reboot_dpu_module " dpu$i " &
362- done
363- wait
372+ reboot_all_dpus " $NUM_DPU "
364373fi
365374
366375check_conflict_boot_in_fw_update
0 commit comments