MLPerf SDXL v5.1 Weekly - MI325x #21

Workflow file for this run

.github/workflows/mlperf_harness_sdxl_micro_shortfin_v5_1.yml at 832f6ad

	# Copyright 2024 Advanced Micro Devices, Inc
	#
	# Licensed under the Apache License v2.0 with LLVM Exceptions.
	# See https://llvm.org/LICENSE.txt for license information.
	# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	name: MLPerf SDXL v5.1 Weekly - MI325x
	on:
	workflow_dispatch:
	pull_request:
	# Run once weekly at midnight on Sundays PST.
	schedule:
	- cron: '0 0 * * 0'
	jobs:
	build_container:
	name: "Build Container"
	runs-on: nod-mi325-8gpu-mlperf
	steps:
	- name: Increase system vm map
	run: sudo sysctl -w vm.max_map_count=262144
	- name: Increase static TLS block limit
	run: export GLIBC_TUNABLES=glibc.rtld.optional_static_tls=2048
	- name: Checking out this repo
	uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
	with:
	ref: v5.1
	# -dt lets us run in an interactive, but detached mode which keeps the container alive
	# so the actual testing job can use it using docker exec
	- name: Build Docker
	run: \|
	cd code/stable-diffusion-xl/
	sudo docker build --no-cache --platform linux/amd64 \
	--tag mlperf_rocm_sdxl:ci_micro_shortfin_nogil_${{ github.run_number }} \
	--file SDXL_inference/sdxl_harness_rocm_shortfin_no_gil.dockerfile .
	sudo docker build --no-cache --platform linux/amd64 \
	--tag mlperf_rocm_sdxl:ci_micro_shortfin_${{ github.run_number }} \
	--file SDXL_inference/sdxl_harness_rocm_shortfin_from_source_iree.dockerfile .
	run_container:
	name: "Run Container"
	needs: build_container
	runs-on: nod-mi325-8gpu-mlperf
	steps:
	- name: Build Docker
	run: \|
	cd code/stable-diffusion-xl/
	sudo docker run -dt --network=host --device=/dev/kfd --device=/dev/dri \
	--group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
	-v /data/mlperf_sdxl/data:/data \
	-v /data/mlperf_sdxl/models:/models \
	-v `pwd`/SDXL_inference/:/mlperf/harness \
	-e ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 \
	-e HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 \
	-w /mlperf/harness \
	--name ci_container_micro_shortfin_nogil_${{ github.run_number }} \
	mlperf_rocm_sdxl:ci_micro_shortfin_nogil_${{ github.run_number }}
	sudo docker run -dt --network=host --device=/dev/kfd --device=/dev/dri \
	--group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
	-v /data/mlperf_sdxl/data:/data \
	-v /data/mlperf_sdxl/models:/models \
	-v `pwd`/SDXL_inference/:/mlperf/harness \
	-e ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 \
	-e HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 \
	-w /mlperf/harness \
	--name ci_container_micro_shortfin_${{ github.run_number }} \
	mlperf_rocm_sdxl:ci_micro_shortfin_${{ github.run_number }}
	ml_perf_test:
	needs: run_container
	name: "ML Perf Harness Test"
	runs-on: nod-mi325-8gpu-mlperf
	steps:
	- name: Precompile (Server)
	run: \|
	sudo docker exec --env IREE_BUILD_DIR="/iree/build-server" ci_container_micro_shortfin_${{ github.run_number }} ./precompile_model_shortfin.sh --td_spec attention_and_matmul_spec_gfx942_MI325.mlir --model_json sdxl_config_fp8_sched_unet_bs2.json --force_export True
	- name: Precompile (Offline)
	run: \|
	sudo docker exec --env IREE_BUILD_DIR="/iree/build-offline" ci_container_micro_shortfin_${{ github.run_number }} ./precompile_model_shortfin.sh --td_spec attention_and_matmul_spec_gfx942_MI325_bs32.mlir --model_json sdxl_config_fp8_sched_unet_bs32.json --force_export True
	- name: Run Server
	run: \|
	sudo docker exec ci_container_micro_shortfin_${{ github.run_number }} ./run_scenario_server_MI325x_cpx.sh
	- name: Run Offline
	run: \|
	sudo docker exec --env PYTHON_GIL=0 ci_container_micro_shortfin_nogil_${{ github.run_number }} ./run_scenario_offline_MI325x_cpx.sh
	- name: "Upload accuracy artifact (offline)"
	uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
	with:
	name: accuracy_artifact_offline
	path: SDXL_inference/Submission/closed/AMD/results/8xMI325x_2xEPYC-9655/stable-diffusion-xl/Offline/accuracy/accuracy.txt
	- name: "Upload accuracy artifact (server)"
	uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
	with:
	name: accuracy_artifact_server
	path: SDXL_inference/Submission/closed/AMD/results/8xMI325x_2xEPYC-9655/stable-diffusion-xl/Server/accuracy/accuracy.txt
	- name: "Upload perf artifact (server)"
	uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
	with:
	name: perf_artifact_server
	path: SDXL_inference/Submission/closed/AMD/results/8xMI325x_2xEPYC-9655/stable-diffusion-xl/Server/performance/run_1/mlperf_log_summary.json
	- name: "Upload perf artifact (offline)"
	uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
	with:
	name: perf_artifact_offline
	path: SDXL_inference/Submission/closed/AMD/results/8xMI325x_2xEPYC-9655/stable-diffusion-xl/Offline/performance/run_1/mlperf_log_summary.json

	container_cleanup:
	needs: ml_perf_test
	name: "Cleanup"
	if: always()
	runs-on: nod-mi325-8gpu-mlperf
	steps:
	- name: Cleanup Docker
	run: \|
	sudo docker stop ci_container_micro_shortfin_nogil_${{ github.run_number }}
	sudo docker stop ci_container_micro_shortfin_${{ github.run_number }}
	sudo docker rm ci_container_micro_shortfin_nogil_${{ github.run_number }}
	sudo docker rm ci_container_micro_shortfin_${{ github.run_number }}
	sudo docker rmi mlperf_rocm_sdxl:ci_micro_shortfin_nogil_${{ github.run_number }}
	sudo docker rmi mlperf_rocm_sdxl:ci_micro_shortfin_${{ github.run_number }}
	- name: Cleanup Submission artifacts
	run: \|
	sudo rm -rf ./code/stable-diffusion-xl/SDXL_inference/

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

MLPerf SDXL v5.1 Weekly - MI325x #21

Workflow file

MLPerf SDXL v5.1 Weekly - MI325x #21

Uh oh!

Workflow file for this run