MLPerf SDXL v5.1 Weekly - MI325x #21
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright 2024 Advanced Micro Devices, Inc | |
| # | |
| # Licensed under the Apache License v2.0 with LLVM Exceptions. | |
| # See https://llvm.org/LICENSE.txt for license information. | |
| # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |
| name: MLPerf SDXL v5.1 Weekly - MI325x | |
| on: | |
| workflow_dispatch: | |
| pull_request: | |
| # Run once weekly at midnight on Sundays PST. | |
| schedule: | |
| - cron: '0 0 * * 0' | |
| jobs: | |
| build_container: | |
| name: "Build Container" | |
| runs-on: nod-mi325-8gpu-mlperf | |
| steps: | |
| - name: Increase system vm map | |
| run: sudo sysctl -w vm.max_map_count=262144 | |
| - name: Increase static TLS block limit | |
| run: export GLIBC_TUNABLES=glibc.rtld.optional_static_tls=2048 | |
| - name: Checking out this repo | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
| with: | |
| ref: v5.1 | |
| # -dt lets us run in an interactive, but detached mode which keeps the container alive | |
| # so the actual testing job can use it using docker exec | |
| - name: Build Docker | |
| run: | | |
| cd code/stable-diffusion-xl/ | |
| sudo docker build --no-cache --platform linux/amd64 \ | |
| --tag mlperf_rocm_sdxl:ci_micro_shortfin_nogil_${{ github.run_number }} \ | |
| --file SDXL_inference/sdxl_harness_rocm_shortfin_no_gil.dockerfile . | |
| sudo docker build --no-cache --platform linux/amd64 \ | |
| --tag mlperf_rocm_sdxl:ci_micro_shortfin_${{ github.run_number }} \ | |
| --file SDXL_inference/sdxl_harness_rocm_shortfin_from_source_iree.dockerfile . | |
| run_container: | |
| name: "Run Container" | |
| needs: build_container | |
| runs-on: nod-mi325-8gpu-mlperf | |
| steps: | |
| - name: Build Docker | |
| run: | | |
| cd code/stable-diffusion-xl/ | |
| sudo docker run -dt --network=host --device=/dev/kfd --device=/dev/dri \ | |
| --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ | |
| -v /data/mlperf_sdxl/data:/data \ | |
| -v /data/mlperf_sdxl/models:/models \ | |
| -v `pwd`/SDXL_inference/:/mlperf/harness \ | |
| -e ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 \ | |
| -e HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 \ | |
| -w /mlperf/harness \ | |
| --name ci_container_micro_shortfin_nogil_${{ github.run_number }} \ | |
| mlperf_rocm_sdxl:ci_micro_shortfin_nogil_${{ github.run_number }} | |
| sudo docker run -dt --network=host --device=/dev/kfd --device=/dev/dri \ | |
| --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ | |
| -v /data/mlperf_sdxl/data:/data \ | |
| -v /data/mlperf_sdxl/models:/models \ | |
| -v `pwd`/SDXL_inference/:/mlperf/harness \ | |
| -e ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 \ | |
| -e HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 \ | |
| -w /mlperf/harness \ | |
| --name ci_container_micro_shortfin_${{ github.run_number }} \ | |
| mlperf_rocm_sdxl:ci_micro_shortfin_${{ github.run_number }} | |
| ml_perf_test: | |
| needs: run_container | |
| name: "ML Perf Harness Test" | |
| runs-on: nod-mi325-8gpu-mlperf | |
| steps: | |
| - name: Precompile (Server) | |
| run: | | |
| sudo docker exec --env IREE_BUILD_DIR="/iree/build-server" ci_container_micro_shortfin_${{ github.run_number }} ./precompile_model_shortfin.sh --td_spec attention_and_matmul_spec_gfx942_MI325.mlir --model_json sdxl_config_fp8_sched_unet_bs2.json --force_export True | |
| - name: Precompile (Offline) | |
| run: | | |
| sudo docker exec --env IREE_BUILD_DIR="/iree/build-offline" ci_container_micro_shortfin_${{ github.run_number }} ./precompile_model_shortfin.sh --td_spec attention_and_matmul_spec_gfx942_MI325_bs32.mlir --model_json sdxl_config_fp8_sched_unet_bs32.json --force_export True | |
| - name: Run Server | |
| run: | | |
| sudo docker exec ci_container_micro_shortfin_${{ github.run_number }} ./run_scenario_server_MI325x_cpx.sh | |
| - name: Run Offline | |
| run: | | |
| sudo docker exec --env PYTHON_GIL=0 ci_container_micro_shortfin_nogil_${{ github.run_number }} ./run_scenario_offline_MI325x_cpx.sh | |
| - name: "Upload accuracy artifact (offline)" | |
| uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 | |
| with: | |
| name: accuracy_artifact_offline | |
| path: SDXL_inference/Submission/closed/AMD/results/8xMI325x_2xEPYC-9655/stable-diffusion-xl/Offline/accuracy/accuracy.txt | |
| - name: "Upload accuracy artifact (server)" | |
| uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 | |
| with: | |
| name: accuracy_artifact_server | |
| path: SDXL_inference/Submission/closed/AMD/results/8xMI325x_2xEPYC-9655/stable-diffusion-xl/Server/accuracy/accuracy.txt | |
| - name: "Upload perf artifact (server)" | |
| uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 | |
| with: | |
| name: perf_artifact_server | |
| path: SDXL_inference/Submission/closed/AMD/results/8xMI325x_2xEPYC-9655/stable-diffusion-xl/Server/performance/run_1/mlperf_log_summary.json | |
| - name: "Upload perf artifact (offline)" | |
| uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 | |
| with: | |
| name: perf_artifact_offline | |
| path: SDXL_inference/Submission/closed/AMD/results/8xMI325x_2xEPYC-9655/stable-diffusion-xl/Offline/performance/run_1/mlperf_log_summary.json | |
| container_cleanup: | |
| needs: ml_perf_test | |
| name: "Cleanup" | |
| if: always() | |
| runs-on: nod-mi325-8gpu-mlperf | |
| steps: | |
| - name: Cleanup Docker | |
| run: | | |
| sudo docker stop ci_container_micro_shortfin_nogil_${{ github.run_number }} | |
| sudo docker stop ci_container_micro_shortfin_${{ github.run_number }} | |
| sudo docker rm ci_container_micro_shortfin_nogil_${{ github.run_number }} | |
| sudo docker rm ci_container_micro_shortfin_${{ github.run_number }} | |
| sudo docker rmi mlperf_rocm_sdxl:ci_micro_shortfin_nogil_${{ github.run_number }} | |
| sudo docker rmi mlperf_rocm_sdxl:ci_micro_shortfin_${{ github.run_number }} | |
| - name: Cleanup Submission artifacts | |
| run: | | |
| sudo rm -rf ./code/stable-diffusion-xl/SDXL_inference/ | |