Skip to content

MLPerf SDXL v5.1 Weekly - MI325x #21

MLPerf SDXL v5.1 Weekly - MI325x

MLPerf SDXL v5.1 Weekly - MI325x #21

# Copyright 2024 Advanced Micro Devices, Inc
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
name: MLPerf SDXL v5.1 Weekly - MI325x
on:
workflow_dispatch:
pull_request:
# Run once weekly at midnight on Sundays PST.
schedule:
- cron: '0 0 * * 0'
jobs:
build_container:
name: "Build Container"
runs-on: nod-mi325-8gpu-mlperf
steps:
- name: Increase system vm map
run: sudo sysctl -w vm.max_map_count=262144
- name: Increase static TLS block limit
run: export GLIBC_TUNABLES=glibc.rtld.optional_static_tls=2048
- name: Checking out this repo
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: v5.1
# -dt lets us run in an interactive, but detached mode which keeps the container alive
# so the actual testing job can use it using docker exec
- name: Build Docker
run: |
cd code/stable-diffusion-xl/
sudo docker build --no-cache --platform linux/amd64 \
--tag mlperf_rocm_sdxl:ci_micro_shortfin_nogil_${{ github.run_number }} \
--file SDXL_inference/sdxl_harness_rocm_shortfin_no_gil.dockerfile .
sudo docker build --no-cache --platform linux/amd64 \
--tag mlperf_rocm_sdxl:ci_micro_shortfin_${{ github.run_number }} \
--file SDXL_inference/sdxl_harness_rocm_shortfin_from_source_iree.dockerfile .
run_container:
name: "Run Container"
needs: build_container
runs-on: nod-mi325-8gpu-mlperf
steps:
- name: Build Docker
run: |
cd code/stable-diffusion-xl/
sudo docker run -dt --network=host --device=/dev/kfd --device=/dev/dri \
--group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
-v /data/mlperf_sdxl/data:/data \
-v /data/mlperf_sdxl/models:/models \
-v `pwd`/SDXL_inference/:/mlperf/harness \
-e ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 \
-e HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 \
-w /mlperf/harness \
--name ci_container_micro_shortfin_nogil_${{ github.run_number }} \
mlperf_rocm_sdxl:ci_micro_shortfin_nogil_${{ github.run_number }}
sudo docker run -dt --network=host --device=/dev/kfd --device=/dev/dri \
--group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
-v /data/mlperf_sdxl/data:/data \
-v /data/mlperf_sdxl/models:/models \
-v `pwd`/SDXL_inference/:/mlperf/harness \
-e ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 \
-e HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 \
-w /mlperf/harness \
--name ci_container_micro_shortfin_${{ github.run_number }} \
mlperf_rocm_sdxl:ci_micro_shortfin_${{ github.run_number }}
ml_perf_test:
needs: run_container
name: "ML Perf Harness Test"
runs-on: nod-mi325-8gpu-mlperf
steps:
- name: Precompile (Server)
run: |
sudo docker exec --env IREE_BUILD_DIR="/iree/build-server" ci_container_micro_shortfin_${{ github.run_number }} ./precompile_model_shortfin.sh --td_spec attention_and_matmul_spec_gfx942_MI325.mlir --model_json sdxl_config_fp8_sched_unet_bs2.json --force_export True
- name: Precompile (Offline)
run: |
sudo docker exec --env IREE_BUILD_DIR="/iree/build-offline" ci_container_micro_shortfin_${{ github.run_number }} ./precompile_model_shortfin.sh --td_spec attention_and_matmul_spec_gfx942_MI325_bs32.mlir --model_json sdxl_config_fp8_sched_unet_bs32.json --force_export True
- name: Run Server
run: |
sudo docker exec ci_container_micro_shortfin_${{ github.run_number }} ./run_scenario_server_MI325x_cpx.sh
- name: Run Offline
run: |
sudo docker exec --env PYTHON_GIL=0 ci_container_micro_shortfin_nogil_${{ github.run_number }} ./run_scenario_offline_MI325x_cpx.sh
- name: "Upload accuracy artifact (offline)"
uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
with:
name: accuracy_artifact_offline
path: SDXL_inference/Submission/closed/AMD/results/8xMI325x_2xEPYC-9655/stable-diffusion-xl/Offline/accuracy/accuracy.txt
- name: "Upload accuracy artifact (server)"
uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
with:
name: accuracy_artifact_server
path: SDXL_inference/Submission/closed/AMD/results/8xMI325x_2xEPYC-9655/stable-diffusion-xl/Server/accuracy/accuracy.txt
- name: "Upload perf artifact (server)"
uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
with:
name: perf_artifact_server
path: SDXL_inference/Submission/closed/AMD/results/8xMI325x_2xEPYC-9655/stable-diffusion-xl/Server/performance/run_1/mlperf_log_summary.json
- name: "Upload perf artifact (offline)"
uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
with:
name: perf_artifact_offline
path: SDXL_inference/Submission/closed/AMD/results/8xMI325x_2xEPYC-9655/stable-diffusion-xl/Offline/performance/run_1/mlperf_log_summary.json
container_cleanup:
needs: ml_perf_test
name: "Cleanup"
if: always()
runs-on: nod-mi325-8gpu-mlperf
steps:
- name: Cleanup Docker
run: |
sudo docker stop ci_container_micro_shortfin_nogil_${{ github.run_number }}
sudo docker stop ci_container_micro_shortfin_${{ github.run_number }}
sudo docker rm ci_container_micro_shortfin_nogil_${{ github.run_number }}
sudo docker rm ci_container_micro_shortfin_${{ github.run_number }}
sudo docker rmi mlperf_rocm_sdxl:ci_micro_shortfin_nogil_${{ github.run_number }}
sudo docker rmi mlperf_rocm_sdxl:ci_micro_shortfin_${{ github.run_number }}
- name: Cleanup Submission artifacts
run: |
sudo rm -rf ./code/stable-diffusion-xl/SDXL_inference/