Skip to content

ML Perf Micro Shortfin #6

ML Perf Micro Shortfin

ML Perf Micro Shortfin #6

# Copyright 2024 Advanced Micro Devices, Inc
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
name: ML Perf Micro Shortfin
on:
workflow_dispatch:
pull_request:
# Run at 7:00 PM PST.
schedule:
- cron: '0 3 * * *'
jobs:
start_container:
name: "Start Container"
runs-on: linux-mi300-8gpu-ossci-nod-ai
steps:
- name: Pre Checkout MI300 Step
run: sudo chmod -R 777 ~/actions-runner/_work
- name: Increase system vm map
run: sudo sysctl -w vm.max_map_count=262144
- name: Increase static TLS block limit
run: export GLIBC_TUNABLES=glibc.rtld.optional_static_tls=2048
- name: Checking out this repo
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: staging-v5.1
# -dt lets us run in an interactive, but detached mode which keeps the container alive
# so the actual testing job can use it using docker exec
- name: Build and run Docker
run: |
echo "always" | sudo tee /sys/kernel/mm/transparent_hugepage/enabled
cd code/stable-diffusion-xl
docker build --no-cache --platform linux/amd64 \
--tag mlperf_rocm_sdxl:ci_micro_shortfin_${{ github.run_number }} \
--file SDXL_inference/sdxl_harness_rocm_shortfin_from_source_iree.dockerfile .
docker run -dt --network=host --device=/dev/kfd --device=/dev/dri \
--group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
-v /data/mlperf_sdxl/data:/data \
-v /data/mlperf_sdxl/models:/models \
-v `pwd`/SDXL_inference/:/mlperf/harness \
-e ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 \
-e HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 \
-w /mlperf/harness \
--name ci_container_micro_shortfin_${{ github.run_number }} \
mlperf_rocm_sdxl:ci_micro_shortfin_${{ github.run_number }}
ml_perf_test:
needs: start_container
name: "ML Perf Harness Test"
runs-on: nodai-amdgpu-mi300-x86-64-perf
steps:
- name: Print Hashes of Dependencies
run: |
docker exec ci_container_micro_shortfin_${{ github.run_number }} ./get_hashes.sh
cat SDXL_inference/hashes.txt
- name: "Download the official model"
run: |
docker exec ci_container_micro_shortfin_${{ github.run_number }} ./download_model.sh
- name: "Download the official data"
run: |
docker exec ci_container_micro_shortfin_${{ github.run_number }} rm -rf /data/coco
docker exec ci_container_micro_shortfin_${{ github.run_number }} ./download_data.sh
- name: Preprocess the dataset
run: |
docker exec ci_container_micro_shortfin_${{ github.run_number }} python3.11 preprocess_data.py
- name: Compile the shark engines
run: |
docker exec ci_container_micro_shortfin_${{ github.run_number }} rm -rf /models/SDXL/official_pytorch/fp16/stable_diffusion_fp16/genfiles
docker exec ci_container_micro_shortfin_${{ github.run_number }} rm -rf /models/SDXL/official_pytorch/fp16/stable_diffusion_fp16/bin
docker exec ci_container_micro_shortfin_${{ github.run_number }} ./precompile_model_shortfin.sh \
--gpu_batch_size 16 \
--vae_batch_size 1 \
--td_spec attention_and_matmul_spec_gfx942_MI325.mlir \
--model_json sdxl_config_fp8_sched_unet_bs16.json
- name: Run Perf
run: |
docker exec ci_container_micro_shortfin_${{ github.run_number }} python3.11 harness.py \
--devices "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63" \
--gpu_batch_size 16 \
--vae_batch_size 1 \
--cores_per_devices 2 \
--workers_per_device 1 \
--fibers_per_device 1 \
--qps 16 \
--td_spec=attention_and_matmul_spec_gfx942_MI325.mlir \
--model_json=sdxl_config_fp8_sched_unet_bs16.json \
--scenario Offline \
--test_mode PerformanceOnly \
--logfile_outdir output_offline_perf \
--verbose True
- name: Print Perf
run: |
cat SDXL_inference/output_offline_perf/mlperf_log_summary.txt
- name: "Upload perf artifact"
uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
with:
name: performance_artifact
path: SDXL_inference/output_offline_perf/mlperf_log_summary.txt
- name: Run Accuracy
run: |
docker exec ci_container_micro_shortfin_${{ github.run_number }} python3.11 harness.py \
--devices "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63" \
--gpu_batch_size 16 \
--vae_batch_size 1 \
--cores_per_devices 2 \
--workers_per_device 1 \
--fibers_per_device 1 \
--qps 16 \
--td_spec=attention_and_matmul_spec_gfx942_MI325.mlir \
--model_json=sdxl_config_fp8_sched_unet_bs16.json \
--scenario Offline \
--test_mode AccuracyOnly \
--logfile_outdir output_offline_acc \
--verbose True
- name: Setup accuracy venv and check
run: |
docker exec ci_container_micro_shortfin_${{ github.run_number }} ./setup_accuracy_env.sh
docker exec ci_container_micro_shortfin_${{ github.run_number }} ./check_accuracy_scores.sh output_offline_acc/mlperf_log_accuracy.json
- name: Print Accuracy
run: |
cat SDXL_inference/output_offline_acc/coco-results.json
- name: "Upload accuracy artifact"
uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
with:
name: accuracy_artifact
path: SDXL_inference/output_offline_acc/coco-results.json
container_cleanup:
needs: ml_perf_test
name: "Docker Cleanup"
if: always()
runs-on: nodai-amdgpu-mi300-x86-64-perf
steps:
- name: Cleanup Docker
run: |
docker stop ci_container_micro_shortfin_${{ github.run_number }}
docker rm ci_container_micro_shortfin_${{ github.run_number }}
docker rmi mlperf_rocm_sdxl:ci_micro_shortfin_${{ github.run_number }}