Fix Confidence and token timestamps for ASR #474
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: ASR Benchmark | |
on: | |
pull_request: | |
branches: [main] | |
workflow_dispatch: | |
jobs: | |
asr-benchmark: | |
name: ASR Benchmark | |
runs-on: macos-14 | |
permissions: | |
contents: read | |
pull-requests: write | |
steps: | |
- uses: actions/checkout@v5 | |
- uses: swift-actions/setup-swift@v2 | |
with: | |
swift-version: "6.1" | |
- name: Install ffmpeg | |
run: | | |
brew install ffmpeg || echo "ffmpeg may already be installed" | |
ffmpeg -version || echo "ffmpeg not available" | |
- name: Cache Dependencies | |
uses: actions/cache@v4 | |
with: | |
path: | | |
.build | |
~/Library/Application Support/FluidAudio/Models/parakeet-tdt-0.6b-v3-coreml | |
~/Library/Application Support/FluidAudio/Datasets/LibriSpeech | |
key: ${{ runner.os }}-asr-${{ hashFiles('Package.resolved') }}-v6 | |
- name: Build | |
run: swift build -c release | |
- name: Run Benchmarks | |
id: benchmark | |
run: | | |
MAX_FILES="25" | |
BENCHMARK_START=$(date +%s) | |
# Set error handling | |
set -o pipefail | |
# Function to run benchmark with error capture | |
run_benchmark() { | |
local SUBSET=$1 | |
local MAX=$2 | |
local OUTPUT=$3 | |
local EXTRA_ARGS="${4:-}" | |
echo "=========================================" | |
echo "Running ASR benchmark: $SUBSET (max $MAX files)" | |
echo "Output: $OUTPUT" | |
echo "Extra args: $EXTRA_ARGS" | |
echo "=========================================" | |
if swift run -c release fluidaudio asr-benchmark \ | |
--subset "$SUBSET" --max-files "$MAX" \ | |
--auto-download --output "$OUTPUT" $EXTRA_ARGS 2>&1 | tee benchmark_log.txt; then | |
echo "✅ Benchmark $SUBSET completed successfully" | |
return 0 | |
else | |
echo "❌ Benchmark $SUBSET FAILED with exit code $?" | |
echo "Last 50 lines of output:" | |
tail -50 benchmark_log.txt | |
# Continue with other benchmarks even if one fails | |
return 1 | |
fi | |
} | |
# Run benchmarks with error capture | |
run_benchmark "test-clean" "$MAX_FILES" "asr_results_clean.json" || CLEAN_FAILED=1 | |
run_benchmark "test-other" "$MAX_FILES" "asr_results_other.json" || OTHER_FAILED=1 | |
# Run streaming benchmark (smaller file count for faster CI) | |
run_benchmark "test-clean" "5" "asr_results_streaming.json" "--test-streaming --chunk-duration 0.5" || STREAMING_FAILED=1 | |
# Extract metrics with error handling | |
if [ -f asr_results_clean.json ]; then | |
CLEAN_WER_AVG=$(jq -r '.summary.averageWER * 100' asr_results_clean.json 2>/dev/null) | |
CLEAN_WER_MED=$(jq -r '.summary.medianWER * 100' asr_results_clean.json 2>/dev/null) | |
CLEAN_AUDIO=$(jq -r '.summary.totalAudioDuration' asr_results_clean.json 2>/dev/null) | |
CLEAN_TIME=$(jq -r '.summary.totalProcessingTime' asr_results_clean.json 2>/dev/null) | |
CLEAN_RTFx=$(jq -r '.summary.medianRTFx' asr_results_clean.json 2>/dev/null) | |
# Format values only if they exist and are not null | |
[ "$CLEAN_WER_AVG" != "null" ] && [ -n "$CLEAN_WER_AVG" ] && CLEAN_WER_AVG=$(printf "%.2f" "$CLEAN_WER_AVG") || CLEAN_WER_AVG="N/A" | |
[ "$CLEAN_WER_MED" != "null" ] && [ -n "$CLEAN_WER_MED" ] && CLEAN_WER_MED=$(printf "%.2f" "$CLEAN_WER_MED") || CLEAN_WER_MED="N/A" | |
[ "$CLEAN_RTFx" != "null" ] && [ -n "$CLEAN_RTFx" ] && CLEAN_RTFx=$(printf "%.2f" "$CLEAN_RTFx") || CLEAN_RTFx="N/A" | |
fi | |
if [ -f asr_results_other.json ]; then | |
OTHER_WER_AVG=$(jq -r '.summary.averageWER * 100' asr_results_other.json 2>/dev/null) | |
OTHER_WER_MED=$(jq -r '.summary.medianWER * 100' asr_results_other.json 2>/dev/null) | |
OTHER_AUDIO=$(jq -r '.summary.totalAudioDuration' asr_results_other.json 2>/dev/null) | |
OTHER_TIME=$(jq -r '.summary.totalProcessingTime' asr_results_other.json 2>/dev/null) | |
OTHER_RTFx=$(jq -r '.summary.medianRTFx' asr_results_other.json 2>/dev/null) | |
# Format values only if they exist and are not null | |
[ "$OTHER_WER_AVG" != "null" ] && [ -n "$OTHER_WER_AVG" ] && OTHER_WER_AVG=$(printf "%.2f" "$OTHER_WER_AVG") || OTHER_WER_AVG="N/A" | |
[ "$OTHER_WER_MED" != "null" ] && [ -n "$OTHER_WER_MED" ] && OTHER_WER_MED=$(printf "%.2f" "$OTHER_WER_MED") || OTHER_WER_MED="N/A" | |
[ "$OTHER_RTFx" != "null" ] && [ -n "$OTHER_RTFx" ] && OTHER_RTFx=$(printf "%.2f" "$OTHER_RTFx") || OTHER_RTFx="N/A" | |
fi | |
if [ -f asr_results_streaming.json ]; then | |
STREAMING_WER=$(jq -r '.summary.averageWER * 100' asr_results_streaming.json 2>/dev/null) | |
STREAMING_RTFx=$(jq -r '.summary.medianRTFx' asr_results_streaming.json 2>/dev/null) | |
STREAMING_AVG_CHUNK=$(jq -r '.summary.streaming.avgChunkProcessingTime' asr_results_streaming.json 2>/dev/null) | |
STREAMING_MAX_CHUNK=$(jq -r '.summary.streaming.maxChunkProcessingTime' asr_results_streaming.json 2>/dev/null) | |
STREAMING_CHUNKS=$(jq -r '.summary.streaming.totalChunksProcessed' asr_results_streaming.json 2>/dev/null) | |
STREAMING_FIRST_TOKEN=$(jq -r '.summary.streaming.avgFirstTokenLatency // "N/A"' asr_results_streaming.json 2>/dev/null) | |
# Format values only if they exist and are not null | |
[ "$STREAMING_WER" != "null" ] && [ -n "$STREAMING_WER" ] && STREAMING_WER=$(printf "%.2f" "$STREAMING_WER") || STREAMING_WER="N/A" | |
[ "$STREAMING_RTFx" != "null" ] && [ -n "$STREAMING_RTFx" ] && STREAMING_RTFx=$(printf "%.2f" "$STREAMING_RTFx") || STREAMING_RTFx="N/A" | |
[ "$STREAMING_AVG_CHUNK" != "null" ] && [ -n "$STREAMING_AVG_CHUNK" ] && STREAMING_AVG_CHUNK=$(printf "%.3f" "$STREAMING_AVG_CHUNK") || STREAMING_AVG_CHUNK="N/A" | |
[ "$STREAMING_MAX_CHUNK" != "null" ] && [ -n "$STREAMING_MAX_CHUNK" ] && STREAMING_MAX_CHUNK=$(printf "%.3f" "$STREAMING_MAX_CHUNK") || STREAMING_MAX_CHUNK="N/A" | |
[ "$STREAMING_FIRST_TOKEN" != "null" ] && [ -n "$STREAMING_FIRST_TOKEN" ] && [ "$STREAMING_FIRST_TOKEN" != "N/A" ] && STREAMING_FIRST_TOKEN=$(printf "%.3f" "$STREAMING_FIRST_TOKEN") | |
fi | |
# Output metrics | |
echo "CLEAN_WER_AVG=${CLEAN_WER_AVG:-N/A}" >> $GITHUB_OUTPUT | |
echo "CLEAN_WER_MED=${CLEAN_WER_MED:-N/A}" >> $GITHUB_OUTPUT | |
echo "CLEAN_RTFx=${CLEAN_RTFx:-N/A}" >> $GITHUB_OUTPUT | |
echo "OTHER_WER_AVG=${OTHER_WER_AVG:-N/A}" >> $GITHUB_OUTPUT | |
echo "OTHER_WER_MED=${OTHER_WER_MED:-N/A}" >> $GITHUB_OUTPUT | |
echo "OTHER_RTFx=${OTHER_RTFx:-N/A}" >> $GITHUB_OUTPUT | |
# Streaming metrics | |
echo "STREAMING_WER=${STREAMING_WER:-N/A}" >> $GITHUB_OUTPUT | |
echo "STREAMING_RTFx=${STREAMING_RTFx:-N/A}" >> $GITHUB_OUTPUT | |
echo "STREAMING_AVG_CHUNK=${STREAMING_AVG_CHUNK:-N/A}" >> $GITHUB_OUTPUT | |
echo "STREAMING_MAX_CHUNK=${STREAMING_MAX_CHUNK:-N/A}" >> $GITHUB_OUTPUT | |
echo "STREAMING_CHUNKS=${STREAMING_CHUNKS:-N/A}" >> $GITHUB_OUTPUT | |
echo "STREAMING_FIRST_TOKEN=${STREAMING_FIRST_TOKEN:-N/A}" >> $GITHUB_OUTPUT | |
EXECUTION_TIME=$(( ($(date +%s) - BENCHMARK_START) / 60 ))m$(( ($(date +%s) - BENCHMARK_START) % 60 ))s | |
echo "EXECUTION_TIME=$EXECUTION_TIME" >> $GITHUB_OUTPUT | |
echo "FILES_COUNT=$MAX_FILES" >> $GITHUB_OUTPUT | |
# Report failures summary | |
if [ ! -z "$CLEAN_FAILED" ] || [ ! -z "$OTHER_FAILED" ] || [ ! -z "$STREAMING_FAILED" ]; then | |
echo "BENCHMARK_STATUS=PARTIAL_FAILURE" >> $GITHUB_OUTPUT | |
echo "⚠️ Some benchmarks failed:" | |
[ ! -z "$CLEAN_FAILED" ] && echo " - test-clean benchmark failed" | |
[ ! -z "$OTHER_FAILED" ] && echo " - test-other benchmark failed" | |
[ ! -z "$STREAMING_FAILED" ] && echo " - streaming benchmark failed" | |
# Don't exit with error to allow PR comment to be posted | |
else | |
echo "BENCHMARK_STATUS=SUCCESS" >> $GITHUB_OUTPUT | |
echo "✅ All benchmarks completed successfully" | |
fi | |
- name: Comment PR | |
if: github.event_name == 'pull_request' | |
continue-on-error: true | |
uses: actions/github-script@v7 | |
with: | |
script: | | |
const benchmarkStatus = '${{ steps.benchmark.outputs.BENCHMARK_STATUS }}'; | |
const statusEmoji = benchmarkStatus === 'SUCCESS' ? '✅' : '⚠️'; | |
const statusText = benchmarkStatus === 'SUCCESS' ? 'All benchmarks passed' : 'Some benchmarks failed (see logs)'; | |
const body = `## ASR Benchmark Results ${statusEmoji} | |
**Status:** ${statusText} | |
| Dataset | WER Avg | WER Med | RTFx | Status | | |
|---------|---------|---------|------|--------| | |
| test-clean | ${{ steps.benchmark.outputs.CLEAN_WER_AVG }}% | ${{ steps.benchmark.outputs.CLEAN_WER_MED }}% | ${{ steps.benchmark.outputs.CLEAN_RTFx }}x | ${parseFloat('${{ steps.benchmark.outputs.CLEAN_WER_AVG }}') < 10 ? '✅' : '${{ steps.benchmark.outputs.CLEAN_WER_AVG }}' === 'N/A' ? '❌' : '⚠️'} | | |
| test-other | ${{ steps.benchmark.outputs.OTHER_WER_AVG }}% | ${{ steps.benchmark.outputs.OTHER_WER_MED }}% | ${{ steps.benchmark.outputs.OTHER_RTFx }}x | ${parseFloat('${{ steps.benchmark.outputs.OTHER_WER_AVG }}') < 20 ? '✅' : '${{ steps.benchmark.outputs.OTHER_WER_AVG }}' === 'N/A' ? '❌' : '⚠️'} | | |
### Streaming Infrastructure Test | |
| Metric | Value | Description | | |
|--------|-------|-------------| | |
| WER | ${{ steps.benchmark.outputs.STREAMING_WER }}% | Word Error Rate in streaming mode | | |
| RTFx | ${{ steps.benchmark.outputs.STREAMING_RTFx }}x | Streaming real-time factor | | |
| Avg Chunk Time | ${{ steps.benchmark.outputs.STREAMING_AVG_CHUNK }}s | Average time to process each chunk | | |
| Max Chunk Time | ${{ steps.benchmark.outputs.STREAMING_MAX_CHUNK }}s | Maximum chunk processing time | | |
| First Token | ${{ steps.benchmark.outputs.STREAMING_FIRST_TOKEN }}s | Latency to first transcription token | | |
| Total Chunks | ${{ steps.benchmark.outputs.STREAMING_CHUNKS }} | Number of chunks processed | | |
<sub>*Streaming test uses 5 files with 0.5s chunks to simulate real-time audio streaming*</sub> | |
<sub>${{ steps.benchmark.outputs.FILES_COUNT }} files per dataset • Test runtime: ${{ steps.benchmark.outputs.EXECUTION_TIME }} • ${new Date().toLocaleString('en-US', { timeZone: 'America/New_York', year: 'numeric', month: '2-digit', day: '2-digit', hour: '2-digit', minute: '2-digit', hour12: true })} EST</sub> | |
<sub>**RTFx** = Real-Time Factor (higher is better) • Calculated as: Total audio duration ÷ Total processing time<br>Processing time includes: Model inference on Apple Neural Engine, audio preprocessing, state resets between files, token-to-text conversion, and file I/O<br>Example: RTFx of 2.0x means 10 seconds of audio processed in 5 seconds (2x faster than real-time)</sub> | |
### Expected RTFx Performance on Physical M1 Hardware: | |
**• M1 Mac: ~28x (clean), ~25x (other)** | |
**• CI shows ~0.5-3x due to virtualization limitations** | |
<sub>Testing methodology follows [HuggingFace Open ASR Leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard)</sub> | |
<!-- fluidaudio-benchmark-asr -->`; | |
const { data: comments } = await github.rest.issues.listComments({ | |
owner: context.repo.owner, | |
repo: context.repo.repo, | |
issue_number: context.issue.number, | |
}); | |
const existing = comments.find(c => | |
c.body.includes('<!-- fluidaudio-benchmark-asr -->') | |
); | |
if (existing) { | |
await github.rest.issues.updateComment({ | |
owner: context.repo.owner, | |
repo: context.repo.repo, | |
comment_id: existing.id, | |
body: body | |
}); | |
} else { | |
await github.rest.issues.createComment({ | |
owner: context.repo.owner, | |
repo: context.repo.repo, | |
issue_number: context.issue.number, | |
body: body | |
}); | |
} | |
- name: Upload Results | |
if: always() | |
uses: actions/upload-artifact@v4 | |
with: | |
name: asr-results | |
path: asr_results_*.json |