Skip to content

Fix Confidence and token timestamps for ASR #474

Fix Confidence and token timestamps for ASR

Fix Confidence and token timestamps for ASR #474

Workflow file for this run

name: ASR Benchmark
on:
pull_request:
branches: [main]
workflow_dispatch:
jobs:
asr-benchmark:
name: ASR Benchmark
runs-on: macos-14
permissions:
contents: read
pull-requests: write
steps:
- uses: actions/checkout@v5
- uses: swift-actions/setup-swift@v2
with:
swift-version: "6.1"
- name: Install ffmpeg
run: |
brew install ffmpeg || echo "ffmpeg may already be installed"
ffmpeg -version || echo "ffmpeg not available"
- name: Cache Dependencies
uses: actions/cache@v4
with:
path: |
.build
~/Library/Application Support/FluidAudio/Models/parakeet-tdt-0.6b-v3-coreml
~/Library/Application Support/FluidAudio/Datasets/LibriSpeech
key: ${{ runner.os }}-asr-${{ hashFiles('Package.resolved') }}-v6
- name: Build
run: swift build -c release
- name: Run Benchmarks
id: benchmark
run: |
MAX_FILES="25"
BENCHMARK_START=$(date +%s)
# Set error handling
set -o pipefail
# Function to run benchmark with error capture
run_benchmark() {
local SUBSET=$1
local MAX=$2
local OUTPUT=$3
local EXTRA_ARGS="${4:-}"
echo "========================================="
echo "Running ASR benchmark: $SUBSET (max $MAX files)"
echo "Output: $OUTPUT"
echo "Extra args: $EXTRA_ARGS"
echo "========================================="
if swift run -c release fluidaudio asr-benchmark \
--subset "$SUBSET" --max-files "$MAX" \
--auto-download --output "$OUTPUT" $EXTRA_ARGS 2>&1 | tee benchmark_log.txt; then
echo "✅ Benchmark $SUBSET completed successfully"
return 0
else
echo "❌ Benchmark $SUBSET FAILED with exit code $?"
echo "Last 50 lines of output:"
tail -50 benchmark_log.txt
# Continue with other benchmarks even if one fails
return 1
fi
}
# Run benchmarks with error capture
run_benchmark "test-clean" "$MAX_FILES" "asr_results_clean.json" || CLEAN_FAILED=1
run_benchmark "test-other" "$MAX_FILES" "asr_results_other.json" || OTHER_FAILED=1
# Run streaming benchmark (smaller file count for faster CI)
run_benchmark "test-clean" "5" "asr_results_streaming.json" "--test-streaming --chunk-duration 0.5" || STREAMING_FAILED=1
# Extract metrics with error handling
if [ -f asr_results_clean.json ]; then
CLEAN_WER_AVG=$(jq -r '.summary.averageWER * 100' asr_results_clean.json 2>/dev/null)
CLEAN_WER_MED=$(jq -r '.summary.medianWER * 100' asr_results_clean.json 2>/dev/null)
CLEAN_AUDIO=$(jq -r '.summary.totalAudioDuration' asr_results_clean.json 2>/dev/null)
CLEAN_TIME=$(jq -r '.summary.totalProcessingTime' asr_results_clean.json 2>/dev/null)
CLEAN_RTFx=$(jq -r '.summary.medianRTFx' asr_results_clean.json 2>/dev/null)
# Format values only if they exist and are not null
[ "$CLEAN_WER_AVG" != "null" ] && [ -n "$CLEAN_WER_AVG" ] && CLEAN_WER_AVG=$(printf "%.2f" "$CLEAN_WER_AVG") || CLEAN_WER_AVG="N/A"
[ "$CLEAN_WER_MED" != "null" ] && [ -n "$CLEAN_WER_MED" ] && CLEAN_WER_MED=$(printf "%.2f" "$CLEAN_WER_MED") || CLEAN_WER_MED="N/A"
[ "$CLEAN_RTFx" != "null" ] && [ -n "$CLEAN_RTFx" ] && CLEAN_RTFx=$(printf "%.2f" "$CLEAN_RTFx") || CLEAN_RTFx="N/A"
fi
if [ -f asr_results_other.json ]; then
OTHER_WER_AVG=$(jq -r '.summary.averageWER * 100' asr_results_other.json 2>/dev/null)
OTHER_WER_MED=$(jq -r '.summary.medianWER * 100' asr_results_other.json 2>/dev/null)
OTHER_AUDIO=$(jq -r '.summary.totalAudioDuration' asr_results_other.json 2>/dev/null)
OTHER_TIME=$(jq -r '.summary.totalProcessingTime' asr_results_other.json 2>/dev/null)
OTHER_RTFx=$(jq -r '.summary.medianRTFx' asr_results_other.json 2>/dev/null)
# Format values only if they exist and are not null
[ "$OTHER_WER_AVG" != "null" ] && [ -n "$OTHER_WER_AVG" ] && OTHER_WER_AVG=$(printf "%.2f" "$OTHER_WER_AVG") || OTHER_WER_AVG="N/A"
[ "$OTHER_WER_MED" != "null" ] && [ -n "$OTHER_WER_MED" ] && OTHER_WER_MED=$(printf "%.2f" "$OTHER_WER_MED") || OTHER_WER_MED="N/A"
[ "$OTHER_RTFx" != "null" ] && [ -n "$OTHER_RTFx" ] && OTHER_RTFx=$(printf "%.2f" "$OTHER_RTFx") || OTHER_RTFx="N/A"
fi
if [ -f asr_results_streaming.json ]; then
STREAMING_WER=$(jq -r '.summary.averageWER * 100' asr_results_streaming.json 2>/dev/null)
STREAMING_RTFx=$(jq -r '.summary.medianRTFx' asr_results_streaming.json 2>/dev/null)
STREAMING_AVG_CHUNK=$(jq -r '.summary.streaming.avgChunkProcessingTime' asr_results_streaming.json 2>/dev/null)
STREAMING_MAX_CHUNK=$(jq -r '.summary.streaming.maxChunkProcessingTime' asr_results_streaming.json 2>/dev/null)
STREAMING_CHUNKS=$(jq -r '.summary.streaming.totalChunksProcessed' asr_results_streaming.json 2>/dev/null)
STREAMING_FIRST_TOKEN=$(jq -r '.summary.streaming.avgFirstTokenLatency // "N/A"' asr_results_streaming.json 2>/dev/null)
# Format values only if they exist and are not null
[ "$STREAMING_WER" != "null" ] && [ -n "$STREAMING_WER" ] && STREAMING_WER=$(printf "%.2f" "$STREAMING_WER") || STREAMING_WER="N/A"
[ "$STREAMING_RTFx" != "null" ] && [ -n "$STREAMING_RTFx" ] && STREAMING_RTFx=$(printf "%.2f" "$STREAMING_RTFx") || STREAMING_RTFx="N/A"
[ "$STREAMING_AVG_CHUNK" != "null" ] && [ -n "$STREAMING_AVG_CHUNK" ] && STREAMING_AVG_CHUNK=$(printf "%.3f" "$STREAMING_AVG_CHUNK") || STREAMING_AVG_CHUNK="N/A"
[ "$STREAMING_MAX_CHUNK" != "null" ] && [ -n "$STREAMING_MAX_CHUNK" ] && STREAMING_MAX_CHUNK=$(printf "%.3f" "$STREAMING_MAX_CHUNK") || STREAMING_MAX_CHUNK="N/A"
[ "$STREAMING_FIRST_TOKEN" != "null" ] && [ -n "$STREAMING_FIRST_TOKEN" ] && [ "$STREAMING_FIRST_TOKEN" != "N/A" ] && STREAMING_FIRST_TOKEN=$(printf "%.3f" "$STREAMING_FIRST_TOKEN")
fi
# Output metrics
echo "CLEAN_WER_AVG=${CLEAN_WER_AVG:-N/A}" >> $GITHUB_OUTPUT
echo "CLEAN_WER_MED=${CLEAN_WER_MED:-N/A}" >> $GITHUB_OUTPUT
echo "CLEAN_RTFx=${CLEAN_RTFx:-N/A}" >> $GITHUB_OUTPUT
echo "OTHER_WER_AVG=${OTHER_WER_AVG:-N/A}" >> $GITHUB_OUTPUT
echo "OTHER_WER_MED=${OTHER_WER_MED:-N/A}" >> $GITHUB_OUTPUT
echo "OTHER_RTFx=${OTHER_RTFx:-N/A}" >> $GITHUB_OUTPUT
# Streaming metrics
echo "STREAMING_WER=${STREAMING_WER:-N/A}" >> $GITHUB_OUTPUT
echo "STREAMING_RTFx=${STREAMING_RTFx:-N/A}" >> $GITHUB_OUTPUT
echo "STREAMING_AVG_CHUNK=${STREAMING_AVG_CHUNK:-N/A}" >> $GITHUB_OUTPUT
echo "STREAMING_MAX_CHUNK=${STREAMING_MAX_CHUNK:-N/A}" >> $GITHUB_OUTPUT
echo "STREAMING_CHUNKS=${STREAMING_CHUNKS:-N/A}" >> $GITHUB_OUTPUT
echo "STREAMING_FIRST_TOKEN=${STREAMING_FIRST_TOKEN:-N/A}" >> $GITHUB_OUTPUT
EXECUTION_TIME=$(( ($(date +%s) - BENCHMARK_START) / 60 ))m$(( ($(date +%s) - BENCHMARK_START) % 60 ))s
echo "EXECUTION_TIME=$EXECUTION_TIME" >> $GITHUB_OUTPUT
echo "FILES_COUNT=$MAX_FILES" >> $GITHUB_OUTPUT
# Report failures summary
if [ ! -z "$CLEAN_FAILED" ] || [ ! -z "$OTHER_FAILED" ] || [ ! -z "$STREAMING_FAILED" ]; then
echo "BENCHMARK_STATUS=PARTIAL_FAILURE" >> $GITHUB_OUTPUT
echo "⚠️ Some benchmarks failed:"
[ ! -z "$CLEAN_FAILED" ] && echo " - test-clean benchmark failed"
[ ! -z "$OTHER_FAILED" ] && echo " - test-other benchmark failed"
[ ! -z "$STREAMING_FAILED" ] && echo " - streaming benchmark failed"
# Don't exit with error to allow PR comment to be posted
else
echo "BENCHMARK_STATUS=SUCCESS" >> $GITHUB_OUTPUT
echo "✅ All benchmarks completed successfully"
fi
- name: Comment PR
if: github.event_name == 'pull_request'
continue-on-error: true
uses: actions/github-script@v7
with:
script: |
const benchmarkStatus = '${{ steps.benchmark.outputs.BENCHMARK_STATUS }}';
const statusEmoji = benchmarkStatus === 'SUCCESS' ? '✅' : '⚠️';
const statusText = benchmarkStatus === 'SUCCESS' ? 'All benchmarks passed' : 'Some benchmarks failed (see logs)';
const body = `## ASR Benchmark Results ${statusEmoji}
**Status:** ${statusText}
| Dataset | WER Avg | WER Med | RTFx | Status |
|---------|---------|---------|------|--------|
| test-clean | ${{ steps.benchmark.outputs.CLEAN_WER_AVG }}% | ${{ steps.benchmark.outputs.CLEAN_WER_MED }}% | ${{ steps.benchmark.outputs.CLEAN_RTFx }}x | ${parseFloat('${{ steps.benchmark.outputs.CLEAN_WER_AVG }}') < 10 ? '✅' : '${{ steps.benchmark.outputs.CLEAN_WER_AVG }}' === 'N/A' ? '❌' : '⚠️'} |
| test-other | ${{ steps.benchmark.outputs.OTHER_WER_AVG }}% | ${{ steps.benchmark.outputs.OTHER_WER_MED }}% | ${{ steps.benchmark.outputs.OTHER_RTFx }}x | ${parseFloat('${{ steps.benchmark.outputs.OTHER_WER_AVG }}') < 20 ? '✅' : '${{ steps.benchmark.outputs.OTHER_WER_AVG }}' === 'N/A' ? '❌' : '⚠️'} |
### Streaming Infrastructure Test
| Metric | Value | Description |
|--------|-------|-------------|
| WER | ${{ steps.benchmark.outputs.STREAMING_WER }}% | Word Error Rate in streaming mode |
| RTFx | ${{ steps.benchmark.outputs.STREAMING_RTFx }}x | Streaming real-time factor |
| Avg Chunk Time | ${{ steps.benchmark.outputs.STREAMING_AVG_CHUNK }}s | Average time to process each chunk |
| Max Chunk Time | ${{ steps.benchmark.outputs.STREAMING_MAX_CHUNK }}s | Maximum chunk processing time |
| First Token | ${{ steps.benchmark.outputs.STREAMING_FIRST_TOKEN }}s | Latency to first transcription token |
| Total Chunks | ${{ steps.benchmark.outputs.STREAMING_CHUNKS }} | Number of chunks processed |
<sub>*Streaming test uses 5 files with 0.5s chunks to simulate real-time audio streaming*</sub>
<sub>${{ steps.benchmark.outputs.FILES_COUNT }} files per dataset • Test runtime: ${{ steps.benchmark.outputs.EXECUTION_TIME }} • ${new Date().toLocaleString('en-US', { timeZone: 'America/New_York', year: 'numeric', month: '2-digit', day: '2-digit', hour: '2-digit', minute: '2-digit', hour12: true })} EST</sub>
<sub>**RTFx** = Real-Time Factor (higher is better) • Calculated as: Total audio duration ÷ Total processing time<br>Processing time includes: Model inference on Apple Neural Engine, audio preprocessing, state resets between files, token-to-text conversion, and file I/O<br>Example: RTFx of 2.0x means 10 seconds of audio processed in 5 seconds (2x faster than real-time)</sub>
### Expected RTFx Performance on Physical M1 Hardware:
**• M1 Mac: ~28x (clean), ~25x (other)**
**• CI shows ~0.5-3x due to virtualization limitations**
<sub>Testing methodology follows [HuggingFace Open ASR Leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard)</sub>
<!-- fluidaudio-benchmark-asr -->`;
const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
const existing = comments.find(c =>
c.body.includes('<!-- fluidaudio-benchmark-asr -->')
);
if (existing) {
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: existing.id,
body: body
});
} else {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: body
});
}
- name: Upload Results
if: always()
uses: actions/upload-artifact@v4
with:
name: asr-results
path: asr_results_*.json