Fix Confidence and token timestamps for ASR #474

Workflow file for this run

.github/workflows/asr-benchmark.yml at 1e270c0

	name: ASR Benchmark

	on:
	pull_request:
	branches: [main]
	workflow_dispatch:

	jobs:
	asr-benchmark:
	name: ASR Benchmark
	runs-on: macos-14
	permissions:
	contents: read
	pull-requests: write

	steps:
	- uses: actions/checkout@v5

	- uses: swift-actions/setup-swift@v2
	with:
	swift-version: "6.1"

	- name: Install ffmpeg
	run: \|
	brew install ffmpeg \|\| echo "ffmpeg may already be installed"
	ffmpeg -version \|\| echo "ffmpeg not available"

	- name: Cache Dependencies
	uses: actions/cache@v4
	with:
	path: \|
	.build
	~/Library/Application Support/FluidAudio/Models/parakeet-tdt-0.6b-v3-coreml
	~/Library/Application Support/FluidAudio/Datasets/LibriSpeech
	key: ${{ runner.os }}-asr-${{ hashFiles('Package.resolved') }}-v6

	- name: Build
	run: swift build -c release

	- name: Run Benchmarks
	id: benchmark
	run: \|
	MAX_FILES="25"
	BENCHMARK_START=$(date +%s)

	# Set error handling
	set -o pipefail

	# Function to run benchmark with error capture
	run_benchmark() {
	local SUBSET=$1
	local MAX=$2
	local OUTPUT=$3
	local EXTRA_ARGS="${4:-}"

	echo "========================================="
	echo "Running ASR benchmark: $SUBSET (max $MAX files)"
	echo "Output: $OUTPUT"
	echo "Extra args: $EXTRA_ARGS"
	echo "========================================="

	if swift run -c release fluidaudio asr-benchmark \
	--subset "$SUBSET" --max-files "$MAX" \
	--auto-download --output "$OUTPUT" $EXTRA_ARGS 2>&1 \| tee benchmark_log.txt; then
	echo "✅ Benchmark $SUBSET completed successfully"
	return 0
	else
	echo "❌ Benchmark $SUBSET FAILED with exit code $?"
	echo "Last 50 lines of output:"
	tail -50 benchmark_log.txt
	# Continue with other benchmarks even if one fails
	return 1
	fi
	}

	# Run benchmarks with error capture
	run_benchmark "test-clean" "$MAX_FILES" "asr_results_clean.json" \|\| CLEAN_FAILED=1

	run_benchmark "test-other" "$MAX_FILES" "asr_results_other.json" \|\| OTHER_FAILED=1

	# Run streaming benchmark (smaller file count for faster CI)
	run_benchmark "test-clean" "5" "asr_results_streaming.json" "--test-streaming --chunk-duration 0.5" \|\| STREAMING_FAILED=1


	# Extract metrics with error handling
	if [ -f asr_results_clean.json ]; then
	CLEAN_WER_AVG=$(jq -r '.summary.averageWER * 100' asr_results_clean.json 2>/dev/null)
	CLEAN_WER_MED=$(jq -r '.summary.medianWER * 100' asr_results_clean.json 2>/dev/null)
	CLEAN_AUDIO=$(jq -r '.summary.totalAudioDuration' asr_results_clean.json 2>/dev/null)
	CLEAN_TIME=$(jq -r '.summary.totalProcessingTime' asr_results_clean.json 2>/dev/null)
	CLEAN_RTFx=$(jq -r '.summary.medianRTFx' asr_results_clean.json 2>/dev/null)

	# Format values only if they exist and are not null
	[ "$CLEAN_WER_AVG" != "null" ] && [ -n "$CLEAN_WER_AVG" ] && CLEAN_WER_AVG=$(printf "%.2f" "$CLEAN_WER_AVG") \|\| CLEAN_WER_AVG="N/A"
	[ "$CLEAN_WER_MED" != "null" ] && [ -n "$CLEAN_WER_MED" ] && CLEAN_WER_MED=$(printf "%.2f" "$CLEAN_WER_MED") \|\| CLEAN_WER_MED="N/A"
	[ "$CLEAN_RTFx" != "null" ] && [ -n "$CLEAN_RTFx" ] && CLEAN_RTFx=$(printf "%.2f" "$CLEAN_RTFx") \|\| CLEAN_RTFx="N/A"
	fi

	if [ -f asr_results_other.json ]; then
	OTHER_WER_AVG=$(jq -r '.summary.averageWER * 100' asr_results_other.json 2>/dev/null)
	OTHER_WER_MED=$(jq -r '.summary.medianWER * 100' asr_results_other.json 2>/dev/null)
	OTHER_AUDIO=$(jq -r '.summary.totalAudioDuration' asr_results_other.json 2>/dev/null)
	OTHER_TIME=$(jq -r '.summary.totalProcessingTime' asr_results_other.json 2>/dev/null)
	OTHER_RTFx=$(jq -r '.summary.medianRTFx' asr_results_other.json 2>/dev/null)

	# Format values only if they exist and are not null
	[ "$OTHER_WER_AVG" != "null" ] && [ -n "$OTHER_WER_AVG" ] && OTHER_WER_AVG=$(printf "%.2f" "$OTHER_WER_AVG") \|\| OTHER_WER_AVG="N/A"
	[ "$OTHER_WER_MED" != "null" ] && [ -n "$OTHER_WER_MED" ] && OTHER_WER_MED=$(printf "%.2f" "$OTHER_WER_MED") \|\| OTHER_WER_MED="N/A"
	[ "$OTHER_RTFx" != "null" ] && [ -n "$OTHER_RTFx" ] && OTHER_RTFx=$(printf "%.2f" "$OTHER_RTFx") \|\| OTHER_RTFx="N/A"
	fi

	if [ -f asr_results_streaming.json ]; then
	STREAMING_WER=$(jq -r '.summary.averageWER * 100' asr_results_streaming.json 2>/dev/null)
	STREAMING_RTFx=$(jq -r '.summary.medianRTFx' asr_results_streaming.json 2>/dev/null)
	STREAMING_AVG_CHUNK=$(jq -r '.summary.streaming.avgChunkProcessingTime' asr_results_streaming.json 2>/dev/null)
	STREAMING_MAX_CHUNK=$(jq -r '.summary.streaming.maxChunkProcessingTime' asr_results_streaming.json 2>/dev/null)
	STREAMING_CHUNKS=$(jq -r '.summary.streaming.totalChunksProcessed' asr_results_streaming.json 2>/dev/null)
	STREAMING_FIRST_TOKEN=$(jq -r '.summary.streaming.avgFirstTokenLatency // "N/A"' asr_results_streaming.json 2>/dev/null)

	# Format values only if they exist and are not null
	[ "$STREAMING_WER" != "null" ] && [ -n "$STREAMING_WER" ] && STREAMING_WER=$(printf "%.2f" "$STREAMING_WER") \|\| STREAMING_WER="N/A"
	[ "$STREAMING_RTFx" != "null" ] && [ -n "$STREAMING_RTFx" ] && STREAMING_RTFx=$(printf "%.2f" "$STREAMING_RTFx") \|\| STREAMING_RTFx="N/A"
	[ "$STREAMING_AVG_CHUNK" != "null" ] && [ -n "$STREAMING_AVG_CHUNK" ] && STREAMING_AVG_CHUNK=$(printf "%.3f" "$STREAMING_AVG_CHUNK") \|\| STREAMING_AVG_CHUNK="N/A"
	[ "$STREAMING_MAX_CHUNK" != "null" ] && [ -n "$STREAMING_MAX_CHUNK" ] && STREAMING_MAX_CHUNK=$(printf "%.3f" "$STREAMING_MAX_CHUNK") \|\| STREAMING_MAX_CHUNK="N/A"
	[ "$STREAMING_FIRST_TOKEN" != "null" ] && [ -n "$STREAMING_FIRST_TOKEN" ] && [ "$STREAMING_FIRST_TOKEN" != "N/A" ] && STREAMING_FIRST_TOKEN=$(printf "%.3f" "$STREAMING_FIRST_TOKEN")
	fi

	# Output metrics
	echo "CLEAN_WER_AVG=${CLEAN_WER_AVG:-N/A}" >> $GITHUB_OUTPUT
	echo "CLEAN_WER_MED=${CLEAN_WER_MED:-N/A}" >> $GITHUB_OUTPUT
	echo "CLEAN_RTFx=${CLEAN_RTFx:-N/A}" >> $GITHUB_OUTPUT
	echo "OTHER_WER_AVG=${OTHER_WER_AVG:-N/A}" >> $GITHUB_OUTPUT
	echo "OTHER_WER_MED=${OTHER_WER_MED:-N/A}" >> $GITHUB_OUTPUT
	echo "OTHER_RTFx=${OTHER_RTFx:-N/A}" >> $GITHUB_OUTPUT

	# Streaming metrics
	echo "STREAMING_WER=${STREAMING_WER:-N/A}" >> $GITHUB_OUTPUT
	echo "STREAMING_RTFx=${STREAMING_RTFx:-N/A}" >> $GITHUB_OUTPUT
	echo "STREAMING_AVG_CHUNK=${STREAMING_AVG_CHUNK:-N/A}" >> $GITHUB_OUTPUT
	echo "STREAMING_MAX_CHUNK=${STREAMING_MAX_CHUNK:-N/A}" >> $GITHUB_OUTPUT
	echo "STREAMING_CHUNKS=${STREAMING_CHUNKS:-N/A}" >> $GITHUB_OUTPUT
	echo "STREAMING_FIRST_TOKEN=${STREAMING_FIRST_TOKEN:-N/A}" >> $GITHUB_OUTPUT

	EXECUTION_TIME=$(( ($(date +%s) - BENCHMARK_START) / 60 ))m$(( ($(date +%s) - BENCHMARK_START) % 60 ))s
	echo "EXECUTION_TIME=$EXECUTION_TIME" >> $GITHUB_OUTPUT
	echo "FILES_COUNT=$MAX_FILES" >> $GITHUB_OUTPUT

	# Report failures summary
	if [ ! -z "$CLEAN_FAILED" ] \|\| [ ! -z "$OTHER_FAILED" ] \|\| [ ! -z "$STREAMING_FAILED" ]; then
	echo "BENCHMARK_STATUS=PARTIAL_FAILURE" >> $GITHUB_OUTPUT
	echo "⚠️ Some benchmarks failed:"
	[ ! -z "$CLEAN_FAILED" ] && echo " - test-clean benchmark failed"
	[ ! -z "$OTHER_FAILED" ] && echo " - test-other benchmark failed"
	[ ! -z "$STREAMING_FAILED" ] && echo " - streaming benchmark failed"
	# Don't exit with error to allow PR comment to be posted
	else
	echo "BENCHMARK_STATUS=SUCCESS" >> $GITHUB_OUTPUT
	echo "✅ All benchmarks completed successfully"
	fi

	- name: Comment PR
	if: github.event_name == 'pull_request'
	continue-on-error: true
	uses: actions/github-script@v7
	with:
	script: \|
	const benchmarkStatus = '${{ steps.benchmark.outputs.BENCHMARK_STATUS }}';
	const statusEmoji = benchmarkStatus === 'SUCCESS' ? '✅' : '⚠️';
	const statusText = benchmarkStatus === 'SUCCESS' ? 'All benchmarks passed' : 'Some benchmarks failed (see logs)';

	const body = `## ASR Benchmark Results ${statusEmoji}

	Status: ${statusText}

	\| Dataset \| WER Avg \| WER Med \| RTFx \| Status \|
	\|---------\|---------\|---------\|------\|--------\|
	\| test-clean \| ${{ steps.benchmark.outputs.CLEAN_WER_AVG }}% \| ${{ steps.benchmark.outputs.CLEAN_WER_MED }}% \| ${{ steps.benchmark.outputs.CLEAN_RTFx }}x \| ${parseFloat('${{ steps.benchmark.outputs.CLEAN_WER_AVG }}') < 10 ? '✅' : '${{ steps.benchmark.outputs.CLEAN_WER_AVG }}' === 'N/A' ? '❌' : '⚠️'} \|
	\| test-other \| ${{ steps.benchmark.outputs.OTHER_WER_AVG }}% \| ${{ steps.benchmark.outputs.OTHER_WER_MED }}% \| ${{ steps.benchmark.outputs.OTHER_RTFx }}x \| ${parseFloat('${{ steps.benchmark.outputs.OTHER_WER_AVG }}') < 20 ? '✅' : '${{ steps.benchmark.outputs.OTHER_WER_AVG }}' === 'N/A' ? '❌' : '⚠️'} \|

	### Streaming Infrastructure Test
	\| Metric \| Value \| Description \|
	\|--------\|-------\|-------------\|
	\| WER \| ${{ steps.benchmark.outputs.STREAMING_WER }}% \| Word Error Rate in streaming mode \|
	\| RTFx \| ${{ steps.benchmark.outputs.STREAMING_RTFx }}x \| Streaming real-time factor \|
	\| Avg Chunk Time \| ${{ steps.benchmark.outputs.STREAMING_AVG_CHUNK }}s \| Average time to process each chunk \|
	\| Max Chunk Time \| ${{ steps.benchmark.outputs.STREAMING_MAX_CHUNK }}s \| Maximum chunk processing time \|
	\| First Token \| ${{ steps.benchmark.outputs.STREAMING_FIRST_TOKEN }}s \| Latency to first transcription token \|
	\| Total Chunks \| ${{ steps.benchmark.outputs.STREAMING_CHUNKS }} \| Number of chunks processed \|

	<sub>Streaming test uses 5 files with 0.5s chunks to simulate real-time audio streaming</sub>

	<sub>${{ steps.benchmark.outputs.FILES_COUNT }} files per dataset • Test runtime: ${{ steps.benchmark.outputs.EXECUTION_TIME }} • ${new Date().toLocaleString('en-US', { timeZone: 'America/New_York', year: 'numeric', month: '2-digit', day: '2-digit', hour: '2-digit', minute: '2-digit', hour12: true })} EST</sub>

	<sub>RTFx = Real-Time Factor (higher is better) • Calculated as: Total audio duration ÷ Total processing time<br>Processing time includes: Model inference on Apple Neural Engine, audio preprocessing, state resets between files, token-to-text conversion, and file I/O<br>Example: RTFx of 2.0x means 10 seconds of audio processed in 5 seconds (2x faster than real-time)</sub>

	### Expected RTFx Performance on Physical M1 Hardware:
	• M1 Mac: ~28x (clean), ~25x (other)
	• CI shows ~0.5-3x due to virtualization limitations

	<sub>Testing methodology follows [HuggingFace Open ASR Leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard)</sub>

	<!-- fluidaudio-benchmark-asr -->`;

	const { data: comments } = await github.rest.issues.listComments({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: context.issue.number,
	});

	const existing = comments.find(c =>
	c.body.includes('<!-- fluidaudio-benchmark-asr -->')
	);

	if (existing) {
	await github.rest.issues.updateComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	comment_id: existing.id,
	body: body
	});
	} else {
	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: context.issue.number,
	body: body
	});
	}

	- name: Upload Results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: asr-results
	path: asr_results_*.json

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Fix Confidence and token timestamps for ASR #474

Workflow file

Fix Confidence and token timestamps for ASR #474

Uh oh!

Jobs

Run details

Workflow file for this run