Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ prodigal
*.faa
*.gff
*.csv
*.out
/baseline
55 changes: 55 additions & 0 deletions scripts/benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash
set -euo pipefail

# Benchmark script for FragGeneScanRs using hyperfine
# Runs benchmarks on all example files with at least 10 iterations

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
BINARY="$PROJECT_ROOT/target/release/FragGeneScanRs"

# Check for hyperfine
if ! command -v hyperfine &> /dev/null; then
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just use ! hyperfine -V, no need for command.

echo "Error: hyperfine is not installed."
echo "Install with: brew install hyperfine or similar"
exit 1
fi

# Build release binary
echo "Building release binary..."
cargo build --release --manifest-path "$PROJECT_ROOT/Cargo.toml"

# Create temp directory for outputs
TEMP_DIR=$(mktemp -d)
trap 'rm -rf "$TEMP_DIR"' EXIT

echo ""
echo "Running benchmarks (minimum 10 runs, 3 warmup runs each)..."
echo "============================================================"

# Benchmark 1: Short reads (NC_000913-454.fna with 454_10 training)
echo ""
echo "Benchmark: Short reads (NC_000913-454.fna)"
hyperfine \
--warmup 3 \
--min-runs 20 \
"$BINARY -s $PROJECT_ROOT/example/NC_000913-454.fna -t 454_10 -w 0 -o $TEMP_DIR/NC_000913-454"

# Benchmark 2: Complete genome (NC_000913.fna with complete training)
echo ""
echo "Benchmark: Complete genome (NC_000913.fna)"
hyperfine \
--warmup 3 \
--min-runs 20 \
"$BINARY -s $PROJECT_ROOT/example/NC_000913.fna -t complete -w 1 -o $TEMP_DIR/NC_000913"

# Benchmark 3: Long reads (contigs.fna with complete training)
echo ""
echo "Benchmark: Long reads (contigs.fna)"
hyperfine \
--warmup 3 \
--min-runs 10 \
"$BINARY -s $PROJECT_ROOT/example/contigs.fna -t complete -w 1 -o $TEMP_DIR/contigs"

echo ""
echo "Benchmarks complete!"
131 changes: 131 additions & 0 deletions scripts/validate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/bin/bash
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use env.

set -euo pipefail

# Validation script for FragGeneScanRs
# Usage:
# ./scripts/validate.sh --baseline Generate baseline output files
# ./scripts/validate.sh --check Compare current output against baseline (default)

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
BASELINE_DIR="$PROJECT_ROOT/baseline"
BINARY="$PROJECT_ROOT/target/release/FragGeneScanRs"

# Example files and their configurations
# Format: "input_file:training_file:whole_genome_flag:output_name"
EXAMPLES=(
"example/NC_000913-454.fna:454_10:0:NC_000913-454"
"example/NC_000913.fna:complete:1:NC_000913"
"example/contigs.fna:complete:1:contigs"
)

usage() {
echo "Usage: $0 [--baseline|--check]"
echo " --baseline Generate baseline output files"
echo " --check Compare current output against baseline (default)"
exit 1
}

build_release() {
echo "Building release binary..."
cargo build --release --manifest-path "$PROJECT_ROOT/Cargo.toml"
}

run_example() {
local input="$1"
local train="$2"
local whole="$3"
local output_prefix="$4"

"$BINARY" \
-s "$PROJECT_ROOT/$input" \
-t "$train" \
-w "$whole" \
-o "$output_prefix"
}

generate_baseline() {
echo "Generating baseline outputs..."
mkdir -p "$BASELINE_DIR"

for example in "${EXAMPLES[@]}"; do
IFS=':' read -r input train whole name <<< "$example"
echo " Processing $name..."
run_example "$input" "$train" "$whole" "$BASELINE_DIR/$name"
Comment on lines +52 to +54
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than putting the example in a string array, splitting and naming them, then naming them again in the run method, I'd rather write three methods

NC454() { "$BINARY" -s example/NC_000913-454.fna -t 454_10 -w 0 -o NC_000913-454; }
...
examples=(NC454 ...)

And loop through the methods to call them directly.

done

echo "Baseline generated in $BASELINE_DIR"
}

check_against_baseline() {
if [[ ! -d "$BASELINE_DIR" ]]; then
echo "Error: Baseline directory not found. Run with --baseline first."
exit 1
fi

local temp_dir
temp_dir=$(mktemp -d)
trap 'rm -rf "$temp_dir"' EXIT

echo "Running current version and comparing against baseline..."
local failed=0

for example in "${EXAMPLES[@]}"; do
IFS=':' read -r input train whole name <<< "$example"
echo " Processing $name..."
run_example "$input" "$train" "$whole" "$temp_dir/$name"

for ext in out faa ffn; do
local baseline_file="$BASELINE_DIR/$name.$ext"
local current_file="$temp_dir/$name.$ext"

if [[ ! -f "$baseline_file" ]]; then
echo " Warning: Baseline file $baseline_file not found"
continue
Comment on lines +83 to +84
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather be defensive and have this fail if there is no baseline to be found.

fi

if diff -q "$baseline_file" "$current_file" > /dev/null 2>&1; then
echo " ✓ $name.$ext matches"
else
echo " ✗ $name.$ext DIFFERS"
failed=1
fi
done
done

if [[ $failed -eq 0 ]]; then
echo "All outputs match baseline!"
exit 0
else
echo "Some outputs differ from baseline!"
exit 1
fi
}

# Parse arguments
MODE="check"
if [[ $# -gt 0 ]]; then
case "$1" in
--baseline)
MODE="baseline"
;;
--check)
MODE="check"
;;
*)
usage
;;
esac
fi
Comment on lines +105 to +119
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feels weird to combine the check and validate in here to deduplicate the 3 example calls, and not do the same for the benchmark. I'd merge all three.


# Main
build_release

case "$MODE" in
baseline)
generate_baseline
;;
check)
check_against_baseline
;;
esac