Skip to content

Evals

Evals #128

Workflow file for this run

name: Evals
on:
schedule:
- cron: '0 2 * * *' # Run daily at 2 AM UTC
workflow_dispatch:
inputs:
suite:
description: 'Eval suite to run (leave empty for basic suite)'
required: false
models:
description: 'Comma-separated list of models to evaluate'
required: false
timeout:
description: 'Timeout for each eval in seconds'
type: number
required: false
image-suffix:
type: choice
description: 'Suffix to use for docker image'
options:
- ''
- '-full'
jobs:
run-evals:
runs-on: ubuntu-latest
env:
OWNER: ${{ github.repository_owner }}
SUITE: ${{ github.event.inputs.suite || '' }}
MODELS: ${{ github.event.inputs.models || '' }}
TIMEOUT: ${{ github.event.inputs.timeout || '60' }}
IMAGE_SUFFIX: ${{ github.event.inputs.image-suffix || '' }}
steps:
- name: Checkout code
uses: actions/checkout@v3
with:
ref: eval-results
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set environment variables
run: echo "OWNER_LC=${OWNER,,}" >>${GITHUB_ENV}
- name: Pull Docker image
run: |
docker pull ghcr.io/${OWNER_LC}/gptme-eval${IMAGE_SUFFIX}:latest
- name: Write gptme config.toml
shell: bash
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
run: |
mkdir -p ~/.config/gptme
cat > ~/.config/gptme/config.toml << EOF
[prompt]
about_user = "I am a curious human programmer."
response_preference = "Don't explain basic concepts"
[env]
OPENAI_API_KEY = "$OPENAI_API_KEY"
ANTHROPIC_API_KEY = "$ANTHROPIC_API_KEY"
OPENROUTER_API_KEY = "$OPENROUTER_API_KEY"
EOF
cat ~/.config/gptme/config.toml
chmod -R o=rwx ~/.config/gptme
- name: Run evals
run: |
MODEL_ARGS=""
if [ -n "$MODELS" ]; then
IFS=',' read -ra MODEL_ARRAY <<< "$MODELS"
for MODEL in "${MODEL_ARRAY[@]}"; do
MODEL_ARGS="$MODEL_ARGS -m $MODEL"
done
fi
echo "Running evals for suite: ${SUITE:-basic}"
echo "Models: ${MODELS:-default}"
echo "Timeout: ${TIMEOUT} seconds"
echo "Image suffix: ${IMAGE_SUFFIX}"
RESULTS_DIR=$(pwd)/eval_results
echo "Results dir: $RESULTS_DIR"
mkdir -p $RESULTS_DIR
chmod -R o=rwx $RESULTS_DIR # make sure docker can read/write the dir
docker run \
-v ~/.config/gptme:/home/appuser/.config/gptme \
-v $RESULTS_DIR:/app/eval_results \
ghcr.io/${OWNER_LC}/gptme-eval${IMAGE_SUFFIX}:latest \
${SUITE:+"$SUITE"} \
--timeout "$TIMEOUT" \
$MODEL_ARGS
- name: Configure Git
run: |
git config --local user.email "[email protected]"
git config --local user.name "GitHub Action"
- name: Update eval results
run: |
git pull origin eval-results
git add -f eval_results
git commit -m "chore: add eval results for run ${{ github.run_number }} [skip ci]" || echo "No changes to commit"
git push origin eval-results