Skip to content

feat: writing evals skill #56

feat: writing evals skill

feat: writing evals skill #56

name: Evaluate Changed Skills
on:
pull_request:
paths:
- 'skills/**'
- 'eval-tooling/**'
workflow_dispatch:
inputs:
skill:
description: 'Skill to evaluate (leave empty to detect from branch diff)'
required: false
type: string
jobs:
detect-changes:
runs-on: ubuntu-latest
outputs:
skills: ${{ steps.changed-skills.outputs.skills }}
has_evals: ${{ steps.changed-skills.outputs.has_evals }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Detect changed skills with evals
id: changed-skills
run: |
# If skill input is provided, use it directly
if [ -n "${{ inputs.skill }}" ]; then
skill="${{ inputs.skill }}"
eval_file="skills/${skill}/.meta/${skill}.eval.ts"
if [ -f "$eval_file" ]; then
echo "skills=[\"${skill}\"]" >> $GITHUB_OUTPUT
echo "has_evals=true" >> $GITHUB_OUTPUT
echo "Running eval for manually specified skill: ${skill}"
exit 0
else
echo "No eval file found for skill: ${skill}"
echo "skills=[]" >> $GITHUB_OUTPUT
echo "has_evals=false" >> $GITHUB_OUTPUT
exit 0
fi
fi
# Get list of changed files in the PR
base_ref="${{ github.base_ref }}"
if [ -z "$base_ref" ]; then
base_ref="main"
fi
changed_files=$(git diff --name-only origin/${base_ref}...HEAD)
# Extract unique skill names from changed paths (skills/<skill>/...)
skills=$(echo "$changed_files" | grep '^skills/' | cut -d'/' -f2 | sort -u)
# Filter to only skills that have eval files
skills_with_evals=""
for skill in $skills; do
eval_file="skills/${skill}/.meta/${skill}.eval.ts"
if [ -f "$eval_file" ]; then
if [ -n "$skills_with_evals" ]; then
skills_with_evals="${skills_with_evals},${skill}"
else
skills_with_evals="${skill}"
fi
fi
done
# Output as JSON array for matrix
if [ -n "$skills_with_evals" ]; then
json_array=$(echo "$skills_with_evals" | tr ',' '\n' | jq -R . | jq -s -c .)
echo "skills=$json_array" >> $GITHUB_OUTPUT
echo "has_evals=true" >> $GITHUB_OUTPUT
echo "Skills with evals to run: $skills_with_evals"
else
echo "skills=[]" >> $GITHUB_OUTPUT
echo "has_evals=false" >> $GITHUB_OUTPUT
echo "No skills with evals found in changed files"
fi
run-evals:
needs: detect-changes
if: needs.detect-changes.outputs.has_evals == 'true'
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
skill: ${{ fromJson(needs.detect-changes.outputs.skills) }}
steps:
- uses: actions/checkout@v4
- uses: pnpm/action-setup@v4
with:
version: 9
- uses: actions/setup-node@v4
with:
node-version: '20'
cache: 'pnpm'
cache-dependency-path: eval-tooling/pnpm-lock.yaml
- name: Install dependencies
working-directory: eval-tooling
run: pnpm install
- name: Get baseline ID
id: baseline
run: |
baseline_file=".eval-baselines.json"
if [ -f "$baseline_file" ]; then
baseline_id=$(jq -r '.baselines["${{ matrix.skill }}"].baselineId // empty' "$baseline_file")
if [ -n "$baseline_id" ]; then
echo "baseline_id=$baseline_id" >> $GITHUB_OUTPUT
echo "has_baseline=true" >> $GITHUB_OUTPUT
echo "Found baseline for ${{ matrix.skill }}: $baseline_id"
else
echo "has_baseline=false" >> $GITHUB_OUTPUT
echo "No baseline found for ${{ matrix.skill }}"
fi
else
echo "has_baseline=false" >> $GITHUB_OUTPUT
echo "No baseline file found"
fi
- name: Run eval for ${{ matrix.skill }}
id: run-eval
working-directory: eval-tooling
env:
AI_GATEWAY_API_KEY: ${{ secrets.AI_GATEWAY_API_KEY }}
AXIOM_TOKEN: ${{ secrets.AXIOM_TOKEN }}
AXIOM_DATASET: ${{ secrets.AXIOM_DATASET }}
AXIOM_URL: ${{ secrets.AXIOM_URL }}
AXIOM_PLAY_URL: ${{ secrets.AXIOM_PLAY_URL }}
AXIOM_PLAY_TOKEN: ${{ secrets.AXIOM_PLAY_TOKEN }}
AXIOM_PLAY_ORG_ID: ${{ secrets.AXIOM_PLAY_ORG_ID }}
run: |
echo "Running eval for skill: ${{ matrix.skill }}"
# Build eval command with optional baseline and git metadata
eval_cmd="pnpm exec axiom eval ../skills/${{ matrix.skill }}/.meta/${{ matrix.skill }}.eval.ts"
eval_cmd="$eval_cmd --flag.git.branch=${{ github.head_ref || github.ref_name }}"
eval_cmd="$eval_cmd --flag.git.commit=${{ github.sha }}"
if [ "${{ steps.baseline.outputs.has_baseline }}" = "true" ]; then
eval_cmd="$eval_cmd --baseline ${{ steps.baseline.outputs.baseline_id }}"
echo "Comparing against baseline: ${{ steps.baseline.outputs.baseline_id }}"
fi
# Capture output while still displaying it, strip ANSI codes
$eval_cmd 2>&1 | tee eval-output-raw.txt
# Strip ANSI escape codes
sed 's/\x1b\[[0-9;]*m//g' eval-output-raw.txt > eval-output.txt
# Extract trace ID from output for potential baseline update
# URL format: https://app.axiom.co/.../evaluations/<name>/<traceId>?...
trace_id=$(grep -oE '/evaluations/[^/]+/[A-Z0-9]+' eval-output.txt | head -1 | sed 's|.*/||' || true)
if [ -z "$trace_id" ]; then
# Fallback: try trace_id= pattern
trace_id=$(grep -oE 'trace_id=[a-zA-Z0-9-]+' eval-output.txt | head -1 | cut -d= -f2 || true)
fi
echo "trace_id=$trace_id" >> $GITHUB_OUTPUT
# Extract scores section (between header and "View eval result")
sed -n '/FINAL EVALUATION REPORT/,/^View eval result:/p' eval-output.txt | \
grep -v 'FINAL EVALUATION REPORT' | \
grep -v '^View eval result:' | \
sed '/^[[:space:]]*$/d' > report-section.txt
# Extract the eval URL
eval_url=$(grep -A1 "View eval result:" eval-output.txt | tail -1 | tr -d ' ' || true)
echo "eval_url=$eval_url" >> $GITHUB_OUTPUT
- name: Save eval report
run: |
mkdir -p eval-reports
report="eval-reports/${{ matrix.skill }}.md"
echo "### ${{ matrix.skill }}" > "$report"
echo "" >> "$report"
echo "**commit:** \`${{ github.sha }}\`" >> "$report"
echo "**branch:** \`${{ github.head_ref || github.ref_name }}\`" >> "$report"
if [ "${{ steps.baseline.outputs.has_baseline }}" = "true" ]; then
echo "**baseline:** \`${{ steps.baseline.outputs.baseline_id }}\`" >> "$report"
else
echo "**baseline:** _none (first eval)_" >> "$report"
fi
echo "" >> "$report"
echo '```' >> "$report"
cat eval-tooling/report-section.txt >> "$report"
echo '```' >> "$report"
if [ -n "${{ steps.run-eval.outputs.eval_url }}" ]; then
echo "" >> "$report"
echo "[view eval result](${{ steps.run-eval.outputs.eval_url }})" >> "$report"
fi
- name: Upload eval report
uses: actions/upload-artifact@v4
with:
name: eval-report-${{ matrix.skill }}
path: eval-reports/${{ matrix.skill }}.md
retention-days: 7
post-comment:
needs: [detect-changes, run-evals]
if: always() && needs.detect-changes.outputs.has_evals == 'true'
runs-on: ubuntu-latest
permissions:
pull-requests: write
steps:
- name: Download all eval reports
uses: actions/download-artifact@v4
with:
pattern: eval-report-*
path: eval-reports
merge-multiple: true
- name: Build combined comment
id: build-comment
run: |
# Create the combined comment
cat > comment.md << 'EOF'
## Skill Evaluation Results
EOF
# Append all individual reports
for report in eval-reports/*.md; do
if [ -f "$report" ]; then
cat "$report" >> comment.md
echo "" >> comment.md
echo "---" >> comment.md
echo "" >> comment.md
fi
done
# Add footer
cat >> comment.md << 'EOF'
<sub>generated by skill evaluation workflow</sub>
EOF
cat comment.md
- name: Post or update PR comment
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const commentBody = fs.readFileSync('comment.md', 'utf8');
const marker = '<!-- skill-eval-results -->';
const fullBody = marker + '\n' + commentBody;
// Only post comment on PRs
if (!context.payload.pull_request) {
console.log('Not a PR, skipping comment');
return;
}
const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.payload.pull_request.number,
});
const existingComment = comments.find(c => c.body.includes(marker));
if (existingComment) {
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: existingComment.id,
body: fullBody,
});
console.log('Updated existing comment');
} else {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.payload.pull_request.number,
body: fullBody,
});
console.log('Created new comment');
}