feat: writing evals skill #56

Workflow file for this run

.github/workflows/eval-changed-skills.yml at 772a08b

	name: Evaluate Changed Skills

	on:
	pull_request:
	paths:
	- 'skills/**'
	- 'eval-tooling/**'
	workflow_dispatch:
	inputs:
	skill:
	description: 'Skill to evaluate (leave empty to detect from branch diff)'
	required: false
	type: string

	jobs:
	detect-changes:
	runs-on: ubuntu-latest
	outputs:
	skills: ${{ steps.changed-skills.outputs.skills }}
	has_evals: ${{ steps.changed-skills.outputs.has_evals }}
	steps:
	- uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Detect changed skills with evals
	id: changed-skills
	run: \|
	# If skill input is provided, use it directly
	if [ -n "${{ inputs.skill }}" ]; then
	skill="${{ inputs.skill }}"
	eval_file="skills/${skill}/.meta/${skill}.eval.ts"
	if [ -f "$eval_file" ]; then
	echo "skills=[\"${skill}\"]" >> $GITHUB_OUTPUT
	echo "has_evals=true" >> $GITHUB_OUTPUT
	echo "Running eval for manually specified skill: ${skill}"
	exit 0
	else
	echo "No eval file found for skill: ${skill}"
	echo "skills=[]" >> $GITHUB_OUTPUT
	echo "has_evals=false" >> $GITHUB_OUTPUT
	exit 0
	fi
	fi

	# Get list of changed files in the PR
	base_ref="${{ github.base_ref }}"
	if [ -z "$base_ref" ]; then
	base_ref="main"
	fi
	changed_files=$(git diff --name-only origin/${base_ref}...HEAD)

	# Extract unique skill names from changed paths (skills/<skill>/...)
	skills=$(echo "$changed_files" \| grep '^skills/' \| cut -d'/' -f2 \| sort -u)

	# Filter to only skills that have eval files
	skills_with_evals=""
	for skill in $skills; do
	eval_file="skills/${skill}/.meta/${skill}.eval.ts"
	if [ -f "$eval_file" ]; then
	if [ -n "$skills_with_evals" ]; then
	skills_with_evals="${skills_with_evals},${skill}"
	else
	skills_with_evals="${skill}"
	fi
	fi
	done

	# Output as JSON array for matrix
	if [ -n "$skills_with_evals" ]; then
	json_array=$(echo "$skills_with_evals" \| tr ',' '\n' \| jq -R . \| jq -s -c .)
	echo "skills=$json_array" >> $GITHUB_OUTPUT
	echo "has_evals=true" >> $GITHUB_OUTPUT
	echo "Skills with evals to run: $skills_with_evals"
	else
	echo "skills=[]" >> $GITHUB_OUTPUT
	echo "has_evals=false" >> $GITHUB_OUTPUT
	echo "No skills with evals found in changed files"
	fi

	run-evals:
	needs: detect-changes
	if: needs.detect-changes.outputs.has_evals == 'true'
	runs-on: ubuntu-latest
	strategy:
	fail-fast: false
	matrix:
	skill: ${{ fromJson(needs.detect-changes.outputs.skills) }}
	steps:
	- uses: actions/checkout@v4

	- uses: pnpm/action-setup@v4
	with:
	version: 9

	- uses: actions/setup-node@v4
	with:
	node-version: '20'
	cache: 'pnpm'
	cache-dependency-path: eval-tooling/pnpm-lock.yaml

	- name: Install dependencies
	working-directory: eval-tooling
	run: pnpm install

	- name: Get baseline ID
	id: baseline
	run: \|
	baseline_file=".eval-baselines.json"
	if [ -f "$baseline_file" ]; then
	baseline_id=$(jq -r '.baselines["${{ matrix.skill }}"].baselineId // empty' "$baseline_file")
	if [ -n "$baseline_id" ]; then
	echo "baseline_id=$baseline_id" >> $GITHUB_OUTPUT
	echo "has_baseline=true" >> $GITHUB_OUTPUT
	echo "Found baseline for ${{ matrix.skill }}: $baseline_id"
	else
	echo "has_baseline=false" >> $GITHUB_OUTPUT
	echo "No baseline found for ${{ matrix.skill }}"
	fi
	else
	echo "has_baseline=false" >> $GITHUB_OUTPUT
	echo "No baseline file found"
	fi

	- name: Run eval for ${{ matrix.skill }}
	id: run-eval
	working-directory: eval-tooling
	env:
	AI_GATEWAY_API_KEY: ${{ secrets.AI_GATEWAY_API_KEY }}
	AXIOM_TOKEN: ${{ secrets.AXIOM_TOKEN }}
	AXIOM_DATASET: ${{ secrets.AXIOM_DATASET }}
	AXIOM_URL: ${{ secrets.AXIOM_URL }}
	AXIOM_PLAY_URL: ${{ secrets.AXIOM_PLAY_URL }}
	AXIOM_PLAY_TOKEN: ${{ secrets.AXIOM_PLAY_TOKEN }}
	AXIOM_PLAY_ORG_ID: ${{ secrets.AXIOM_PLAY_ORG_ID }}
	run: \|
	echo "Running eval for skill: ${{ matrix.skill }}"

	# Build eval command with optional baseline and git metadata
	eval_cmd="pnpm exec axiom eval ../skills/${{ matrix.skill }}/.meta/${{ matrix.skill }}.eval.ts"
	eval_cmd="$eval_cmd --flag.git.branch=${{ github.head_ref \|\| github.ref_name }}"
	eval_cmd="$eval_cmd --flag.git.commit=${{ github.sha }}"
	if [ "${{ steps.baseline.outputs.has_baseline }}" = "true" ]; then
	eval_cmd="$eval_cmd --baseline ${{ steps.baseline.outputs.baseline_id }}"
	echo "Comparing against baseline: ${{ steps.baseline.outputs.baseline_id }}"
	fi

	# Capture output while still displaying it, strip ANSI codes
	$eval_cmd 2>&1 \| tee eval-output-raw.txt

	# Strip ANSI escape codes
	sed 's/\x1b\[[0-9;]*m//g' eval-output-raw.txt > eval-output.txt

	# Extract trace ID from output for potential baseline update
	# URL format: https://app.axiom.co/.../evaluations/<name>/<traceId>?...
	trace_id=$(grep -oE '/evaluations/[^/]+/[A-Z0-9]+' eval-output.txt \| head -1 \| sed 's\|.*/\|\|' \|\| true)
	if [ -z "$trace_id" ]; then
	# Fallback: try trace_id= pattern
	trace_id=$(grep -oE 'trace_id=[a-zA-Z0-9-]+' eval-output.txt \| head -1 \| cut -d= -f2 \|\| true)
	fi
	echo "trace_id=$trace_id" >> $GITHUB_OUTPUT

	# Extract scores section (between header and "View eval result")
	sed -n '/FINAL EVALUATION REPORT/,/^View eval result:/p' eval-output.txt \| \
	grep -v 'FINAL EVALUATION REPORT' \| \
	grep -v '^View eval result:' \| \
	sed '/^[[:space:]]*$/d' > report-section.txt

	# Extract the eval URL
	eval_url=$(grep -A1 "View eval result:" eval-output.txt \| tail -1 \| tr -d ' ' \|\| true)
	echo "eval_url=$eval_url" >> $GITHUB_OUTPUT

	- name: Save eval report
	run: \|
	mkdir -p eval-reports
	report="eval-reports/${{ matrix.skill }}.md"

	echo "### ${{ matrix.skill }}" > "$report"
	echo "" >> "$report"
	echo "commit: \`${{ github.sha }}\`" >> "$report"
	echo "branch: \`${{ github.head_ref \|\| github.ref_name }}\`" >> "$report"

	if [ "${{ steps.baseline.outputs.has_baseline }}" = "true" ]; then
	echo "baseline: \`${{ steps.baseline.outputs.baseline_id }}\`" >> "$report"
	else
	echo "baseline: _none (first eval)_" >> "$report"
	fi

	echo "" >> "$report"
	echo '```' >> "$report"
	cat eval-tooling/report-section.txt >> "$report"
	echo '```' >> "$report"

	if [ -n "${{ steps.run-eval.outputs.eval_url }}" ]; then
	echo "" >> "$report"
	echo "[view eval result](${{ steps.run-eval.outputs.eval_url }})" >> "$report"
	fi

	- name: Upload eval report
	uses: actions/upload-artifact@v4
	with:
	name: eval-report-${{ matrix.skill }}
	path: eval-reports/${{ matrix.skill }}.md
	retention-days: 7

	post-comment:
	needs: [detect-changes, run-evals]
	if: always() && needs.detect-changes.outputs.has_evals == 'true'
	runs-on: ubuntu-latest
	permissions:
	pull-requests: write
	steps:
	- name: Download all eval reports
	uses: actions/download-artifact@v4
	with:
	pattern: eval-report-*
	path: eval-reports
	merge-multiple: true

	- name: Build combined comment
	id: build-comment
	run: \|
	# Create the combined comment
	cat > comment.md << 'EOF'
	## Skill Evaluation Results

	EOF

	# Append all individual reports
	for report in eval-reports/*.md; do
	if [ -f "$report" ]; then
	cat "$report" >> comment.md
	echo "" >> comment.md
	echo "---" >> comment.md
	echo "" >> comment.md
	fi
	done

	# Add footer
	cat >> comment.md << 'EOF'

	<sub>generated by skill evaluation workflow</sub>
	EOF

	cat comment.md

	- name: Post or update PR comment
	uses: actions/github-script@v7
	with:
	script: \|
	const fs = require('fs');
	const commentBody = fs.readFileSync('comment.md', 'utf8');
	const marker = '<!-- skill-eval-results -->';
	const fullBody = marker + '\n' + commentBody;

	// Only post comment on PRs
	if (!context.payload.pull_request) {
	console.log('Not a PR, skipping comment');
	return;
	}

	const { data: comments } = await github.rest.issues.listComments({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: context.payload.pull_request.number,
	});

	const existingComment = comments.find(c => c.body.includes(marker));

	if (existingComment) {
	await github.rest.issues.updateComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	comment_id: existingComment.id,
	body: fullBody,
	});
	console.log('Updated existing comment');
	} else {
	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: context.payload.pull_request.number,
	body: fullBody,
	});
	console.log('Created new comment');
	}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: writing evals skill #56

Workflow file

feat: writing evals skill #56

Uh oh!

Workflow file for this run