From 4b9fd8b4830e9c40cea301ac29df27496a1c2345 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 14:05:41 +0200 Subject: [PATCH 1/3] feat(core): update grading artifact contract --- apps/cli/src/commands/pipeline/bench.ts | 16 +++- apps/cli/src/commands/results/manifest.ts | 24 +++++- apps/cli/src/commands/results/validate.ts | 4 +- apps/cli/test/commands/eval/aggregate.test.ts | 4 +- .../commands/eval/artifact-writer.test.ts | 76 ++++++++++++------ .../test/commands/eval/pipeline/bench.test.ts | 2 +- .../results/export-e2e-providers.test.ts | 11 ++- apps/cli/test/commands/results/export.test.ts | 20 +++-- .../cli/test/commands/results/summary.test.ts | 15 +++- .../docs/docs/reference/result-artifacts.mdx | 51 +++++++++++- .../evaluation/graders/llm-grader-prompt.ts | 4 + .../core/src/evaluation/graders/llm-grader.ts | 13 ++++ packages/core/src/evaluation/run-artifacts.ts | 77 +++++++++++++------ 13 files changed, 243 insertions(+), 74 deletions(-) diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts index 325226c9d..daa95683c 100644 --- a/apps/cli/src/commands/pipeline/bench.ts +++ b/apps/cli/src/commands/pipeline/bench.ts @@ -26,6 +26,16 @@ interface EvaluatorScore { readonly assertions: readonly { text: string; passed: boolean; evidence?: string }[]; } +function toAssertionResult(assertion: { text: string; passed: boolean; evidence?: string }) { + return { + text: assertion.text, + passed: assertion.passed, + evidence: assertion.evidence ?? '', + score: assertion.passed ? 1 : 0, + verdict: assertion.passed ? 'pass' : 'fail', + }; +} + export const evalBenchCommand = command({ name: 'bench', description: 'Merge grader scores and produce benchmark artifacts', @@ -130,14 +140,18 @@ export const evalBenchCommand = command({ // Write grading.json const grading = { - assertions: allAssertions, + score: Math.round(weightedScore * 1000) / 1000, + verdict: weightedScore >= DEFAULT_THRESHOLD ? 'pass' : 'fail', + assertion_results: allAssertions.map(toAssertionResult), summary: { passed, failed, total: allAssertions.length, pass_rate: passRate }, graders: evaluators.map((e) => ({ name: e.name, type: e.type, score: e.score, + verdict: e.score >= DEFAULT_THRESHOLD ? 'pass' : 'fail', reasoning: '', weight: e.weight, + assertion_results: e.assertions.map(toAssertionResult), })), }; await writeFile( diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 412757a59..2f331549b 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -229,6 +229,13 @@ function hydrateManifestRecord( const grading = readOptionalJson(baseDir, record.grading_path); const timing = readOptionalJson(baseDir, record.timing_path); const testId = record.test_id ?? 'unknown'; + const gradingAssertions = + grading?.assertion_results ?? + ( + grading as + | (GradingArtifact & { assertions?: GradingArtifact['assertion_results'] }) + | undefined + )?.assertions; return { timestamp: record.timestamp, @@ -240,7 +247,7 @@ function hydrateManifestRecord( score: record.score, executionStatus: record.execution_status, error: record.error, - assertions: grading?.assertions.map((assertion) => ({ + assertions: gradingAssertions?.map((assertion) => ({ text: assertion.text, passed: assertion.passed, evidence: assertion.evidence, @@ -256,8 +263,8 @@ function hydrateManifestRecord( name: evaluator.name, type: evaluator.type, score: evaluator.score, - assertions: Array.isArray(evaluator.assertions) - ? evaluator.assertions.map((assertion) => ({ + assertions: Array.isArray(evaluator.assertion_results) + ? evaluator.assertion_results.map((assertion) => ({ text: String((assertion as Record).text ?? ''), passed: Boolean((assertion as Record).passed), evidence: @@ -265,7 +272,16 @@ function hydrateManifestRecord( ? String((assertion as Record).evidence) : undefined, })) - : undefined, + : Array.isArray((evaluator as Record).assertions) + ? ((evaluator as Record).assertions as Record[]).map( + (assertion) => ({ + text: String(assertion.text ?? ''), + passed: Boolean(assertion.passed), + evidence: + typeof assertion.evidence === 'string' ? String(assertion.evidence) : undefined, + }), + ) + : undefined, weight: typeof evaluator.weight === 'number' ? evaluator.weight : undefined, verdict: typeof evaluator.verdict === 'string' ? evaluator.verdict : undefined, details: evaluator.details, diff --git a/apps/cli/src/commands/results/validate.ts b/apps/cli/src/commands/results/validate.ts index f9d102e87..69e82da7a 100644 --- a/apps/cli/src/commands/results/validate.ts +++ b/apps/cli/src/commands/results/validate.ts @@ -274,10 +274,10 @@ function checkArtifactFiles(runDir: string, entries: IndexEntry[]): Diagnostic[] } else { try { const grading = JSON.parse(readFileSync(gradingPath, 'utf8')); - if (!grading.assertions || !Array.isArray(grading.assertions)) { + if (!grading.assertion_results || !Array.isArray(grading.assertion_results)) { diagnostics.push({ severity: 'error', - message: `${testId}: grading.json missing 'assertions' array`, + message: `${testId}: grading.json missing 'assertion_results' array`, }); } if (!grading.summary) { diff --git a/apps/cli/test/commands/eval/aggregate.test.ts b/apps/cli/test/commands/eval/aggregate.test.ts index b6a7763a9..73d163fcf 100644 --- a/apps/cli/test/commands/eval/aggregate.test.ts +++ b/apps/cli/test/commands/eval/aggregate.test.ts @@ -291,7 +291,7 @@ describe('writePerTestArtifacts', () => { const grading1 = JSON.parse( readFileSync(rowRunPath(tmpDir, 'test-1', 'run-1', 'grading.json'), 'utf8'), ); - expect(grading1.assertions).toHaveLength(1); + expect(grading1.assertion_results).toHaveLength(1); const timing1 = JSON.parse( readFileSync(rowRunPath(tmpDir, 'test-1', 'run-1', 'timing.json'), 'utf8'), @@ -301,7 +301,7 @@ describe('writePerTestArtifacts', () => { const grading2 = JSON.parse( readFileSync(rowRunPath(tmpDir, 'test-2', 'run-1', 'grading.json'), 'utf8'), ); - expect(grading2.assertions).toHaveLength(1); + expect(grading2.assertion_results).toHaveLength(1); }); it('writes outputs/answer.md for results with output', async () => { diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 42ec16517..8a9b63d3a 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -113,7 +113,7 @@ function runArtifactPath( // --------------------------------------------------------------------------- describe('buildGradingArtifact', () => { - it('maps evaluator assertions to grading assertions', () => { + it('maps evaluator assertions to grading assertion_results', () => { const result = makeResult({ assertions: [ { text: 'correct format', passed: true }, @@ -124,22 +124,31 @@ describe('buildGradingArtifact', () => { const grading = buildGradingArtifact(result); - expect(grading.assertions).toHaveLength(3); - expect(grading.assertions[0]).toEqual({ + expect(grading).not.toHaveProperty('assertions'); + expect(grading.assertion_results).toHaveLength(3); + expect(grading.assertion_results[0]).toEqual({ text: 'correct format', passed: true, evidence: '', + score: 1, + verdict: 'pass', }); - expect(grading.assertions[1]).toEqual({ + expect(grading.assertion_results[1]).toEqual({ text: 'has code', passed: true, evidence: '', + score: 1, + verdict: 'pass', }); - expect(grading.assertions[2]).toEqual({ + expect(grading.assertion_results[2]).toEqual({ text: 'missing tests', passed: false, evidence: '', + score: 0, + verdict: 'fail', }); + expect(grading.score).toBe(0.9); + expect(grading.verdict).toBe('pass'); }); it('computes correct summary', () => { @@ -239,11 +248,11 @@ describe('buildGradingArtifact', () => { const grading = buildGradingArtifact(result); - expect(grading.assertions).toHaveLength(3); - expect(grading.assertions[0].text).toBe('ok-1'); - expect(grading.assertions[0].passed).toBe(true); - expect(grading.assertions[2].text).toBe('miss-1'); - expect(grading.assertions[2].passed).toBe(false); + expect(grading.assertion_results).toHaveLength(3); + expect(grading.assertion_results[0].text).toBe('ok-1'); + expect(grading.assertion_results[0].passed).toBe(true); + expect(grading.assertion_results[2].text).toBe('miss-1'); + expect(grading.assertion_results[2].passed).toBe(false); }); it('includes evaluators list with AgentV extensions', () => { @@ -289,7 +298,10 @@ describe('buildGradingArtifact', () => { const grading = buildGradingArtifact(result); - expect(grading.assertions).toEqual(rubricAssertions); + expect(grading.assertion_results).toEqual([ + { ...rubricAssertions[0], score: 1, verdict: 'pass' }, + { ...rubricAssertions[1], score: 0, verdict: 'fail' }, + ]); expect(grading.summary).toEqual({ passed: 1, failed: 1, @@ -300,7 +312,10 @@ describe('buildGradingArtifact', () => { name: 'rubric-review', type: 'llm-grader', score: 0.6, - assertions: rubricAssertions, + assertion_results: [ + { ...rubricAssertions[0], score: 1, verdict: 'pass' }, + { ...rubricAssertions[1], score: 0, verdict: 'fail' }, + ], }); }); @@ -314,7 +329,7 @@ describe('buildGradingArtifact', () => { const result = makeResult({ assertions: [], scores: undefined }); const grading = buildGradingArtifact(result); - expect(grading.assertions).toHaveLength(0); + expect(grading.assertion_results).toHaveLength(0); expect(grading.summary).toEqual({ passed: 0, failed: 0, @@ -547,7 +562,7 @@ describe('buildRunSummaryArtifact', () => { // --------------------------------------------------------------------------- describe('buildAggregateGradingArtifact', () => { - it('combines assertions from multiple results with test_id', () => { + it('combines assertion_results from multiple results with test_id', () => { const results = [ makeResult({ testId: 'test-alpha', @@ -564,24 +579,30 @@ describe('buildAggregateGradingArtifact', () => { const aggregate = buildAggregateGradingArtifact(results); - expect(aggregate.assertions).toHaveLength(3); - expect(aggregate.assertions[0]).toEqual({ + expect(aggregate.assertion_results).toHaveLength(3); + expect(aggregate.assertion_results[0]).toEqual({ test_id: 'test-alpha', text: 'criterion-1', passed: true, evidence: 'looks good', + score: 1, + verdict: 'pass', }); - expect(aggregate.assertions[1]).toEqual({ + expect(aggregate.assertion_results[1]).toEqual({ test_id: 'test-alpha', text: 'criterion-2', passed: false, evidence: '', + score: 0, + verdict: 'fail', }); - expect(aggregate.assertions[2]).toEqual({ + expect(aggregate.assertion_results[2]).toEqual({ test_id: 'test-beta', text: 'criterion-3', passed: true, evidence: '', + score: 1, + verdict: 'pass', }); }); @@ -624,8 +645,8 @@ describe('buildAggregateGradingArtifact', () => { const aggregate = buildAggregateGradingArtifact(results); - expect(aggregate.assertions).toHaveLength(1); - expect(aggregate.assertions[0].test_id).toBe('test-1'); + expect(aggregate.assertion_results).toHaveLength(1); + expect(aggregate.assertion_results[0].test_id).toBe('test-1'); expect(aggregate.summary.total).toBe(1); expect(aggregate.summary.passed).toBe(1); expect(aggregate.summary.failed).toBe(0); @@ -646,12 +667,14 @@ describe('buildAggregateGradingArtifact', () => { const aggregate = buildAggregateGradingArtifact(results); - expect(aggregate.assertions).toEqual([ + expect(aggregate.assertion_results).toEqual([ { test_id: 'quality-pass', text: 'quality criterion', passed: true, evidence: '', + score: 1, + verdict: 'pass', }, ]); expect(aggregate.summary).toEqual({ @@ -665,7 +688,7 @@ describe('buildAggregateGradingArtifact', () => { it('handles empty results array', () => { const aggregate = buildAggregateGradingArtifact([]); - expect(aggregate.assertions).toHaveLength(0); + expect(aggregate.assertion_results).toHaveLength(0); expect(aggregate.summary).toEqual({ passed: 0, failed: 0, @@ -950,13 +973,18 @@ describe('schema compatibility', () => { }); const grading = buildGradingArtifact(result); - for (const exp of grading.assertions) { + expect(grading).not.toHaveProperty('assertions'); + for (const exp of grading.assertion_results) { expect(exp).toHaveProperty('text'); expect(exp).toHaveProperty('passed'); expect(exp).toHaveProperty('evidence'); + expect(exp).toHaveProperty('score'); + expect(exp).toHaveProperty('verdict'); expect(typeof exp.text).toBe('string'); expect(typeof exp.passed).toBe('boolean'); expect(typeof exp.evidence).toBe('string'); + expect(typeof exp.score).toBe('number'); + expect(['pass', 'fail']).toContain(exp.verdict); } }); @@ -2007,7 +2035,7 @@ describe('writeArtifactsFromResults', () => { await readFile(runArtifactPath(testDir, indexLine, 'run-1', 'grading.json'), 'utf8'), ); - expect(grading.assertions[0].text).toBe('baseline-check'); + expect(grading.assertion_results[0].text).toBe('baseline-check'); }); it('uses distinct row ids for the same test id across targets', async () => { diff --git a/apps/cli/test/commands/eval/pipeline/bench.test.ts b/apps/cli/test/commands/eval/pipeline/bench.test.ts index a52fe210e..c9b9ab0a6 100644 --- a/apps/cli/test/commands/eval/pipeline/bench.test.ts +++ b/apps/cli/test/commands/eval/pipeline/bench.test.ts @@ -73,7 +73,7 @@ describe('pipeline bench', () => { const grading = JSON.parse(await readFile(join(OUT_DIR, 'test-01', 'grading.json'), 'utf8')); expect(grading.summary.pass_rate).toBeGreaterThan(0); - expect(grading.assertions.length).toBeGreaterThan(0); + expect(grading.assertion_results.length).toBeGreaterThan(0); expect(grading.graders).toHaveLength(2); const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8'); diff --git a/apps/cli/test/commands/results/export-e2e-providers.test.ts b/apps/cli/test/commands/results/export-e2e-providers.test.ts index 991d29a91..6756fb909 100644 --- a/apps/cli/test/commands/results/export-e2e-providers.test.ts +++ b/apps/cli/test/commands/results/export-e2e-providers.test.ts @@ -500,9 +500,14 @@ describe('export e2e — multi-provider metrics verification', () => { ), ); - expect(grading.assertions).toHaveLength(2); - expect(grading.assertions[0].text).toBe('Correct answer'); - expect(grading.assertions[0].evidence).toBe('Matched expected output'); + expect(grading).not.toHaveProperty('assertions'); + expect(grading.score).toBe(1); + expect(grading.verdict).toBe('pass'); + expect(grading.assertion_results).toHaveLength(2); + expect(grading.assertion_results[0].text).toBe('Correct answer'); + expect(grading.assertion_results[0].evidence).toBe('Matched expected output'); + expect(grading.assertion_results[0].score).toBe(1); + expect(grading.assertion_results[0].verdict).toBe('pass'); expect(grading.summary.passed).toBe(2); expect(grading.summary.failed).toBe(0); expect(grading.summary.pass_rate).toBe(1.0); diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index 90860e80a..91dafed15 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -722,12 +722,17 @@ describe('results export', () => { const grading: GradingArtifact = JSON.parse(readFileSync(gradingPath, 'utf8')); - // Uses artifact-writer's assertions field - expect(grading.assertions).toBeDefined(); - expect(grading.assertions.length).toBeGreaterThan(0); - expect(grading.assertions[0]).toHaveProperty('text'); - expect(grading.assertions[0]).toHaveProperty('passed'); - expect(grading.assertions[0]).toHaveProperty('evidence'); + // Uses artifact-writer's assertion_results field + expect(grading).not.toHaveProperty('assertions'); + expect(grading.score).toBe(1); + expect(grading.verdict).toBe('pass'); + expect(grading.assertion_results).toBeDefined(); + expect(grading.assertion_results.length).toBeGreaterThan(0); + expect(grading.assertion_results[0]).toHaveProperty('text'); + expect(grading.assertion_results[0]).toHaveProperty('passed'); + expect(grading.assertion_results[0]).toHaveProperty('evidence'); + expect(grading.assertion_results[0]).toHaveProperty('score'); + expect(grading.assertion_results[0]).toHaveProperty('verdict'); // Has summary expect(grading.summary).toBeDefined(); @@ -864,7 +869,8 @@ describe('results export', () => { expect(existsSync(gradingPath)).toBe(true); const grading: GradingArtifact = JSON.parse(readFileSync(gradingPath, 'utf8')); - expect(grading.assertions).toEqual([]); + expect(grading).not.toHaveProperty('assertions'); + expect(grading.assertion_results).toEqual([]); expect(grading.summary.total).toBe(0); }); diff --git a/apps/cli/test/commands/results/summary.test.ts b/apps/cli/test/commands/results/summary.test.ts index a88c95742..dbd1e40ec 100644 --- a/apps/cli/test/commands/results/summary.test.ts +++ b/apps/cli/test/commands/results/summary.test.ts @@ -76,10 +76,17 @@ describe('formatSummary', () => { describe('formatSummary with grading artifact', () => { it('uses assertion counts from grading artifact when provided', () => { const grading: AggregateGradingArtifact = { - assertions: [ - { test_id: 'test-1', text: 'a', passed: true, evidence: '' }, - { test_id: 'test-1', text: 'b', passed: false, evidence: 'missing' }, - { test_id: 'test-2', text: 'c', passed: true, evidence: '' }, + assertion_results: [ + { test_id: 'test-1', text: 'a', passed: true, evidence: '', score: 1, verdict: 'pass' }, + { + test_id: 'test-1', + text: 'b', + passed: false, + evidence: 'missing', + score: 0, + verdict: 'fail', + }, + { test_id: 'test-2', text: 'c', passed: true, evidence: '', score: 1, verdict: 'pass' }, ], summary: { passed: 2, failed: 1, total: 3, pass_rate: 0.667 }, }; diff --git a/apps/web/src/content/docs/docs/reference/result-artifacts.mdx b/apps/web/src/content/docs/docs/reference/result-artifacts.mdx index 76e04c3f7..7a60dd016 100644 --- a/apps/web/src/content/docs/docs/reference/result-artifacts.mdx +++ b/apps/web/src/content/docs/docs/reference/result-artifacts.mdx @@ -87,7 +87,7 @@ reserved for rebuildable local state and are skipped by run discovery. | `summary.json` | Aggregate run metadata and rollups: run id, experiment metadata, counts, pass rate, score summaries, duration, token/cost totals, and writer metadata. | Listing runs, CI summaries, quick dashboards, trend cards, and validating that a run is complete enough to inspect. | | `index.jsonl` | Canonical row index: one row per result, attempt, or case-level aggregate, with identity fields, filter metadata, scores, status, and explicit run-relative paths to sidecars. | Filtering, compare/trend inputs, Dashboard detail routing, rerun/resume lookup, export adapters, and artifact discovery. | | `result.json` | Compact per-attempt manifest for one attempt directory, including AgentV `execution_status` and `verdict`. | Loading one attempt without scanning the whole run index. | -| `grading.json` | Grader outputs, assertions, rubric evidence, execution-metric grader facts, and scoring provenance. | Explaining why a row passed or failed. | +| `grading.json` | Grader outputs, `assertion_results`, rubric evidence, execution-metric grader facts, and scoring provenance. | Explaining why a row passed or failed. | | `metrics.json` | Derived executor behavior summary, such as tool calls, files touched, shell commands, errors, turns, and output sizes. | Dashboard behavior views, metric-style graders, adapter projections, and lightweight analysis. | | `outputs/file_changes.diff` | Full unified diff of workspace file changes when file changes are captured. | Human review and external artifact inspection; LLM and script graders still receive the same full diff through `file_changes`. | | `timing.json` | Duration, token usage, cost usage, and source labels such as `provider_reported`, `token_estimated`, `aggregate`, or `unavailable`. | Cost/latency reporting and provider-accounting audits. | @@ -102,6 +102,55 @@ reader should not parse aggregate summary structures to find one case's grading or transcript. Keep aggregate questions on `summary.json`; keep row and artifact discovery on `index.jsonl`. +## Grading Contract + +Each per-attempt `grading.json` uses `assertion_results` for the public +per-criterion rows. The internal grader API and eval YAML still use +`assertions`; the sidecar converts those rows at the artifact boundary. + +```json +{ + "score": 0.5, + "verdict": "fail", + "assertion_results": [ + { + "text": "Answer cites the changed file", + "passed": true, + "evidence": "The answer cites src/refunds.ts.", + "score": 1, + "verdict": "pass" + }, + { + "text": "Tests were updated", + "passed": false, + "evidence": "No test file path or diff was provided.", + "score": 0, + "verdict": "fail" + } + ], + "summary": { + "passed": 1, + "failed": 1, + "total": 2, + "pass_rate": 0.5 + }, + "graders": [ + { + "name": "implementation_review", + "type": "llm-grader", + "score": 0.5, + "verdict": "fail", + "assertion_results": [] + } + ] +} +``` + +`score` values are normalized to the `0..1` range. `verdict` is `pass`, +`fail`, or `skip` at the artifact level, and `pass` or `fail` on individual +assertion rows. Evidence stays in `grading.json` so the sidecar remains useful +without loading traces. + ## Row Contract Each `index.jsonl` line is a JSON object. The exact field set grows as AgentV diff --git a/packages/core/src/evaluation/graders/llm-grader-prompt.ts b/packages/core/src/evaluation/graders/llm-grader-prompt.ts index 185960d93..57a0c04d7 100644 --- a/packages/core/src/evaluation/graders/llm-grader-prompt.ts +++ b/packages/core/src/evaluation/graders/llm-grader-prompt.ts @@ -187,6 +187,8 @@ function assembleChecklist( const parts: string[] = [ 'You are an expert grader. Evaluate the candidate answer against each rubric item below.', + 'Be skeptical: mark a rubric satisfied only when the answer, file changes, tool calls, or concrete workspace paths support it.', + 'Cite paths, diffs, tool calls, or answer excerpts in reasoning whenever they are available.', '', '[[ ## question ## ]]', formattedQuestion, @@ -257,6 +259,8 @@ function assembleScoreRange( const parts: string[] = [ 'You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.', 'For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.', + 'Be skeptical: award credit only for evidence supported by the answer, file changes, tool calls, or concrete workspace paths.', + 'Cite paths, diffs, tool calls, or answer excerpts in reasoning whenever they are available.', '', '[[ ## question ## ]]', formattedQuestion, diff --git a/packages/core/src/evaluation/graders/llm-grader.ts b/packages/core/src/evaluation/graders/llm-grader.ts index 6194e4d50..3a3b9fecd 100644 --- a/packages/core/src/evaluation/graders/llm-grader.ts +++ b/packages/core/src/evaluation/graders/llm-grader.ts @@ -74,6 +74,8 @@ export const DEFAULT_GRADER_TEMPLATE = `You are an expert grader. Your goal is t Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The answer does not need to match it verbatim, but should capture the key points and follow the same spirit. +Be skeptical. Award credit only for behavior supported by the answer, file_changes, tool_calls, or referenced workspace paths. When evaluating repo or file work, cite concrete paths, diffs, tool calls, or answer excerpts in each assertion's evidence whenever they are available. Do not infer hidden work from intent or plausible next steps. + Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations. [[ ## criteria ## ]] @@ -691,6 +693,8 @@ export class LlmGrader implements Grader { 'You are an expert grader with access to the workspace filesystem.', 'Use the provided tools to investigate the workspace and verify the criteria are met.', 'Thoroughly examine relevant files before making your assessment.', + 'Be skeptical: award credit only for evidence you can support with the answer, file changes, tool calls, or concrete workspace paths.', + 'Each assertion evidence should cite paths, diffs, tool calls, or answer excerpts when available.', '', ]; @@ -725,6 +729,7 @@ export class LlmGrader implements Grader { const parts: string[] = [ 'Evaluate the candidate answer by investigating the workspace.', + 'Be skeptical: verify claims against concrete workspace paths, file changes, tool calls, or answer excerpts.', '', '[[ ## question ## ]]', formattedQuestion, @@ -794,6 +799,8 @@ export class LlmGrader implements Grader { const parts: string[] = [ 'You are an expert grader. Investigate the workspace to verify the criteria are met.', + 'Be skeptical: award credit only for evidence you can support with the answer, file changes, tool calls, or concrete workspace paths.', + 'Each assertion evidence should cite paths, diffs, tool calls, or answer excerpts when available.', '', '[[ ## question ## ]]', formattedQuestion, @@ -920,6 +927,8 @@ export class LlmGrader implements Grader { const parts: string[] = [ 'You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.', 'For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.', + 'Be skeptical: award credit only for evidence supported by the answer, file changes, tool calls, or concrete workspace paths.', + 'Cite paths, diffs, tool calls, or answer excerpts in reasoning whenever they are available.', '', '[[ ## question ## ]]', formattedQuestion, @@ -996,6 +1005,8 @@ export class LlmGrader implements Grader { const parts: string[] = [ 'You are an expert grader. Evaluate the candidate answer against each rubric item below.', + 'Be skeptical: mark a rubric satisfied only when the answer, file changes, tool calls, or concrete workspace paths support it.', + 'Cite paths, diffs, tool calls, or answer excerpts in reasoning whenever they are available.', '', '[[ ## question ## ]]', formattedQuestion, @@ -1204,6 +1215,7 @@ function sumTokenUsage( export function buildRubricOutputSchema(): string { return `You are an expert grader. Evaluate the candidate answer against each rubric item. +Be skeptical: mark a rubric satisfied only when concrete evidence supports it, and cite paths, diffs, tool calls, or answer excerpts in reasoning when available. You must return a valid JSON object matching this schema: { "checks": [ @@ -1268,6 +1280,7 @@ export function calculateRubricScore( */ export function buildScoreRangeOutputSchema(): string { return `You are an expert grader. Score the candidate answer on each criterion. +Be skeptical: award credit only for concrete evidence, and cite paths, diffs, tool calls, or answer excerpts in reasoning when available. You must return a valid JSON object matching this schema: { "checks": [ diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index 470c82fcc..7a7917b6f 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -24,7 +24,7 @@ import { omitExternalTraceMetadataKeys, toExternalTraceMetadataWire, } from './external-trace.js'; -import { DEFAULT_THRESHOLD } from './graders/scoring.js'; +import { DEFAULT_THRESHOLD, clampScore, scoreToVerdict } from './graders/scoring.js'; import { buildMetricsArtifact } from './metrics.js'; import { type ExportDuplicatePolicy, @@ -321,10 +321,14 @@ function isRunRuntimeSourceMetadata(value: unknown): value is RunRuntimeSourceMe } export interface GradingArtifact { - readonly assertions: readonly { + readonly score: number; + readonly verdict: 'pass' | 'fail' | 'skip'; + readonly assertion_results: readonly { readonly text: string; readonly passed: boolean; readonly evidence: string; + readonly score: number; + readonly verdict: 'pass' | 'fail'; }[]; readonly summary: { readonly passed: number; @@ -337,6 +341,7 @@ export interface GradingArtifact { readonly type: string; readonly score: number; readonly reasoning: string; + readonly assertion_results: readonly GradingAssertionResult[]; readonly [key: string]: unknown; }[]; readonly workspace_changes?: { @@ -458,11 +463,13 @@ export interface RunSummaryArtifact { } export interface AggregateGradingArtifact { - readonly assertions: readonly { + readonly assertion_results: readonly { readonly test_id: string; readonly text: string; readonly passed: boolean; readonly evidence: string; + readonly score: number; + readonly verdict: 'pass' | 'fail'; }[]; readonly summary: { readonly passed: number; @@ -472,6 +479,8 @@ export interface AggregateGradingArtifact { }; } +type GradingAssertionResult = GradingArtifact['assertion_results'][number]; + export interface IndexArtifactEntry { readonly timestamp: string; readonly test_id: string; @@ -672,13 +681,30 @@ function parseWorkspaceChanges( }; } -function buildAssertions(result: EvaluationResult): GradingArtifact['assertions'] { - if (!result.assertions) return []; - return result.assertions.map((a) => ({ - text: a.text, - passed: a.passed, - evidence: a.evidence ?? '', - })); +function assertionResultFromAssertion(assertion: EvaluationResult['assertions'][number]) { + const passed = assertion.passed; + return { + text: assertion.text, + passed, + evidence: assertion.evidence ?? '', + score: passed ? 1 : 0, + verdict: passed ? ('pass' as const) : ('fail' as const), + }; +} + +function buildAssertionResults(result: EvaluationResult): GradingArtifact['assertion_results'] { + return (result.assertions ?? []).map(assertionResultFromAssertion); +} + +function resultVerdict(result: EvaluationResult): GradingArtifact['verdict'] { + const scores = result.scores ?? []; + if (scores.length > 0 && scores.every((score) => score.verdict === 'skip')) { + return 'skip'; + } + if (result.executionStatus === 'execution_error') { + return 'fail'; + } + return scoreToVerdict(clampScore(result.score)); } function buildEvaluators(scores: readonly GraderResult[] | undefined): GradingArtifact['graders'] { @@ -693,8 +719,9 @@ function buildEvaluators(scores: readonly GraderResult[] | undefined): GradingAr reasoning: '', weight: s.weight, verdict: s.verdict, - assertions: s.assertions, + assertion_results: (s.assertions ?? []).map(assertionResultFromAssertion), details: s.details, + scores: buildEvaluators(s.scores), })); } @@ -1198,14 +1225,16 @@ export function buildGradingArtifact( result: EvaluationResult, options?: { includeTrials?: boolean }, ): GradingArtifact { - const assertions = buildAssertions(result); - const passed = assertions.filter((e) => e.passed).length; - const failed = assertions.filter((e) => !e.passed).length; - const total = assertions.length; + const assertionResults = buildAssertionResults(result); + const passed = assertionResults.filter((e) => e.passed).length; + const failed = assertionResults.filter((e) => !e.passed).length; + const total = assertionResults.length; const includeTrials = options?.includeTrials ?? true; return { - assertions, + score: clampScore(result.score), + verdict: resultVerdict(result), + assertion_results: assertionResults, summary: { passed, failed, @@ -1504,26 +1533,24 @@ export async function writeInitialRunSummaryArtifact( export function buildAggregateGradingArtifact( results: readonly EvaluationResult[], ): AggregateGradingArtifact { - const assertions: AggregateGradingArtifact['assertions'][number][] = []; + const assertionResults: AggregateGradingArtifact['assertion_results'][number][] = []; for (const result of results.filter((r) => !isExecutionError(r))) { const testId = result.testId ?? 'unknown'; for (const assertion of result.assertions ?? []) { - assertions.push({ + assertionResults.push({ test_id: testId, - text: assertion.text, - passed: assertion.passed, - evidence: assertion.evidence ?? '', + ...assertionResultFromAssertion(assertion), }); } } - const passed = assertions.filter((a) => a.passed).length; - const failed = assertions.filter((a) => !a.passed).length; - const total = assertions.length; + const passed = assertionResults.filter((a) => a.passed).length; + const failed = assertionResults.filter((a) => !a.passed).length; + const total = assertionResults.length; return { - assertions, + assertion_results: assertionResults, summary: { passed, failed, From cd81ca51902591aa6ce8c52e8bd9471a774e3d8e Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 15:13:24 +0200 Subject: [PATCH 2/3] fix(eval): address grading contract review findings --- apps/cli/src/commands/results/manifest.ts | 115 +++++++++++------ apps/cli/src/commands/results/validate.ts | 9 +- apps/cli/test/commands/results/shared.test.ts | 74 +++++++++++ .../test/commands/results/validate.test.ts | 43 +++++++ .../src/components/EvalDetail.test.ts | 31 +++++ apps/dashboard/src/components/EvalDetail.tsx | 8 +- .../evaluation/graders/llm-grader-prompt.ts | 6 +- .../core/src/evaluation/graders/llm-grader.ts | 45 +++++-- packages/core/test/evaluation/graders.test.ts | 119 ++++++++++++++++++ 9 files changed, 397 insertions(+), 53 deletions(-) create mode 100644 apps/dashboard/src/components/EvalDetail.test.ts diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 2f331549b..af9fb6ce8 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -86,6 +86,74 @@ export interface ManifestHydrationOptions { readonly hydrateTranscriptTrace?: boolean; } +type HydratedScore = NonNullable[number]; + +function mapGradingAssertions( + value: unknown, +): NonNullable | undefined { + if (!Array.isArray(value)) { + return undefined; + } + return value.map((assertion) => { + const record = assertion as Record; + return { + text: String(record.text ?? ''), + passed: Boolean(record.passed), + evidence: typeof record.evidence === 'string' ? record.evidence : undefined, + }; + }); +} + +function readGradingAssertionResults( + record: Record, +): NonNullable | undefined { + return mapGradingAssertions( + Array.isArray(record.assertion_results) ? record.assertion_results : record.assertions, + ); +} + +function readNestedGradingScores(record: Record): unknown { + if (Array.isArray(record.scores)) { + return record.scores; + } + if (Array.isArray(record.graders)) { + return record.graders; + } + if (Array.isArray(record.evaluators)) { + return record.evaluators; + } + return undefined; +} + +function mapGradingEvaluator(evaluator: Record): HydratedScore { + const verdict = + evaluator.verdict === 'pass' || evaluator.verdict === 'fail' || evaluator.verdict === 'skip' + ? evaluator.verdict + : undefined; + const details = + evaluator.details && typeof evaluator.details === 'object' && !Array.isArray(evaluator.details) + ? (evaluator.details as HydratedScore['details']) + : undefined; + + return { + name: String(evaluator.name ?? ''), + type: String(evaluator.type ?? '') as HydratedScore['type'], + score: typeof evaluator.score === 'number' ? evaluator.score : 0, + assertions: readGradingAssertionResults(evaluator) ?? [], + scores: mapGradingEvaluators(readNestedGradingScores(evaluator)), + weight: typeof evaluator.weight === 'number' ? evaluator.weight : undefined, + verdict, + details, + }; +} + +function mapGradingEvaluators(value: unknown): EvaluationResult['scores'] | undefined { + if (!Array.isArray(value)) { + return undefined; + } + return value.map((evaluator) => mapGradingEvaluator(evaluator as Record)); +} + function parseResultRows(content: string, sourceLabel?: string): ResultManifestRecord[] { return content .split(/\r?\n/) @@ -229,13 +297,14 @@ function hydrateManifestRecord( const grading = readOptionalJson(baseDir, record.grading_path); const timing = readOptionalJson(baseDir, record.timing_path); const testId = record.test_id ?? 'unknown'; - const gradingAssertions = - grading?.assertion_results ?? - ( - grading as - | (GradingArtifact & { assertions?: GradingArtifact['assertion_results'] }) - | undefined - )?.assertions; + const gradingAssertions = grading + ? readGradingAssertionResults(grading as unknown as Record) + : undefined; + const gradingScores = mapGradingEvaluators( + grading?.graders ?? + (grading as (GradingArtifact & { evaluators?: GradingArtifact['graders'] }) | undefined) + ?.evaluators, + ); return { timestamp: record.timestamp, @@ -255,37 +324,7 @@ function hydrateManifestRecord( scores: // `evaluators` was renamed to `graders` in v4.13 — read both for backwards compat with old artifacts. // TODO: remove `evaluators` fallback once old run directories are no longer in use. - ( - grading?.graders ?? - (grading as (GradingArtifact & { evaluators?: GradingArtifact['graders'] }) | undefined) - ?.evaluators - )?.map((evaluator) => ({ - name: evaluator.name, - type: evaluator.type, - score: evaluator.score, - assertions: Array.isArray(evaluator.assertion_results) - ? evaluator.assertion_results.map((assertion) => ({ - text: String((assertion as Record).text ?? ''), - passed: Boolean((assertion as Record).passed), - evidence: - typeof (assertion as Record).evidence === 'string' - ? String((assertion as Record).evidence) - : undefined, - })) - : Array.isArray((evaluator as Record).assertions) - ? ((evaluator as Record).assertions as Record[]).map( - (assertion) => ({ - text: String(assertion.text ?? ''), - passed: Boolean(assertion.passed), - evidence: - typeof assertion.evidence === 'string' ? String(assertion.evidence) : undefined, - }), - ) - : undefined, - weight: typeof evaluator.weight === 'number' ? evaluator.weight : undefined, - verdict: typeof evaluator.verdict === 'string' ? evaluator.verdict : undefined, - details: evaluator.details, - })) ?? (record.scores as EvaluationResult['scores']), + gradingScores ?? (record.scores as EvaluationResult['scores']), tokenUsage: timing?.token_usage ? { input: timing.token_usage.input, diff --git a/apps/cli/src/commands/results/validate.ts b/apps/cli/src/commands/results/validate.ts index 69e82da7a..f580fd36d 100644 --- a/apps/cli/src/commands/results/validate.ts +++ b/apps/cli/src/commands/results/validate.ts @@ -274,7 +274,14 @@ function checkArtifactFiles(runDir: string, entries: IndexEntry[]): Diagnostic[] } else { try { const grading = JSON.parse(readFileSync(gradingPath, 'utf8')); - if (!grading.assertion_results || !Array.isArray(grading.assertion_results)) { + if (Array.isArray(grading.assertion_results)) { + // Current grading sidecar contract. + } else if (Array.isArray(grading.assertions)) { + diagnostics.push({ + severity: 'warning', + message: `${testId}: grading.json uses legacy 'assertions' array; rewrite the run to emit 'assertion_results'`, + }); + } else { diagnostics.push({ severity: 'error', message: `${testId}: grading.json missing 'assertion_results' array`, diff --git a/apps/cli/test/commands/results/shared.test.ts b/apps/cli/test/commands/results/shared.test.ts index ac7144598..70add7065 100644 --- a/apps/cli/test/commands/results/shared.test.ts +++ b/apps/cli/test/commands/results/shared.test.ts @@ -284,6 +284,80 @@ describe('results shared source resolution', () => { expect(results[0].trace.toolCalls).toEqual({}); }); + it('hydrates nested grader rows recursively from grading artifacts', () => { + const runDir = path.join(tempDir, '.agentv', 'results', 'default', '2026-03-25T10-00-00-000Z'); + mkdirSync(path.join(runDir, 'nested-graders'), { recursive: true }); + writeFileSync( + path.join(runDir, 'nested-graders/grading.json'), + `${JSON.stringify({ + score: 1, + verdict: 'pass', + assertion_results: [{ text: 'top-level', passed: true, evidence: 'top evidence' }], + summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 }, + graders: [ + { + name: 'parent', + type: 'assert-set', + score: 1, + assertion_results: [ + { text: 'parent assertion', passed: true, evidence: 'parent evidence' }, + ], + scores: [ + { + name: 'child', + type: 'contains', + score: 1, + assertion_results: [ + { text: 'child assertion', passed: true, evidence: 'child evidence' }, + ], + scores: [ + { + name: 'legacy-grandchild', + type: 'regex', + score: 0, + assertions: [ + { + text: 'legacy grandchild assertion', + passed: false, + evidence: 'legacy child evidence', + }, + ], + }, + ], + }, + ], + }, + ], + })}\n`, + ); + const indexPath = path.join(runDir, 'index.jsonl'); + writeFileSync( + indexPath, + `${JSON.stringify({ + timestamp: '2026-03-25T10:00:00.000Z', + test_id: 'nested-graders', + target: 'codex', + score: 1, + grading_path: 'nested-graders/grading.json', + })}\n`, + ); + + const results = loadManifestResults(indexPath); + + expect(results[0].assertions).toEqual([ + { text: 'top-level', passed: true, evidence: 'top evidence' }, + ]); + expect(results[0].scores?.[0]?.assertions).toEqual([ + { text: 'parent assertion', passed: true, evidence: 'parent evidence' }, + ]); + expect(results[0].scores?.[0]?.scores?.[0]?.assertions).toEqual([ + { text: 'child assertion', passed: true, evidence: 'child evidence' }, + ]); + expect(results[0].scores?.[0]?.scores?.[0]?.scores?.[0]?.assertions).toEqual([ + { text: 'legacy grandchild assertion', passed: false, evidence: 'legacy child evidence' }, + ]); + }); + it('rejects eval-case-only rows with migration guidance', () => { const runDir = path.join(tempDir, '.agentv', 'results', 'default', '2026-03-25T10-00-00-000Z'); mkdirSync(runDir, { recursive: true }); diff --git a/apps/cli/test/commands/results/validate.test.ts b/apps/cli/test/commands/results/validate.test.ts index a33bf01fb..538e061d8 100644 --- a/apps/cli/test/commands/results/validate.test.ts +++ b/apps/cli/test/commands/results/validate.test.ts @@ -107,6 +107,49 @@ describe('results validate', () => { } }); + it('accepts legacy grading assertions with a compatibility warning', () => { + const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-validate-test-')); + + try { + const runDir = path.join(tempDir, '.agentv', 'results', '2026-03-27T12-42-24-429Z'); + mkdirSync(path.join(runDir, 'test-greeting'), { recursive: true }); + writeFileSync( + path.join(runDir, 'index.jsonl'), + `${JSON.stringify({ + timestamp: '2026-03-27T12:42:24.429Z', + test_id: 'test-greeting', + score: 1, + target: 'gpt-4o', + execution_status: 'ok', + summary_path: 'test-greeting/summary.json', + grading_path: 'test-greeting/grading.json', + })}\n`, + ); + writeFileSync(path.join(runDir, 'test-greeting', 'summary.json'), '{}\n'); + writeFileSync(path.join(runDir, 'summary.json'), '{}\n'); + writeFileSync( + path.join(runDir, 'test-greeting', 'grading.json'), + `${JSON.stringify({ + score: 1, + verdict: 'pass', + assertions: [{ text: 'legacy assertion', passed: true }], + summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 }, + })}\n`, + ); + + const { diagnostics } = validateRunDirectory(runDir); + + expect(diagnostics.filter((d) => d.severity === 'error')).toEqual([]); + expect(diagnostics).toContainEqual({ + severity: 'warning', + message: + "test-greeting: grading.json uses legacy 'assertions' array; rewrite the run to emit 'assertion_results'", + }); + } finally { + rmSync(tempDir, { recursive: true, force: true }); + } + }); + it('accepts new test_dir and legacy task_dir bundle metadata', () => { const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-validate-test-')); diff --git a/apps/dashboard/src/components/EvalDetail.test.ts b/apps/dashboard/src/components/EvalDetail.test.ts new file mode 100644 index 000000000..e8cb31f49 --- /dev/null +++ b/apps/dashboard/src/components/EvalDetail.test.ts @@ -0,0 +1,31 @@ +import { describe, expect, it } from 'bun:test'; + +import { parseGradingArtifact } from './EvalDetail'; + +describe('parseGradingArtifact', () => { + it('reads assertion_results with legacy assertions fallback', () => { + const current = parseGradingArtifact( + JSON.stringify({ + assertion_results: [ + { text: 'Current evidence row', passed: true, evidence: 'from assertion_results' }, + ], + assertions: [{ text: 'Legacy row ignored when current shape exists', passed: false }], + summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 }, + }), + ); + + expect(current?.assertions).toEqual([ + { text: 'Current evidence row', passed: true, evidence: 'from assertion_results' }, + ]); + + const legacy = parseGradingArtifact( + JSON.stringify({ + assertions: [{ text: 'Legacy evidence row', passed: false, evidence: 'from assertions' }], + }), + ); + + expect(legacy?.assertions).toEqual([ + { text: 'Legacy evidence row', passed: false, evidence: 'from assertions' }, + ]); + }); +}); diff --git a/apps/dashboard/src/components/EvalDetail.tsx b/apps/dashboard/src/components/EvalDetail.tsx index 5b861ce08..f2c8b2710 100644 --- a/apps/dashboard/src/components/EvalDetail.tsx +++ b/apps/dashboard/src/components/EvalDetail.tsx @@ -669,11 +669,15 @@ type ParsedGradingArtifact = { error?: string; }; -function parseGradingArtifact(content: string | undefined): ParsedGradingArtifact | null { +export function parseGradingArtifact(content: string | undefined): ParsedGradingArtifact | null { if (!content) return null; try { const parsed = JSON.parse(content) as Record; - const rawAssertions = Array.isArray(parsed.assertions) ? parsed.assertions : []; + const rawAssertions = Array.isArray(parsed.assertion_results) + ? parsed.assertion_results + : Array.isArray(parsed.assertions) + ? parsed.assertions + : []; const assertions = rawAssertions.flatMap((value): AssertionEntry[] => { if (!value || typeof value !== 'object') return []; const assertion = value as Record; diff --git a/packages/core/src/evaluation/graders/llm-grader-prompt.ts b/packages/core/src/evaluation/graders/llm-grader-prompt.ts index 57a0c04d7..2b518199d 100644 --- a/packages/core/src/evaluation/graders/llm-grader-prompt.ts +++ b/packages/core/src/evaluation/graders/llm-grader-prompt.ts @@ -5,7 +5,9 @@ import type { PromptInputs } from '../yaml-parser.js'; import { DEFAULT_GRADER_TEMPLATE, buildOutputSchema, + buildRubricFormatInstructions, buildRubricOutputSchema, + buildScoreRangeFormatInstructions, buildScoreRangeOutputSchema, substituteVariables, } from './llm-grader.js'; @@ -151,7 +153,9 @@ function assembleCustom( graderTemplateOverride: string, ): LlmGraderPromptAssembly { const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0); - const systemPrompt = hasScoreRanges ? buildScoreRangeOutputSchema() : buildRubricOutputSchema(); + const systemPrompt = hasScoreRanges + ? buildScoreRangeFormatInstructions() + : buildRubricFormatInstructions(); const userPrompt = substituteVariables( graderTemplateOverride, buildTemplateVariables({ diff --git a/packages/core/src/evaluation/graders/llm-grader.ts b/packages/core/src/evaluation/graders/llm-grader.ts index 3a3b9fecd..ed9f04b0f 100644 --- a/packages/core/src/evaluation/graders/llm-grader.ts +++ b/packages/core/src/evaluation/graders/llm-grader.ts @@ -392,7 +392,10 @@ export class LlmGrader implements Grader { context.graderTemplateOverride || this.graderTemplate ? this.buildCustomPrompt(context) : this.buildRubricPrompt(context, rubrics); - const systemPrompt = buildRubricOutputSchema(); + const systemPrompt = + context.graderTemplateOverride || this.graderTemplate + ? buildRubricFormatInstructions() + : buildRubricOutputSchema(); const graderRawRequest: JsonObject = { userPrompt: prompt, @@ -451,7 +454,10 @@ export class LlmGrader implements Grader { context.graderTemplateOverride || this.graderTemplate ? this.buildCustomPrompt(context) : this.buildScoreRangePrompt(context, rubrics); - const systemPrompt = buildScoreRangeOutputSchema(); + const systemPrompt = + context.graderTemplateOverride || this.graderTemplate + ? buildScoreRangeFormatInstructions() + : buildScoreRangeOutputSchema(); const graderRawRequest: JsonObject = { userPrompt: prompt, @@ -791,8 +797,13 @@ export class LlmGrader implements Grader { const variables = buildTemplateVariables(context); const customPrompt = substituteVariables(template, variables); + const hasScoreRanges = rubrics?.some((r) => r.score_ranges && r.score_ranges.length > 0); const outputSchema = - rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema(); + rubrics && rubrics.length > 0 + ? hasScoreRanges + ? buildScoreRangeFormatInstructions() + : buildRubricFormatInstructions() + : buildOutputSchema(); return `${customPrompt}\n\n${outputSchema}`; } @@ -1213,10 +1224,8 @@ function sumTokenUsage( }; } -export function buildRubricOutputSchema(): string { - return `You are an expert grader. Evaluate the candidate answer against each rubric item. -Be skeptical: mark a rubric satisfied only when concrete evidence supports it, and cite paths, diffs, tool calls, or answer excerpts in reasoning when available. -You must return a valid JSON object matching this schema: +export function buildRubricFormatInstructions(): string { + return `You must return a valid JSON object matching this schema: { "checks": [ { @@ -1229,6 +1238,14 @@ You must return a valid JSON object matching this schema: }`; } +export function buildRubricOutputSchema(): string { + return [ + 'You are an expert grader. Evaluate the candidate answer against each rubric item.', + 'Be skeptical: mark a rubric satisfied only when concrete evidence supports it, and cite paths, diffs, tool calls, or answer excerpts in reasoning when available.', + buildRubricFormatInstructions(), + ].join('\n'); +} + export function substituteVariables(template: string, variables: Record): string { return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => { return variables[varName] ?? match; @@ -1278,10 +1295,8 @@ export function calculateRubricScore( /** * Build the output schema for score-range rubric evaluation. */ -export function buildScoreRangeOutputSchema(): string { - return `You are an expert grader. Score the candidate answer on each criterion. -Be skeptical: award credit only for concrete evidence, and cite paths, diffs, tool calls, or answer excerpts in reasoning when available. -You must return a valid JSON object matching this schema: +export function buildScoreRangeFormatInstructions(): string { + return `You must return a valid JSON object matching this schema: { "checks": [ { @@ -1296,6 +1311,14 @@ You must return a valid JSON object matching this schema: Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`; } +export function buildScoreRangeOutputSchema(): string { + return [ + 'You are an expert grader. Score the candidate answer on each criterion.', + 'Be skeptical: award credit only for concrete evidence, and cite paths, diffs, tool calls, or answer excerpts in reasoning when available.', + buildScoreRangeFormatInstructions(), + ].join('\n'); +} + /** * Calculate score from score-range rubric evaluation results. * - Normalizes each criterion score (0-10) to 0-1 by dividing by 10 diff --git a/packages/core/test/evaluation/graders.test.ts b/packages/core/test/evaluation/graders.test.ts index 713e32030..7112974bd 100644 --- a/packages/core/test/evaluation/graders.test.ts +++ b/packages/core/test/evaluation/graders.test.ts @@ -358,6 +358,97 @@ describe('LlmGrader (llm-grader)', () => { expect(result.graderRawRequest?.systemPrompt).not.toContain(customPrompt); }); + it('does not inject skeptical guidance into explicit custom rubric prompts', async () => { + const customPrompt = 'Custom rubric prompt: decide using my policy for {{output}}'; + const graderProvider = new CapturingProvider({ + output: [ + { + role: 'assistant', + content: JSON.stringify({ + checks: [{ id: 'quality', satisfied: true, reasoning: 'Matches the custom policy.' }], + overall_reasoning: 'Accepted', + }), + }, + ], + }); + + const evaluator = new LlmGrader({ + resolveGraderProvider: async () => graderProvider, + graderTemplate: customPrompt, + }); + + await evaluator.evaluate({ + evalCase: baseTestCase, + candidate: 'Answer', + target: baseTarget, + provider: graderProvider, + attempt: 0, + promptInputs: { question: '' }, + now: new Date(), + evaluator: { + name: 'rubric', + type: 'llm-grader', + rubrics: [{ id: 'quality', outcome: 'Answer follows the policy', weight: 1 }], + }, + }); + + expect(graderProvider.lastRequest?.question).toContain('Custom rubric prompt'); + expect(graderProvider.lastRequest?.systemPrompt).toContain( + 'You must return a valid JSON object matching this schema', + ); + expect(graderProvider.lastRequest?.systemPrompt).not.toContain('Be skeptical'); + expect(graderProvider.lastRequest?.systemPrompt).not.toContain('concrete evidence supports'); + }); + + it('does not inject skeptical guidance into explicit custom score-range prompts', async () => { + const customPrompt = 'Custom score prompt: score {{output}} with my rubric'; + const graderProvider = new CapturingProvider({ + output: [ + { + role: 'assistant', + content: JSON.stringify({ + checks: [{ id: 'quality', score: 8, reasoning: 'Fits the requested range.' }], + overall_reasoning: 'Strong', + }), + }, + ], + }); + + const evaluator = new LlmGrader({ + resolveGraderProvider: async () => graderProvider, + graderTemplate: customPrompt, + }); + + await evaluator.evaluate({ + evalCase: baseTestCase, + candidate: 'Answer', + target: baseTarget, + provider: graderProvider, + attempt: 0, + promptInputs: { question: '' }, + now: new Date(), + evaluator: { + name: 'score-range', + type: 'llm-grader', + rubrics: [ + { + id: 'quality', + outcome: 'Answer quality', + weight: 1, + score_ranges: [{ score_range: [0, 10], outcome: 'Any valid score' }], + }, + ], + }, + }); + + expect(graderProvider.lastRequest?.question).toContain('Custom score prompt'); + expect(graderProvider.lastRequest?.systemPrompt).toContain( + 'The "score" must be an integer from 0 to 10', + ); + expect(graderProvider.lastRequest?.systemPrompt).not.toContain('Be skeptical'); + expect(graderProvider.lastRequest?.systemPrompt).not.toContain('award credit only'); + }); + it('uses evaluator target overrides when configured', async () => { const defaultGraderProvider = new CapturingProvider( textResponse( @@ -599,6 +690,34 @@ describe('LlmGrader (llm-grader)', () => { expect(prompt.userPrompt).toContain('Contradiction guard'); }); + it('uses format-only schema instructions for custom rubric prompt assembly', () => { + const prompt = assembleLlmGraderPrompt({ + evalCase: baseTestCase, + candidate: 'Revenue did not decline.', + promptInputs: { question: '' }, + graderTemplateOverride: 'Custom rubric prompt for {{output}}', + evaluatorConfig: { + name: 'rubric', + type: 'llm-grader', + rubrics: [ + { + id: 'quality', + outcome: 'Answer follows the requested policy', + weight: 1.0, + required: true, + }, + ], + }, + }); + + expect(prompt.userPrompt).toContain('Custom rubric prompt'); + expect(prompt.systemPrompt).toContain( + 'You must return a valid JSON object matching this schema', + ); + expect(prompt.systemPrompt).not.toContain('Be skeptical'); + expect(prompt.systemPrompt).not.toContain('concrete evidence supports'); + }); + it('passes multi-turn role markers through to evaluator prompts', async () => { const graderProvider = new CapturingProvider({ output: [{ role: 'assistant', content: JSON.stringify({ score: 0.65, assertions: [] }) }], From 864fc93748cfc8b5b9c7b44d09d3e4135a210804 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 18:21:12 +0200 Subject: [PATCH 3/3] fix(core): honor grading artifact verdict status --- .../commands/eval/artifact-writer.test.ts | 62 ++++++++++++++++++- .../cli/test/commands/results/summary.test.ts | 2 + .../docs/docs/reference/result-artifacts.mdx | 5 ++ packages/core/src/evaluation/orchestrator.ts | 2 +- packages/core/src/evaluation/run-artifacts.ts | 37 ++++++++--- 5 files changed, 96 insertions(+), 12 deletions(-) diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 8a9b63d3a..b70c7abae 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -151,6 +151,26 @@ describe('buildGradingArtifact', () => { expect(grading.verdict).toBe('pass'); }); + it('uses execution status for threshold-sensitive top-level verdicts', () => { + const passedBelowDefault = buildGradingArtifact( + makeResult({ + score: 0.7, + executionStatus: 'ok', + }), + ); + const failedAboveDefault = buildGradingArtifact( + makeResult({ + score: 0.85, + executionStatus: 'quality_failure', + }), + ); + + expect(passedBelowDefault.score).toBe(0.7); + expect(passedBelowDefault.verdict).toBe('pass'); + expect(failedAboveDefault.score).toBe(0.85); + expect(failedAboveDefault.verdict).toBe('fail'); + }); + it('computes correct summary', () => { const result = makeResult({ assertions: [ @@ -579,6 +599,8 @@ describe('buildAggregateGradingArtifact', () => { const aggregate = buildAggregateGradingArtifact(results); + expect(aggregate.score).toBe(0.9); + expect(aggregate.verdict).toBe('pass'); expect(aggregate.assertion_results).toHaveLength(3); expect(aggregate.assertion_results[0]).toEqual({ test_id: 'test-alpha', @@ -634,6 +656,38 @@ describe('buildAggregateGradingArtifact', () => { }); }); + it('computes top-level score and verdict from quality result status', () => { + const aggregate = buildAggregateGradingArtifact([ + makeResult({ + testId: 'low-threshold-pass', + score: 0.7, + executionStatus: 'ok', + assertions: [{ text: 'passes under configured threshold', passed: true }], + }), + makeResult({ + testId: 'high-threshold-fail', + score: 0.85, + executionStatus: 'quality_failure', + assertions: [{ text: 'fails under configured threshold', passed: false }], + }), + makeResult({ + testId: 'provider-timeout', + score: 1, + executionStatus: 'execution_error', + assertions: [{ text: 'execution error placeholder', passed: false }], + }), + ]); + + expect(aggregate.score).toBe(0.775); + expect(aggregate.verdict).toBe('fail'); + expect(aggregate.summary).toEqual({ + passed: 1, + failed: 1, + total: 2, + pass_rate: 0.5, + }); + }); + it('handles results with no assertions', () => { const results = [ makeResult({ @@ -647,6 +701,8 @@ describe('buildAggregateGradingArtifact', () => { expect(aggregate.assertion_results).toHaveLength(1); expect(aggregate.assertion_results[0].test_id).toBe('test-1'); + expect(aggregate.score).toBe(0.9); + expect(aggregate.verdict).toBe('pass'); expect(aggregate.summary.total).toBe(1); expect(aggregate.summary.passed).toBe(1); expect(aggregate.summary.failed).toBe(0); @@ -677,6 +733,8 @@ describe('buildAggregateGradingArtifact', () => { verdict: 'pass', }, ]); + expect(aggregate.score).toBe(0.9); + expect(aggregate.verdict).toBe('pass'); expect(aggregate.summary).toEqual({ passed: 1, failed: 0, @@ -688,6 +746,8 @@ describe('buildAggregateGradingArtifact', () => { it('handles empty results array', () => { const aggregate = buildAggregateGradingArtifact([]); + expect(aggregate.score).toBe(0); + expect(aggregate.verdict).toBe('skip'); expect(aggregate.assertion_results).toHaveLength(0); expect(aggregate.summary).toEqual({ passed: 0, @@ -756,7 +816,7 @@ describe('buildIndexArtifactEntry', () => { attempt: 0, run_path: 'run-1', score: 0.9, - verdict: 'pass', + verdict: 'fail', scores: [ { name: 'quality', diff --git a/apps/cli/test/commands/results/summary.test.ts b/apps/cli/test/commands/results/summary.test.ts index dbd1e40ec..1a4ff0183 100644 --- a/apps/cli/test/commands/results/summary.test.ts +++ b/apps/cli/test/commands/results/summary.test.ts @@ -76,6 +76,8 @@ describe('formatSummary', () => { describe('formatSummary with grading artifact', () => { it('uses assertion counts from grading artifact when provided', () => { const grading: AggregateGradingArtifact = { + score: 0.75, + verdict: 'fail', assertion_results: [ { test_id: 'test-1', text: 'a', passed: true, evidence: '', score: 1, verdict: 'pass' }, { diff --git a/apps/web/src/content/docs/docs/reference/result-artifacts.mdx b/apps/web/src/content/docs/docs/reference/result-artifacts.mdx index 7a60dd016..4235ec1e7 100644 --- a/apps/web/src/content/docs/docs/reference/result-artifacts.mdx +++ b/apps/web/src/content/docs/docs/reference/result-artifacts.mdx @@ -151,6 +151,11 @@ per-criterion rows. The internal grader API and eval YAML still use assertion rows. Evidence stays in `grading.json` so the sidecar remains useful without loading traces. +Aggregate grading artifacts use the same top-level `score` and `verdict` +fields. Their `score` is the mean normalized score across non-execution-error +attempts or cases, while `verdict` reflects the already derived execution +status for those quality results instead of recomputing a default threshold. + ## Row Contract Each `index.jsonl` line is a JSON object. The exact field set grows as AgentV diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index fc15036a0..3c49f9c03 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -2434,7 +2434,7 @@ async function runEvalCaseWithTrials( // Extract cost from trace summary if available const trialCost = result.costUsd; - const trialVerdict = scoreToVerdict(result.score); + const trialVerdict = result.executionStatus === 'ok' ? 'pass' : 'fail'; const trial: TrialResult = { attempt, score: result.score, diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index 7a7917b6f..d2fad4c87 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -24,7 +24,7 @@ import { omitExternalTraceMetadataKeys, toExternalTraceMetadataWire, } from './external-trace.js'; -import { DEFAULT_THRESHOLD, clampScore, scoreToVerdict } from './graders/scoring.js'; +import { DEFAULT_THRESHOLD, clampScore } from './graders/scoring.js'; import { buildMetricsArtifact } from './metrics.js'; import { type ExportDuplicatePolicy, @@ -463,6 +463,8 @@ export interface RunSummaryArtifact { } export interface AggregateGradingArtifact { + readonly score: number; + readonly verdict: 'pass' | 'fail' | 'skip'; readonly assertion_results: readonly { readonly test_id: string; readonly text: string; @@ -701,10 +703,10 @@ function resultVerdict(result: EvaluationResult): GradingArtifact['verdict'] { if (scores.length > 0 && scores.every((score) => score.verdict === 'skip')) { return 'skip'; } - if (result.executionStatus === 'execution_error') { - return 'fail'; + if (result.executionStatus === 'ok') { + return 'pass'; } - return scoreToVerdict(clampScore(result.score)); + return 'fail'; } function buildEvaluators(scores: readonly GraderResult[] | undefined): GradingArtifact['graders'] { @@ -947,7 +949,7 @@ function buildRepeatCaseSummaryArtifact( const passedRuns = trials.length > 0 ? trials.filter((trial) => trial.verdict === 'pass').length - : result.executionStatus !== 'execution_error' && result.score >= DEFAULT_THRESHOLD + : resultVerdict(result) === 'pass' ? 1 : 0; const fallbackMeanMs = totalRuns > 0 ? roundMillis(timing.duration_ms / totalRuns) : 0; @@ -1045,10 +1047,7 @@ function singleRunTrial(result: EvaluationResult): TrialResult { return { attempt: 0, score: result.score, - verdict: - result.executionStatus !== 'execution_error' && result.score >= DEFAULT_THRESHOLD - ? 'pass' - : 'fail', + verdict: resultVerdict(result), scores: result.scores, error: result.error, costUsd: result.costUsd, @@ -1534,8 +1533,9 @@ export function buildAggregateGradingArtifact( results: readonly EvaluationResult[], ): AggregateGradingArtifact { const assertionResults: AggregateGradingArtifact['assertion_results'][number][] = []; + const qualityResults = results.filter((r) => !isExecutionError(r)); - for (const result of results.filter((r) => !isExecutionError(r))) { + for (const result of qualityResults) { const testId = result.testId ?? 'unknown'; for (const assertion of result.assertions ?? []) { assertionResults.push({ @@ -1548,8 +1548,25 @@ export function buildAggregateGradingArtifact( const passed = assertionResults.filter((a) => a.passed).length; const failed = assertionResults.filter((a) => !a.passed).length; const total = assertionResults.length; + const score = + qualityResults.length > 0 + ? Math.round( + (qualityResults.reduce((sum, result) => sum + clampScore(result.score), 0) / + qualityResults.length) * + 1000, + ) / 1000 + : 0; + const verdict = + results.length === 0 + ? 'skip' + : qualityResults.length > 0 && + qualityResults.every((result) => resultVerdict(result) === 'pass') + ? 'pass' + : 'fail'; return { + score, + verdict, assertion_results: assertionResults, summary: { passed,