Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/cli/src/commands/eval/task-bundle.ts
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ async function copyDirectory(sourcePath: string, destinationPath: string): Promi
}

function shouldCopyDirectory(reference: BundleSourceReference): boolean {
if (reference.kind !== 'code_grader_cwd') {
if (reference.kind !== 'script_grader_cwd' && reference.kind !== 'code_grader_cwd') {
return true;
}
return !path.isAbsolute(reference.displayPath);
Expand Down
5 changes: 3 additions & 2 deletions apps/cli/src/commands/pipeline/grade.ts
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ export async function runCodeGraders(
const executeCodeGrader = async (graderConfig: Record<string, unknown>, task: GraderTask) => {
const { testId, resultsDir, responseText, inputData } = task;
const graderName = graderConfig.name as string;
const graderType = typeof graderConfig.type === 'string' ? graderConfig.type : 'script';
const messages = [{ role: 'assistant' as const, content: responseText }];
const trace = buildTraceFromMessages({
input: inputData.input,
Expand Down Expand Up @@ -157,7 +158,7 @@ export async function runCodeGraders(

await writeFile(
join(resultsDir, `${graderName}.json`),
`${JSON.stringify({ name: graderName, type: 'code-grader', score, weight: graderConfig.weight ?? 1.0, assertions, details: parsed.details ?? {} }, null, 2)}\n`,
`${JSON.stringify({ name: graderName, type: graderType, score, weight: graderConfig.weight ?? 1.0, assertions, details: parsed.details ?? {} }, null, 2)}\n`,
'utf8',
);
} catch (error) {
Expand All @@ -167,7 +168,7 @@ export async function runCodeGraders(

await writeFile(
join(resultsDir, `${graderName}.json`),
`${JSON.stringify({ name: graderName, type: 'code-grader', score: 0, weight: graderConfig.weight ?? 1.0, assertions: [{ text: `Error: ${message}`, passed: false }], details: { error: message } }, null, 2)}\n`,
`${JSON.stringify({ name: graderName, type: graderType, score: 0, weight: graderConfig.weight ?? 1.0, assertions: [{ text: `Error: ${message}`, passed: false }], details: { error: message } }, null, 2)}\n`,
'utf8',
);
}
Expand Down
8 changes: 4 additions & 4 deletions apps/cli/src/commands/pipeline/input.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import { readFile } from 'node:fs/promises';
import { mkdir, writeFile } from 'node:fs/promises';
import { dirname, join, relative, resolve } from 'node:path';

import type { CodeGraderConfig, GraderConfig, LlmGraderConfig } from '@agentv/core';
import type { GraderConfig, LlmGraderConfig, ScriptGraderConfig } from '@agentv/core';

/** Assertion types that can be graded deterministically without external scripts or LLMs. */
const BUILTIN_ASSERTION_TYPES = new Set([
Expand Down Expand Up @@ -252,15 +252,15 @@ async function writeGraderConfigs(
let hasLlmGraders = false;

for (const assertion of assertions) {
if (assertion.type === 'code-grader') {
if (assertion.type === 'script' || assertion.type === 'code-grader') {
if (!hasCodeGraders) {
await mkdir(codeGradersDir, { recursive: true });
hasCodeGraders = true;
}
const config = assertion as CodeGraderConfig;
const config = assertion as ScriptGraderConfig;
await writeJson(join(codeGradersDir, `${config.name}.json`), {
name: config.name,
type: 'code-grader',
type: 'script',
command: config.command,
cwd: config.resolvedCwd ?? config.cwd ?? evalDir,
weight: config.weight ?? 1.0,
Expand Down
7 changes: 4 additions & 3 deletions apps/cli/src/commands/pipeline/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import { tmpdir } from 'node:os';
import { dirname, join, relative, resolve } from 'node:path';

import { deriveCategory, loadTestSuite } from '@agentv/core';
import type { CodeGraderConfig, GraderConfig, LlmGraderConfig } from '@agentv/core';
import type { GraderConfig, LlmGraderConfig, ScriptGraderConfig } from '@agentv/core';
import { command, number, oneOf, option, optional, positional, string } from 'cmd-ts';

import { buildDefaultRunDir } from '../eval/result-layout.js';
Expand Down Expand Up @@ -439,14 +439,15 @@ async function writeGraderConfigs(
let hasLlmGraders = false;

for (const assertion of assertions) {
if (assertion.type === 'code-grader') {
if (assertion.type === 'script' || assertion.type === 'code-grader') {
if (!hasCodeGraders) {
await mkdir(codeGradersDir, { recursive: true });
hasCodeGraders = true;
}
const config = assertion as CodeGraderConfig;
const config = assertion as ScriptGraderConfig;
await writeJson(join(codeGradersDir, `${config.name}.json`), {
name: config.name,
type: 'script',
command: config.command,
cwd: config.resolvedCwd ?? config.cwd ?? evalDir,
weight: config.weight ?? 1.0,
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/test/commands/eval/task-bundle.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ describe('materializeTaskBundle', () => {
graderName: 'quality',
},
{
kind: 'code_grader_command',
kind: 'script_grader_command',
displayPath: scriptPath,
resolvedPath: scriptPath,
graderName: 'quality',
Expand Down
6 changes: 3 additions & 3 deletions apps/web/src/content/docs/docs/evaluation/batch-cli.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ tests:

assertions:
- name: decision-check
type: code-grader
type: script
command: [bun, run, ./scripts/check-output.ts]
cwd: .

Expand Down Expand Up @@ -82,7 +82,7 @@ tests:

assertions:
- name: decision-check
type: code-grader
type: script
command: [bun, run, ./scripts/check-output.ts]
cwd: .
```
Expand Down Expand Up @@ -141,7 +141,7 @@ AgentV extracts tool calls directly from `output[].tool_calls[]` for `tool_traje

## Grader Implementation

Each test has its own grader that validates the batch runner output. The grader receives the standard `code_grader` input via stdin.
Each test has its own grader that validates the batch runner output. The grader receives the standard `script` input via stdin.

**Input (stdin):**
```json
Expand Down
86 changes: 37 additions & 49 deletions apps/web/src/content/docs/docs/evaluation/eval-cases.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,12 @@ tests:
| Field | Required | Description |
|-------|----------|-------------|
| `id` | Yes | Unique identifier for the test |
| `criteria` | Conditional | Description of what a correct response should contain. Required only when the case has no `expected_output` or `assertions` |
| `input` | Yes | Input sent to the target (string, object, or message array) |
| `expected_output` | No | Expected response for comparison (string, object, or message array) |
| `expected_output` | No | Passive reference response available to graders (string, object, or message array) |
| `assertions` / `assert` | Yes | Per-test graders; plain strings become `g-eval` rubric checks |
| `execution` | No | Per-case grader/default overrides such as `skip_defaults`; target selection belongs in top-level `target` or CLI `--target` |
| `workspace` | No | Per-case workspace config (overrides suite-level) |
| `metadata` | No | Arbitrary key-value pairs passed to graders and workspace scripts |
| `rubrics` | No | Structured evaluation criteria |
| `assertions` | No | Per-test graders |

## Input

Expand All @@ -41,7 +39,7 @@ The simplest form is a string, which expands to a single user message:
input: What is 15 + 27?
```

Structured object input also expands to a single user message while preserving the object for code graders and batch runners:
Structured object input also expands to a single user message while preserving the object for script graders and batch runners:

```yaml
input:
Expand Down Expand Up @@ -71,8 +69,8 @@ Optional reference response for comparison by graders. Write `expected_output` a
a golden answer or reference response the target could have produced, not as a
rubric or "the agent should..." criteria list. `expected_output` is passive
reference data: it is stored on the case and passed to graders, but it does not
choose a grader by itself when `assertions` is present. Add explicit assertion
strings, `llm-grader`, `code-grader`, `field-accuracy`, or another
choose a grader by itself. Add explicit assertion
strings, `llm-grader`, `script`, `field-accuracy`, or another
reference-aware grader when you want the reference answer evaluated.

A string expands to a single assistant message:
Expand All @@ -98,10 +96,10 @@ eval suites, or tags/filters for target-specific cases.
```yaml
tests:
- id: complex-case
criteria: Provides detailed explanation
input: Explain quicksort algorithm

assertions:
- Provides a detailed explanation
- name: depth_check
type: llm-grader
prompt: ./graders/depth.md
Expand All @@ -117,16 +115,17 @@ assertions:

tests:
- id: normal-case
criteria: Returns correct answer
input: What is 2+2?
assertions:
- Returns the correct answer
# Gets latency_check from root-level assertions

- id: special-case
criteria: Handles edge case
input: Handle this edge case
execution:
skip_defaults: true
assertions:
- Handles the edge case
- name: custom_eval
type: llm-grader
# Does NOT get latency_check
Expand All @@ -144,16 +143,18 @@ workspace:

tests:
- id: case-1
criteria: Should work
input: Do something
assertions:
- Completes the requested task
workspace:
hooks:
before_all:
command: ["bun", "run", "custom-setup.ts"]

- id: case-2
criteria: Should also work
input: Do something else
assertions:
- Completes the requested task
# Inherits suite-level hooks.before_all
```

Expand Down Expand Up @@ -287,17 +288,17 @@ All deterministic assertions support these optional fields:
```yaml
tests:
- id: no-competitors
criteria: Response must not mention any competitor
input: "Describe our product advantages."
assertions:
- Response must not mention any competitor
- type: contains-any
value: ["CompetitorA", "CompetitorB", "CompetitorC"]
negate: true

- id: required-inputs
criteria: Agent asks for missing rule codes
input: "Process customs entry for country BE."
assertions:
- Agent asks for missing rule codes
- name: asks-for-rule-codes
type: icontains-any
value: ["rule code", "rule codes"]
Expand All @@ -311,13 +312,12 @@ Assertion graders auto-generate a `name` when one is not provided (e.g., `contai

### Advanced Rubric Assertions

Use `type: rubrics` with a `criteria` array only when you need weights,
Use `type: g-eval` with a `criteria` array only when you need weights,
required flags, or score ranges:

```yaml
tests:
- id: denied-party
criteria: Must identify denied party
input:
- role: user
content: Screen "Acme Corp" against denied parties list
Expand All @@ -328,7 +328,7 @@ tests:
- type: contains
value: "DENIED"
required: true
- type: rubrics
- type: g-eval
criteria:
- id: accuracy
outcome: Correctly identifies the denied party
Expand All @@ -352,7 +352,7 @@ assertions:
- type: contains
value: "DENIED"
required: true # must pass (>= 0.8)
- type: rubrics
- type: g-eval
required: true
min_score: 0.6 # must score at least 0.6
criteria:
Expand All @@ -373,24 +373,22 @@ Required gates are evaluated after all graders run. If any required grader falls

## How Reference Fields and `assertions` Interact

The `criteria` and `expected_output` fields are **data fields** that describe what the
response should accomplish. They are not graders themselves — how they get used depends
on whether `assertions` is present.
`expected_output` is reference data, not a grader. It is stored on the case and
provided to graders that know how to use it, but it does not create an LLM
grading call by itself. Put the grading contract in `assertions` or `assert`.

### No `assertions` — implicit LLM grader

When a test has no `assertions` field, a default `llm-grader` grader runs automatically
and uses the case context, including `criteria` and `expected_output` when present:
Plain assertion strings are the default shape for semantic checks:

```yaml
tests:
- id: simple-eval
criteria: Assistant correctly explains the bug and proposes a fix
input: "Debug this function..."
# No assertions → default llm-grader evaluates against criteria
assertions:
- Assistant correctly explains the bug and proposes a fix
```

Suite-level `preprocessors` also apply to this implicit grader. That matters when the agent output is a `ContentFile` block rather than plain text:
Suite-level `preprocessors` apply to explicit LLM graders. That matters when the
agent output is a `ContentFile` block rather than plain text:

```yaml
preprocessors:
Expand All @@ -399,16 +397,15 @@ preprocessors:

tests:
- id: spreadsheet-eval
criteria: Output includes the revenue rows
input: Generate the spreadsheet report
assertions:
- Output includes the revenue rows
```

### `assertions` present — explicit graders only

When `assertions` is defined, only the declared graders run. No implicit grader is added
because `criteria` or `expected_output` exists. Graders that are declared (such as
plain rubric strings, `llm-grader`, `code-grader`, or `rubrics`) receive the case
context, including `criteria` and `expected_output`, as input automatically.
When `assertions` is defined, only the declared graders run. No implicit grader is
added because `expected_output` exists. Declared graders such as plain rubric
strings, `llm-grader`, `script`, or `g-eval` receive the case context, including
`expected_output`, as input automatically.

This means a case with `expected_output` and only deterministic assertions evaluates only
those deterministic assertions:
Expand All @@ -424,7 +421,7 @@ tests:
```

For contract-style evals where assertion strings express every semantic check,
omit `criteria`:
keep those checks in `assertions`:

```yaml
tests:
Expand All @@ -440,21 +437,11 @@ tests:
- The answer avoids preserving one-off observations as durable guidance.
```

If `assertions` contains only deterministic graders (like `contains` or `regex`), the `criteria` field is not evaluated and a warning is emitted:

```
Warning: Test 'my-test': criteria is defined but no grader in assertions
will evaluate it. Add a rubric assertion string or another grader to assertions,
or remove criteria if it is documentation-only.
```

To use `criteria` alongside deterministic checks, add a rubric assertion string
or another grader explicitly:
To combine deterministic checks with semantic checks, add both explicitly:

```yaml
tests:
- id: mixed-eval
criteria: Response is helpful and mentions the fix
input: "Debug this function..."
assertions:
- Explains why the bug happens
Expand All @@ -471,9 +458,9 @@ preprocessors:

tests:
- id: mixed-eval
criteria: Response is helpful and mentions the fix
input: "Debug this function..."
assertions:
- Response is helpful and mentions the fix
- type: llm-grader # use explicit form for custom preprocessors
preprocessors:
- type: xlsx
Expand All @@ -489,11 +476,12 @@ Pass additional context through the `metadata` field:
```yaml
tests:
- id: code-gen
criteria: Generates valid Python
metadata:
language: python
difficulty: medium
input: Write a function to sort a list
assertions:
- Generates valid Python
```

`metadata` is passed to workspace lifecycle hooks as `case_metadata`, preserved
Expand Down
2 changes: 1 addition & 1 deletion apps/web/src/content/docs/docs/evaluation/eval-files.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ tests:
```

`assertions` supports rubric shorthand strings, deterministic assertion types
(`contains`, `regex`, `is_json`, `equals`), `rubrics`, LLM graders, and code
(`contains`, `regex`, `is_json`, `equals`), `g-eval`, LLM graders, and code
graders. See [Tests](/docs/evaluation/eval-cases/#per-test-assertions) for
per-test assertions usage.

Expand Down
Loading
Loading