diff --git a/apps/cli/src/commands/eval/task-bundle.ts b/apps/cli/src/commands/eval/task-bundle.ts index 9aec62e11..7e86e6872 100644 --- a/apps/cli/src/commands/eval/task-bundle.ts +++ b/apps/cli/src/commands/eval/task-bundle.ts @@ -298,7 +298,7 @@ async function copyDirectory(sourcePath: string, destinationPath: string): Promi } function shouldCopyDirectory(reference: BundleSourceReference): boolean { - if (reference.kind !== 'code_grader_cwd') { + if (reference.kind !== 'script_grader_cwd' && reference.kind !== 'code_grader_cwd') { return true; } return !path.isAbsolute(reference.displayPath); diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts index 738924275..ef4eb3773 100644 --- a/apps/cli/src/commands/pipeline/grade.ts +++ b/apps/cli/src/commands/pipeline/grade.ts @@ -107,6 +107,7 @@ export async function runCodeGraders( const executeCodeGrader = async (graderConfig: Record, task: GraderTask) => { const { testId, resultsDir, responseText, inputData } = task; const graderName = graderConfig.name as string; + const graderType = typeof graderConfig.type === 'string' ? graderConfig.type : 'script'; const messages = [{ role: 'assistant' as const, content: responseText }]; const trace = buildTraceFromMessages({ input: inputData.input, @@ -157,7 +158,7 @@ export async function runCodeGraders( await writeFile( join(resultsDir, `${graderName}.json`), - `${JSON.stringify({ name: graderName, type: 'code-grader', score, weight: graderConfig.weight ?? 1.0, assertions, details: parsed.details ?? {} }, null, 2)}\n`, + `${JSON.stringify({ name: graderName, type: graderType, score, weight: graderConfig.weight ?? 1.0, assertions, details: parsed.details ?? {} }, null, 2)}\n`, 'utf8', ); } catch (error) { @@ -167,7 +168,7 @@ export async function runCodeGraders( await writeFile( join(resultsDir, `${graderName}.json`), - `${JSON.stringify({ name: graderName, type: 'code-grader', score: 0, weight: graderConfig.weight ?? 1.0, assertions: [{ text: `Error: ${message}`, passed: false }], details: { error: message } }, null, 2)}\n`, + `${JSON.stringify({ name: graderName, type: graderType, score: 0, weight: graderConfig.weight ?? 1.0, assertions: [{ text: `Error: ${message}`, passed: false }], details: { error: message } }, null, 2)}\n`, 'utf8', ); } diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts index 72ee29955..efd07824c 100644 --- a/apps/cli/src/commands/pipeline/input.ts +++ b/apps/cli/src/commands/pipeline/input.ts @@ -22,7 +22,7 @@ import { readFile } from 'node:fs/promises'; import { mkdir, writeFile } from 'node:fs/promises'; import { dirname, join, relative, resolve } from 'node:path'; -import type { CodeGraderConfig, GraderConfig, LlmGraderConfig } from '@agentv/core'; +import type { GraderConfig, LlmGraderConfig, ScriptGraderConfig } from '@agentv/core'; /** Assertion types that can be graded deterministically without external scripts or LLMs. */ const BUILTIN_ASSERTION_TYPES = new Set([ @@ -252,15 +252,15 @@ async function writeGraderConfigs( let hasLlmGraders = false; for (const assertion of assertions) { - if (assertion.type === 'code-grader') { + if (assertion.type === 'script' || assertion.type === 'code-grader') { if (!hasCodeGraders) { await mkdir(codeGradersDir, { recursive: true }); hasCodeGraders = true; } - const config = assertion as CodeGraderConfig; + const config = assertion as ScriptGraderConfig; await writeJson(join(codeGradersDir, `${config.name}.json`), { name: config.name, - type: 'code-grader', + type: 'script', command: config.command, cwd: config.resolvedCwd ?? config.cwd ?? evalDir, weight: config.weight ?? 1.0, diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index 99672c5bf..8437f732d 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -18,7 +18,7 @@ import { tmpdir } from 'node:os'; import { dirname, join, relative, resolve } from 'node:path'; import { deriveCategory, loadTestSuite } from '@agentv/core'; -import type { CodeGraderConfig, GraderConfig, LlmGraderConfig } from '@agentv/core'; +import type { GraderConfig, LlmGraderConfig, ScriptGraderConfig } from '@agentv/core'; import { command, number, oneOf, option, optional, positional, string } from 'cmd-ts'; import { buildDefaultRunDir } from '../eval/result-layout.js'; @@ -439,14 +439,15 @@ async function writeGraderConfigs( let hasLlmGraders = false; for (const assertion of assertions) { - if (assertion.type === 'code-grader') { + if (assertion.type === 'script' || assertion.type === 'code-grader') { if (!hasCodeGraders) { await mkdir(codeGradersDir, { recursive: true }); hasCodeGraders = true; } - const config = assertion as CodeGraderConfig; + const config = assertion as ScriptGraderConfig; await writeJson(join(codeGradersDir, `${config.name}.json`), { name: config.name, + type: 'script', command: config.command, cwd: config.resolvedCwd ?? config.cwd ?? evalDir, weight: config.weight ?? 1.0, diff --git a/apps/cli/test/commands/eval/task-bundle.test.ts b/apps/cli/test/commands/eval/task-bundle.test.ts index c44b34988..9d7aeb4be 100644 --- a/apps/cli/test/commands/eval/task-bundle.test.ts +++ b/apps/cli/test/commands/eval/task-bundle.test.ts @@ -72,7 +72,7 @@ describe('materializeTaskBundle', () => { graderName: 'quality', }, { - kind: 'code_grader_command', + kind: 'script_grader_command', displayPath: scriptPath, resolvedPath: scriptPath, graderName: 'quality', diff --git a/apps/web/src/content/docs/docs/evaluation/batch-cli.mdx b/apps/web/src/content/docs/docs/evaluation/batch-cli.mdx index 80c20f7a0..1e2d66724 100644 --- a/apps/web/src/content/docs/docs/evaluation/batch-cli.mdx +++ b/apps/web/src/content/docs/docs/evaluation/batch-cli.mdx @@ -54,7 +54,7 @@ tests: assertions: - name: decision-check - type: code-grader + type: script command: [bun, run, ./scripts/check-output.ts] cwd: . @@ -82,7 +82,7 @@ tests: assertions: - name: decision-check - type: code-grader + type: script command: [bun, run, ./scripts/check-output.ts] cwd: . ``` @@ -141,7 +141,7 @@ AgentV extracts tool calls directly from `output[].tool_calls[]` for `tool_traje ## Grader Implementation -Each test has its own grader that validates the batch runner output. The grader receives the standard `code_grader` input via stdin. +Each test has its own grader that validates the batch runner output. The grader receives the standard `script` input via stdin. **Input (stdin):** ```json diff --git a/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx b/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx index 4431a1a57..baf8040b9 100644 --- a/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx +++ b/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx @@ -24,14 +24,12 @@ tests: | Field | Required | Description | |-------|----------|-------------| | `id` | Yes | Unique identifier for the test | -| `criteria` | Conditional | Description of what a correct response should contain. Required only when the case has no `expected_output` or `assertions` | | `input` | Yes | Input sent to the target (string, object, or message array) | -| `expected_output` | No | Expected response for comparison (string, object, or message array) | +| `expected_output` | No | Passive reference response available to graders (string, object, or message array) | +| `assertions` / `assert` | Yes | Per-test graders; plain strings become `g-eval` rubric checks | | `execution` | No | Per-case grader/default overrides such as `skip_defaults`; target selection belongs in top-level `target` or CLI `--target` | | `workspace` | No | Per-case workspace config (overrides suite-level) | | `metadata` | No | Arbitrary key-value pairs passed to graders and workspace scripts | -| `rubrics` | No | Structured evaluation criteria | -| `assertions` | No | Per-test graders | ## Input @@ -41,7 +39,7 @@ The simplest form is a string, which expands to a single user message: input: What is 15 + 27? ``` -Structured object input also expands to a single user message while preserving the object for code graders and batch runners: +Structured object input also expands to a single user message while preserving the object for script graders and batch runners: ```yaml input: @@ -71,8 +69,8 @@ Optional reference response for comparison by graders. Write `expected_output` a a golden answer or reference response the target could have produced, not as a rubric or "the agent should..." criteria list. `expected_output` is passive reference data: it is stored on the case and passed to graders, but it does not -choose a grader by itself when `assertions` is present. Add explicit assertion -strings, `llm-grader`, `code-grader`, `field-accuracy`, or another +choose a grader by itself. Add explicit assertion +strings, `llm-grader`, `script`, `field-accuracy`, or another reference-aware grader when you want the reference answer evaluated. A string expands to a single assistant message: @@ -98,10 +96,10 @@ eval suites, or tags/filters for target-specific cases. ```yaml tests: - id: complex-case - criteria: Provides detailed explanation input: Explain quicksort algorithm assertions: + - Provides a detailed explanation - name: depth_check type: llm-grader prompt: ./graders/depth.md @@ -117,16 +115,17 @@ assertions: tests: - id: normal-case - criteria: Returns correct answer input: What is 2+2? + assertions: + - Returns the correct answer # Gets latency_check from root-level assertions - id: special-case - criteria: Handles edge case input: Handle this edge case execution: skip_defaults: true assertions: + - Handles the edge case - name: custom_eval type: llm-grader # Does NOT get latency_check @@ -144,16 +143,18 @@ workspace: tests: - id: case-1 - criteria: Should work input: Do something + assertions: + - Completes the requested task workspace: hooks: before_all: command: ["bun", "run", "custom-setup.ts"] - id: case-2 - criteria: Should also work input: Do something else + assertions: + - Completes the requested task # Inherits suite-level hooks.before_all ``` @@ -287,17 +288,17 @@ All deterministic assertions support these optional fields: ```yaml tests: - id: no-competitors - criteria: Response must not mention any competitor input: "Describe our product advantages." assertions: + - Response must not mention any competitor - type: contains-any value: ["CompetitorA", "CompetitorB", "CompetitorC"] negate: true - id: required-inputs - criteria: Agent asks for missing rule codes input: "Process customs entry for country BE." assertions: + - Agent asks for missing rule codes - name: asks-for-rule-codes type: icontains-any value: ["rule code", "rule codes"] @@ -311,13 +312,12 @@ Assertion graders auto-generate a `name` when one is not provided (e.g., `contai ### Advanced Rubric Assertions -Use `type: rubrics` with a `criteria` array only when you need weights, +Use `type: g-eval` with a `criteria` array only when you need weights, required flags, or score ranges: ```yaml tests: - id: denied-party - criteria: Must identify denied party input: - role: user content: Screen "Acme Corp" against denied parties list @@ -328,7 +328,7 @@ tests: - type: contains value: "DENIED" required: true - - type: rubrics + - type: g-eval criteria: - id: accuracy outcome: Correctly identifies the denied party @@ -352,7 +352,7 @@ assertions: - type: contains value: "DENIED" required: true # must pass (>= 0.8) - - type: rubrics + - type: g-eval required: true min_score: 0.6 # must score at least 0.6 criteria: @@ -373,24 +373,22 @@ Required gates are evaluated after all graders run. If any required grader falls ## How Reference Fields and `assertions` Interact -The `criteria` and `expected_output` fields are **data fields** that describe what the -response should accomplish. They are not graders themselves — how they get used depends -on whether `assertions` is present. +`expected_output` is reference data, not a grader. It is stored on the case and +provided to graders that know how to use it, but it does not create an LLM +grading call by itself. Put the grading contract in `assertions` or `assert`. -### No `assertions` — implicit LLM grader - -When a test has no `assertions` field, a default `llm-grader` grader runs automatically -and uses the case context, including `criteria` and `expected_output` when present: +Plain assertion strings are the default shape for semantic checks: ```yaml tests: - id: simple-eval - criteria: Assistant correctly explains the bug and proposes a fix input: "Debug this function..." - # No assertions → default llm-grader evaluates against criteria + assertions: + - Assistant correctly explains the bug and proposes a fix ``` -Suite-level `preprocessors` also apply to this implicit grader. That matters when the agent output is a `ContentFile` block rather than plain text: +Suite-level `preprocessors` apply to explicit LLM graders. That matters when the +agent output is a `ContentFile` block rather than plain text: ```yaml preprocessors: @@ -399,16 +397,15 @@ preprocessors: tests: - id: spreadsheet-eval - criteria: Output includes the revenue rows input: Generate the spreadsheet report + assertions: + - Output includes the revenue rows ``` -### `assertions` present — explicit graders only - -When `assertions` is defined, only the declared graders run. No implicit grader is added -because `criteria` or `expected_output` exists. Graders that are declared (such as -plain rubric strings, `llm-grader`, `code-grader`, or `rubrics`) receive the case -context, including `criteria` and `expected_output`, as input automatically. +When `assertions` is defined, only the declared graders run. No implicit grader is +added because `expected_output` exists. Declared graders such as plain rubric +strings, `llm-grader`, `script`, or `g-eval` receive the case context, including +`expected_output`, as input automatically. This means a case with `expected_output` and only deterministic assertions evaluates only those deterministic assertions: @@ -424,7 +421,7 @@ tests: ``` For contract-style evals where assertion strings express every semantic check, -omit `criteria`: +keep those checks in `assertions`: ```yaml tests: @@ -440,21 +437,11 @@ tests: - The answer avoids preserving one-off observations as durable guidance. ``` -If `assertions` contains only deterministic graders (like `contains` or `regex`), the `criteria` field is not evaluated and a warning is emitted: - -``` -Warning: Test 'my-test': criteria is defined but no grader in assertions -will evaluate it. Add a rubric assertion string or another grader to assertions, -or remove criteria if it is documentation-only. -``` - -To use `criteria` alongside deterministic checks, add a rubric assertion string -or another grader explicitly: +To combine deterministic checks with semantic checks, add both explicitly: ```yaml tests: - id: mixed-eval - criteria: Response is helpful and mentions the fix input: "Debug this function..." assertions: - Explains why the bug happens @@ -471,9 +458,9 @@ preprocessors: tests: - id: mixed-eval - criteria: Response is helpful and mentions the fix input: "Debug this function..." assertions: + - Response is helpful and mentions the fix - type: llm-grader # use explicit form for custom preprocessors preprocessors: - type: xlsx @@ -489,11 +476,12 @@ Pass additional context through the `metadata` field: ```yaml tests: - id: code-gen - criteria: Generates valid Python metadata: language: python difficulty: medium input: Write a function to sort a list + assertions: + - Generates valid Python ``` `metadata` is passed to workspace lifecycle hooks as `case_metadata`, preserved diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx index c53d61e4f..88a3bfc45 100644 --- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx +++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx @@ -239,7 +239,7 @@ tests: ``` `assertions` supports rubric shorthand strings, deterministic assertion types -(`contains`, `regex`, `is_json`, `equals`), `rubrics`, LLM graders, and code +(`contains`, `regex`, `is_json`, `equals`), `g-eval`, LLM graders, and code graders. See [Tests](/docs/evaluation/eval-cases/#per-test-assertions) for per-test assertions usage. diff --git a/apps/web/src/content/docs/docs/evaluation/examples.mdx b/apps/web/src/content/docs/docs/evaluation/examples.mdx index 9d69a737c..3d19bfea7 100644 --- a/apps/web/src/content/docs/docs/evaluation/examples.mdx +++ b/apps/web/src/content/docs/docs/evaluation/examples.mdx @@ -69,7 +69,7 @@ tests: ## Multi-Grader -Combine a code grader and an LLM grader on the same test: +Combine a script grader and an LLM grader on the same test: ```yaml description: JSON generation with validation @@ -81,7 +81,7 @@ tests: assertions: - name: json_format_validator - type: code-grader + type: script command: [uv, run, validate_json.py] cwd: ./graders - name: content_evaluator @@ -310,7 +310,7 @@ tests: assertions: - name: decision-check - type: code-grader + type: script command: [bun, run, ./scripts/check-batch-cli-output.ts] cwd: . @@ -343,7 +343,7 @@ tests: assertions: - name: decision-check - type: code-grader + type: script command: [bun, run, ./scripts/check-batch-cli-output.ts] cwd: . ``` diff --git a/apps/web/src/content/docs/docs/evaluation/rubrics.mdx b/apps/web/src/content/docs/docs/evaluation/rubrics.mdx index 2fb186e6f..acebb043f 100644 --- a/apps/web/src/content/docs/docs/evaluation/rubrics.mdx +++ b/apps/web/src/content/docs/docs/evaluation/rubrics.mdx @@ -22,11 +22,11 @@ tests: - States time complexity ``` -All strings are collected into a single rubrics grader automatically. +All strings are collected into a single g-eval grader automatically. ### Full form for advanced options -Use `type: rubrics` explicitly when you need weights, required flags, or score ranges: +Use `type: g-eval` explicitly when you need weights, required flags, or score ranges: ```yaml tests: @@ -34,7 +34,7 @@ tests: criteria: Explain how quicksort works input: Explain quicksort algorithm assertions: - - type: rubrics + - type: g-eval criteria: - Mentions divide-and-conquer approach - Explains partition step @@ -47,7 +47,7 @@ For fine-grained control, use rubric objects with weights and requirements: ```yaml assertions: - - type: rubrics + - type: g-eval criteria: - id: core-concept outcome: Explains divide-and-conquer @@ -74,7 +74,7 @@ assertions: | `score_ranges` | — | Score range definitions (analytic mode) | :::note -Use `min_score` for analytic rubric gating. The only 0–10 values in authored rubrics are `score_ranges` bands and grader outputs. +Use `min_score` for analytic rubric gating. The only 0–10 values in authored g-eval are `score_ranges` bands and grader outputs. ::: ### Criterion Operators @@ -83,7 +83,7 @@ Use `operator` when the criterion outcome should be interpreted with a specific ```yaml assertions: - - type: rubrics + - type: g-eval criteria: - id: supported-revenue operator: correctness @@ -103,7 +103,7 @@ For quality gradients instead of binary pass/fail, use score ranges: ```yaml assertions: - - type: rubrics + - type: g-eval criteria: - id: accuracy outcome: Provides correct answer @@ -170,12 +170,12 @@ tests: criteria: Generates correct, clean Python code input: Write a fibonacci function assertions: - - type: rubrics + - type: g-eval criteria: - Returns correct values for n=0,1,2,10 - Uses meaningful variable names - Includes docstring - name: syntax_check - type: code-grader + type: script command: [./validators/check_python.py] ``` diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index b13595cfd..1b5b9caad 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -42,7 +42,7 @@ Each `scores[]` entry includes per-grader timing: } ``` -The `duration_ms`, `started_at`, and `ended_at` fields are present on every grader result (including `code-grader`), enabling per-grader bottleneck analysis. +The `duration_ms`, `started_at`, and `ended_at` fields are present on every grader result (including `script`), enabling per-grader bottleneck analysis. ## Common Options @@ -419,7 +419,7 @@ fixture so AgentV still runs graders against real or frozen candidate output. ## Run a Single Assertion -Run a code-grader assertion in isolation without executing a full eval suite: +Run a script assertion in isolation without executing a full eval suite: ```bash agentv eval assert --agent-output --agent-input @@ -441,7 +441,7 @@ The `--file` option reads a JSON file with `{ "output": "...", "input": "..." }` **Exit codes:** 0 if score >= 0.5 (pass), 1 if score < 0.5 (fail). -This is the same interface that agent-orchestrated evals use — the EVAL.yaml transpiler emits `assertions` instructions for code graders so external grading agents can execute them directly. +This is the same interface that agent-orchestrated evals use — the EVAL.yaml transpiler emits `assertions` instructions for script graders so external grading agents can execute them directly. ## Offline Grading diff --git a/apps/web/src/content/docs/docs/evaluation/sdk.mdx b/apps/web/src/content/docs/docs/evaluation/sdk.mdx index 10698004b..4b2f75222 100644 --- a/apps/web/src/content/docs/docs/evaluation/sdk.mdx +++ b/apps/web/src/content/docs/docs/evaluation/sdk.mdx @@ -9,7 +9,7 @@ YAML remains AgentV's canonical, portable eval format. The SDK surfaces below ar AgentV currently provides two npm packages for programmatic use: -- **`@agentv/sdk`** — user-facing SDK for `evaluate()`, YAML-aligned eval authoring, custom assertions, and code graders +- **`@agentv/sdk`** — user-facing SDK for `evaluate()`, YAML-aligned eval authoring, custom assertions, and script graders - **`@agentv/core`** — core implementation package and typed configuration ## Installation @@ -140,7 +140,7 @@ export default defineEval({ graders.exact('{"message":"Hello"}', { name: 'exact-json', minScore: 1 }), graders.regex(/"message"\s*:/, { name: 'message-key' }), graders.json({ name: 'valid-json', required: true }), - graders.rubrics(['Greets the user'], { name: 'rubric-review' }), + graders.g-eval(['Greets the user'], { name: 'rubric-review' }), graders.llmGrader({ name: 'llm-review', prompt: 'Grade whether the answer is useful.', @@ -153,7 +153,7 @@ export default defineEval({ }); ``` -The catalog covers `contains`, `equals`/`exact`, `regex`, `is-json`/`json`, `rubrics`, `llm-grader`, and `code-grader`. CamelCase SDK options such as `minScore`, `maxSteps`, and rubric `scoreRanges` lower to `min_score`, `max_steps`, and `score_ranges` when AgentV loads or serializes the suite. +The catalog covers `contains`, `equals`/`exact`, `regex`, `is-json`/`json`, `g-eval`, `llm-grader`, and `script`. CamelCase SDK options such as `minScore`, `maxSteps`, and rubric `scoreRanges` lower to `min_score`, `max_steps`, and `score_ranges` when AgentV loads or serializes the suite. ## AgentV-Native Helper Factories @@ -262,7 +262,7 @@ assertions: value: "Hello" ``` -## Code Graders +## Script Graders Use `defineCodeGrader` from `@agentv/sdk` for full control over scoring with an explicit assertions array: @@ -294,7 +294,7 @@ it('links to the dashboard', () => { ```yaml assertions: - name: vitest-welcome-banner - type: code-grader + type: script command: [agentv, eval, graders/welcome-banner.test.ts] ``` @@ -311,9 +311,9 @@ export default defineWorkspaceGrader(async ({ workspace }) => [ ]); ``` -`defineCodeGrader`, `defineVitestWorkspaceGrader`, and `defineWorkspaceGrader` custom scripts are referenced in YAML with `type: code-grader` and `command: [bun, run, grader.ts]`. Plain Vitest verifier files can use `command: [agentv, eval, graders/check.test.ts]` without a custom wrapper; use `agentv eval vitest` when you need adapter flags. `defineAssertion` uses convention-based discovery instead — just place in `.agentv/assertions/` and reference by name. +`defineCodeGrader`, `defineVitestWorkspaceGrader`, and `defineWorkspaceGrader` custom scripts are referenced in YAML with `type: script` and `command: [bun, run, grader.ts]`. Plain Vitest verifier files can use `command: [agentv, eval, graders/check.test.ts]` without a custom wrapper; use `agentv eval vitest` when you need adapter flags. `defineAssertion` uses convention-based discovery instead — just place in `.agentv/assertions/` and reference by name. -For detailed patterns, input/output contracts, and language-agnostic examples, see [Code Graders](/docs/graders/code-graders/). +For detailed patterns, input/output contracts, and language-agnostic examples, see [Script Graders](/docs/graders/code-graders/). ## Wire Format vs SDK Format diff --git a/apps/web/src/content/docs/docs/getting-started/quickstart.mdx b/apps/web/src/content/docs/docs/getting-started/quickstart.mdx index b844759d4..59fefede3 100644 --- a/apps/web/src/content/docs/docs/getting-started/quickstart.mdx +++ b/apps/web/src/content/docs/docs/getting-started/quickstart.mdx @@ -55,7 +55,7 @@ tests: assertions: - name: math_check - type: code-grader + type: script command: [./validators/check_math.py] ``` diff --git a/apps/web/src/content/docs/docs/graders/code-graders.mdx b/apps/web/src/content/docs/docs/graders/code-graders.mdx index 4c775491a..ea328cc84 100644 --- a/apps/web/src/content/docs/docs/graders/code-graders.mdx +++ b/apps/web/src/content/docs/docs/graders/code-graders.mdx @@ -1,22 +1,21 @@ --- -title: Code Graders -description: Deterministic code graders in Python or TypeScript +title: Script Graders +description: Deterministic script graders in Python or TypeScript sidebar: order: 1 --- -Code graders are scripts that evaluate agent responses deterministically. Write them in any language — Python, TypeScript, Node, or any executable. +Script graders are scripts that evaluate agent responses deterministically. Write them in any language — Python, TypeScript, Node, or any executable. ## Contract -Code graders receive eval context via stdin JSON and return a result via stdout. +Script graders receive eval context via stdin JSON and return a result via stdout. **Input (stdin, raw wire format):** ```json { "input": [{ "role": "user", "content": "What is 15 + 27?" }], "input_files": [], - "criteria": "Correctly calculates 15 + 27 = 42", "output": "The answer is 42.", "expected_output": [{ "role": "assistant", "content": "42" }], "messages": [{ "role": "assistant", "content": "The answer is 42." }], @@ -86,7 +85,7 @@ fi ```yaml assertions: - - type: code-grader + - type: script command: [bash, scripts/check-pages.sh] ``` @@ -94,7 +93,7 @@ Silent one-liners work too — stdout is optional: ```yaml assertions: - - type: code-grader + - type: script command: ["bash", "-c", "[ $(wc -l < output.txt) -ge 10 ]"] ``` @@ -129,7 +128,7 @@ print(json.dumps({ The repo-local helper in `examples/features/sdk-python/` wraps the same contract for that example checkout: ```python -from agentv_py.grader import Assertion, CodeGraderResult, define_code_grader +from agentv_py.grader import Assertion, CodeGraderResult, define_script def evaluate(context): @@ -146,7 +145,7 @@ def evaluate(context): ) if __name__ == "__main__": - define_code_grader(evaluate) + define_script(evaluate) ``` Deprecated wire aliases like `output_text`, `input_text`, `reference_answer`, and `expected_output_text` are not accepted by the Python helper. @@ -181,7 +180,7 @@ console.log(JSON.stringify({ ```yaml assertions: - name: my_validator - type: code-grader + type: script command: [./validators/check_answer.py] ``` @@ -238,12 +237,12 @@ describe('welcome banner', () => { }); ``` -Then use AgentV's built-in Vitest adapter as the `code-grader` command. The adapter copies verifier files into a temporary workspace-local path when needed, runs Vitest in `workspace_path`, reads the JSON reporter output, and maps each test outcome to an AgentV assertion: +Then use AgentV's built-in Vitest adapter as the `script` command. The adapter copies verifier files into a temporary workspace-local path when needed, runs Vitest in `workspace_path`, reads the JSON reporter output, and maps each test outcome to an AgentV assertion: ```yaml assertions: - name: vitest-welcome-banner - type: code-grader + type: script command: [agentv, eval, graders/welcome-banner.test.ts] ``` @@ -271,7 +270,7 @@ Prefer Vitest verifiers when the checks naturally fit `expect(...)`. Use `define ## Target Access -Code graders can call an LLM through a target proxy for metrics that require multiple LLM calls (contextual precision, semantic similarity, etc.). +Script graders can call an LLM through a target proxy for metrics that require multiple LLM calls (contextual precision, semantic similarity, etc.). ### Configuration @@ -280,7 +279,7 @@ Add a `target` block to the grader config: ```yaml assertions: - name: contextual-precision - type: code-grader + type: script command: [bun, scripts/contextual-precision.ts] target: max_calls: 10 # Default: 50 @@ -324,7 +323,7 @@ Use `target.invokeBatch(requests)` for multiple calls in parallel. ## Advanced Input Fields -Beyond the basic fields (`input`, `output`, `expected_output`, `criteria`), code graders receive additional structured context: +Beyond the basic fields (`input`, `output`, `expected_output`), script graders receive additional structured context: | Field | Type | Description | |-------|------|-------------| @@ -366,7 +365,7 @@ Use `expected_output` for reference answers and `output` for the actual final an ## Workspace Access -When `workspace` is configured in the eval YAML (via `workspace.template`, `workspace.repos`, or lifecycle hooks), code graders receive the prepared workspace path in two ways: +When `workspace` is configured in the eval YAML (via `workspace.template`, `workspace.repos`, or lifecycle hooks), script graders receive the prepared workspace path in two ways: 1. **JSON payload**: `workspace_path` field in the stdin input 2. **Environment variable**: `AGENTV_WORKSPACE_PATH` @@ -426,11 +425,11 @@ target: my_agent tests: - id: implement-feature - criteria: Agent implements the feature correctly input: "Implement the TODO functions in src/index.ts" assertions: + - Agent implements the feature correctly - name: functional-check - type: code-grader + type: script command: [bun, scripts/functional-check.ts] ``` @@ -465,7 +464,7 @@ The command: 3. Prints the grader's JSON result to stdout 4. Exits 0 if score >= 0.5, exit 1 otherwise -This is the same interface that agent-orchestrated evals use — the EVAL.yaml transpiler emits `agentv eval assert` instructions for code graders so external grading agents can run them directly. +This is the same interface that agent-orchestrated evals use — the EVAL.yaml transpiler emits `agentv eval assert` instructions for script graders so external grading agents can run them directly. ### With stdin pipe diff --git a/apps/web/src/content/docs/docs/graders/composite.mdx b/apps/web/src/content/docs/docs/graders/composite.mdx index a755c7367..ac92bce6d 100644 --- a/apps/web/src/content/docs/docs/graders/composite.mdx +++ b/apps/web/src/content/docs/docs/graders/composite.mdx @@ -20,7 +20,7 @@ assertions: type: llm-grader prompt: ./prompts/check1.md - name: evaluator_2 - type: code-grader + type: script command: [uv, run, check2.py] aggregator: type: weighted_average @@ -32,7 +32,7 @@ assertions: Each sub-grader runs independently, then the aggregator combines their results. Use `assertions` for composite members. `graders` is still accepted for backward compatibility. -If you only need weighted-average aggregation, a plain test-level `assertions` list already computes a weighted mean across graders. Use `composite` when you need a custom aggregation strategy (`threshold`, `code-grader`, `llm-grader`) or nested grader groups. +If you only need weighted-average aggregation, a plain test-level `assertions` list already computes a weighted mean across graders. Use `composite` when you need a custom aggregation strategy (`threshold`, `script`, `llm-grader`) or nested grader groups. ## Aggregator Types @@ -100,7 +100,7 @@ Because this is an average, the final score is the fraction of passing children ### OR Logic (Strict) -For a strict OR, add a custom code-grader aggregator and return `1.0` when any child score passes. +For a strict OR, add a custom script aggregator and return `1.0` when any child score passes. Composite aggregator execution accepts either a direct script path or a shell command. The `bun run` form is the recommended pattern: @@ -110,7 +110,7 @@ assertions: - name: strict_or type: composite aggregator: - type: code-grader + type: script path: bun run ../scripts/or-aggregator.js assertions: - name: mentions-paris @@ -138,13 +138,13 @@ console.log( ); ``` -### Code Grader Aggregator +### Script Grader Aggregator Run a custom command to decide the final score based on all grader results: ```yaml aggregator: - type: code-grader + type: script path: bun run ./scripts/safety-gate.js cwd: ./graders # optional working directory ``` @@ -187,7 +187,7 @@ Inside the prompt file, use the `{{EVALUATOR_RESULTS_JSON}}` variable to inject ### Safety Gate -Block outputs that fail safety even if quality is high. A code grader aggregator can enforce hard gates: +Block outputs that fail safety even if quality is high. A script grader aggregator can enforce hard gates: ```yaml tests: @@ -207,7 +207,7 @@ tests: type: llm-grader prompt: ./prompts/quality-check.md aggregator: - type: code-grader + type: script path: ./scripts/safety-gate.js ``` @@ -225,7 +225,7 @@ Assign different importance to each evaluation dimension: type: llm-grader prompt: ./prompts/correctness.md - name: style - type: code-grader + type: script command: [uv, run, style_checker.py] - name: security type: llm-grader @@ -313,7 +313,7 @@ Assertions from sub-graders are prefixed with the grader name (e.g., `[safety]`) ## Best Practices 1. **Name graders clearly** -- names appear in results and debugging output, so use descriptive labels like `safety` or `correctness` rather than `eval_1`. -2. **Use safety gates for critical checks** -- do not let high quality scores override safety failures. A code grader aggregator can enforce hard gates. +2. **Use safety gates for critical checks** -- do not let high quality scores override safety failures. A script grader aggregator can enforce hard gates. 3. **Balance weights thoughtfully** -- consider which aspects matter most for your use case and assign weights accordingly. 4. **Keep nesting shallow** -- deep nesting makes debugging harder. Two levels of composites is usually sufficient. 5. **Test aggregators independently** -- verify custom aggregation logic with unit tests before wiring it into a composite grader. diff --git a/apps/web/src/content/docs/docs/graders/custom-assertions.mdx b/apps/web/src/content/docs/docs/graders/custom-assertions.mdx index bd453db76..11061a152 100644 --- a/apps/web/src/content/docs/docs/graders/custom-assertions.mdx +++ b/apps/web/src/content/docs/docs/graders/custom-assertions.mdx @@ -14,11 +14,11 @@ AgentV provides two SDK functions for custom evaluation logic: | Function | Best For | Discovery | |----------|----------|-----------| | `defineAssertion()` | Pass/fail checks, reusable assertion types | Convention-based (`.agentv/assertions/`) | -| `defineCodeGrader()` | Full scoring control with explicit assertions array | Referenced via `type: code-grader` + `command:` | +| `defineCodeGrader()` | Full scoring control with explicit assertions array | Referenced via `type: script` + `command:` | **Use `defineAssertion()`** when you want a named assertion type that can be referenced across eval files without specifying a command path. It uses a simplified result contract focused on `pass` and optional `score`. -**Use `defineCodeGrader()`** when you need full control over scoring with explicit `assertions` arrays, or when the grader is a one-off grader tied to a specific eval. See [Code Graders](/docs/graders/code-graders/) for details. +**Use `defineCodeGrader()`** when you need full control over scoring with explicit `assertions` arrays, or when the grader is a one-off grader tied to a specific eval. See [Script Graders](/docs/graders/code-graders/) for details. Both functions handle stdin/stdout JSON parsing, snake_case-to-camelCase conversion, Zod validation, and error handling automatically. @@ -111,7 +111,7 @@ The handler must return an `AssertionScore` object: ## Context Available to Assertions -The handler receives an `AssertionContext` with the same fields as a code grader: +The handler receives an `AssertionContext` with the same fields as a script grader: | Field | Type | Description | |-------|------|-------------| @@ -224,19 +224,19 @@ target: default tests: - id: greeting-response - criteria: Agent gives a multi-word greeting input: "Say hello and introduce yourself" expected_output: "Hello! I'm an AI assistant here to help you." assertions: + - Agent gives a multi-word greeting - type: contains value: "Hello" - type: word-count - id: short-answer - criteria: Agent gives a short but valid response input: "What is 2+2?" expected_output: "The answer is 4." assertions: + - Agent gives a short but valid response - type: contains value: "4" - type: word-count diff --git a/apps/web/src/content/docs/docs/graders/custom-graders.mdx b/apps/web/src/content/docs/docs/graders/custom-graders.mdx index 568d5b989..ebac41f1a 100644 --- a/apps/web/src/content/docs/docs/graders/custom-graders.mdx +++ b/apps/web/src/content/docs/docs/graders/custom-graders.mdx @@ -11,9 +11,9 @@ AgentV supports multiple grader types that can be combined for comprehensive eva | Type | Description | Use Case | |------|-------------|----------| -| `code_grader` | Deterministic command (Python/TS/any) | Exact matching, format validation, programmatic checks | -| `llm_grader` | LLM-based evaluation with custom prompt | Semantic evaluation, nuance, subjective quality | -| `rubrics` | Structured rubric grader via `assertions` | Multi-criterion grading with weights | +| `script` | Deterministic command (Python/TS/any) | Exact matching, format validation, programmatic checks | +| `llm-grader` | LLM-based evaluation with custom prompt | Semantic evaluation, nuance, subjective quality | +| `g-eval` | Structured rubric grader via `assertions` | Multi-criterion grading with weights | ## Referencing Graders @@ -39,11 +39,11 @@ tests: ```yaml tests: - id: test-1 - criteria: Returns valid JSON input: Generate a JSON config assertions: + - Returns valid JSON - name: json_check - type: code-grader + type: script command: [./validators/check_json.py] ``` @@ -54,14 +54,13 @@ Use multiple graders on the same case for comprehensive scoring: ```yaml tests: - id: code-generation - criteria: Generates correct Python code input: Write a sorting function assertions: - Code is syntactically valid - Handles edge cases such as empty lists and single-element lists - Uses an appropriate algorithm - name: syntax_check - type: code-grader + type: script command: [./validators/check_syntax.py] - name: quality_review type: llm-grader @@ -82,8 +81,8 @@ If any grader has `required: true` and scores below its required threshold, the ## Best Practices - **Use plain assertion strings first for semantic checks** — AgentV treats them as rubric criteria -- **Use code graders for deterministic checks** — exact value matching, format validation, schema compliance +- **Use script graders for deterministic checks** — exact value matching, format validation, schema compliance - **Use LLM graders for semantic evaluation** — meaning, quality, helpfulness -- **Use rubrics for structured multi-criteria grading** — when you need weighted, itemized scoring +- **Use `g-eval` for structured multi-criteria grading** — when you need weighted, itemized scoring - **Combine grader types** for comprehensive coverage -- **Test code graders locally** before running full evaluations +- **Test script graders locally** before running full evaluations diff --git a/apps/web/src/content/docs/docs/graders/execution-metrics.mdx b/apps/web/src/content/docs/docs/graders/execution-metrics.mdx index 7b12d3f69..e3abb1c3f 100644 --- a/apps/web/src/content/docs/docs/graders/execution-metrics.mdx +++ b/apps/web/src/content/docs/docs/graders/execution-metrics.mdx @@ -112,7 +112,7 @@ Fails if total token usage exceeds the threshold. |----------|----------------------| | Check multiple metrics at once | `execution_metrics` | | Simple single-threshold check | `latency`, `cost`, or `token_usage` | -| Complex custom formulas | `code_grader` with custom command | +| Complex custom formulas | `script` with custom command | ## Combining with Other Graders diff --git a/apps/web/src/content/docs/docs/graders/llm-graders.mdx b/apps/web/src/content/docs/docs/graders/llm-graders.mdx index b828d240f..7f7d72d17 100644 --- a/apps/web/src/content/docs/docs/graders/llm-graders.mdx +++ b/apps/web/src/content/docs/docs/graders/llm-graders.mdx @@ -7,19 +7,22 @@ sidebar: LLM graders use a language model to evaluate agent responses against custom criteria defined in a prompt file. -## Default Grader +## Explicit LLM Graders -When a test defines `criteria` but has **no `assertions` field**, a default `llm-grader` runs automatically. The built-in prompt evaluates the response against your `criteria` and `expected_output`: +Put semantic grading requirements in `assertions` or `assert`. Plain strings are +handled by the built-in `g-eval` rubric grader. Use `type: llm-grader` when you +need a custom prompt, target, or grader-specific preprocessing: ```yaml tests: - id: simple-eval - criteria: Correctly explains the bug and proposes a fix input: "Debug this function..." - # No assertions needed — default llm-grader evaluates against criteria + assertions: + - Correctly explains the bug and proposes a fix ``` -When `assertions` **is** present, no default grader is added. To use an LLM grader alongside other graders, declare it explicitly. See [How criteria and assertions interact](/docs/evaluation/eval-cases/#how-criteria-and-assertions-interact). +`expected_output` is passive reference data; it is available to graders but does +not create an LLM grading call by itself. See [How reference fields and assertions interact](/docs/evaluation/eval-cases/#how-reference-fields-and-assertions-interact). ## Configuration @@ -71,7 +74,7 @@ Score the response from 0.0 to 1.0 based on: | `output` | Candidate answer text | | `metadata` | Test metadata as formatted JSON | | `metadata_json` | Test metadata as compact JSON | -| `rubrics` | LLM-grader rubric items as formatted JSON | +| `g-eval` | LLM-grader rubric items as formatted JSON | | `rubrics_json` | LLM-grader rubric items as compact JSON | | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) | | `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) | @@ -99,7 +102,7 @@ tests: - name: dexter_semantic type: llm-grader prompt: file://prompts/dexter-grader.md - rubrics: + g-eval: - operator: correctness criteria: Uses the provided ticker and company. ``` @@ -196,9 +199,9 @@ preprocessors: tests: - id: spreadsheet-output - criteria: Output includes the revenue rows input: Generate the spreadsheet report assertions: + - Output includes the revenue rows - name: spreadsheet-check type: llm-grader prompt: | @@ -215,8 +218,6 @@ Resolution order: - if no preprocessor matches, AgentV falls back to a UTF-8 text read - if the fallback read looks binary or invalid, the grader receives a warning note instead of failing the test run -The implicit default `llm-grader` also inherits suite-level `preprocessors`, so you can omit `assertions` and still preprocess file outputs before grading. - See [`examples/features/preprocessors/`](../../../../examples/features/preprocessors/) for a runnable example with a file-producing target and a custom preprocessor script. ## Available Context Fields diff --git a/apps/web/src/content/docs/docs/graders/python-helpers.mdx b/apps/web/src/content/docs/docs/graders/python-helpers.mdx index 6394785db..b997c7817 100644 --- a/apps/web/src/content/docs/docs/graders/python-helpers.mdx +++ b/apps/web/src/content/docs/docs/graders/python-helpers.mdx @@ -1,6 +1,6 @@ --- title: Repo-Local Python Helpers -description: Example-local Python helpers for canonical AgentV code-graders and eval authoring +description: Example-local Python helpers for canonical AgentV script graders and eval authoring sidebar: order: 7 --- @@ -15,7 +15,7 @@ The helper lives in `examples/features/sdk-python/`. ## Scope -- `agentv_py.grader` wraps Python `code-grader` scripts over canonical `snake_case` fields. +- `agentv_py.grader` wraps Python `script` graders over canonical `snake_case` fields. - `agentv_py.evals` builds AgentV-shaped eval definitions and JSONL datasets. - `run_agentv_eval()` shells out to `agentv eval` or the repo source CLI. @@ -35,7 +35,7 @@ Use canonical fields instead: ## Example ```python -from agentv_py.grader import Assertion, CodeGraderResult, define_code_grader +from agentv_py.grader import Assertion, CodeGraderResult, define_script def evaluate(context): @@ -54,7 +54,7 @@ def evaluate(context): if __name__ == "__main__": - define_code_grader(evaluate) + define_script(evaluate) ``` ## Authoring evals diff --git a/apps/web/src/content/docs/docs/graders/structured-data.mdx b/apps/web/src/content/docs/docs/graders/structured-data.mdx index 41af50f5a..b9338ba0e 100644 --- a/apps/web/src/content/docs/docs/graders/structured-data.mdx +++ b/apps/web/src/content/docs/docs/graders/structured-data.mdx @@ -54,7 +54,7 @@ assertions: | `date` | Compares dates after parsing | `formats` -- list of accepted date formats | | `numeric_tolerance` | Numeric compare within tolerance | `tolerance` -- absolute threshold; `relative: true` for relative tolerance | -For fuzzy string matching, use a `code_grader` grader (e.g. Levenshtein distance) instead of adding a fuzzy mode to `field_accuracy`. +For fuzzy string matching, use a `script` grader (e.g. Levenshtein distance) instead of adding a fuzzy mode to `field_accuracy`. ### Aggregation diff --git a/apps/web/src/content/docs/docs/graders/tool-trajectory.mdx b/apps/web/src/content/docs/docs/graders/tool-trajectory.mdx index b9e4bddfa..87ebfd354 100644 --- a/apps/web/src/content/docs/docs/graders/tool-trajectory.mdx +++ b/apps/web/src/content/docs/docs/graders/tool-trajectory.mdx @@ -257,4 +257,4 @@ tests: 2. **Combine with other graders** — use tool trajectory for execution validation and LLM graders for output quality. 3. **Inspect traces first** with `--dump-traces` to understand agent behavior before writing graders. 4. **Use generous latency thresholds** to avoid flaky tests from timing variance. -5. **Use code graders for custom validation** — write custom tool validation scripts when built-in modes are insufficient. +5. **Use script graders for custom validation** — write custom tool validation scripts when built-in modes are insufficient. diff --git a/apps/web/src/content/docs/docs/guides/agent-eval-layers.mdx b/apps/web/src/content/docs/docs/guides/agent-eval-layers.mdx index e9a9fe2d1..03dd91ba7 100644 --- a/apps/web/src/content/docs/docs/guides/agent-eval-layers.mdx +++ b/apps/web/src/content/docs/docs/guides/agent-eval-layers.mdx @@ -15,8 +15,8 @@ Covers plan quality, plan adherence, and tool selection rationale. Use LLM-based | Concern | AgentV grader | |---------|-----------------| -| Plan quality & coherence | `rubrics` | -| Workspace-aware auditing | `rubrics` with `required: true` criteria | +| Plan quality & coherence | `g-eval` | +| Workspace-aware auditing | `g-eval` with `required: true` criteria | ```yaml # Layer 1: Reasoning — verify the agent's plan makes sense @@ -24,7 +24,7 @@ assertions: - Agent formed a coherent plan before acting - Agent selected appropriate tools for the task - name: workspace-audit - type: rubrics + type: g-eval criteria: - id: plan-before-act outcome: Agent formed a plan before making changes @@ -43,7 +43,7 @@ Covers tool call correctness, argument validity, execution path, and redundancy. | Tool sequence | `tool_trajectory` (`in_order`, `exact`) | | Minimum tool usage | `tool_trajectory` (`any_order`) | | Argument correctness | `tool_trajectory` with `args` matching | -| Custom validation logic | `code_grader` | +| Custom validation logic | `script` | ```yaml # Layer 2: Action — verify the agent called the right tools @@ -72,7 +72,7 @@ Covers task completion, output correctness, step efficiency, latency, and cost. | Concern | AgentV grader | |---------|-----------------| -| Output correctness | `rubrics`, `equals`, `contains`, `regex` | +| Output correctness | `g-eval`, `equals`, `contains`, `regex` | | Structured data accuracy | `field_accuracy` | | Efficiency budgets | `execution_metrics` | | Multi-signal rollup | `composite` | @@ -102,8 +102,8 @@ Covers prompt injection resilience, policy adherence, bias, and content safety. | Concern | AgentV grader | |---------|-----------------| -| Content safety | `rubrics` | -| Policy enforcement | `code_grader` with policy command | +| Content safety | `g-eval` | +| Policy enforcement | `script` with policy command | | "Must NOT" assertions | Any grader with `negate: true` | ```yaml diff --git a/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx b/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx index 87241b4dd..fcb3a4a7d 100644 --- a/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx +++ b/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx @@ -32,7 +32,7 @@ Use this split when deciding where a benchmark key belongs: | `workspace.isolation` | Yes | Controls shared vs per-case folder isolation. Runtime workspace paths are machine-local config/CLI bindings, not benchmark provenance. | | `experiment` | Yes | Selects targets, thresholds, repeat policy, budgets, and default grader behavior. Concurrency is an operator/run setting from `--workers` or project config. | | `input`, `input_files`, `expected_output` | Yes | Builds the target prompt and passive reference answer. | -| `assertions` | Yes | Runs deterministic, LLM, composite, or code graders. | +| `assertions` | Yes | Runs deterministic, LLM, composite, or script graders. | | Top-level `name`, `version`, `tags`, `license`, `requires` | Informational | Identifies and categorizes the suite. | | `tests[].metadata` | Informational to AgentV | Passes arbitrary case data through to results and extension context; in-process custom assertions can also read it. | @@ -74,7 +74,7 @@ Benchmark task packs map cleanly onto AgentV fields at authoring time: | Source checkout | `workspace.repos[].repo` and `workspace.repos[].commit` | | Per-case setup | `extensions: ["file://scripts/setup.mjs:beforeEach"]` reading `case_metadata` | | Gold answer | `expected_output` when the answer is passive reference data | -| Active verification | `assertions`, especially `code-grader` for commands or artifact checks | +| Active verification | `assertions`, especially `script` for commands or artifact checks | | Provenance | `tests[].metadata` with source pins, generator rows, and curation labels | | Bulky task files | Optional `tests: ./cases/` with per-case directories and supporting files | @@ -113,7 +113,7 @@ extensions: assertions: - name: focused-tests - type: code-grader + type: script command: ["python", "./graders/run-focused-tests.py"] required: true @@ -136,7 +136,7 @@ In this example, `workspace.repos[].commit` is the actual checkout. The matching `metadata.source_commit` is audit data that gets recorded with the case and is available to extensions. `apply-test-patch.mjs` can read `case_metadata.test_patch` and `case_metadata.fail_to_pass_tests`, then apply -the patch and write the selected test list into the workspace. The code grader +the patch and write the selected test list into the workspace. The script grader can read that workspace file through its `workspace_path` payload. Repo acquisition remains outside the eval; use registered projects or `git_cache.mirrors` when a local machine needs faster large-repo setup. See @@ -167,7 +167,7 @@ target: codex assertions: - name: tests-pass - type: code-grader + type: script command: ["python", "./graders/run-tests.py"] required: true ``` @@ -220,7 +220,7 @@ letting a parent eval compare targets, repeat policy, and gates consistently. Generated datasets often need stable row provenance more than workspace setup. Keep the generated row identity in metadata, use `expected_output` for the gold -answer, and score with rubrics or an LLM/code grader. +answer, and score with rubrics or an LLM/script grader. ```yaml name: finance-research-generated @@ -334,7 +334,7 @@ script. - Do not duplicate operational checkout state only in metadata. Put the real checkout under `workspace.repos`. - Keep `metadata` snake_case because it crosses process and result boundaries. -- Prefer `expected_output` for passive gold answers and `code-grader` for active +- Prefer `expected_output` for passive gold answers and `script` for active commands, file checks, or generated artifact validation. - Prefer case directories over long inline YAML only for bulky source inputs; the generated run folder remains the portable artifact contract. diff --git a/apps/web/src/content/docs/docs/guides/evaluation-types.mdx b/apps/web/src/content/docs/docs/guides/evaluation-types.mdx index 368ffd26d..fe9953cdc 100644 --- a/apps/web/src/content/docs/docs/guides/evaluation-types.mdx +++ b/apps/web/src/content/docs/docs/guides/evaluation-types.mdx @@ -47,7 +47,7 @@ Trigger quality evaluates whether the right skill is activated for the right pro |-----------|------------------|-----------------| | **Question** | "Does it help?" | "Does it activate?" | | **Signal type** | Deterministic-ish | Noisy / statistical | -| **Test method** | Fixed assertions, rubrics, graders | Repeated trials, train/test splits | +| **Test method** | Fixed assertions, g-eval, graders | Repeated trials, train/test splits | | **What you tune** | Agent logic, prompts, tool use | Skill descriptions, trigger metadata | | **Failure mode** | Wrong output | Wrong routing | | **Optimization** | Pass/fail per test case | Accuracy rate over a sample | @@ -64,7 +64,7 @@ AgentV's eval tooling is designed for **execution quality**: - **`EVAL.yaml`** — define test cases with inputs, expected outputs, and assertions - **`evals.json`** — lightweight skill evaluation format (prompt/expected-output pairs) - **`agentv eval`** — execute evaluations and collect results -- **Graders** — `llm-grader`, `code-grader`, `tool-trajectory`, `rubrics`, `contains`, `regex`, and others all measure execution behavior +- **Graders** — `llm-grader`, `script`, `tool-trajectory`, `g-eval`, `contains`, `regex`, and others all measure execution behavior These tools assume the skill is already loaded and invoked. They measure what happens *after* routing, not the routing decision itself. diff --git a/apps/web/src/content/docs/docs/guides/human-review.mdx b/apps/web/src/content/docs/docs/guides/human-review.mdx index 26a9ad601..50dcf8b6e 100644 --- a/apps/web/src/content/docs/docs/guides/human-review.mdx +++ b/apps/web/src/content/docs/docs/guides/human-review.mdx @@ -27,7 +27,7 @@ Skip the review step for routine CI gate runs where you only need pass/fail. | **False positive** | A `contains` check passes on a coincidental substring match | | **False negative** | An LLM grader penalizes a correct answer that uses different phrasing | | **Qualitative regression** | Scores stay the same but tone, formatting, or helpfulness degrades | -| **Grader miscalibration** | A code grader is too strict on whitespace; a rubric is too lenient on accuracy | +| **Grader miscalibration** | A script grader is too strict on whitespace; a rubric is too lenient on accuracy | | **Flaky results** | The same test produces wildly different scores across runs | ## How to review @@ -92,7 +92,7 @@ The `feedback.json` file is a structured annotation of a single eval run. It rec "verdict": "needs_improvement", "notes": "Missing coverage of multi-document queries.", "evaluator_overrides": { - "code-grader:format-check": "Too strict — penalized valid output with trailing newline", + "script:format-check": "Too strict — penalized valid output with trailing newline", "llm-grader:quality": "Score 0.6 seems fair, answer was incomplete" }, "workspace_notes": "Workspace had stale cached files from previous run — may have affected retrieval results." @@ -137,14 +137,14 @@ The `feedback.json` file is a structured annotation of a single eval run. It rec ### Grader overrides (workspace evaluations) -For workspace evaluations with multiple graders (code graders, LLM graders, tool trajectory checks), the `evaluator_overrides` field lets the reviewer annotate specific grader results: +For workspace evaluations with multiple graders (script graders, LLM graders, tool trajectory checks), the `evaluator_overrides` field lets the reviewer annotate specific grader results: ```json { "test_id": "test-refactor-api", "verdict": "needs_improvement", "evaluator_overrides": { - "code-grader:test-pass": "Tests pass but the refactored code has a subtle race condition the tests don't cover", + "script:test-pass": "Tests pass but the refactored code has a subtle race condition the tests don't cover", "llm-grader:quality": "Score 0.9 is too high — the agent left dead code behind", "tool-trajectory:efficiency": "Used 12 tool calls where 5 would suffice, but the result is correct" }, diff --git a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx index 0bea029db..41253c202 100644 --- a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx +++ b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx @@ -212,7 +212,7 @@ Keep your baseline stable across iterations. Only re-run the baseline when the t ## Graduating to EVAL.yaml -When `evals.json` becomes limiting — you need workspace isolation, code graders, tool trajectory checks, or multi-turn conversations — graduate to EVAL.yaml: +When `evals.json` becomes limiting — you need workspace isolation, script graders, tool trajectory checks, or multi-turn conversations — graduate to EVAL.yaml: ```bash agentv convert evals.json -o eval.yaml @@ -241,7 +241,7 @@ tests: After converting, you can: - Replace `llm-grader` assertions with faster deterministic graders (`contains`, `regex`, `equals`) - Add `workspace` configuration for file-system isolation -- Use `code-grader` for custom scoring logic +- Use `script` for custom scoring logic - Define `tool-trajectory` assertions to check tool usage patterns See [Skill Evals (evals.json)](/docs/integrations/agent-skills-evals/) for the full field mapping and side-by-side comparison. @@ -258,7 +258,7 @@ If you've been using the Agent Skills skill-creator workflow, AgentV reads your | `summary.json` (read) | `/summary.json` (write) | AgentV writes the canonical run summary; convert it in a wrapper if another tool needs a narrower compatibility shape | | n/a | `index.jsonl` (write) | AgentV-specific per-test manifest for filtering, retry, and replay workflows | | with-skill vs without-skill | `--target baseline --target candidate` | Structured comparison | -| Graduate to richer evals | `agentv convert evals.json` → EVAL.yaml | Adds workspace, code graders, etc. | +| Graduate to richer evals | `agentv convert evals.json` → EVAL.yaml | Adds workspace, script graders, etc. | **Key takeaway:** You do not need to rewrite your `evals.json`. AgentV reads it directly and adds a richer evaluation engine on top. @@ -310,7 +310,7 @@ Start simple and add complexity only when the evaluation results demand it: 1. **Start with `evals.json`** — 5-10 test cases, natural-language assertions 2. **Add deterministic checks** — when you find assertions that can be exact (`contains`, `regex`) -3. **Graduate to EVAL.yaml** — when you need workspace isolation or code graders +3. **Graduate to EVAL.yaml** — when you need workspace isolation or script graders 4. **Add tool trajectory checks** — when tool usage patterns matter 5. **Use rubrics** — when you need weighted, structured scoring criteria diff --git a/apps/web/src/content/docs/docs/index.mdx b/apps/web/src/content/docs/docs/index.mdx index 5855bd2b6..10844cd03 100644 --- a/apps/web/src/content/docs/docs/index.mdx +++ b/apps/web/src/content/docs/docs/index.mdx @@ -5,7 +5,7 @@ sidebar: order: 1 --- -AgentV is a CLI-first AI agent evaluation framework. It evaluates your agents locally with multi-objective scoring (correctness, latency, cost, safety) from YAML specifications. Deterministic code graders + customizable LLM graders, all version-controlled in Git. +AgentV is a CLI-first AI agent evaluation framework. It evaluates your agents locally with multi-objective scoring (correctness, latency, cost, safety) from YAML specifications. Deterministic script graders + customizable LLM graders, all version-controlled in Git. ## Why AgentV? @@ -38,7 +38,7 @@ AgentV is a CLI-first AI agent evaluation framework. It evaluates your agents lo - **Eval files** — YAML or JSONL definitions of test cases - **Tests** — Individual test entries with input messages and expected outcomes - **Targets** — The agent or LLM provider being evaluated -- **Graders** — Code graders (Python/TypeScript) or LLM graders that score responses +- **Graders** — Script graders (Python/TypeScript) or LLM graders that score responses - **Rubrics** — Structured criteria with weights for grading - **Results** — JSONL output with scores, reasoning, and execution traces @@ -50,7 +50,7 @@ Use this topic map when you are an AI agent trying to decide which primitive or | --- | --- | --- | | Create a first eval | [Quickstart](/docs/getting-started/quickstart/) → [Eval files](/docs/evaluation/eval-files/) | Defines the smallest runnable YAML shape before adding advanced fields. | | Run or resume evals | [Running evals](/docs/evaluation/running-evals/) → [WIP checkpoints](/docs/tools/wip-checkpoints/) | Covers `agentv eval`, concurrency, `--resume`, `--rerun-failed`, and remote partial-run recovery. | -| Choose graders | [Rubrics](/docs/evaluation/rubrics/) → [Code graders](/docs/graders/code-graders/) → [LLM graders](/docs/graders/llm-graders/) | Keeps deterministic checks, rubric scoring, and LLM judgment separate. | +| Choose graders | [Rubrics](/docs/evaluation/rubrics/) → [Script graders](/docs/graders/code-graders/) → [LLM graders](/docs/graders/llm-graders/) | Keeps deterministic checks, rubric scoring, and LLM judgment separate. | | Evaluate tool use or agents | [Tool trajectory](/docs/graders/tool-trajectory/) → [Coding agents](/docs/targets/coding-agents/) → [CLI provider](/docs/targets/cli-provider/) | Shows how targets, transcripts, and tool-call assertions compose. | | Share and inspect results | [Result artifact contract](/docs/reference/result-artifacts/) → [Results](/docs/tools/results/) → [Dashboard](/docs/tools/dashboard/) | Explains canonical run bundles, local artifacts, reports, remote result repositories, and Dashboard review flows. | | Compare runs | [Compare](/docs/tools/compare/) → [Dashboard Analytics](/docs/tools/dashboard/#analytics) | Use CLI metrics for automation and Dashboard analytics for interactive inspection. | diff --git a/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx b/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx index fe8f0d175..3d583fdad 100644 --- a/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx +++ b/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx @@ -152,7 +152,7 @@ The generated YAML includes comments about available AgentV features you can use # Converted from Agent Skills evals.json # AgentV features you can add: # - type: is_json, contains, regex for deterministic graders -# - type: code-grader for custom scoring scripts +# - type: script for custom scoring scripts # - Multi-turn conversations via input message arrays # - Composite graders with weighted scoring # - Workspace isolation with repos and hooks diff --git a/apps/web/src/content/docs/docs/integrations/autoevals-integration.mdx b/apps/web/src/content/docs/docs/integrations/autoevals-integration.mdx index 48d69c0b7..1b2ae8159 100644 --- a/apps/web/src/content/docs/docs/integrations/autoevals-integration.mdx +++ b/apps/web/src/content/docs/docs/integrations/autoevals-integration.mdx @@ -1,6 +1,6 @@ --- title: Autoevals Integration -description: Use Braintrust's open-source autoevals scorers (Factuality, Faithfulness, etc.) as code-grader graders in AgentV. +description: Use Braintrust's open-source autoevals scorers (Factuality, Faithfulness, etc.) as script graders in AgentV. sidebar: order: 3 --- @@ -13,7 +13,7 @@ sidebar: - Works standalone — no Braintrust platform account required - Uses any OpenAI-compatible endpoint for LLM-based scorers -- Integrates with AgentV via the `code-grader` type: wrap any autoevals scorer in a command that reads stdin and writes the AgentV grader result to stdout +- Integrates with AgentV via the `script` grader type: wrap any autoevals scorer in a command that reads stdin and writes the AgentV grader result to stdout ## Installation @@ -50,7 +50,7 @@ All LLM-based scorers return a `score` (0–1) and `metadata.rationale` explaini ## TypeScript Example -Use the `Factuality` scorer as an AgentV `code-grader` to verify answer correctness. +Use the `Factuality` scorer as an AgentV `script` grader to verify answer correctness. **EVAL.yaml:** @@ -63,7 +63,7 @@ tests: expected_output: "Paris is the capital of France." assertions: - name: factuality - type: code-grader + type: script command: ["bun", "run", "graders/factuality.ts"] ``` @@ -101,7 +101,7 @@ console.log( ); ``` -The code grader reads the canonical AgentV stdin payload (`input`, `expected_output`, `output`), maps those fields to autoevals parameters (`input`, `output`, `expected`), runs the scorer, and writes the AgentV result format (with `assertions` array) to stdout. +The script grader reads the canonical AgentV stdin payload (`input`, `expected_output`, `output`), maps those fields to autoevals parameters (`input`, `output`, `expected`), runs the scorer, and writes the AgentV result format (with `assertions` array) to stdout. ## Python Example @@ -118,7 +118,7 @@ tests: expected_output: "The paper found that transformer models outperform RNNs on long-range tasks." assertions: - name: faithfulness - type: code-grader + type: script command: ["python", "graders/faithfulness.py"] ``` @@ -202,7 +202,7 @@ const result = await Factuality({ ## RAG Evaluation Suite -Combine multiple autoevals scorers in a single code grader for comprehensive RAG evaluation. +Combine multiple autoevals scorers in a single script grader for comprehensive RAG evaluation. **EVAL.yaml:** @@ -215,7 +215,7 @@ tests: expected_output: "Exercise improves cardiovascular health, mental well-being, and longevity." assertions: - name: rag-quality - type: code-grader + type: script command: ["bun", "run", "graders/rag-suite.ts"] weight: 1.0 ``` diff --git a/apps/web/src/content/docs/docs/reference/comparison.mdx b/apps/web/src/content/docs/docs/reference/comparison.mdx index a354160c1..a91911931 100644 --- a/apps/web/src/content/docs/docs/reference/comparison.mdx +++ b/apps/web/src/content/docs/docs/reference/comparison.mdx @@ -15,7 +15,7 @@ AgentV is the **evaluation layer** in the AI agent lifecycle. It works alongside ### AgentV — Evaluate -Offline evaluation and testing. Run eval cases against agents, score with deterministic code graders + LLM judges, detect regressions, gate CI/CD pipelines. Everything lives in Git. +Offline evaluation and testing. Run eval cases against agents, score with deterministic script graders + LLM judges, detect regressions, gate CI/CD pipelines. Everything lives in Git. ``` agentv eval evals/my-agent.yaml diff --git a/apps/web/src/content/docs/docs/reference/result-artifacts.mdx b/apps/web/src/content/docs/docs/reference/result-artifacts.mdx index 77acc974f..76e04c3f7 100644 --- a/apps/web/src/content/docs/docs/reference/result-artifacts.mdx +++ b/apps/web/src/content/docs/docs/reference/result-artifacts.mdx @@ -89,7 +89,7 @@ reserved for rebuildable local state and are skipped by run discovery. | `result.json` | Compact per-attempt manifest for one attempt directory, including AgentV `execution_status` and `verdict`. | Loading one attempt without scanning the whole run index. | | `grading.json` | Grader outputs, assertions, rubric evidence, execution-metric grader facts, and scoring provenance. | Explaining why a row passed or failed. | | `metrics.json` | Derived executor behavior summary, such as tool calls, files touched, shell commands, errors, turns, and output sizes. | Dashboard behavior views, metric-style graders, adapter projections, and lightweight analysis. | -| `outputs/file_changes.diff` | Full unified diff of workspace file changes when file changes are captured. | Human review and external artifact inspection; LLM and code graders still receive the same full diff through `file_changes`. | +| `outputs/file_changes.diff` | Full unified diff of workspace file changes when file changes are captured. | Human review and external artifact inspection; LLM and script graders still receive the same full diff through `file_changes`. | | `timing.json` | Duration, token usage, cost usage, and source labels such as `provider_reported`, `token_estimated`, `aggregate`, or `unavailable`. | Cost/latency reporting and provider-accounting audits. | | `transcript.json` | AgentV-normalized transcript/timeline document with canonical `tool_name` values and `transcript_summary`. | Portable human review, transcript-aware graders, and tool-trajectory analysis. | | `transcript-raw.jsonl` | Native provider or harness evidence when available. | Parser debugging, forensic review, and preserving source bytes without making provider schemas public AgentV fields. | diff --git a/apps/web/src/content/docs/docs/tools/convert.mdx b/apps/web/src/content/docs/docs/tools/convert.mdx index 1a754967a..ba91b090a 100644 --- a/apps/web/src/content/docs/docs/tools/convert.mdx +++ b/apps/web/src/content/docs/docs/tools/convert.mdx @@ -37,7 +37,7 @@ Converts an [Agent Skills `evals.json`](/docs/integrations/agent-skills-evals) f - Maps `expected_output` → `expected_output` - Maps `assertions` → `assertions` graders (llm-grader) - Resolves `files[]` paths relative to the evals.json directory -- Adds TODO comments for AgentV-specific features (workspace setup, code graders, rubrics) +- Adds TODO comments for AgentV-specific features (workspace setup, script graders, rubrics) This is a one-way conversion — use it as a starting point, then enhance the generated YAML with AgentV features. diff --git a/apps/web/src/content/docs/docs/tools/import.mdx b/apps/web/src/content/docs/docs/tools/import.mdx index 834b0680c..276a3775f 100644 --- a/apps/web/src/content/docs/docs/tools/import.mdx +++ b/apps/web/src/content/docs/docs/tools/import.mdx @@ -204,7 +204,7 @@ Each instance becomes an EVAL.yaml with: - `input` — the problem statement - `workspace.docker.image` — the pre-built SWE-bench Docker image (`ghcr.io/epoch-research/swe-bench.eval.x86_64.:latest`) - `workspace.repos[].base_commit` — the commit to reset to before the agent runs -- `assertions` — `code-grader` tasks that run `FAIL_TO_PASS` and `PASS_TO_PASS` pytest suites inside the container +- `assertions` — `script` tasks that run `FAIL_TO_PASS` and `PASS_TO_PASS` pytest suites inside the container Run an imported SWE-bench eval against any coding agent target: diff --git a/apps/web/src/content/docs/docs/tools/prepare.mdx b/apps/web/src/content/docs/docs/tools/prepare.mdx index ac07cddcc..b9f2a8dbb 100644 --- a/apps/web/src/content/docs/docs/tools/prepare.mdx +++ b/apps/web/src/content/docs/docs/tools/prepare.mdx @@ -64,7 +64,7 @@ Supported `--trace` inputs: | `agentv.trace.v1` JSON or JSONL | Explicit trace replay/export files | | AgentV transcript JSONL | `agentv import claude`, `agentv import codex`, or `agentv import copilot` output | -Single-record trace files are accepted directly. Multi-record files are matched by `test_id` and target. The selected trace is projected into AgentV's normal `trace` and `messages` grader context, so `tool-trajectory`, execution-metrics, and code graders receive the same shape they see during eval runs. +Single-record trace files are accepted directly. Multi-record files are matched by `test_id` and target. The selected trace is projected into AgentV's normal `trace` and `messages` grader context, so `tool-trajectory`, execution-metrics, and script graders receive the same shape they see during eval runs. Use `--response` when the final answer text should be graded independently of the trace. If `--response` is omitted and the trace contains an assistant message with content, AgentV uses the last assistant message as the candidate answer. diff --git a/packages/core/src/evaluation/graders/composite.ts b/packages/core/src/evaluation/graders/composite.ts index 66c88fe2b..df89613b1 100644 --- a/packages/core/src/evaluation/graders/composite.ts +++ b/packages/core/src/evaluation/graders/composite.ts @@ -71,6 +71,7 @@ export class CompositeGrader implements Grader { const aggregator = this.config.aggregator; switch (aggregator.type) { + case 'script': case 'code-grader': return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd); case 'llm-grader': diff --git a/packages/core/src/evaluation/graders/llm-grader.ts b/packages/core/src/evaluation/graders/llm-grader.ts index 27633b8ef..6194e4d50 100644 --- a/packages/core/src/evaluation/graders/llm-grader.ts +++ b/packages/core/src/evaluation/graders/llm-grader.ts @@ -13,7 +13,7 @@ import type { Message, Provider, ProviderResponse, ProviderTool } from '../provi import { extractLastAssistantContent, isAgentProvider } from '../providers/types.js'; import { TEMPLATE_VARIABLES } from '../template-variables.js'; import type { TokenUsage } from '../trace.js'; -import type { AssertionEntry, JsonObject, RubricItem } from '../types.js'; +import type { AssertionEntry, GraderConfig, JsonObject, RubricItem } from '../types.js'; import { formatRubricOperatorGuidance, formatRubricOperatorLabel } from './rubric-operators.js'; import { clampScore, isNonEmptyString, parseJsonFromText, scoreToVerdict } from './scoring.js'; import type { EvaluationContext, EvaluationScore, Grader } from './types.js'; @@ -164,7 +164,7 @@ function buildTemplateVariables(context: EvaluationContext): Record 0 ? context.promptInputs.question : context.evalCase.question; - const rubrics = context.evaluator?.type === 'llm-grader' ? context.evaluator.rubrics : undefined; + const rubrics = getRubrics(context.evaluator); return { [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(), @@ -180,6 +180,18 @@ function buildTemplateVariables(context: EvaluationContext): Record { + return ( + config?.type === 'llm-grader' || config?.type === 'g-eval' || config?.type === 'llm-rubric' + ); +} + function resolveContentBasePath(context: EvaluationContext): string | undefined { if (context.workspacePath) { return context.workspacePath; @@ -242,7 +254,11 @@ export class LlmGrader implements Grader { // LLM mode: structured JSON evaluation const config = preparedContext.evaluator; - if (config?.type === 'llm-grader' && config.rubrics && config.rubrics.length > 0) { + if ( + (config?.type === 'llm-grader' || config?.type === 'g-eval') && + config.rubrics && + config.rubrics.length > 0 + ) { return this.evaluateWithRubrics(preparedContext, graderProvider, config.rubrics); } @@ -251,7 +267,7 @@ export class LlmGrader implements Grader { private async prepareContext(context: EvaluationContext): Promise { const config = context.evaluator; - if (config?.type !== 'llm-grader' || !context.output) { + if (!isLlmBackedWithPreprocessors(config) || !context.output) { return context; } @@ -505,7 +521,7 @@ export class LlmGrader implements Grader { const userPrompt = this.buildAgentUserPrompt(context); const config = context.evaluator; - const rubrics = config?.type === 'llm-grader' ? config.rubrics : undefined; + const rubrics = getRubrics(config); const fsTools = createFilesystemTools(workspacePath); @@ -625,7 +641,7 @@ export class LlmGrader implements Grader { } const config = context.evaluator; - const rubrics = config?.type === 'llm-grader' ? config.rubrics : undefined; + const rubrics = getRubrics(config); const details: JsonObject = { mode: modeLabel, @@ -669,7 +685,7 @@ export class LlmGrader implements Grader { */ private buildAgentSystemPrompt(context: EvaluationContext): string { const config = context.evaluator; - const rubrics = config?.type === 'llm-grader' ? config.rubrics : undefined; + const rubrics = getRubrics(config); const parts: string[] = [ 'You are an expert grader with access to the workspace filesystem.', @@ -705,7 +721,7 @@ export class LlmGrader implements Grader { } const config = context.evaluator; - const rubrics = config?.type === 'llm-grader' ? config.rubrics : undefined; + const rubrics = getRubrics(config); const parts: string[] = [ 'Evaluate the candidate answer by investigating the workspace.', @@ -763,7 +779,7 @@ export class LlmGrader implements Grader { : context.evalCase.question; const config = context.evaluator; - const rubrics = config?.type === 'llm-grader' ? config.rubrics : undefined; + const rubrics = getRubrics(config); const template = context.graderTemplateOverride ?? this.graderTemplate; if (template) { diff --git a/packages/core/src/evaluation/graders/promptfoo-assertions.ts b/packages/core/src/evaluation/graders/promptfoo-assertions.ts new file mode 100644 index 000000000..6847984c7 --- /dev/null +++ b/packages/core/src/evaluation/graders/promptfoo-assertions.ts @@ -0,0 +1,372 @@ +import { execFileWithStdin } from '../../runtime/exec.js'; +import { serializeSnakeCaseBoundaryPayload } from '../case-conversion.js'; +import type { + AssertSetGraderConfig, + AssertionEntry, + JsonObject, + JsonValue, + ScriptAssertionGraderConfig, + SimilarGraderConfig, +} from '../types.js'; +import { clampScore } from './scoring.js'; +import type { EvaluationContext, EvaluationScore, Grader } from './types.js'; + +type ScriptResult = + | boolean + | number + | { + readonly pass?: boolean; + readonly score?: number; + readonly reason?: string; + readonly assertions?: readonly AssertionEntry[]; + readonly details?: JsonObject; + }; + +function buildAssertionContext(context: EvaluationContext): Record { + return { + criteria: context.evalCase.criteria, + expectedOutput: context.evalCase.expected_output, + input: context.evalCase.input, + metadata: context.evalCase.metadata ?? null, + trace: context.trace ?? null, + tokenUsage: context.tokenUsage ?? null, + costUsd: context.costUsd ?? null, + durationMs: context.durationMs ?? null, + fileChanges: context.fileChanges ?? null, + workspacePath: context.workspacePath ?? null, + dependencyResults: context.dependencyResults ?? null, + }; +} + +function normalizeScriptResult( + result: ScriptResult, + fallbackText: string, + threshold?: number, +): EvaluationScore { + const passThreshold = threshold ?? Number.EPSILON; + if (typeof result === 'boolean') { + return { + score: result ? 1 : 0, + verdict: result ? 'pass' : 'fail', + assertions: [{ text: result ? 'Assertion passed' : fallbackText, passed: result }], + expectedAspectCount: 1, + }; + } + + if (typeof result === 'number') { + const score = clampScore(result); + const passed = score >= passThreshold; + return { + score, + verdict: passed ? 'pass' : 'fail', + assertions: [{ text: passed ? 'Assertion passed' : fallbackText, passed }], + expectedAspectCount: 1, + }; + } + + const score = + typeof result.score === 'number' + ? clampScore(result.score) + : result.pass === true + ? 1 + : result.pass === false + ? 0 + : 0; + const passed = result.pass ?? score >= passThreshold; + const assertions = + result.assertions && result.assertions.length > 0 + ? result.assertions + : [ + { + text: result.reason ?? (passed ? 'Assertion passed' : fallbackText), + passed, + }, + ]; + return { + score, + verdict: passed ? 'pass' : 'fail', + assertions, + expectedAspectCount: assertions.length || 1, + ...(result.details ? { details: result.details } : {}), + }; +} + +function buildFunctionBody(code: string): string { + const trimmed = code.trim().replace(/;+\s*$/, ''); + if (trimmed.includes('\n') || /\breturn\b/.test(trimmed)) { + return trimmed; + } + const lastSemi = trimmed.lastIndexOf(';'); + if (/^(const|let|var)\s/.test(trimmed) && lastSemi >= 0) { + return `${trimmed.slice(0, lastSemi + 1)} return ${trimmed.slice(lastSemi + 1).trim()}`; + } + return `return ${trimmed}`; +} + +export class JavascriptAssertionGrader implements Grader { + readonly kind = 'javascript'; + + constructor(private readonly config: ScriptAssertionGraderConfig) {} + + async evaluate(context: EvaluationContext): Promise { + try { + const fn = new Function('output', 'context', buildFunctionBody(this.config.value)); + const result = (await fn(context.candidate, buildAssertionContext(context))) as ScriptResult; + return normalizeScriptResult( + result, + 'Javascript assertion returned a failing result', + this.config.threshold, + ); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { + score: 0, + verdict: 'fail', + assertions: [{ text: `Javascript assertion failed: ${message}`, passed: false }], + expectedAspectCount: 1, + }; + } + } +} + +function buildPythonProgram(code: string): string { + const isMultiline = code.includes('\n'); + const body = isMultiline + ? code + .split('\n') + .map((line) => ` ${line}`) + .join('\n') + : ` return ${code}`; + + return `import json +import sys + +payload = json.load(sys.stdin) + +def main(output, context): +${body} + +result = main(payload.get("output", ""), payload.get("context", {})) +print(json.dumps(result)) +`; +} + +export class PythonAssertionGrader implements Grader { + readonly kind = 'python'; + + constructor( + private readonly config: ScriptAssertionGraderConfig, + private readonly timeoutMs?: number, + ) {} + + async evaluate(context: EvaluationContext): Promise { + const payload = JSON.stringify({ + output: context.candidate, + context: serializeSnakeCaseBoundaryPayload(buildAssertionContext(context)), + }); + try { + const result = await execFileWithStdin( + ['python3', '-c', buildPythonProgram(this.config.value)], + payload, + { + timeoutMs: this.timeoutMs, + }, + ); + if (result.exitCode !== 0) { + throw new Error(result.stderr.trim() || `python exited with code ${result.exitCode}`); + } + const parsed = JSON.parse(result.stdout.trim()) as ScriptResult; + return normalizeScriptResult( + parsed, + 'Python assertion returned a failing result', + this.config.threshold, + ); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { + score: 0, + verdict: 'fail', + assertions: [{ text: `Python assertion failed: ${message}`, passed: false }], + expectedAspectCount: 1, + }; + } + } +} + +export class WebhookAssertionGrader implements Grader { + readonly kind = 'webhook'; + + constructor(private readonly config: ScriptAssertionGraderConfig) {} + + async evaluate(context: EvaluationContext): Promise { + try { + const response = await fetch(this.config.value, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ + output: context.candidate, + context: serializeSnakeCaseBoundaryPayload(buildAssertionContext(context)), + }), + }); + if (!response.ok) { + throw new Error(`HTTP ${response.status}`); + } + const result = (await response.json()) as ScriptResult; + return normalizeScriptResult( + result, + 'Webhook assertion returned a failing result', + this.config.threshold, + ); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { + score: 0, + verdict: 'fail', + assertions: [{ text: `Webhook assertion failed: ${message}`, passed: false }], + expectedAspectCount: 1, + }; + } + } +} + +export class AssertSetGrader implements Grader { + readonly kind = 'assert-set'; + + constructor( + private readonly config: AssertSetGraderConfig, + private readonly createChild: ( + config: AssertSetGraderConfig['assertions'][number], + ) => Promise, + ) {} + + async evaluate(context: EvaluationContext): Promise { + const scores = []; + for (const childConfig of this.config.assertions) { + const child = await this.createChild(childConfig); + const result = await child.evaluate(context); + scores.push({ + name: childConfig.name, + type: childConfig.type, + score: result.score, + weight: childConfig.weight ?? 1, + verdict: result.verdict, + assertions: result.assertions, + graderRawRequest: result.graderRawRequest, + scores: result.scores, + details: result.details, + tokenUsage: result.tokenUsage, + }); + } + + const totalWeight = scores.reduce((sum, score) => sum + (score.weight ?? 1), 0) || 1; + const score = + scores.reduce((sum, item) => sum + item.score * (item.weight ?? 1), 0) / totalWeight; + const threshold = this.config.threshold ?? 1; + const passed = score >= threshold; + return { + score, + verdict: passed ? 'pass' : 'fail', + assertions: scores.flatMap((item) => item.assertions), + expectedAspectCount: scores.reduce((sum, item) => sum + item.assertions.length, 0) || 1, + scores, + details: { threshold }, + }; + } +} + +function getEmbeddingConfig(config: SimilarGraderConfig): JsonObject | undefined { + const provider = typeof config.provider === 'object' ? config.provider : undefined; + const nested = + config.config?.embedding_provider && typeof config.config.embedding_provider === 'object' + ? (config.config.embedding_provider as JsonObject) + : undefined; + return nested ?? provider ?? config.config; +} + +function asString(value: JsonValue | undefined): string | undefined { + return typeof value === 'string' && value.trim().length > 0 ? value : undefined; +} + +async function embedTexts( + config: SimilarGraderConfig, + texts: readonly string[], +): Promise { + const embedding = getEmbeddingConfig(config); + const model = asString(embedding?.model); + const rawBaseUrl = asString(embedding?.base_url) ?? asString(embedding?.endpoint); + if (!embedding || !model || !rawBaseUrl) { + throw new Error( + 'similar requires config.embedding_provider with OpenAI-compatible base_url and model', + ); + } + const apiKey = asString(embedding.api_key); + const baseUrl = rawBaseUrl.replace(/\/+$/, ''); + const response = await fetch(`${baseUrl}/embeddings`, { + method: 'POST', + headers: { + 'content-type': 'application/json', + ...(apiKey ? { authorization: `Bearer ${apiKey}` } : {}), + }, + body: JSON.stringify({ model, input: texts }), + }); + if (!response.ok) { + throw new Error(`embedding provider returned HTTP ${response.status}`); + } + const json = (await response.json()) as { data?: readonly { embedding?: readonly number[] }[] }; + const embeddings = json.data?.map((item) => [...(item.embedding ?? [])]) ?? []; + if (embeddings.length !== texts.length || embeddings.some((item) => item.length === 0)) { + throw new Error('embedding provider returned an invalid embeddings payload'); + } + return embeddings; +} + +function cosine(a: readonly number[], b: readonly number[]): number { + let dot = 0; + let aMag = 0; + let bMag = 0; + for (let i = 0; i < Math.min(a.length, b.length); i += 1) { + dot += a[i] * b[i]; + aMag += a[i] * a[i]; + bMag += b[i] * b[i]; + } + if (aMag === 0 || bMag === 0) return 0; + return dot / (Math.sqrt(aMag) * Math.sqrt(bMag)); +} + +export class SimilarAssertionGrader implements Grader { + readonly kind = 'similar'; + + constructor(private readonly config: SimilarGraderConfig) {} + + async evaluate(context: EvaluationContext): Promise { + try { + const [expected, actual] = await embedTexts(this.config, [ + this.config.value, + context.candidate, + ]); + const similarity = clampScore((cosine(expected, actual) + 1) / 2); + const threshold = this.config.threshold ?? 0.75; + const passed = similarity >= threshold; + return { + score: similarity, + verdict: passed ? 'pass' : 'fail', + assertions: [ + { + text: `Embedding similarity ${similarity.toFixed(3)} ${passed ? '>=' : '<'} ${threshold}`, + passed, + }, + ], + expectedAspectCount: 1, + details: { threshold, metric: 'cosine' }, + }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { + score: 0, + verdict: 'fail', + assertions: [{ text: `Similar assertion failed: ${message}`, passed: false }], + expectedAspectCount: 1, + }; + } + } +} diff --git a/packages/core/src/evaluation/loaders/case-file-loader.ts b/packages/core/src/evaluation/loaders/case-file-loader.ts index ada3be8cd..c20103477 100644 --- a/packages/core/src/evaluation/loaders/case-file-loader.ts +++ b/packages/core/src/evaluation/loaders/case-file-loader.ts @@ -227,7 +227,7 @@ function parseAssertionFromString(expected: string, sourceFilePath: string): Jso } if (expected.startsWith('python:')) { return { - type: 'code-grader', + type: 'script', command: ['uv', 'run', 'python', expected.slice('python:'.length).trim()], }; } @@ -235,14 +235,14 @@ function parseAssertionFromString(expected: string, sourceFilePath: string): Jso const filePath = stripFileProtocol(expected).trim(); if (!filePath.endsWith('.py')) { throw new Error( - `Unsupported promptfoo __expected file assertion "${expected}". Only file://*.py code graders are supported.`, + `Unsupported promptfoo __expected file assertion "${expected}". Only file://*.py script graders are supported.`, ); } const commandPath = path.isAbsolute(filePath) ? filePath : path.resolve(path.dirname(sourceFilePath), filePath); return { - type: 'code-grader', + type: 'script', command: ['uv', 'run', 'python', commandPath], }; } diff --git a/packages/core/src/evaluation/loaders/grader-parser.ts b/packages/core/src/evaluation/loaders/grader-parser.ts index 49283f09f..20181baea 100644 --- a/packages/core/src/evaluation/loaders/grader-parser.ts +++ b/packages/core/src/evaluation/loaders/grader-parser.ts @@ -48,12 +48,78 @@ function isDeprecatedJudgeType(type: string): boolean { return type === 'code-judge' || type === 'llm-judge'; } +const UNSUPPORTED_PROMPTFOO_ASSERTION_TYPES = new Set([ + 'agent-rubric', + 'answer-relevance', + 'bleu', + 'classifier', + 'contains-html', + 'contains-json', + 'contains-sql', + 'contains-xml', + 'context-faithfulness', + 'context-recall', + 'context-relevance', + 'conversation-relevance', + 'factuality', + 'finish-reason', + 'gleu', + 'guardrails', + 'is-html', + 'is-refusal', + 'is-sql', + 'is-valid-function-call', + 'is-valid-openai-function-call', + 'is-valid-openai-tools-call', + 'is-xml', + 'levenshtein', + 'meteor', + 'model-graded-closedqa', + 'model-graded-factuality', + 'moderation', + 'perplexity', + 'perplexity-score', + 'pi', + 'rouge-n', + 'ruby', + 'similar:cosine', + 'similar:dot', + 'similar:euclidean', + 'select-best', + 'human', + 'max-score', + 'tool-call-f1', + 'skill-used', + 'trajectory:goal-success', + 'trajectory:tool-args-match', + 'trajectory:step-count', + 'trajectory:tool-sequence', + 'trajectory:tool-used', + 'trace-error-spans', + 'trace-span-count', + 'trace-span-duration', + 'search-rubric', + 'word-count', +]); + +function assertSupportedPromptfooType(type: string, evalId: string, name?: string): void { + const baseType = type.startsWith('not-') ? type.slice(4) : type; + if (!UNSUPPORTED_PROMPTFOO_ASSERTION_TYPES.has(baseType)) { + return; + } + throw new Error( + `Unsupported promptfoo assertion type '${type}' in '${evalId}'` + + `${name ? ` for evaluator '${name}'` : ''}. This type is future scope in AgentV and is not accepted as a custom assertion.`, + ); +} + /** * Parse evaluators from eval case configuration. */ export async function parseGraders( rawEvalCase: JsonObject & { readonly execution?: JsonValue; + readonly assert?: JsonValue; readonly assertions?: JsonValue; readonly evaluators?: JsonValue; }, @@ -65,17 +131,19 @@ export async function parseGraders( const execution = rawEvalCase.execution; const executionObject = isJsonObject(execution) ? execution : undefined; - // Case-level graders priority: assertions > legacy execution/top-level assertion lists + // Case-level graders priority: assert > assertions > legacy execution/top-level assertion lists const caseEvaluators = + rawEvalCase.assert ?? rawEvalCase.assertions ?? + (executionObject ? executionObject.assert : undefined) ?? (executionObject ? executionObject.evaluators : undefined) ?? // deprecated: use assertions rawEvalCase.evaluators; // deprecated: use assertions - // Root-level default graders: assertions > legacy execution assertion list + // Root-level default graders: assert > assertions > legacy execution assertion list const skipDefaults = executionObject?.skip_defaults === true; const rootEvaluators = skipDefaults ? undefined - : (globalExecution?.assertions ?? globalExecution?.evaluators); // deprecated: use assertions + : (globalExecution?.assert ?? globalExecution?.assertions ?? globalExecution?.evaluators); // deprecated: use assertions // Parse case-level evaluators const parsedCase = await parseGraderList( @@ -247,6 +315,7 @@ async function expandGraderEntries( export async function collectAssertionTemplateSourceReferences( rawEvalCase: JsonObject & { readonly execution?: JsonValue; + readonly assert?: JsonValue; readonly assertions?: JsonValue; readonly evaluators?: JsonValue; }, @@ -257,13 +326,15 @@ export async function collectAssertionTemplateSourceReferences( const execution = rawEvalCase.execution; const executionObject = isJsonObject(execution) ? execution : undefined; const caseEvaluators = + rawEvalCase.assert ?? rawEvalCase.assertions ?? + (executionObject ? executionObject.assert : undefined) ?? (executionObject ? executionObject.evaluators : undefined) ?? rawEvalCase.evaluators; const skipDefaults = executionObject?.skip_defaults === true; const rootEvaluators = skipDefaults ? undefined - : (globalExecution?.assertions ?? globalExecution?.evaluators); + : (globalExecution?.assert ?? globalExecution?.assertions ?? globalExecution?.evaluators); return [ ...(await collectAssertionTemplateReferencesFromValue(caseEvaluators, searchRoots, evalId)), @@ -364,7 +435,7 @@ async function collectAssertionTemplateReferencesFromObject( includeContext: IncludeContext, ): Promise { const references: EvalSourceReference[] = []; - for (const key of ['assertions', 'evaluators'] as const) { + for (const key of ['assert', 'assertions', 'evaluators'] as const) { references.push( ...(await collectAssertionTemplateReferencesFromValue( value[key], @@ -426,7 +497,7 @@ async function parseGraderList( // e.g. [contains, "crit1", "crit2", "crit3"] → contains(w=1) + rubrics(w=3) // → each of the 4 visible assertions counts equally. result[placeholderIndex] = { - type: 'rubrics', + type: 'g-eval', criteria: strings, weight: strings.length, }; @@ -457,6 +528,10 @@ async function parseGraderList( continue; } + if (typeof typeValue === 'string') { + assertSupportedPromptfooType(typeValue, evalId, rawName); + } + // Unknown types are treated as custom assertion types (resolved via registry discovery) const isCustomType = typeof typeValue === 'string' && !isGraderKind(typeValue); if (typeof typeValue !== 'string') { @@ -515,11 +590,64 @@ async function parseGraderList( continue; } - if (typeValue === 'code-grader') { + if (typeValue === 'assert-set') { + const rawMembers = rawEvaluator.assert ?? rawEvaluator.assertions; + if (!Array.isArray(rawMembers)) { + logWarning(`Skipping assert-set evaluator '${name}' in '${evalId}': missing assert array`); + continue; + } + + const parsedMembers = await parseGraderList( + rawMembers as JsonValue, + searchRoots, + `${evalId}:${name}`, + defaultPreprocessors, + ); + if (!parsedMembers || parsedMembers.length === 0) { + logWarning( + `Skipping assert-set evaluator '${name}' in '${evalId}': no valid child assertions`, + ); + continue; + } + + const threshold = + typeof rawEvaluator.threshold === 'number' && + rawEvaluator.threshold >= 0 && + rawEvaluator.threshold <= 1 + ? rawEvaluator.threshold + : undefined; + const weight = validateWeight(rawEvaluator.weight, name, evalId); + const { required, min_score } = parseRequiredAndMinScore( + rawEvaluator.required, + (rawEvaluator as Record).min_score as JsonValue | undefined, + name, + evalId, + ); + evaluators.push({ + name, + type: 'assert-set', + assertions: parsedMembers, + ...(threshold !== undefined ? { threshold } : {}), + ...(weight !== undefined ? { weight } : {}), + ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), + ...(negate !== undefined ? { negate } : {}), + }); + continue; + } + + if (typeValue === 'code-grader' || typeValue === 'script') { + const isLegacyCodeGrader = typeValue === 'code-grader'; + if (isLegacyCodeGrader) { + logWarning( + `Evaluator '${name}' in '${evalId}': 'code-grader' is deprecated. Use 'script' instead.`, + ); + } + const displayType = 'script'; let command: string[] | undefined; if (rawEvaluator.script !== undefined) { throw new Error( - `Grader '${name}' in '${evalId}': 'script' has been removed. Use 'command' instead.`, + `Grader '${name}' in '${evalId}': 'script' field has been removed. Use 'command' instead.`, ); } const rawCommand = rawEvaluator.command; @@ -528,19 +656,19 @@ async function parseGraderList( const trimmed = rawCommand.trim(); if (trimmed.length === 0) { throw new Error( - `Invalid code-grader command for evaluator '${name}' in '${evalId}': command cannot be empty`, + `Invalid ${displayType} command for evaluator '${name}' in '${evalId}': command cannot be empty`, ); } command = parseCommandToArgv(trimmed); } else { command = asStringArray( rawCommand, - `code-grader command for evaluator '${name}' in '${evalId}'`, + `${displayType} command for evaluator '${name}' in '${evalId}'`, ); } if (!command) { - logWarning(`Skipping code-grader evaluator '${name}' in '${evalId}': missing command`); + logWarning(`Skipping ${displayType} evaluator '${name}' in '${evalId}': missing command`); continue; } @@ -556,7 +684,7 @@ async function parseGraderList( resolvedCwd = path.resolve(resolved.resolvedPath); } else { logWarning( - `Code-grader evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`, + `${displayType} evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`, resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : undefined, @@ -625,7 +753,7 @@ async function parseGraderList( evaluators.push({ name, - type: 'code-grader', + type: 'script', command, ...(resolvedScriptPath ? { resolvedScriptPath } : {}), cwd, @@ -642,8 +770,8 @@ async function parseGraderList( } if (typeValue === 'composite') { - // Accept assertions > evaluators (deprecated) - const rawMembers = rawEvaluator.assertions ?? rawEvaluator.evaluators; // evaluators deprecated + // Accept assert > assertions > evaluators (deprecated) + const rawMembers = rawEvaluator.assert ?? rawEvaluator.assertions ?? rawEvaluator.evaluators; // evaluators deprecated if (!Array.isArray(rawMembers)) { logWarning( `Skipping composite evaluator '${name}' in '${evalId}': missing assertions (or evaluators) array`, @@ -675,6 +803,7 @@ async function parseGraderList( } if ( normalizedAggregatorType !== 'weighted_average' && + normalizedAggregatorType !== 'script' && normalizedAggregatorType !== 'code-grader' && normalizedAggregatorType !== 'llm-grader' && normalizedAggregatorType !== 'threshold' @@ -710,7 +839,7 @@ async function parseGraderList( continue; } - // Parse member evaluator (reuse existing logic for code, llm-grader, code-grader) + // Parse member evaluator (reuse existing logic for code, llm-grader, script) const memberConfigs = await parseGraders( { evaluators: [rawMember] }, undefined, @@ -749,11 +878,19 @@ async function parseGraderList( type: 'weighted_average', ...(Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}), }; - } else if (normalizedAggregatorType === 'code-grader') { + } else if ( + normalizedAggregatorType === 'script' || + normalizedAggregatorType === 'code-grader' + ) { + if (normalizedAggregatorType === 'code-grader') { + logWarning( + `Composite evaluator '${name}' in '${evalId}': aggregator type 'code-grader' is deprecated. Use 'script' instead.`, + ); + } const aggregatorPath = asString(rawAggregator.path); if (!aggregatorPath) { logWarning( - `Skipping composite evaluator '${name}' in '${evalId}': code-grader aggregator missing path`, + `Skipping composite evaluator '${name}' in '${evalId}': script aggregator missing path`, ); continue; } @@ -761,7 +898,7 @@ async function parseGraderList( // Set cwd to eval file directory (first search root) // Paths are resolved relative to this directory aggregator = { - type: 'code-grader', + type: 'script', path: aggregatorPath, cwd: searchRoots[0], }; @@ -1311,6 +1448,79 @@ async function parseGraderList( continue; } + if (typeValue === 'javascript' || typeValue === 'python' || typeValue === 'webhook') { + const value = asString(rawEvaluator.value); + if (!value || value.trim().length === 0) { + logWarning(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`); + continue; + } + const threshold = + typeof rawEvaluator.threshold === 'number' && + rawEvaluator.threshold >= 0 && + rawEvaluator.threshold <= 1 + ? rawEvaluator.threshold + : undefined; + const weight = validateWeight(rawEvaluator.weight, name, evalId); + const { required, min_score } = parseRequiredAndMinScore( + rawEvaluator.required, + (rawEvaluator as Record).min_score as JsonValue | undefined, + name, + evalId, + ); + const config = isJsonObject(rawEvaluator.config) ? rawEvaluator.config : undefined; + evaluators.push({ + name, + type: typeValue, + value, + ...(threshold !== undefined ? { threshold } : {}), + ...(weight !== undefined ? { weight } : {}), + ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), + ...(negate !== undefined ? { negate } : {}), + ...(config !== undefined ? { config } : {}), + }); + continue; + } + + if (typeValue === 'similar') { + const value = asString(rawEvaluator.value); + if (!value || value.trim().length === 0) { + logWarning(`Skipping similar evaluator '${name}' in '${evalId}': missing value`); + continue; + } + const threshold = + typeof rawEvaluator.threshold === 'number' && + rawEvaluator.threshold >= 0 && + rawEvaluator.threshold <= 1 + ? rawEvaluator.threshold + : undefined; + const weight = validateWeight(rawEvaluator.weight, name, evalId); + const { required, min_score } = parseRequiredAndMinScore( + rawEvaluator.required, + (rawEvaluator as Record).min_score as JsonValue | undefined, + name, + evalId, + ); + const provider = + typeof rawEvaluator.provider === 'string' || isJsonObject(rawEvaluator.provider) + ? rawEvaluator.provider + : undefined; + const config = isJsonObject(rawEvaluator.config) ? rawEvaluator.config : undefined; + evaluators.push({ + name, + type: 'similar', + value, + ...(threshold !== undefined ? { threshold } : {}), + ...(provider !== undefined ? { provider } : {}), + ...(weight !== undefined ? { weight } : {}), + ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), + ...(negate !== undefined ? { negate } : {}), + ...(config !== undefined ? { config } : {}), + }); + continue; + } + if (typeValue === 'contains') { const value = asString(rawEvaluator.value); if (!value) { @@ -1526,21 +1736,14 @@ async function parseGraderList( if (typeValue === 'rubrics') { const rawCriteria = rawEvaluator.criteria; - if (!Array.isArray(rawCriteria) || rawCriteria.length === 0) { + const normalizedCriteria = normalizeRubricCriteria(rawCriteria); + if (!normalizedCriteria || normalizedCriteria.length === 0) { logWarning( `Skipping rubrics evaluator '${name}' in '${evalId}': criteria must be a non-empty array`, ); continue; } - // Normalize string shorthands to objects before passing to parseRubricItems - const normalizedCriteria = rawCriteria.map((item, index) => { - if (typeof item === 'string') { - return { id: `rubric-${index + 1}`, outcome: item, weight: 1.0, required: true }; - } - return item; - }); - const parsedCriteria = parseRubricItems(normalizedCriteria, name, evalId); if (!parsedCriteria || parsedCriteria.length === 0) { logWarning(`Skipping rubrics evaluator '${name}' in '${evalId}': no valid criteria found`); @@ -1557,7 +1760,7 @@ async function parseGraderList( evaluators.push({ name, - type: 'llm-grader', + type: 'g-eval', rubrics: parsedCriteria, ...(graderTargetName ? { target: graderTargetName } : {}), ...(weight !== undefined ? { weight } : {}), @@ -1643,7 +1846,7 @@ async function parseGraderList( const rawRubrics = rawEvaluator.rubrics; const parsedRubrics = Array.isArray(rawRubrics) - ? parseRubricItems(rawRubrics, name, evalId) + ? parseRubricItems(normalizeRubricCriteria(rawRubrics) ?? [], name, evalId) : undefined; if (typeValue === 'rubric') { @@ -1667,7 +1870,7 @@ async function parseGraderList( // deprecated: `type: rubric` maps to `type: llm-grader` with `rubrics`. Use `type: rubrics` with `criteria` instead. evaluators.push({ name, - type: 'llm-grader', + type: 'g-eval', rubrics: parsedRubrics, ...(graderTargetName ? { target: graderTargetName } : {}), ...(weight !== undefined ? { weight } : {}), @@ -1736,6 +1939,73 @@ async function parseGraderList( const llmTemperature = typeof rawTempLlm === 'number' && rawTempLlm >= 0 && rawTempLlm <= 2 ? rawTempLlm : undefined; + if (typeValue === 'g-eval') { + const rubricSource = + rawEvaluator.rubric_item ?? + rawEvaluator.rubricItem ?? + rawEvaluator.rubrics ?? + rawEvaluator.criteria ?? + rawEvaluator.value; + const normalizedCriteria = normalizeRubricCriteria(rubricSource, rawEvaluator); + const gEvalRubrics = normalizedCriteria + ? parseRubricItems(normalizedCriteria, name, evalId) + : undefined; + if (!gEvalRubrics || gEvalRubrics.length === 0) { + logWarning( + `Skipping g-eval evaluator '${name}' in '${evalId}': expected value, criteria, rubric_item, or rubrics`, + ); + continue; + } + + evaluators.push({ + name, + type: 'g-eval', + prompt, + promptPath, + ...(promptPath ? { resolvedPromptPath: promptPath } : {}), + ...(resolvedPromptScript ? { resolvedPromptScript } : {}), + rubrics: gEvalRubrics, + ...(graderTargetName ? { target: graderTargetName } : {}), + ...(weight !== undefined ? { weight } : {}), + ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), + ...(negate !== undefined ? { negate } : {}), + ...(finalConfig ? { config: finalConfig } : {}), + ...(llmMaxSteps !== undefined ? { max_steps: llmMaxSteps } : {}), + ...(llmTemperature !== undefined ? { temperature: llmTemperature } : {}), + ...(mergedPreprocessors ? { preprocessors: mergedPreprocessors } : {}), + }); + continue; + } + + if (typeValue === 'llm-rubric') { + const value = + typeof rawEvaluator.value === 'string' + ? rawEvaluator.value + : typeof rawEvaluator.criteria === 'string' + ? rawEvaluator.criteria + : undefined; + evaluators.push({ + name, + type: 'llm-rubric', + prompt, + promptPath, + ...(promptPath ? { resolvedPromptPath: promptPath } : {}), + ...(resolvedPromptScript ? { resolvedPromptScript } : {}), + ...(value !== undefined ? { value } : {}), + ...(graderTargetName ? { target: graderTargetName } : {}), + ...(weight !== undefined ? { weight } : {}), + ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), + ...(negate !== undefined ? { negate } : {}), + ...(finalConfig ? { config: finalConfig } : {}), + ...(llmMaxSteps !== undefined ? { max_steps: llmMaxSteps } : {}), + ...(llmTemperature !== undefined ? { temperature: llmTemperature } : {}), + ...(mergedPreprocessors ? { preprocessors: mergedPreprocessors } : {}), + }); + continue; + } + evaluators.push({ name, type: 'llm-grader', @@ -2106,6 +2376,55 @@ function parseRubricOperator( return undefined; } +function normalizeRubricCriteria( + raw: unknown, + fallback?: JsonObject, +): readonly unknown[] | undefined { + if (typeof raw === 'string') { + return [ + { + id: 'rubric-1', + outcome: raw, + weight: typeof fallback?.weight === 'number' ? fallback.weight : 1, + required: typeof fallback?.required === 'boolean' ? fallback.required : true, + ...(typeof fallback?.min_score === 'number' ? { min_score: fallback.min_score } : {}), + ...(fallback?.score_ranges !== undefined ? { score_ranges: fallback.score_ranges } : {}), + ...(typeof fallback?.operator === 'string' ? { operator: fallback.operator } : {}), + }, + ]; + } + + if (Array.isArray(raw)) { + return raw.map((item, index) => { + if (typeof item === 'string') { + return { id: `rubric-${index + 1}`, outcome: item, weight: 1.0, required: true }; + } + return item; + }); + } + + if (isJsonObject(raw)) { + return [raw]; + } + + if (fallback?.score_ranges !== undefined) { + const outcome = + asString(fallback.criteria) ?? asString(fallback.value) ?? asString(fallback.outcome); + return [ + { + id: asString(fallback.id) ?? 'rubric-1', + ...(outcome ? { outcome } : {}), + score_ranges: fallback.score_ranges, + weight: typeof fallback.weight === 'number' ? fallback.weight : 1, + ...(typeof fallback.min_score === 'number' ? { min_score: fallback.min_score } : {}), + ...(typeof fallback.operator === 'string' ? { operator: fallback.operator } : {}), + }, + ]; + } + + return undefined; +} + /** * Parse rubric items from raw YAML/JSON data. * Supports both checklist rubrics and score-range rubrics. @@ -2373,11 +2692,11 @@ function parseScoreRanges( * - String shorthand: "Must be polite" -> { id: "rubric-1", outcome: "Must be polite", weight: 1.0, required: true } * - Object form with outcome, weight, required, score_ranges, min_score * - * Returns an LlmGraderConfig to prepend to evaluators, or undefined if no valid rubrics. + * Returns a g-eval config to prepend to evaluators, or undefined if no valid rubrics. */ export function parseInlineRubrics( rawRubrics: readonly unknown[], -): import('../types.js').LlmGraderConfig | undefined { +): import('../types.js').GEvalGraderConfig | undefined { const rubricItems = rawRubrics .filter((r): r is JsonObject | string => isJsonObject(r) || typeof r === 'string') .map((rubric, index) => { @@ -2456,7 +2775,7 @@ export function parseInlineRubrics( return { name: 'rubrics', - type: 'llm-grader', + type: 'g-eval', rubrics: rubricItems, }; } diff --git a/packages/core/src/evaluation/loaders/jsonl-parser.ts b/packages/core/src/evaluation/loaders/jsonl-parser.ts index 4572a6191..b02560eb9 100644 --- a/packages/core/src/evaluation/loaders/jsonl-parser.ts +++ b/packages/core/src/evaluation/loaders/jsonl-parser.ts @@ -50,12 +50,14 @@ type RawJsonlEvalCase = JsonObject & { readonly id?: JsonValue; readonly conversation_id?: JsonValue; readonly criteria?: JsonValue; - /** @deprecated Use `criteria` instead */ + /** @deprecated Use `assert` instead */ readonly expected_outcome?: JsonValue; readonly input?: JsonValue; readonly expected_output?: JsonValue; readonly execution?: JsonValue; readonly evaluators?: JsonValue; + readonly assert?: JsonValue; + readonly assertions?: JsonValue; readonly rubrics?: JsonValue; }; @@ -167,7 +169,7 @@ export async function loadTestsFromJsonl( sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName; // Global defaults from sidecar - const globalEvaluator = coerceEvaluator(sidecar.evaluator, 'sidecar') ?? 'llm-grader'; + const globalEvaluator = coerceEvaluator(sidecar.evaluator, 'sidecar'); const globalExecution = sidecar.execution; if (verbose) { @@ -197,7 +199,7 @@ export async function loadTestsFromJsonl( outcome = asString(testCaseConfig.expected_outcome); if (outcome) { logWarning( - `Test '${asString(testCaseConfig.id) ?? 'unknown'}': 'expected_outcome' is deprecated. Use 'criteria' instead.`, + `Test '${asString(testCaseConfig.id) ?? 'unknown'}': 'expected_outcome' has been removed. Use 'assert' instead.`, ); } } @@ -207,12 +209,37 @@ export async function loadTestsFromJsonl( // Resolve expected_output with shorthand support const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? []; - // A test is complete when it has id, input, and at least one of: criteria, expected_output, or assertions + const hasExplicitCaseGraders = + testCaseConfig.assert !== undefined || + testCaseConfig.assertions !== undefined || + testCaseConfig.evaluators !== undefined || + testCaseConfig.rubrics !== undefined; + const executionObject = isJsonObject(testCaseConfig.execution) + ? testCaseConfig.execution + : undefined; + const hasExplicitRootGraders = + executionObject?.skip_defaults === true + ? false + : globalExecution?.assert !== undefined || + globalExecution?.assertions !== undefined || + globalExecution?.evaluators !== undefined; + const graderCase = + outcome && !hasExplicitCaseGraders && !hasExplicitRootGraders + ? ({ ...testCaseConfig, assert: [outcome] } satisfies RawJsonlEvalCase) + : testCaseConfig; + + // A test is complete when it has id, input, and at least one of: criteria, + // expected_output, or assertions. Legacy test-level criteria is desugared to a + // bare-string assert above so it uses the canonical g-eval path instead of the + // implicit default LLM grader. const hasEvaluationSpec = - !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== undefined; + !!outcome || + expectedMessages.length > 0 || + graderCase.assert !== undefined || + graderCase.assertions !== undefined; if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) { logError( - `Skipping incomplete test at line ${lineNumber}: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`, + `Skipping incomplete test at line ${lineNumber}: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`, ); continue; } @@ -276,12 +303,7 @@ export async function loadTestsFromJsonl( const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator; let evaluators: Awaited>; try { - evaluators = await parseGraders( - testCaseConfig, - mergedExecution, - searchRoots, - id ?? 'unknown', - ); + evaluators = await parseGraders(graderCase, mergedExecution, searchRoots, id ?? 'unknown'); } catch (error) { // Skip entire test if evaluator validation fails const message = error instanceof Error ? error.message : String(error); diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 29518ee3a..947c41927 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -2783,6 +2783,22 @@ async function runEvaluatorsForCase(options: { }); } + if (!evalCase.evaluator && (!evalCase.preprocessors || evalCase.preprocessors.length === 0)) { + return { + score: { + score: 1, + verdict: 'pass', + assertions: [ + { + text: 'No assertions declared; grading skipped', + passed: true, + }, + ], + expectedAspectCount: 1, + }, + }; + } + const evaluatorKind = evalCase.evaluator ?? 'llm-grader'; const activeEvaluator = evaluators[evaluatorKind] ?? evaluators['llm-grader']; if (!activeEvaluator) { diff --git a/packages/core/src/evaluation/registry/builtin-graders.ts b/packages/core/src/evaluation/registry/builtin-graders.ts index d3dbcf873..6bc4d691b 100644 --- a/packages/core/src/evaluation/registry/builtin-graders.ts +++ b/packages/core/src/evaluation/registry/builtin-graders.ts @@ -32,6 +32,13 @@ import { } from '../graders.js'; import { InlineAssertGrader } from '../graders/inline-assert.js'; import { containsTemplateVariables, resolveCustomPrompt } from '../graders/prompt-resolution.js'; +import { + AssertSetGrader, + JavascriptAssertionGrader, + PythonAssertionGrader, + SimilarAssertionGrader, + WebhookAssertionGrader, +} from '../graders/promptfoo-assertions.js'; import { isAgentProvider } from '../providers/types.js'; import type { Provider } from '../providers/types.js'; import type { ToolTrajectoryGraderConfig } from '../trace.js'; @@ -46,14 +53,20 @@ import type { EqualsGraderConfig, ExecutionMetricsGraderConfig, FieldAccuracyGraderConfig, + GEvalGraderConfig, GraderConfig, IcontainsAllGraderConfig, IcontainsAnyGraderConfig, IcontainsGraderConfig, IsJsonGraderConfig, LatencyGraderConfig, + LlmBackedGraderConfig, LlmGraderConfig, + LlmRubricGraderConfig, RegexGraderConfig, + ScriptAssertionGraderConfig, + ScriptGraderConfig, + SimilarGraderConfig, SkillTriggerGraderConfig, StartsWithGraderConfig, TokenUsageGraderConfig, @@ -79,7 +92,7 @@ export const INLINE_ASSERT_FN = Symbol.for('agentv.inline-assert-fn'); * - agentv provider: built-in AI SDK agent mode with filesystem tools */ export const llmGraderFactory: GraderFactoryFn = (config, context) => { - const c = config as LlmGraderConfig; + const c = config as LlmBackedGraderConfig; const { llmGrader, graderProvider, targetResolver, agentTimeoutMs } = context; let evaluator = llmGrader; @@ -111,7 +124,7 @@ export const llmGraderFactory: GraderFactoryFn = (config, context) => { } return { - kind: 'llm-grader', + kind: c.type, async evaluate(evalContext) { const customPrompt = await resolveCustomPrompt( c, @@ -146,6 +159,9 @@ export const llmGraderFactory: GraderFactoryFn = (config, context) => { let graderTemplateOverride: string | undefined; let evalCase = evalContext.evalCase; + if (c.type === 'llm-rubric' && c.value && !customPrompt) { + evalCase = { ...evalCase, criteria: c.value }; + } if (customPrompt) { if (!isFromInlinePrompt || containsTemplateVariables(customPrompt)) { graderTemplateOverride = customPrompt; @@ -165,9 +181,15 @@ export const llmGraderFactory: GraderFactoryFn = (config, context) => { }; }; -/** Factory for `code-grader` evaluators. */ +export const gEvalFactory: GraderFactoryFn = (config, context) => + llmGraderFactory(config as GEvalGraderConfig, context); + +export const llmRubricFactory: GraderFactoryFn = (config, context) => + llmGraderFactory(config as LlmRubricGraderConfig, context); + +/** Factory for subprocess-backed script evaluators. */ export const codeFactory: GraderFactoryFn = (config, context) => { - const c = config as CodeGraderConfig; + const c = config as ScriptGraderConfig | CodeGraderConfig; return new CodeGrader({ command: c.command, cwd: c.resolvedCwd ?? c.cwd, @@ -177,6 +199,24 @@ export const codeFactory: GraderFactoryFn = (config, context) => { }); }; +export const javascriptFactory: GraderFactoryFn = (config) => + new JavascriptAssertionGrader(config as ScriptAssertionGraderConfig); + +export const pythonFactory: GraderFactoryFn = (config, context) => + new PythonAssertionGrader(config as ScriptAssertionGraderConfig, context.agentTimeoutMs); + +export const webhookFactory: GraderFactoryFn = (config) => + new WebhookAssertionGrader(config as ScriptAssertionGraderConfig); + +export const similarFactory: GraderFactoryFn = (config) => + new SimilarAssertionGrader(config as SimilarGraderConfig); + +export const assertSetFactory: GraderFactoryFn = (config, context) => { + return new AssertSetGrader(config as import('../types.js').AssertSetGraderConfig, (child) => + context.registry.create(child, context), + ); +}; + /** Factory for `composite` evaluators. */ export const compositeFactory: GraderFactoryFn = (config, context) => { const c = config as CompositeGraderConfig; @@ -407,7 +447,10 @@ export function createBuiltinRegistry(): GraderRegistry { registry .register('llm-grader', llmGraderFactory) + .register('g-eval', gEvalFactory) + .register('llm-rubric', llmRubricFactory) .register('code-grader', codeFactory) + .register('script', codeFactory) .register('composite', compositeFactory) .register('tool-trajectory', toolTrajectoryFactory) .register('field-accuracy', fieldAccuracyFactory) @@ -416,6 +459,7 @@ export function createBuiltinRegistry(): GraderRegistry { .register('token-usage', tokenUsageFactory) .register('execution-metrics', executionMetricsFactory) .register('skill-trigger', skillTriggerFactory) + .register('assert-set', assertSetFactory) .register('contains', containsFactory) .register('contains-any', containsAnyFactory) .register('contains-all', containsAllFactory) @@ -427,6 +471,10 @@ export function createBuiltinRegistry(): GraderRegistry { .register('regex', regexFactory) .register('is-json', isJsonFactory) .register('equals', equalsFactory) + .register('javascript', javascriptFactory) + .register('python', pythonFactory) + .register('webhook', webhookFactory) + .register('similar', similarFactory) .register('inline-assert', (config) => { // biome-ignore lint/suspicious/noExplicitAny: symbol key access requires any const fn = (config as any)[INLINE_ASSERT_FN] as diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 806d86146..f379a0c2d 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -165,6 +165,7 @@ export function isTestMessage(value: unknown): value is TestMessage { const GRADER_KIND_VALUES = [ 'code-grader', + 'script', 'llm-grader', 'rubric', 'composite', @@ -175,6 +176,9 @@ const GRADER_KIND_VALUES = [ 'token-usage', 'execution-metrics', 'skill-trigger', + 'assert-set', + 'g-eval', + 'llm-rubric', 'contains', 'contains-any', 'contains-all', @@ -186,6 +190,10 @@ const GRADER_KIND_VALUES = [ 'regex', 'is-json', 'equals', + 'javascript', + 'python', + 'webhook', + 'similar', 'rubrics', 'inline-assert', ] as const; @@ -387,9 +395,9 @@ export type WorkspaceConfig = { readonly env?: WorkspaceEnvConfig; }; -export type CodeGraderConfig = { +export type ScriptGraderConfig = { readonly name: string; - readonly type: 'code-grader'; + readonly type: 'script'; readonly command: readonly string[]; readonly resolvedScriptPath?: string; readonly cwd?: string; @@ -400,7 +408,7 @@ export type CodeGraderConfig = { readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; - /** Pass-through configuration for the code-grader (any unrecognized YAML properties) */ + /** Pass-through configuration for the script (any unrecognized YAML properties) */ readonly config?: JsonObject; /** When present, enables target access via local proxy */ readonly target?: TargetAccessConfig; @@ -408,6 +416,11 @@ export type CodeGraderConfig = { readonly preprocessors?: readonly ContentPreprocessorConfig[]; }; +/** @deprecated Use ScriptGraderConfig with type: 'script'. */ +export type CodeGraderConfig = Omit & { + readonly type: 'code-grader'; +}; + /** * Executable prompt template configuration. * Matches code-grader pattern for consistency. @@ -457,6 +470,18 @@ export type LlmGraderConfig = { readonly preprocessors?: readonly ContentPreprocessorConfig[]; }; +export type GEvalGraderConfig = Omit & { + readonly type: 'g-eval'; +}; + +export type LlmRubricGraderConfig = Omit & { + readonly type: 'llm-rubric'; + /** Promptfoo-compatible free-form rubric text. */ + readonly value?: string; +}; + +export type LlmBackedGraderConfig = LlmGraderConfig | GEvalGraderConfig | LlmRubricGraderConfig; + /** * Score range definition for analytic rubric scoring. * Each range maps an integer score band (0-10) to an outcome description. @@ -510,6 +535,8 @@ export type RubricItem = { export type CompositeAggregatorConfig = | { readonly type: 'weighted_average'; readonly weights?: Record } + | { readonly type: 'script'; readonly path: string; readonly cwd?: string } + /** @deprecated Use the script aggregator type. */ | { readonly type: 'code-grader'; readonly path: string; readonly cwd?: string } | { readonly type: 'llm-grader'; @@ -860,6 +887,42 @@ export type RubricsEvaluatorConfig = { readonly negate?: boolean; }; +export type ScriptAssertionGraderConfig = { + readonly name: string; + readonly type: 'javascript' | 'python' | 'webhook'; + readonly value: string; + readonly threshold?: number; + readonly weight?: number; + readonly required?: boolean; + readonly min_score?: number; + readonly negate?: boolean; + readonly config?: JsonObject; +}; + +export type SimilarGraderConfig = { + readonly name: string; + readonly type: 'similar'; + readonly value: string; + readonly threshold?: number; + readonly provider?: string | JsonObject; + readonly weight?: number; + readonly required?: boolean; + readonly min_score?: number; + readonly negate?: boolean; + readonly config?: JsonObject; +}; + +export type AssertSetGraderConfig = { + readonly name: string; + readonly type: 'assert-set'; + readonly assertions: readonly GraderConfig[]; + readonly threshold?: number; + readonly weight?: number; + readonly required?: boolean; + readonly min_score?: number; + readonly negate?: boolean; +}; + /** * Configuration for the skill-trigger evaluator. * Detects whether the agent invoked a named skill as its first tool call. @@ -895,8 +958,11 @@ export type InlineAssertEvaluatorConfig = { }; export type GraderConfig = ( + | ScriptGraderConfig | CodeGraderConfig | LlmGraderConfig + | GEvalGraderConfig + | LlmRubricGraderConfig | CompositeGraderConfig | ToolTrajectoryGraderConfig | FieldAccuracyGraderConfig @@ -917,6 +983,9 @@ export type GraderConfig = ( | IsJsonGraderConfig | EqualsGraderConfig | RubricsEvaluatorConfig + | ScriptAssertionGraderConfig + | SimilarGraderConfig + | AssertSetGraderConfig | InlineAssertEvaluatorConfig ) & { /** Optional promptfoo-style named score key. Scoring aggregation support is layered separately. */ @@ -935,7 +1004,11 @@ export interface EvalSourceReference { | 'input_file' | 'llm_grader_prompt' | 'prompt_script' + | 'script_grader_command' + | 'script_grader_cwd' + /** @deprecated New eval loads emit script_grader_command. */ | 'code_grader_command' + /** @deprecated New eval loads emit script_grader_cwd. */ | 'code_grader_cwd' | 'assertion_template' | 'preprocessor_command'; diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 021cece7d..b120a5745 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -106,7 +106,7 @@ const RubricCriterionSchema = z.union([z.string().min(1), RubricItemSchema]); // --- Type-specific evaluator schemas --- const CodeGraderSchema = EvaluatorCommonSchema.extend({ - type: z.enum(['code-grader', 'code_grader']), + type: z.literal('script'), command: z.union([z.string(), z.array(z.string())]), cwd: z.string().optional(), target: z.union([z.boolean(), z.object({ max_calls: z.number().optional() })]).optional(), @@ -143,7 +143,7 @@ const AggregatorSchema = z.discriminatedUnion('type', [ threshold: z.number().min(0).max(1), }), z.object({ - type: z.literal('code-grader'), + type: z.literal('script'), path: z.string(), cwd: z.string().optional(), }), @@ -158,6 +158,7 @@ const AggregatorSchema = z.discriminatedUnion('type', [ const CompositeSchema: z.ZodType = z.lazy(() => EvaluatorCommonSchema.extend({ type: z.literal('composite'), + assert: z.array(EvaluatorSchema).optional(), assertions: z.array(EvaluatorSchema).optional(), evaluators: z.array(EvaluatorSchema).optional(), aggregator: AggregatorSchema, @@ -264,8 +265,6 @@ const PromptfooAssertionSchema = EvaluatorCommonSchema.extend({ 'python', 'webhook', 'similar', - 'select-best', - 'human', 'contains', 'contains-any', 'contains-all', @@ -504,6 +503,7 @@ const ExecutionSchema = z.object({ target: z.string().optional(), targets: z.array(z.union([z.string(), EvalTargetRefSchema])).optional(), workers: z.never().optional(), + assert: z.array(AssertionItemSchema).optional(), assertions: z.array(AssertionItemSchema).optional(), evaluators: z.array(EvaluatorSchema).optional(), skip_defaults: z.boolean().optional(), @@ -585,7 +585,6 @@ const EvalTestSchema = z.object({ id: z.string().min(1).optional(), description: z.string().optional(), vars: JsonObjectSchema.optional(), - criteria: z.string().optional(), provider: EvalTargetSchema.optional(), providers: EvalTargetsSchema.optional(), prompts: PromptsSchema.optional(), diff --git a/packages/core/src/evaluation/validation/eval-validator.ts b/packages/core/src/evaluation/validation/eval-validator.ts index 1c0cc3ef9..8de2978f0 100644 --- a/packages/core/src/evaluation/validation/eval-validator.ts +++ b/packages/core/src/evaluation/validation/eval-validator.ts @@ -232,7 +232,7 @@ const REMOVED_TEST_FIELDS = new Map([]); /** Deprecated test-level fields with migration hints. */ const DEPRECATED_TEST_FIELDS = new Map([ ['evaluator', "'evaluator' is deprecated. Use 'assertions' instead."], - ['expected_outcome', "'expected_outcome' is deprecated. Use 'criteria' instead."], + ['expected_outcome', "'expected_outcome' is deprecated. Use 'assert' instead."], ]); /** Name field pattern: lowercase alphanumeric with hyphens. */ diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 12f25ed16..cf36040cf 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -204,6 +204,7 @@ type RawTestSuite = JsonObject & { readonly default_test?: JsonValue; readonly workspace?: JsonValue; readonly assertions?: JsonValue; + readonly assert?: JsonValue; readonly preprocessors?: JsonValue; readonly extensions?: JsonValue; readonly on_run_complete?: JsonValue; @@ -240,6 +241,7 @@ type RawEvalCase = JsonObject & { readonly run?: JsonValue; readonly evaluators?: JsonValue; readonly assertions?: JsonValue; + readonly assert?: JsonValue; readonly rubrics?: JsonValue; readonly workspace?: JsonValue; readonly metadata?: JsonValue; @@ -324,6 +326,9 @@ function interpolateRawEvalCase( ...(raw.expected_output !== undefined ? { expected_output: interpolateCaseField(raw.expected_output, vars, filters) } : {}), + ...(raw.assert !== undefined + ? { assert: interpolateCaseField(raw.assert, vars, filters) } + : {}), ...(raw.assertions !== undefined ? { assertions: interpolateCaseField(raw.assertions, vars, filters) } : {}), @@ -624,7 +629,7 @@ async function loadTestsFromParsedYamlValue( const suiteMetadataPayload = extractSuiteMetadataPayload(suite); const evalFileDir = path.dirname(absoluteTestPath); - const globalEvaluator = coerceEvaluator(suite.evaluator, 'global') ?? 'llm-grader'; + const globalEvaluator = coerceEvaluator(suite.evaluator, 'global'); const suitePreprocessors = await parsePreprocessors( suite.preprocessors, searchRoots, @@ -679,9 +684,9 @@ async function loadTestsFromParsedYamlValue( readSuiteRuntimeBlock(suite, evalFilePath); // Build global execution context, including suite-level assertions (which is a sibling of execution) - const suiteAssertions = suite.assertions; + const suiteAssertions = suite.assert ?? suite.assertions; const globalExecution: JsonObject | undefined = - suiteAssertions !== undefined ? { assertions: suiteAssertions } : undefined; + suiteAssertions !== undefined ? { assert: suiteAssertions } : undefined; const results: EvalTest[] = []; @@ -712,7 +717,7 @@ async function loadTestsFromParsedYamlValue( outcome = asString(renderedCase.expected_outcome); if (outcome) { logWarning( - `Test '${asString(renderedCase.id) ?? 'unknown'}': 'expected_outcome' is deprecated. Use 'criteria' instead.`, + `Test '${asString(renderedCase.id) ?? 'unknown'}': 'expected_outcome' has been removed. Use 'assert' instead.`, ); } } @@ -786,23 +791,42 @@ async function loadTestsFromParsedYamlValue( : undefined; const effectiveSuiteInputMessages = expandInputShorthand(effectiveSuiteInputValue); - // A test is complete when it has id, input, and at least one of: criteria, expected_output, assertions, or turns (conversation mode) + const hasExplicitCaseGraders = + renderedCase.assert !== undefined || + renderedCase.assertions !== undefined || + renderedCase.evaluators !== undefined || + renderedCase.rubrics !== undefined; + const hasExplicitRootGraders = + skipDefaults === true + ? false + : globalExecution?.assert !== undefined || + globalExecution?.assertions !== undefined || + globalExecution?.evaluators !== undefined; + const graderCase = + outcome && !hasExplicitCaseGraders && !hasExplicitRootGraders + ? ({ ...renderedCase, assert: [outcome] } satisfies RawEvalCase) + : renderedCase; + + // A test is complete when it has id, input, and at least one of: criteria, + // expected_output, assertions, or turns (conversation mode). Legacy test-level + // criteria is desugared to a bare-string assert above so it uses the canonical + // g-eval path instead of the implicit default LLM grader. const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || - renderedCase.assertions !== undefined || + graderCase.assertions !== undefined || + graderCase.assert !== undefined || (Array.isArray(renderedCase.turns) && renderedCase.turns.length > 0); const hasInputMessages = testInputMessages.length > 0 || (effectiveSuiteInputMessages !== undefined && effectiveSuiteInputMessages.length > 0); if (!id || !hasEvaluationSpec || !hasInputMessages) { logError( - `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input or PROMPT.md, and at least one of criteria/expected_output/assertions/turns`, + `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input or PROMPT.md, and at least one of criteria/expected_output/assert/turns`, ); continue; } - // Prepend suite-level input to test input (respecting skip_defaults) // expected_output is optional - for outcome-only evaluation const hasExpectedMessages = expectedMessages.length > 0; @@ -870,7 +894,7 @@ async function loadTestsFromParsedYamlValue( let evaluators: Awaited>; try { evaluators = await parseGraders( - renderedCase, + graderCase, globalExecution, searchRoots, id ?? 'unknown', @@ -884,7 +908,7 @@ async function loadTestsFromParsedYamlValue( } const assertionTemplateReferences = await collectAssertionTemplateSourceReferences( - renderedCase, + graderCase, globalExecution, searchRoots, id ?? 'unknown', @@ -1891,10 +1915,10 @@ function collectSingleGraderSourceReferences( ): readonly EvalSourceReference[] { const references: EvalSourceReference[] = []; - if (evaluator.type === 'code-grader') { + if (evaluator.type === 'script' || evaluator.type === 'code-grader') { const command = evaluator.command ?? []; references.push({ - kind: 'code_grader_command', + kind: 'script_grader_command', displayPath: evaluator.resolvedScriptPath ?? command.join(' '), ...(evaluator.resolvedScriptPath ? { resolvedPath: evaluator.resolvedScriptPath } : {}), graderName: evaluator.name, @@ -1902,7 +1926,7 @@ function collectSingleGraderSourceReferences( }); if (evaluator.resolvedCwd) { references.push({ - kind: 'code_grader_cwd', + kind: 'script_grader_cwd', displayPath: evaluator.cwd ?? evaluator.resolvedCwd, resolvedPath: evaluator.resolvedCwd, graderName: evaluator.name, @@ -1948,9 +1972,9 @@ function collectSingleGraderSourceReferences( for (const member of evaluator.assertions) { references.push(...collectSingleGraderSourceReferences(member)); } - if (evaluator.aggregator.type === 'code-grader') { + if (evaluator.aggregator.type === 'script' || evaluator.aggregator.type === 'code-grader') { references.push({ - kind: 'code_grader_command', + kind: 'script_grader_command', displayPath: evaluator.aggregator.path, resolvedPath: path.resolve(evaluator.aggregator.cwd ?? '', evaluator.aggregator.path), graderName: evaluator.name, diff --git a/packages/core/test/evaluation/code-grader-file-backed.test.ts b/packages/core/test/evaluation/code-grader-file-backed.test.ts index 459f60118..2106e9f05 100644 --- a/packages/core/test/evaluation/code-grader-file-backed.test.ts +++ b/packages/core/test/evaluation/code-grader-file-backed.test.ts @@ -49,6 +49,30 @@ async function createScoringGrader(dir: string): Promise { return [process.execPath, script]; } +async function createPayloadShapeGrader(dir: string): Promise { + const script = join(dir, 'payload-shape-grader.js'); + await writeFile( + script, + `const input = require('fs').readFileSync(0, 'utf8'); +const payload = JSON.parse(input); +console.log(JSON.stringify({ + score: payload.expected_output?.[0]?.content?.answer === 'Paris' && + payload.config?.mode === 'strict' && + payload.input?.[0]?.content === 'Test input' ? 1 : 0, + assertions: [{ + text: 'structured stdin preserved', + passed: payload.expected_output?.[0]?.content?.answer === 'Paris' && + payload.config?.mode === 'strict' && + payload.input?.[0]?.content === 'Test input' + }], + details: { expected_output: payload.expected_output, config: payload.config, input: payload.input } +})); +`, + 'utf8', + ); + return [process.execPath, script]; +} + describe('CodeGrader file-backed output', () => { let tmpDir: string; @@ -113,4 +137,24 @@ describe('CodeGrader file-backed output', () => { // We can't inspect the payload directly, but the grader script should run without error expect(result.score).toBeGreaterThanOrEqual(0); }); + + it('preserves structured expected_output, input, and config in stdin', async () => { + const command = await createPayloadShapeGrader(tmpDir); + + const evaluator = new CodeGrader({ command, config: { mode: 'strict' } }); + const result = await evaluator.evaluate({ + evalCase: { + ...baseTestCase, + expected_output: [{ role: 'assistant', content: { answer: 'Paris' } }], + }, + candidate: 'answer', + output: [{ role: 'assistant' as const, content: 'answer' }], + }); + + expect(result.score).toBe(1); + expect(result.assertions).toEqual([{ text: 'structured stdin preserved', passed: true }]); + expect(result.details?.expected_output).toEqual([ + { role: 'assistant', content: { answer: 'Paris' } }, + ]); + }); }); diff --git a/packages/core/test/evaluation/graders/promptfoo-assertions.test.ts b/packages/core/test/evaluation/graders/promptfoo-assertions.test.ts new file mode 100644 index 000000000..e745d250c --- /dev/null +++ b/packages/core/test/evaluation/graders/promptfoo-assertions.test.ts @@ -0,0 +1,253 @@ +import { afterEach, describe, expect, it } from 'bun:test'; +import { type Server, createServer } from 'node:http'; + +import type { EvaluationContext } from '../../../src/evaluation/graders/types.js'; +import { createBuiltinRegistry } from '../../../src/evaluation/registry/builtin-graders.js'; +import type { GraderConfig } from '../../../src/evaluation/types.js'; + +const baseContext: EvaluationContext = { + evalCase: { + id: 'case-1', + question: 'Question', + input: [{ role: 'user', content: 'Question' }], + expected_output: [{ role: 'assistant', content: { answer: 'Paris' } }], + reference_answer: 'Paris', + file_paths: [], + criteria: 'Answer correctly', + }, + candidate: 'Paris is the capital of France.', + target: { name: 'mock', kind: 'mock', config: {} }, + provider: { + id: 'mock', + kind: 'mock', + targetName: 'mock', + async invoke() { + return { output: [{ role: 'assistant', content: 'ok' }] }; + }, + }, + attempt: 1, + promptInputs: { question: 'Question' }, + now: new Date('2026-07-02T00:00:00Z'), +}; + +async function run(config: GraderConfig) { + const registry = createBuiltinRegistry(); + const grader = await registry.create(config, { + llmGrader: { + kind: 'llm-grader', + evaluate() { + throw new Error('not used'); + }, + }, + registry, + }); + return grader.evaluate(baseContext); +} + +describe('promptfoo-compatible built-in assertions', () => { + let server: Server | undefined; + + afterEach(async () => { + if (server) { + await new Promise((resolve) => server?.close(() => resolve())); + server = undefined; + } + }); + + it('runs javascript assertions in-process', async () => { + const result = await run({ + name: 'js', + type: 'javascript', + value: "output.includes('Paris') && context.expectedOutput[0].content.answer === 'Paris'", + }); + + expect(result.score).toBe(1); + expect(result.verdict).toBe('pass'); + }); + + it('uses assertion thresholds for numeric script results', async () => { + const result = await run({ + name: 'js-threshold', + type: 'javascript', + value: '0.5', + threshold: 0.75, + }); + + expect(result.score).toBe(0.5); + expect(result.verdict).toBe('fail'); + expect(result.assertions[0].passed).toBe(false); + }); + + it('fails numeric javascript score zero when no threshold is set', async () => { + const result = await run({ + name: 'js-zero', + type: 'javascript', + value: '0', + }); + + expect(result.score).toBe(0); + expect(result.verdict).toBe('fail'); + expect(result.assertions[0].passed).toBe(false); + }); + + it('honors explicit threshold zero for numeric javascript results', async () => { + const result = await run({ + name: 'js-zero-threshold', + type: 'javascript', + value: '0', + threshold: 0, + }); + + expect(result.score).toBe(0); + expect(result.verdict).toBe('pass'); + expect(result.assertions[0].passed).toBe(true); + }); + + it('runs python assertions in a subprocess', async () => { + const result = await run({ + name: 'py', + type: 'python', + value: "'Paris' in output and context['expected_output'][0]['content']['answer'] == 'Paris'", + }); + + expect(result.score).toBe(1); + expect(result.verdict).toBe('pass'); + }); + + it('fails object python score zero when no pass flag or threshold is set', async () => { + const result = await run({ + name: 'py-zero', + type: 'python', + value: "{'score': 0, 'reason': 'zero score'}", + }); + + expect(result.score).toBe(0); + expect(result.verdict).toBe('fail'); + expect(result.assertions[0]).toEqual({ text: 'zero score', passed: false }); + }); + + it('runs webhook assertions against an HTTP endpoint', async () => { + const url = await new Promise((resolve) => { + server = createServer((req, res) => { + let body = ''; + req.on('data', (chunk) => { + body += chunk; + }); + req.on('end', () => { + const payload = JSON.parse(body) as { output: string }; + res.setHeader('content-type', 'application/json'); + res.end( + JSON.stringify({ + score: payload.output.includes('Paris') ? 1 : 0, + assertions: [{ text: 'saw output', passed: payload.output.includes('Paris') }], + }), + ); + }); + }).listen(0, () => { + const address = server?.address(); + if (address && typeof address === 'object') { + resolve(`http://127.0.0.1:${address.port}`); + } + }); + }); + + const result = await run({ name: 'webhook', type: 'webhook', value: url }); + expect(result.score).toBe(1); + expect(result.assertions[0].text).toBe('saw output'); + }); + + it('fails webhook score zero when no pass flag or threshold is set', async () => { + const url = await new Promise((resolve) => { + server = createServer((req, res) => { + req.resume(); + req.on('end', () => { + res.setHeader('content-type', 'application/json'); + res.end(JSON.stringify({ score: 0, reason: 'zero score' })); + }); + }).listen(0, () => { + const address = server?.address(); + if (address && typeof address === 'object') { + resolve(`http://127.0.0.1:${address.port}`); + } + }); + }); + + const result = await run({ name: 'webhook-zero', type: 'webhook', value: url }); + + expect(result.score).toBe(0); + expect(result.verdict).toBe('fail'); + expect(result.assertions[0]).toEqual({ text: 'zero score', passed: false }); + }); + + it('aggregates nested assert-set children', async () => { + const result = await run({ + name: 'set', + type: 'assert-set', + threshold: 0.5, + assertions: [ + { name: 'contains', type: 'contains', value: 'Paris' }, + { name: 'starts', type: 'starts-with', value: 'Paris' }, + ], + }); + + expect(result.score).toBe(1); + expect(result.scores?.map((score) => score.type)).toEqual(['contains', 'starts-with']); + }); + + it('does not count zero-score script children as passing in composite thresholds', async () => { + const result = await run({ + name: 'gate', + type: 'composite', + assertions: [ + { name: 'js-zero', type: 'javascript', value: '0' }, + { name: 'contains', type: 'contains', value: 'Paris' }, + ], + aggregator: { type: 'threshold', threshold: 1 }, + }); + + expect(result.score).toBe(0.5); + expect(result.verdict).toBe('fail'); + expect(result.assertions[0]).toEqual({ + text: '1/2 evaluators passed (threshold: 1)', + passed: false, + }); + expect(result.scores?.[0]).toMatchObject({ + name: 'js-zero', + type: 'javascript', + score: 0, + verdict: 'fail', + }); + }); + + it('runs similar with an OpenAI-compatible embeddings provider', async () => { + const url = await new Promise((resolve) => { + server = createServer((req, res) => { + req.resume(); + req.on('end', () => { + res.setHeader('content-type', 'application/json'); + res.end( + JSON.stringify({ + data: [{ embedding: [1, 0, 0] }, { embedding: [1, 0, 0] }], + }), + ); + }); + }).listen(0, () => { + const address = server?.address(); + if (address && typeof address === 'object') { + resolve(`http://127.0.0.1:${address.port}`); + } + }); + }); + + const result = await run({ + name: 'similar', + type: 'similar', + value: 'Paris is the capital of France.', + threshold: 0.9, + config: { embedding_provider: { base_url: url, model: 'test-embedding' } }, + }); + + expect(result.score).toBe(1); + expect(result.verdict).toBe('pass'); + }); +}); diff --git a/packages/core/test/evaluation/loaders/case-file-loader.test.ts b/packages/core/test/evaluation/loaders/case-file-loader.test.ts index ddca6666e..828f558f1 100644 --- a/packages/core/test/evaluation/loaders/case-file-loader.test.ts +++ b/packages/core/test/evaluation/loaders/case-file-loader.test.ts @@ -136,15 +136,11 @@ describe('resolveFileReference', () => { expect(cases[0].assertions).toEqual([ { type: 'latency', threshold: 1000 }, { type: 'cost', budget: 0.01 }, - { type: 'code-grader', command: ['uv', 'run', 'python', graderPath] }, + { type: 'script', command: ['uv', 'run', 'python', graderPath] }, ]); const evaluators = await parseGraders(cases[0], undefined, [tempDir], 'csv-assertions'); - expect(evaluators.map((evaluator) => evaluator.type)).toEqual([ - 'latency', - 'cost', - 'code-grader', - ]); + expect(evaluators.map((evaluator) => evaluator.type)).toEqual(['latency', 'cost', 'script']); }); it('rejects unsupported promptfoo expected DSL forms clearly', async () => { diff --git a/packages/core/test/evaluation/loaders/grader-parser.test.ts b/packages/core/test/evaluation/loaders/grader-parser.test.ts index 6b8e2e9e2..a05f4bb25 100644 --- a/packages/core/test/evaluation/loaders/grader-parser.test.ts +++ b/packages/core/test/evaluation/loaders/grader-parser.test.ts @@ -10,6 +10,7 @@ import type { CompositeGraderConfig, ContainsGraderConfig, EqualsGraderConfig, + GEvalGraderConfig, IsJsonGraderConfig, LatencyGraderConfig, LlmGraderConfig, @@ -211,7 +212,7 @@ describe('parseGraders - deterministic assertion types', () => { expect(evaluators).toBeUndefined(); }); - it('parses type: rubrics with criteria as llm-grader', async () => { + it('parses type: rubrics with criteria as g-eval', async () => { const evaluators = await parseGraders( { evaluators: [ @@ -227,8 +228,8 @@ describe('parseGraders - deterministic assertion types', () => { 'test-1', ); expect(evaluators).toHaveLength(1); - expect(evaluators?.[0].type).toBe('llm-grader'); - expect((evaluators?.[0] as LlmGraderConfig).rubrics).toHaveLength(1); + expect(evaluators?.[0].type).toBe('g-eval'); + expect((evaluators?.[0] as GEvalGraderConfig).rubrics).toHaveLength(1); }); it('parses multiple assertion types in one evaluators array', async () => { @@ -251,6 +252,75 @@ describe('parseGraders - deterministic assertion types', () => { expect(evaluators?.[2].type).toBe('is-json'); expect(evaluators?.[3].type).toBe('equals'); }); + + it('parses explicit g-eval criteria with score ranges', async () => { + const evaluators = await parseGraders( + { + assert: [ + { + name: 'quality', + type: 'g-eval', + rubric_item: { + id: 'quality', + outcome: 'Answer quality', + min_score: 0.8, + score_ranges: [ + { score_range: [0, 4], outcome: 'Weak' }, + { score_range: [5, 7], outcome: 'Adequate' }, + { score_range: [8, 10], outcome: 'Strong' }, + ], + }, + }, + ], + }, + undefined, + [tempDir], + 'test-1', + ); + + const config = evaluators?.[0] as GEvalGraderConfig; + expect(config.type).toBe('g-eval'); + expect(config.rubrics?.[0]).toMatchObject({ + id: 'quality', + outcome: 'Answer quality', + min_score: 0.8, + score_ranges: [ + { score_range: [0, 4], outcome: 'Weak' }, + { score_range: [5, 7], outcome: 'Adequate' }, + { score_range: [8, 10], outcome: 'Strong' }, + ], + }); + }); + + it('parses llm-rubric as free-form rubric text', async () => { + const evaluators = await parseGraders( + { + assert: [{ name: 'freeform', type: 'llm-rubric', value: 'Judge whether it is helpful' }], + }, + undefined, + [tempDir], + 'test-1', + ); + + expect(evaluators?.[0]).toMatchObject({ + name: 'freeform', + type: 'llm-rubric', + value: 'Judge whether it is helpful', + }); + }); + + it('rejects known unimplemented promptfoo assertion types', async () => { + await expect( + parseGraders( + { + assert: [{ name: 'bleu', type: 'bleu', value: 'reference' }], + }, + undefined, + [tempDir], + 'test-1', + ), + ).rejects.toThrow("Unsupported promptfoo assertion type 'bleu'"); + }); }); describe('parseGraders - tool-trajectory', () => { @@ -447,7 +517,7 @@ describe('parseGraders - tool-trajectory', () => { }); }); -describe('parseGraders - code-grader config pass-through', () => { +describe('parseGraders - script config pass-through', () => { let tempDir: string; beforeAll(async () => { @@ -466,7 +536,7 @@ describe('parseGraders - code-grader config pass-through', () => { evaluators: [ { name: 'fuzzy-matcher', - type: 'code-grader', + type: 'script', command: ['bun', 'run', './test_script.ts'], fields: [ { path: 'supplier.name', threshold: 0.85 }, @@ -482,7 +552,7 @@ describe('parseGraders - code-grader config pass-through', () => { expect(evaluators).toHaveLength(1); const config = evaluators?.[0] as CodeGraderConfig; - expect(config.type).toBe('code-grader'); + expect(config.type).toBe('script'); expect(config.name).toBe('fuzzy-matcher'); expect(config.config).toEqual({ fields: [ @@ -499,7 +569,7 @@ describe('parseGraders - code-grader config pass-through', () => { evaluators: [ { name: 'simple-grader', - type: 'code-grader', + type: 'script', command: ['bun', 'run', './test_script.ts'], }, ], @@ -509,7 +579,7 @@ describe('parseGraders - code-grader config pass-through', () => { expect(evaluators).toHaveLength(1); const config = evaluators?.[0] as CodeGraderConfig; - expect(config.type).toBe('code-grader'); + expect(config.type).toBe('script'); expect(config.config).toBeUndefined(); }); @@ -518,7 +588,7 @@ describe('parseGraders - code-grader config pass-through', () => { evaluators: [ { name: 'with-weight', - type: 'code-grader', + type: 'script', command: ['bun', 'run', './test_script.ts'], cwd: tempDir, weight: 2.0, @@ -550,7 +620,7 @@ describe('parseGraders - code-grader config pass-through', () => { evaluators: [ { name: 'shell-command', - type: 'code-grader', + type: 'script', command: './test_script.ts', }, ], @@ -574,7 +644,7 @@ describe('parseGraders - code-grader config pass-through', () => { evaluators: [ { name: 'legacy-script', - type: 'code-grader', + type: 'script', script: './test_script.ts', }, ], @@ -583,7 +653,7 @@ describe('parseGraders - code-grader config pass-through', () => { [tempDir], 'test-case', ), - ).rejects.toThrow(/'script' has been removed.*command/); + ).rejects.toThrow(/'script' field has been removed.*command/); }); }); @@ -609,7 +679,7 @@ describe('parseGraders - kebab-case type normalization', () => { expect((evaluators?.[0] as LlmGraderConfig).target).toBe('grader-low-cost-a'); }); - it('accepts code-grader kebab-case as canonical form', async () => { + it('normalizes legacy code-grader to script', async () => { const rawEvalCase = { evaluators: [ { @@ -623,7 +693,29 @@ describe('parseGraders - kebab-case type normalization', () => { const evaluators = await parseGraders(rawEvalCase, undefined, [tempDir], 'test-case'); expect(evaluators).toHaveLength(1); - expect(evaluators?.[0].type).toBe('code-grader'); + expect(evaluators?.[0].type).toBe('script'); + }); + + it('accepts script as the subprocess grader type', async () => { + const rawEvalCase = { + evaluators: [ + { + name: 'subprocess-check', + type: 'script', + command: ['bun', 'run', './test_script.ts'], + }, + ], + }; + + const evaluators = await parseGraders(rawEvalCase, undefined, [tempDir], 'test-case'); + + expect(evaluators).toHaveLength(1); + expect(evaluators?.[0].type).toBe('script'); + expect((evaluators?.[0] as CodeGraderConfig).command).toEqual([ + 'bun', + 'run', + './test_script.ts', + ]); }); it('accepts is-json kebab-case as canonical form', async () => { @@ -1320,17 +1412,17 @@ describe('parseGraders - assertions field', () => { expect(evaluators?.[0].type).toBe('contains'); }); - it('ignores the removed assertion field as evaluator input', async () => { - const removedKey = ['ass', 'ert'].join(''); + it('parses canonical assert field as evaluators', async () => { const evaluators = await parseGraders( { - [removedKey]: [{ type: 'contains', value: 'DENIED' }], + assert: [{ type: 'contains', value: 'DENIED' }], }, undefined, [tempDir], 'test-1', ); - expect(evaluators).toBeUndefined(); + expect(evaluators).toHaveLength(1); + expect(evaluators?.[0].type).toBe('contains'); }); it('assertions takes precedence over execution.evaluators', async () => { @@ -1349,6 +1441,23 @@ describe('parseGraders - assertions field', () => { expect(evaluators?.[0].type).toBe('contains'); }); + it('assert takes precedence over assertions and execution.evaluators', async () => { + const evaluators = await parseGraders( + { + assert: [{ type: 'contains', value: 'CANONICAL' }], + assertions: [{ type: 'contains', value: 'LEGACY' }], + execution: { + evaluators: [{ name: 'latency-check', type: 'latency', threshold: 5000 }], + }, + }, + undefined, + [tempDir], + 'test-1', + ); + expect(evaluators).toHaveLength(1); + expect(evaluators?.[0]).toMatchObject({ type: 'contains', value: 'CANONICAL' }); + }); + it('assertions takes precedence over top-level evaluators', async () => { const evaluators = await parseGraders( { @@ -1406,6 +1515,22 @@ describe('parseGraders - assertions field', () => { expect(evaluators?.[0].type).toBe('latency'); }); + it('falls back to execution.assert when case-level assertions are not present', async () => { + const evaluators = await parseGraders( + { + execution: { + assert: [{ type: 'contains', value: 'EXEC' }], + evaluators: [{ name: 'latency-check', type: 'latency', threshold: 5000 }], + }, + }, + undefined, + [tempDir], + 'test-1', + ); + expect(evaluators).toHaveLength(1); + expect(evaluators?.[0]).toMatchObject({ type: 'contains', value: 'EXEC' }); + }); + it('suite-level assertions takes precedence over suite-level execution.evaluators', async () => { const evaluators = await parseGraders( {}, @@ -1432,6 +1557,20 @@ describe('parseGraders - assertions field', () => { expect(evaluators).toHaveLength(1); expect(evaluators?.[0].type).toBe('latency'); }); + + it('suite-level assert takes precedence over suite-level assertions', async () => { + const evaluators = await parseGraders( + {}, + { + assert: [{ type: 'contains', value: 'CANONICAL' }], + assertions: [{ type: 'contains', value: 'LEGACY' }], + }, + [tempDir], + 'test-1', + ); + expect(evaluators).toHaveLength(1); + expect(evaluators?.[0]).toMatchObject({ type: 'contains', value: 'CANONICAL' }); + }); }); describe('parseGraders - assertion templates', () => { @@ -1660,9 +1799,9 @@ describe('parseGraders - type: rubrics with criteria', () => { 'test-1', ); expect(evaluators).toHaveLength(1); - expect(evaluators?.[0].type).toBe('llm-grader'); - expect((evaluators?.[0] as LlmGraderConfig).rubrics).toHaveLength(2); - expect((evaluators?.[0] as LlmGraderConfig).weight).toBe(4.0); + expect(evaluators?.[0].type).toBe('g-eval'); + expect((evaluators?.[0] as GEvalGraderConfig).rubrics).toHaveLength(2); + expect((evaluators?.[0] as GEvalGraderConfig).weight).toBe(4.0); }); it('preserves optional rubric criterion operators', async () => { @@ -1831,9 +1970,9 @@ describe('parseGraders - type: rubrics with criteria', () => { ); expect(evaluators).toHaveLength(1); - const config = evaluators?.[0] as LlmGraderConfig; + const config = evaluators?.[0] as GEvalGraderConfig; expect(config.name).toBe('rubrics'); - expect(config.type).toBe('llm-grader'); + expect(config.type).toBe('g-eval'); expect(config.rubrics?.[0]?.min_score).toBe(0.8); expect(config.rubrics?.[0]?.score_ranges).toEqual([ { score_range: [0, 4], outcome: 'Weak' }, @@ -1934,13 +2073,13 @@ describe('parseGraders - required field', () => { expect(config.required).toBe(true); }); - it('parses required on code-grader evaluator', async () => { + it('parses required on script evaluator', async () => { const evaluators = await parseGraders( { evaluators: [ { name: 'code-check', - type: 'code-grader', + type: 'script', command: ['bun', 'run', './test_script.ts'], required: true, }, @@ -2059,6 +2198,31 @@ describe('parseGraders - composite assertions field', () => { expect(evaluators?.[0].type).toBe('composite'); }); + it('parses composite with canonical assert field', async () => { + const evaluators = await parseGraders( + { + assert: [ + { + name: 'combined', + type: 'composite', + assert: [ + { name: 'safety', type: 'llm-grader', prompt: './safety.md' }, + { name: 'quality', type: 'llm-grader', prompt: './quality.md' }, + ], + aggregator: { type: 'weighted_average' }, + }, + ], + }, + undefined, + [tempDir], + 'test-1', + ); + expect(evaluators).toHaveLength(1); + const composite = evaluators?.[0] as CompositeGraderConfig; + expect(composite.type).toBe('composite'); + expect(composite.assertions).toHaveLength(2); + }); + it('composite still works with evaluators field (backward compat)', async () => { const evaluators = await parseGraders( { @@ -2105,6 +2269,30 @@ describe('parseGraders - composite assertions field', () => { expect(composite.assertions).toHaveLength(1); expect(composite.assertions[0].name).toBe('safety'); }); + + it('composite assert takes precedence over assertions and evaluators', async () => { + const evaluators = await parseGraders( + { + assert: [ + { + name: 'combined', + type: 'composite', + assert: [{ name: 'safety', type: 'llm-grader', prompt: './safety.md' }], + assertions: [{ name: 'legacy', type: 'llm-grader', prompt: './quality.md' }], + evaluators: [{ name: 'quality', type: 'llm-grader', prompt: './quality.md' }], + aggregator: { type: 'weighted_average' }, + }, + ], + }, + undefined, + [tempDir], + 'test-1', + ); + expect(evaluators).toHaveLength(1); + const composite = evaluators?.[0] as CompositeGraderConfig; + expect(composite.assertions).toHaveLength(1); + expect(composite.assertions[0].name).toBe('safety'); + }); }); describe('parseGraders - string shorthand in assertions', () => { @@ -2124,13 +2312,13 @@ describe('parseGraders - string shorthand in assertions', () => { expect(evaluators).toHaveLength(1); const rubrics = evaluators?.[0]; - expect(rubrics?.type).toBe('llm-grader'); - expect((rubrics as LlmGraderConfig).rubrics).toHaveLength(3); - expect((rubrics as LlmGraderConfig).rubrics?.[0].outcome).toBe( + expect(rubrics?.type).toBe('g-eval'); + expect((rubrics as GEvalGraderConfig).rubrics).toHaveLength(3); + expect((rubrics as GEvalGraderConfig).rubrics?.[0].outcome).toBe( 'Mentions divide-and-conquer approach', ); - expect((rubrics as LlmGraderConfig).rubrics?.[1].outcome).toBe('Explains partition step'); - expect((rubrics as LlmGraderConfig).rubrics?.[2].outcome).toBe('States time complexity'); + expect((rubrics as GEvalGraderConfig).rubrics?.[1].outcome).toBe('Explains partition step'); + expect((rubrics as GEvalGraderConfig).rubrics?.[2].outcome).toBe('States time complexity'); }); it('groups strings into rubrics and preserves object evaluators', async () => { @@ -2149,9 +2337,9 @@ describe('parseGraders - string shorthand in assertions', () => { expect(evaluators).toHaveLength(2); // First: rubrics (at position of first string) - expect(evaluators?.[0].type).toBe('llm-grader'); - expect((evaluators?.[0] as LlmGraderConfig).rubrics).toHaveLength(2); - expect((evaluators?.[0] as LlmGraderConfig).rubrics?.[0].outcome).toBe( + expect(evaluators?.[0].type).toBe('g-eval'); + expect((evaluators?.[0] as GEvalGraderConfig).rubrics).toHaveLength(2); + expect((evaluators?.[0] as GEvalGraderConfig).rubrics?.[0].outcome).toBe( 'Mentions divide-and-conquer approach', ); // Second: the contains evaluator @@ -2170,9 +2358,9 @@ describe('parseGraders - string shorthand in assertions', () => { ); expect(evaluators).toHaveLength(1); - expect(evaluators?.[0].type).toBe('llm-grader'); - expect((evaluators?.[0] as LlmGraderConfig).rubrics).toHaveLength(1); - expect((evaluators?.[0] as LlmGraderConfig).rubrics?.[0].outcome).toBe( + expect(evaluators?.[0].type).toBe('g-eval'); + expect((evaluators?.[0] as GEvalGraderConfig).rubrics).toHaveLength(1); + expect((evaluators?.[0] as GEvalGraderConfig).rubrics?.[0].outcome).toBe( 'Response must be polite', ); }); @@ -2208,8 +2396,8 @@ describe('parseGraders - string shorthand in assertions', () => { ); expect(evaluators).toHaveLength(2); - const rubrics = evaluators?.[0] as LlmGraderConfig; - expect(rubrics.type).toBe('llm-grader'); + const rubrics = evaluators?.[0] as GEvalGraderConfig; + expect(rubrics.type).toBe('g-eval'); expect(rubrics.rubrics).toHaveLength(3); expect(rubrics.weight).toBe(3); expect(evaluators?.[1].type).toBe('contains'); diff --git a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts index e3f377943..2f0f3b5ae 100644 --- a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts +++ b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts @@ -69,6 +69,23 @@ describe('loadTestsFromJsonl', () => { expect(cases[0].input).toHaveLength(1); expect(cases[0].input[0].role).toBe('user'); expect(cases[0].input[0].content).toBe('Query'); + expect(cases[0].assertions?.[0]?.type).toBe('g-eval'); + expect(cases[0].assertions?.[0]?.rubrics?.[0]?.outcome).toBe('Goal'); + }); + + it('keeps expected_output-only JSONL cases passive without implicit assertions', async () => { + const jsonlPath = path.join(tempDir, 'expected-output-only.jsonl'); + await writeFile( + jsonlPath, + '{"id": "test-1", "input": "Query", "expected_output": "Reference answer"}\n', + ); + + const cases = await loadTestsFromJsonl(jsonlPath, tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0].criteria).toBe(''); + expect(cases[0].expected_output[0].content).toBe('Reference answer'); + expect(cases[0].assertions).toBeUndefined(); }); it('parses multi-line JSONL', async () => { @@ -225,7 +242,7 @@ describe('loadTestsFromJsonl', () => { expect(cases).toHaveLength(1); expect(cases[0].assertions).toHaveLength(1); - expect(cases[0].assertions?.[0].type).toBe('llm-grader'); + expect(cases[0].assertions?.[0].type).toBe('g-eval'); const rubricEvaluator = cases[0].assertions?.[0] as { type: string; rubrics?: unknown[] }; expect(rubricEvaluator.rubrics).toHaveLength(2); }); @@ -259,11 +276,13 @@ describe('loadTestsFromJsonl', () => { expect(cases[0].assertions).toHaveLength(1); expect(cases[0].assertions?.[0]).toMatchObject({ name: 'rubrics', - type: 'llm-grader', + type: 'g-eval', rubrics: [ { id: 'quality', + outcome: 'Answer quality', min_score: 0.8, + weight: 1, score_ranges: [ { score_range: [0, 4], outcome: 'Weak' }, { score_range: [5, 7], outcome: 'Adequate' }, @@ -406,6 +425,27 @@ describe('loadTests with format detection', () => { expect(cases).toHaveLength(1); expect(cases[0].id).toBe('yaml-test'); + expect(cases[0].assertions?.[0]?.type).toBe('g-eval'); + expect(cases[0].assertions?.[0]?.rubrics?.[0]?.outcome).toBe('Goal'); + }); + + it('keeps expected_output-only YAML cases passive without implicit assertions', async () => { + const yamlPath = path.join(tempDir, 'expected-output-only.yaml'); + await writeFile( + yamlPath, + `tests: + - id: expected-only + input: Query + expected_output: Reference answer +`, + ); + + const cases = await loadTests(yamlPath, tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0].criteria).toBe(''); + expect(cases[0].expected_output[0].content).toBe('Reference answer'); + expect(cases[0].assertions).toBeUndefined(); }); it('routes .yml to YAML parser', async () => { @@ -843,8 +883,8 @@ eval_cases: }); }); - describe('expected_outcome → criteria alias (YAML)', () => { - it('supports expected_outcome as deprecated alias for criteria', async () => { + describe('expected_outcome → assert compatibility (YAML)', () => { + it('supports expected_outcome as deprecated assertion shorthand', async () => { const yamlPath = path.join(tempDir, 'expected-outcome-alias.yaml'); await writeFile( yamlPath, @@ -885,8 +925,8 @@ eval_cases: }); }); - describe('expected_outcome → criteria alias (JSONL)', () => { - it('supports expected_outcome as deprecated alias for criteria', async () => { + describe('expected_outcome → assert compatibility (JSONL)', () => { + it('supports expected_outcome as deprecated assertion shorthand', async () => { const jsonlPath = path.join(tempDir, 'expected-outcome-alias.jsonl'); await writeFile( jsonlPath, diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 4acf92dd1..385a965b4 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -2568,6 +2568,41 @@ describe('criteria with assertions runs only declared evaluators (#452)', () => expect(result.score).toBe(1); }); + it('does not run the default llm-grader for passive expected_output-only cases', async () => { + const provider = new SequenceProvider('mock', { + responses: [{ output: [{ role: 'assistant', content: 'hello world' }] }], + }); + const llmEvaluate = mock(() => { + throw new Error('default llm-grader should not run'); + }); + + const { evaluator: _evaluator, ...referenceOnlyCase } = criteriaTestCase; + const result = await runEvalCase({ + evalCase: { + ...referenceOnlyCase, + criteria: '', + expected_output: [{ role: 'assistant', content: 'hello world' }], + }, + provider, + target: { + ...baseTarget, + graderTarget: 'grader-target', + }, + evaluators: { + 'llm-grader': { + kind: 'llm-grader', + evaluate: llmEvaluate, + }, + }, + }); + + expect(llmEvaluate).not.toHaveBeenCalled(); + expect(result.score).toBe(1); + expect(result.assertions).toEqual([ + { text: 'No assertions declared; grading skipped', passed: true }, + ]); + }); + it('criteria is available as evalCase data for evaluators that consume it', async () => { const provider = new SequenceProvider('mock', { responses: [{ output: [{ role: 'assistant', content: 'hello world' }] }], diff --git a/packages/core/test/evaluation/rubric-operators-yaml.test.ts b/packages/core/test/evaluation/rubric-operators-yaml.test.ts index 44f36bbdf..1e03b2132 100644 --- a/packages/core/test/evaluation/rubric-operators-yaml.test.ts +++ b/packages/core/test/evaluation/rubric-operators-yaml.test.ts @@ -39,9 +39,9 @@ describe('rubric criterion operators', () => { const tests = await loadTests(path.join(dir, 'suite.eval.yaml'), dir); const evaluator = tests[0]?.assertions?.[0]; - expect(evaluator?.type).toBe('llm-grader'); - if (!evaluator || evaluator.type !== 'llm-grader') { - throw new Error('expected rubrics to normalize to llm-grader'); + expect(evaluator?.type).toBe('g-eval'); + if (!evaluator || evaluator.type !== 'g-eval') { + throw new Error('expected rubrics to normalize to g-eval'); } expect(evaluator.rubrics?.map((rubric) => rubric.operator)).toEqual([ diff --git a/packages/core/test/evaluation/source-traceability.test.ts b/packages/core/test/evaluation/source-traceability.test.ts index d47a4bd26..3e3a96722 100644 --- a/packages/core/test/evaluation/source-traceability.test.ts +++ b/packages/core/test/evaluation/source-traceability.test.ts @@ -85,12 +85,12 @@ tests: const kinds = source?.references.map((reference) => reference.kind).sort(); expect(kinds).toEqual([ 'assertion_template', - 'code_grader_command', - 'code_grader_cwd', 'input_file', 'llm_grader_prompt', 'preprocessor_command', 'prompt_script', + 'script_grader_command', + 'script_grader_cwd', ]); const promptFile = source?.references.find( @@ -100,7 +100,7 @@ tests: expect(promptFile?.resolvedPath).toBe(path.join(tempDir, 'graders', 'prompt.md')); const codeCommand = source?.references.find( - (reference) => reference.kind === 'code_grader_command', + (reference) => reference.kind === 'script_grader_command', ); expect(codeCommand?.command).toEqual(['bun', 'graders/code.ts']); expect(codeCommand?.resolvedPath).toBe(path.join(tempDir, 'graders', 'code.ts')); diff --git a/packages/core/test/evaluation/validation/eval-file-schema.test.ts b/packages/core/test/evaluation/validation/eval-file-schema.test.ts index 5f9a83b11..eee77a3d0 100644 --- a/packages/core/test/evaluation/validation/eval-file-schema.test.ts +++ b/packages/core/test/evaluation/validation/eval-file-schema.test.ts @@ -215,7 +215,15 @@ describe('EvalFileSchema input shorthand', () => { value: ['Identifies user impact', 'Avoids unsupported claims'], score_ranges: [{ score_range: [0, 10], outcome: 'overall quality' }], }, + { + type: 'composite', + assert: [{ type: 'contains', value: 'safe' }], + aggregator: { type: 'weighted_average' }, + }, ], + execution: { + assert: [{ type: 'contains', value: 'Looks' }], + }, }, ], scenarios: [ diff --git a/packages/core/test/evaluation/validation/eval-validator.test.ts b/packages/core/test/evaluation/validation/eval-validator.test.ts index 63e54f97f..dab22e157 100644 --- a/packages/core/test/evaluation/validation/eval-validator.test.ts +++ b/packages/core/test/evaluation/validation/eval-validator.test.ts @@ -2120,7 +2120,7 @@ tests: warnings.some( (e) => e.message.includes("'expected_outcome' is deprecated") && - e.message.includes("'criteria'"), + e.message.includes("'assert'"), ), ).toBe(true); }); diff --git a/packages/sdk/src/assertion.ts b/packages/sdk/src/assertion.ts index 3e326a103..8eda2977b 100644 --- a/packages/sdk/src/assertion.ts +++ b/packages/sdk/src/assertion.ts @@ -38,7 +38,10 @@ export type AssertionContext = CodeGraderInput; export type AssertionType = // kebab-case (canonical internal form) | 'llm-grader' - | 'code-grader' + | 'g-eval' + | 'llm-rubric' + | 'script' + | 'assert-set' | 'rubrics' | 'composite' | 'tool-trajectory' @@ -59,6 +62,10 @@ export type AssertionType = | 'equals' | 'regex' | 'is-json' + | 'javascript' + | 'python' + | 'webhook' + | 'similar' // legacy snake_case aliases (still accepted) | 'llm_grader' | 'code_grader' diff --git a/packages/sdk/src/graders.ts b/packages/sdk/src/graders.ts index c5fc27083..9b0169e75 100644 --- a/packages/sdk/src/graders.ts +++ b/packages/sdk/src/graders.ts @@ -67,6 +67,18 @@ export interface RubricsGraderConfig extends EvalAssertionConfig, GraderCommonCo readonly criteria: readonly GraderRubricCriterion[]; } +export interface GEvalGraderConfig extends EvalAssertionConfig, GraderCommonConfig { + readonly type: 'g-eval'; + readonly criteria: readonly GraderRubricCriterion[]; + readonly target?: string; +} + +export interface LlmRubricGraderConfig extends EvalAssertionConfig, GraderCommonConfig { + readonly type: 'llm-rubric'; + readonly value: string; + readonly target?: string; +} + export interface GraderPromptScriptConfig { readonly command: readonly string[]; readonly config?: Readonly>; @@ -104,8 +116,8 @@ export interface CodeGraderOptions extends GraderHelperOptions { readonly preprocessors?: readonly EvalPreprocessor[]; } -export interface CodeGraderConfig extends EvalAssertionConfig, GraderCommonConfig { - readonly type: 'code-grader'; +export interface ScriptGraderConfig extends EvalAssertionConfig, GraderCommonConfig { + readonly type: 'script'; readonly command: GraderCommand; readonly cwd?: string; readonly target?: true | CodeGraderTargetOptions; @@ -113,14 +125,19 @@ export interface CodeGraderConfig extends EvalAssertionConfig, GraderCommonConfi readonly preprocessors?: readonly EvalPreprocessor[]; } +/** @deprecated Use ScriptGraderConfig with type: 'script'. */ +export type CodeGraderConfig = ScriptGraderConfig; + export type GraderHelperConfig = | ContainsGraderConfig | EqualsGraderConfig | RegexGraderConfig | IsJsonGraderConfig | RubricsGraderConfig + | GEvalGraderConfig + | LlmRubricGraderConfig | LlmGraderConfig - | CodeGraderConfig; + | ScriptGraderConfig; function withCommon( config: T, @@ -180,6 +197,34 @@ export function rubricsGrader( return withCommon({ type: 'rubrics', criteria }, options); } +export function gEvalGrader( + criteria: readonly GraderRubricCriterion[], + options: GraderHelperOptions & { readonly target?: string } = {}, +): GEvalGraderConfig { + return withCommon( + { + type: 'g-eval', + criteria, + ...(options.target !== undefined ? { target: options.target } : {}), + }, + options, + ); +} + +export function llmRubricGrader( + value: string, + options: GraderHelperOptions & { readonly target?: string } = {}, +): LlmRubricGraderConfig { + return withCommon( + { + type: 'llm-rubric', + value, + ...(options.target !== undefined ? { target: options.target } : {}), + }, + options, + ); +} + export function llmGrader(options: LlmGraderOptions = {}): LlmGraderConfig { return withCommon( { @@ -199,10 +244,17 @@ export function llmGrader(options: LlmGraderOptions = {}): LlmGraderConfig { export function codeGrader( command: GraderCommand, options: CodeGraderOptions = {}, -): CodeGraderConfig { +): ScriptGraderConfig { + return scriptGrader(command, options); +} + +export function scriptGrader( + command: GraderCommand, + options: CodeGraderOptions = {}, +): ScriptGraderConfig { return withCommon( { - type: 'code-grader', + type: 'script', command, ...(options.cwd !== undefined ? { cwd: options.cwd } : {}), ...(options.target !== undefined ? { target: options.target } : {}), @@ -221,8 +273,12 @@ export const graders = Object.freeze({ isJson: isJsonGrader, json: jsonGrader, rubrics: rubricsGrader, + gEval: gEvalGrader, + llmRubric: llmRubricGrader, llmGrader, codeGrader, + script: scriptGrader, + scriptGrader, }); export type GraderCatalog = typeof graders; diff --git a/packages/sdk/src/index.ts b/packages/sdk/src/index.ts index c752a5ed7..7d4d0d992 100644 --- a/packages/sdk/src/index.ts +++ b/packages/sdk/src/index.ts @@ -160,17 +160,21 @@ export { containsGrader, equalsGrader, exactGrader, + gEvalGrader, graders, isJsonGrader, jsonGrader, llmGrader, + llmRubricGrader, regexGrader, rubricsGrader, + scriptGrader, type CodeGraderConfig, type CodeGraderOptions, type CodeGraderTargetOptions, type ContainsGraderConfig, type EqualsGraderConfig, + type GEvalGraderConfig, type GraderCatalog, type GraderCommand, type GraderCommonConfig, @@ -184,9 +188,11 @@ export { type IsJsonGraderConfig, type LlmGraderConfig, type LlmGraderOptions, + type LlmRubricGraderConfig, type RegexGraderConfig, type RegexGraderOptions, type RubricsGraderConfig, + type ScriptGraderConfig, } from './graders.js'; // Re-export target client diff --git a/packages/sdk/test/grader-helpers.test.ts b/packages/sdk/test/grader-helpers.test.ts index 791a1351c..655255d2a 100644 --- a/packages/sdk/test/grader-helpers.test.ts +++ b/packages/sdk/test/grader-helpers.test.ts @@ -12,6 +12,7 @@ import { llmGrader, regexGrader, rubricsGrader, + scriptGrader, serializeEvalYaml, toEvalYamlObject, } from '../src/index.js'; @@ -64,12 +65,16 @@ describe('grader helper config builders', () => { }), ).toEqual({ name: 'scripted-check', - type: 'code-grader', + type: 'script', command: ['bun', 'run', 'graders/check.ts'], cwd: 'graders', target: { maxCalls: 2 }, config: { mode: 'strict' }, }); + expect(scriptGrader(['bun', 'run', 'graders/check.ts'])).toEqual({ + type: 'script', + command: ['bun', 'run', 'graders/check.ts'], + }); }); it('composes inside defineEval and serializes to canonical AgentV YAML assertions', () => { @@ -112,7 +117,7 @@ describe('grader helper config builders', () => { }, ], }), - graders.codeGrader(['bun', 'run', 'graders/check.ts'], { + graders.script(['bun', 'run', 'graders/check.ts'], { name: 'scripted-check', target: { maxCalls: 2 }, minScore: 0.5, @@ -163,7 +168,7 @@ describe('grader helper config builders', () => { }, { name: 'scripted-check', - type: 'code-grader', + type: 'script', command: ['bun', 'run', 'graders/check.ts'], target: { max_calls: 2 }, min_score: 0.5, @@ -174,7 +179,7 @@ describe('grader helper config builders', () => { expect(yaml).toContain('assertions:'); expect(yaml).toContain('type: llm-grader'); - expect(yaml).toContain('type: code-grader'); + expect(yaml).toContain('type: script'); expect(yaml).toContain('max_steps: 2'); expect(yaml).toContain('max_calls: 2'); expect(yaml).toContain('min_score: 0.8'); diff --git a/skills-data/agentv-eval-writer/SKILL.md b/skills-data/agentv-eval-writer/SKILL.md index 7449431d2..f725beda0 100644 --- a/skills-data/agentv-eval-writer/SKILL.md +++ b/skills-data/agentv-eval-writer/SKILL.md @@ -3,7 +3,7 @@ name: agentv-eval-writer description: >- Write, edit, review, and validate AgentV EVAL.yaml / .eval.yaml evaluation files. Use when asked to create new eval files, update or fix existing ones, add or remove test cases, - configure graders (`llm-grader`, `code-grader`, `rubrics`), review whether an eval is correct or complete, + configure graders (`g-eval`, `llm-rubric`, `llm-grader`, `script`), review whether an eval is correct or complete, convert between EVAL.yaml and evals.json using `agentv convert`, or generate eval test cases from chat transcripts (markdown conversation or JSON messages). Do NOT use for creating SKILL.md files, writing skill definitions, or running evals — @@ -37,8 +37,8 @@ Use `@agentv/sdk` for TypeScript helper imports. Do not use `@agentv/eval` for n ## Authoring Checklist -- If `assertions` already state the grading contract, omit `criteria` instead of duplicating the same rubric twice. -- Prefer plain assertion strings for semantic checks when the default LLM rubric grader can judge them. Use multiple named `type: llm-grader` blocks only for custom prompts, custom grader targets, or intentionally separate grader panels. +- Put grading criteria in `assertions`/`assert`, not in test-level `criteria`. Plain assertion strings become a `g-eval` rubric grader. +- Prefer plain assertion strings for semantic checks when the default rubric grader can judge them. Use `type: llm-rubric` for structured criteria, `type: llm-grader` for custom prompts/targets, and `type: script` when grading must execute code. - Write `expected_output` as a golden/reference answer the target could have produced. Do not write criteria, scoring instructions, or "the agent should..." rubric prose there. - For historical or repo-state evals, materialize the repo under `workspace.repos[]` pinned to the commit under test. Mentioning a SHA only in prompt prose is not enough because the agent needs an actual checkout to inspect. @@ -60,7 +60,7 @@ agentv convert evals.json agentv eval evals.json ``` -The converter maps `prompt` → `input`, `expected_output` → `expected_output`, `assertions` → `assertions` (`llm-grader`), and resolves `files[]` paths. The generated YAML includes TODO comments for AgentV features to add (workspace setup, code graders, rubrics, required gates). +The converter maps `prompt` → `input`, `expected_output` → `expected_output`, `assertions` → `assertions` (`g-eval` rubric checks), and resolves `files[]` paths. The generated YAML includes TODO comments for AgentV features to add (workspace setup, script graders, rubrics, required gates). After converting, enhance the YAML with AgentV-specific capabilities shown below. @@ -133,10 +133,9 @@ tests: | Field | Required | Description | |-------|----------|-------------| | `id` | yes | Unique identifier | -| `criteria` | conditional | What the response should accomplish; required only when no `expected_output` or `assertions` are present | | `input` | yes | Input to the agent (string/object shorthand or full message array) | | `expected_output` | no | Gold-standard reference answer (string shorthand or full message array) | -| `assertions` | no | Graders: deterministic checks, rubrics, and LLM/code graders | +| `assertions` / `assert` | yes | Graders: deterministic checks, rubrics, LLM graders, script graders, or plain-string `g-eval` checks | | `execution` | no | Per-case grader/default overrides such as `skip_defaults`; target selection belongs in top-level `target` or CLI `--target` | | `workspace` | no | Per-case workspace config (overrides suite-level) | | `metadata` | no | Arbitrary key-value pairs passed to setup/teardown scripts | @@ -186,7 +185,8 @@ tests: ./cases.yaml # cases.yaml — each test only needs its own query # - id: test-1 -# criteria: ... +# assertions: +# - ... # input: "User question here" ``` @@ -207,7 +207,7 @@ The external file can be YAML (array of test objects) or JSONL. ## Assertions Field -`assertions` defines graders at the suite level or per-test level. It is the canonical field for all graders: +`assertions` (or `assert`) defines graders at the suite level or per-test level. It is the canonical field for all graders: ```yaml # Mix exact checks with rubric shorthand when both matter. @@ -230,9 +230,11 @@ tests: Plain strings in `assertions` are rubric criteria and are the preferred shape for qualitative agent behavior. Use deterministic assertions (`contains`, `regex`, -`is-json`, `equals`) only for exact machine-verifiable outputs, and code graders +`is-json`, `equals`) only for exact machine-verifiable outputs, and script graders when the check must inspect files, run commands, or validate structured state. -Do not add a separate `criteria` field that just repeats these assertion strings. +Do not add a separate test-level `criteria` field. Legacy evals that still use +`criteria` without explicit assertions are loaded as a plain-string assertion for +compatibility, but new evals should author the assertion directly. For repo-state evals, combine a pinned checkout, a golden answer, and assertion shorthand: @@ -262,33 +264,11 @@ tests: - The answer preserves the historical commit SHA as context. ``` -## How `criteria` and `assertions` Interact +## Assertions and Reference Data -`criteria` is a **data field** — it describes what the response should accomplish. It is **not** a grader. How it gets evaluated depends on whether `assertions` is present: - -| Scenario | What happens | Warning? | -|----------|-------------|----------| -| `criteria` + **no `assertions`** | Implicit `llm-grader` runs automatically against `criteria` | No | -| `criteria` + **`assertions` with only deterministic graders** (contains, regex, etc.) | Only declared graders run. `criteria` is **not evaluated**. | Yes — warns that no grader will consume criteria | -| `criteria` + **`assertions` with rubric shorthand or a grader** (plain strings, `llm-grader`, `code-grader`, `rubrics`) | Declared graders run. Graders receive `criteria` as input. | No | - -### No assertions → implicit llm-grader - -The simplest path. `criteria` is automatically evaluated by the default `llm-grader`: - -```yaml -tests: - - id: simple-eval - criteria: Assistant correctly explains the bug and proposes a fix - input: "Debug this function..." - # No assertions → default llm-grader evaluates against criteria -``` - -### assertions present → no implicit grader - -When `assertions` is defined, **only the declared graders run**. For semantic -checks, add plain rubric strings. If you need a custom LLM prompt or grader -target, declare `llm-grader` explicitly: +When `assertions` or `assert` is defined, **only the declared graders run**. For +semantic checks, add plain rubric strings. If you need a custom LLM prompt or +grader target, declare `llm-grader` explicitly: ```yaml tests: @@ -300,22 +280,30 @@ tests: value: "fix" ``` -**Common mistake:** defining `criteria` with only deterministic graders. The criteria will be ignored and a warning is emitted: +`expected_output` is passive reference data. It is available to graders through +`{{expected_output}}` and the script stdin payload, but it does not create an +implicit LLM grading call by itself. + +**Common mistake:** putting rubric prose in `expected_output` instead of an +assertion: ```yaml tests: - id: bad-example - criteria: Gives a thoughtful answer # ⚠ NOT evaluated — no grader in assertions input: "What is 2+2?" - assertions: - - type: contains - value: "4" - # Warning: criteria is defined but no grader in assertions will evaluate it. + expected_output: The assistant should explain why the answer is 4. # reference answer field, not a grader ``` -If plain assertion strings fully express the semantic contract, leave `criteria` -out. Keep `criteria` for the implicit-grader path or for non-duplicative context -that a declared grader actually needs. +Write this as: + +```yaml +tests: + - id: good-example + input: "What is 2+2?" + expected_output: "4" + assertions: + - The answer is 4 and explains the arithmetic briefly +``` ## Required Gates @@ -326,7 +314,7 @@ assertions: - type: contains value: "DENIED" required: true # must score >= 0.8 (default) - - type: rubrics + - type: g-eval required: true min_score: 0.6 # must score >= 0.6 (custom threshold) criteria: @@ -413,26 +401,26 @@ See https://agentv.dev/targets/configuration/#repository-lifecycle Configure via `assertions` array. Multiple graders produce a weighted average score. -### code-grader +### script ```yaml - name: format_check - type: code-grader + type: script command: [uv, run, validate.py] cwd: ./scripts # optional working directory target: {} # optional: enable LLM target proxy (max_calls: 50) ``` Contract: stdin JSON -> stdout JSON `{score, assertions: [{text, passed, evidence?}], reasoning}` -Raw stdin uses snake_case and includes: `criteria`, `input`, `expected_output`, `output` (final answer string), `messages`, `trace`, `trace_summary`, `token_usage`, `cost_usd`, `duration_ms`, `start_time`, `end_time`, `file_changes`, `workspace_path`, `config` +Raw stdin uses snake_case and includes: `input`, `expected_output`, `output` (final answer string), `messages`, `trace`, `trace_summary`, `token_usage`, `cost_usd`, `duration_ms`, `start_time`, `end_time`, `file_changes`, `workspace_path`, `config` SDK handlers receive the same payload in camelCase: `expectedOutput`, `traceSummary`, `tokenUsage`, `costUsd`, `durationMs`, `startTime`, `endTime`, `fileChanges`, `workspacePath`. When a workspace is configured, `workspace_path` is the absolute path to the workspace dir (also available as `AGENTV_WORKSPACE_PATH` env var). Use this for functional grading (e.g., running `npm test` in the workspace). For deterministic workspace checks that fit normal Vitest `expect(...)` tests, prefer a plain verifier file and the built-in adapter: ```yaml - name: welcome_banner - type: code-grader + type: script command: [agentv, eval, graders/welcome-banner.test.ts] ``` AgentV infers the Vitest adapter for `*.test.ts`, `*.spec.ts`, and Vercel-style `EVAL.ts` files. Use the explicit `agentv eval vitest` subcommand only when you need adapter flags such as `--cwd`, `--in-workspace`, or `--vitest-command`. -See docs at https://agentv.dev/graders/code-graders/ +See the Script Graders docs for the full stdin/stdout contract. ### llm-grader ```yaml @@ -557,15 +545,16 @@ Binary check: does output exactly equal the value (both trimmed)? ``` Binary check: is the output valid JSON? -### rubrics +### g-eval / llm-rubric ```yaml - Correctly identifies the denied party - Provides clear reasoning ``` LLM-judged structured evaluation. Plain strings are the preferred shorthand. -Use `type: rubrics` only when you need weighted criteria, `required: false`, -`min_score`, or score ranges. Criteria items support `id`, `outcome`, `weight`, -and `required` fields. +Use `type: g-eval` when you need weighted criteria, `required: false`, +`min_score`, or score ranges. Use `type: llm-rubric` for a single structured +rubric item with the same LLM rubric semantics. Criteria items support `id`, +`outcome`, `weight`, and `required` fields. Use optional `operator: correctness` for positive support checks or `operator: contradiction` for guard criteria where omission is acceptable but incompatible claims fail. See `references/rubric-grader.md` for score-range mode and scoring formula. @@ -663,7 +652,7 @@ export default defineEval({ }); ``` -The `graders` catalog returns ordinary `assertions` entries such as `type: is-json`, `type: regex`, `type: llm-grader`, and `type: code-grader`. `defineEval()` lowers camelCase TypeScript fields such as `expectedOutput`, `inputFiles`, and `maxSteps` to canonical snake_case YAML/runtime keys. +The `graders` catalog returns ordinary `assertions` entries such as `type: is-json`, `type: regex`, `type: llm-grader`, and `type: script`. `defineEval()` lowers camelCase TypeScript fields such as `expectedOutput`, `inputFiles`, and `maxSteps` to canonical snake_case YAML/runtime keys. If adapting Braintrust `scores` or DeepEval metrics, write small AgentV helper factories that return `graders.*` configs: @@ -714,7 +703,7 @@ export default defineCodeGrader(({ output, trace }) => { }); ``` -`defineAssertion()` files go in `.agentv/assertions/` and are referenced by filename as `type: `. `defineCodeGrader()` scripts are referenced in YAML with `type: code-grader` and `command: [bun, run, grader.ts]`. Plain Vitest workspace verifier files can use `command: [agentv, eval, graders/check.test.ts]`. +`defineAssertion()` files go in `.agentv/assertions/` and are referenced by filename as `type: `. `defineCodeGrader()` scripts are referenced in YAML with `type: script` and `command: [bun, run, grader.ts]`. Plain Vitest workspace verifier files can use `command: [agentv, eval, graders/check.test.ts]`. ### Convention-Based Discovery @@ -798,14 +787,14 @@ After running evals, perform a human review before iterating. Create `feedback.j "test_id": "test-id", "verdict": "acceptable | needs_improvement | incorrect | flaky", "notes": "Why this verdict", - "evaluator_overrides": { "code-grader:name": "Override note" }, + "evaluator_overrides": { "script:name": "Override note" }, "workspace_notes": "Workspace state observations" } ] } ``` -Use `evaluator_overrides` for workspace evaluations to annotate specific grader results (e.g., "code-grader was too strict"). Use `workspace_notes` for observations about workspace state. +Use `evaluator_overrides` for workspace evaluations to annotate specific grader results (e.g., "script grader was too strict"). Use `workspace_notes` for observations about workspace state. Review workflow: run evals → inspect results (`agentv inspect show`) → write feedback → tune prompts/graders → re-run. diff --git a/skills-data/agentv-eval-writer/references/custom-evaluators.md b/skills-data/agentv-eval-writer/references/custom-evaluators.md index 1ed851720..d058bb733 100644 --- a/skills-data/agentv-eval-writer/references/custom-evaluators.md +++ b/skills-data/agentv-eval-writer/references/custom-evaluators.md @@ -6,7 +6,6 @@ ```json { - "criteria": "string", "input_files": ["path"], "input": [{"role": "user", "content": "..."}], "expected_output": [{"role": "assistant", "content": "..."}], @@ -69,7 +68,7 @@ import { - `.invokeBatch(requests)` - Batch LLM calls - `definePromptTemplate(fn)` - Wraps prompt generation function - Raw stdin uses `snake_case`; SDK handlers receive `camelCase` - - Context fields: `input`, `expectedOutput`, `output`, `messages`, `criteria`, `config`, `trace`, `traceSummary`, `tokenUsage`, `costUsd`, `durationMs`, `startTime`, `endTime` + - Context fields: `input`, `expectedOutput`, `output`, `messages`, `config`, `trace`, `traceSummary`, `tokenUsage`, `costUsd`, `durationMs`, `startTime`, `endTime` For Python, the repo-local helper example in `examples/features/sdk-python/` keeps canonical `snake_case` fields and rejects deprecated wire aliases like `output_text`, `input_text`, and `reference_answer`. It is not a separate Python runner or a promised published package; generated evals still run through the AgentV CLI. @@ -145,10 +144,13 @@ if __name__ == "__main__": #!/usr/bin/env bun import { defineCodeGrader } from '@agentv/sdk'; -export default defineCodeGrader(({ output, criteria }) => { +export default defineCodeGrader(({ output, expectedOutput }) => { const candidate = output ?? ''; + const expected = expectedOutput + ?.map((message) => (typeof message.content === 'string' ? message.content : '')) + .join('\n') ?? ''; const assertions: Array<{ text: string; passed: boolean }> = []; - if (candidate.includes(criteria)) { + if (expected.length > 0 && candidate.includes(expected)) { assertions.push({ text: 'Matches expected outcome', passed: true }); } else { assertions.push({ text: 'Does not match expected outcome', passed: false }); @@ -167,7 +169,6 @@ Derived from test fields (users never author these directly): | Variable | Source | |----------|--------| -| `criteria` | Test `criteria` field | | `input` | Full resolved input array (JSON) | | `expected_output` | Full resolved expected array (JSON) | | `output` | Final answer / scored result string | diff --git a/skills-data/agentv-eval-writer/references/eval.schema.json b/skills-data/agentv-eval-writer/references/eval.schema.json index 7f0511f1c..9b0400054 100644 --- a/skills-data/agentv-eval-writer/references/eval.schema.json +++ b/skills-data/agentv-eval-writer/references/eval.schema.json @@ -890,9 +890,6 @@ "properties": {}, "additionalProperties": {} }, - "criteria": { - "type": "string" - }, "provider": { "anyOf": [ { @@ -2298,7 +2295,7 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "const": "script" }, "command": { "anyOf": [ @@ -2603,8 +2600,6 @@ "python", "webhook", "similar", - "select-best", - "human", "contains", "contains-any", "contains-all", @@ -2896,6 +2891,10 @@ "type": "string", "const": "composite" }, + "assert": { + "type": "array", + "items": {} + }, "assertions": { "type": "array", "items": {} @@ -2944,7 +2943,7 @@ "properties": { "type": { "type": "string", - "const": "code-grader" + "const": "script" }, "path": { "type": "string" @@ -3621,6 +3620,21 @@ "workers": { "not": {} }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, "assertions": { "type": "array", "items": { @@ -3667,7 +3681,7 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "const": "script" }, "command": { "anyOf": [ @@ -3972,8 +3986,6 @@ "python", "webhook", "similar", - "select-best", - "human", "contains", "contains-any", "contains-all", @@ -4265,6 +4277,10 @@ "type": "string", "const": "composite" }, + "assert": { + "type": "array", + "items": {} + }, "assertions": { "type": "array", "items": {} @@ -4313,7 +4329,7 @@ "properties": { "type": { "type": "string", - "const": "code-grader" + "const": "script" }, "path": { "type": "string" @@ -5617,9 +5633,6 @@ "properties": {}, "additionalProperties": {} }, - "criteria": { - "type": "string" - }, "provider": { "anyOf": [ { @@ -7025,7 +7038,7 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "const": "script" }, "command": { "anyOf": [ @@ -7330,8 +7343,6 @@ "python", "webhook", "similar", - "select-best", - "human", "contains", "contains-any", "contains-all", @@ -7623,6 +7634,10 @@ "type": "string", "const": "composite" }, + "assert": { + "type": "array", + "items": {} + }, "assertions": { "type": "array", "items": {} @@ -7671,7 +7686,7 @@ "properties": { "type": { "type": "string", - "const": "code-grader" + "const": "script" }, "path": { "type": "string" @@ -8348,6 +8363,21 @@ "workers": { "not": {} }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, "assertions": { "type": "array", "items": { @@ -8394,7 +8424,7 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "const": "script" }, "command": { "anyOf": [ @@ -8699,8 +8729,6 @@ "python", "webhook", "similar", - "select-best", - "human", "contains", "contains-any", "contains-all", @@ -8992,6 +9020,10 @@ "type": "string", "const": "composite" }, + "assert": { + "type": "array", + "items": {} + }, "assertions": { "type": "array", "items": {} @@ -9040,7 +9072,7 @@ "properties": { "type": { "type": "string", - "const": "code-grader" + "const": "script" }, "path": { "type": "string" @@ -13979,9 +14011,6 @@ "properties": {}, "additionalProperties": {} }, - "criteria": { - "type": "string" - }, "provider": { "anyOf": [ { @@ -15387,7 +15416,7 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "const": "script" }, "command": { "anyOf": [ @@ -15692,8 +15721,6 @@ "python", "webhook", "similar", - "select-best", - "human", "contains", "contains-any", "contains-all", @@ -15985,6 +16012,10 @@ "type": "string", "const": "composite" }, + "assert": { + "type": "array", + "items": {} + }, "assertions": { "type": "array", "items": {} @@ -16033,7 +16064,7 @@ "properties": { "type": { "type": "string", - "const": "code-grader" + "const": "script" }, "path": { "type": "string" @@ -16710,6 +16741,21 @@ "workers": { "not": {} }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, "assertions": { "type": "array", "items": { @@ -16756,7 +16802,7 @@ }, "type": { "type": "string", - "enum": ["code-grader", "code_grader"] + "const": "script" }, "command": { "anyOf": [ @@ -17061,8 +17107,6 @@ "python", "webhook", "similar", - "select-best", - "human", "contains", "contains-any", "contains-all", @@ -17354,6 +17398,10 @@ "type": "string", "const": "composite" }, + "assert": { + "type": "array", + "items": {} + }, "assertions": { "type": "array", "items": {} @@ -17402,7 +17450,7 @@ "properties": { "type": { "type": "string", - "const": "code-grader" + "const": "script" }, "path": { "type": "string" diff --git a/skills-data/agentv-eval-writer/references/rubric-evaluator.md b/skills-data/agentv-eval-writer/references/rubric-evaluator.md index d0afd6225..821d6ae59 100644 --- a/skills-data/agentv-eval-writer/references/rubric-evaluator.md +++ b/skills-data/agentv-eval-writer/references/rubric-evaluator.md @@ -1,12 +1,12 @@ -# Rubric Grader +# Rubric Graders -Rubrics are defined as `assertions` entries with `type: rubrics`. They support binary checklist grading and score-range analytic grading. +Rubrics are defined as `assertions` entries with plain strings, `type: g-eval`, or `type: llm-rubric`. They support binary checklist grading and score-range analytic grading. ## Field Reference | Field | Type | Default | Description | |-------|------|---------|-------------| -| `type` | string | required | Must be `rubrics` | +| `type` | string | required | Use `g-eval` for grouped criteria or `llm-rubric` for a single structured rubric | | `criteria` | array | required | List of criterion strings or objects | | `required` | boolean or number | - | Gate: `true` requires score >= 0.8; a number (0–1) sets a custom threshold | @@ -33,14 +33,14 @@ assertions: - States time complexity ``` -Equivalent to the full form with `type: rubrics`. Use the full form only when you need weights, `required: false`, or `score_ranges`. +Equivalent to the full form with `type: g-eval`. Use the full form only when you need weights, `required: false`, or `score_ranges`. Mixed strings and objects are supported in `assertions` — strings are grouped into a single rubrics grader at the position of the first string: ```yaml assertions: - Mentions divide-and-conquer approach # grouped into rubrics - - type: code-grader + - type: script command: [check_syntax.py] - States time complexity # grouped into rubrics ``` @@ -49,7 +49,7 @@ assertions: ```yaml assertions: - - type: rubrics + - type: g-eval criteria: - Mentions divide-and-conquer approach - id: complexity @@ -68,7 +68,7 @@ Use `operator` when outcome text should carry grading intent without embedding w ```yaml assertions: - - type: rubrics + - type: g-eval criteria: - id: supported-fact operator: correctness @@ -87,7 +87,7 @@ Shorthand map format (recommended): ```yaml assertions: - - type: rubrics + - type: g-eval criteria: - id: correctness weight: 2.0