diff --git a/.agents/product-boundary.md b/.agents/product-boundary.md index f55b36c6c..b93614b84 100644 --- a/.agents/product-boundary.md +++ b/.agents/product-boundary.md @@ -80,6 +80,8 @@ Use public reference standards before inventing AgentV-specific contracts: - Hugging Face Datasets for dataset, split, record, and portable corpus conventions. - OpenInference for trace, span, tool-call, and model-observability semantics. +Research those references from local cloned repositories first when a clone is available, and use DeepWiki MCP for repository-level orientation or cross-repo questions. Broad web search is a fallback, not the default. If current public documentation matters for the decision, use official docs and record the exact source or commit alongside the conclusion. + Treat these as reference inputs, not dependencies. AgentV should adopt the shared lowest common denominator when it fits the repo-native artifact model, and document any intentional divergence in the relevant plan, ADR, or contract docs. ### 5. YAGNI - You Aren't Gonna Need It diff --git a/AGENTS.md b/AGENTS.md index 6ab4ece17..81beb0fb6 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -26,6 +26,7 @@ Design guardrails: - Document composition patterns before inventing a new feature. - Match industry-standard lowest-common-denominator contracts when possible. - When designing AgentV contracts, check public reference standards such as Claude Skills, Vercel agent-eval, Hugging Face Datasets, and OpenInference before inventing AgentV-specific shapes. Use their shared lowest common denominator where it fits, and document any intentional divergence. +- For peer-framework research, prefer local cloned repositories and DeepWiki MCP over broad web search. If a public contract must be checked for currentness, use official docs and record the source or commit behind the conclusion. - Apply YAGNI aggressively and solve the current request with the smallest surface that works. - Keep extensions non-breaking unless a same-week unreleased surface should be hard-corrected. - Design for AI comprehension with self-describing modules, clear extension points, and no dead scaffolding. diff --git a/apps/cli/src/commands/eval/commands/bundle.ts b/apps/cli/src/commands/eval/commands/bundle.ts index a6c33b8f1..799a7e8e3 100644 --- a/apps/cli/src/commands/eval/commands/bundle.ts +++ b/apps/cli/src/commands/eval/commands/bundle.ts @@ -30,7 +30,7 @@ function unique(values: readonly string[]): readonly string[] { function targetReferenceNames(target: TargetDefinition): readonly string[] { const references: string[] = []; - for (const key of ['use_target', 'grader_target', 'judge_target'] as const) { + for (const key of ['use_target', 'grader_target'] as const) { const value = target[key]; if (typeof value === 'string' && value.trim().length > 0 && !value.includes('${{')) { references.push(value.trim()); diff --git a/apps/cli/src/commands/eval/task-bundle.ts b/apps/cli/src/commands/eval/task-bundle.ts index 8b71b257a..89d458fc0 100644 --- a/apps/cli/src/commands/eval/task-bundle.ts +++ b/apps/cli/src/commands/eval/task-bundle.ts @@ -497,7 +497,7 @@ function buildEvalCase( function targetReferenceNames(target: TargetDefinition): readonly string[] { const references: string[] = []; - for (const key of ['use_target', 'grader_target', 'judge_target'] as const) { + for (const key of ['use_target', 'grader_target'] as const) { const value = target[key]; if (typeof value === 'string' && value.trim().length > 0 && !value.includes('${{')) { references.push(value.trim()); @@ -831,7 +831,7 @@ async function collectWorkspaceReferences( for (const hookName of ['before_all', 'before_each', 'after_each', 'after_all'] as const) { const hook = hooks[hookName]; - const command = hook?.command ?? hook?.script; + const command = hook?.command; if (!command || command.length === 0) { continue; } diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts index 8e63cdf29..72ee29955 100644 --- a/apps/cli/src/commands/pipeline/input.ts +++ b/apps/cli/src/commands/pipeline/input.ts @@ -293,7 +293,7 @@ async function writeGraderConfigs( weight: r.weight ?? 1.0, ...(r.score_ranges ? { score_range: r.score_ranges } : {}), ...(r.required !== undefined ? { required: r.required } : {}), - ...(r.required_min_score !== undefined ? { required_min_score: r.required_min_score } : {}), + ...(r.min_score !== undefined ? { min_score: r.min_score } : {}), })); await writeJson(join(llmGradersDir, `${config.name}.json`), { diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index 89072679e..99672c5bf 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -476,7 +476,7 @@ async function writeGraderConfigs( weight: r.weight ?? 1.0, ...(r.score_ranges ? { score_range: r.score_ranges } : {}), ...(r.required !== undefined ? { required: r.required } : {}), - ...(r.required_min_score !== undefined ? { required_min_score: r.required_min_score } : {}), + ...(r.min_score !== undefined ? { min_score: r.min_score } : {}), })); await writeJson(join(llmGradersDir, `${config.name}.json`), { diff --git a/apps/cli/src/commands/runs/rerun.ts b/apps/cli/src/commands/runs/rerun.ts index abab40290..864b9c6e3 100644 --- a/apps/cli/src/commands/runs/rerun.ts +++ b/apps/cli/src/commands/runs/rerun.ts @@ -141,7 +141,7 @@ function resolveWholeEnvReference(value: unknown): string | undefined { function referencedTargetNames(definition: Record): readonly string[] { const names: string[] = []; - for (const key of ['use_target', 'grader_target', 'judge_target'] as const) { + for (const key of ['use_target', 'grader_target'] as const) { const resolved = resolveWholeEnvReference(definition[key]); if (resolved && !resolved.includes('${{')) { names.push(resolved); diff --git a/apps/cli/src/templates/.agentv/targets.yaml b/apps/cli/src/templates/.agentv/targets.yaml index c95f7decc..ba2ad1350 100644 --- a/apps/cli/src/templates/.agentv/targets.yaml +++ b/apps/cli/src/templates/.agentv/targets.yaml @@ -1,6 +1,7 @@ # A list of all supported evaluation targets for the project. # Each target defines a provider and its specific configuration. # Actual values for paths/keys are stored in the local .env file. +# Agent and CLI targets use grader_target to reference an LLM target for scoring. targets: - name: default @@ -12,7 +13,7 @@ targets: - name: codex provider: codex - judge_target: azure-llm + grader_target: azure-llm # Uses the Codex CLI (defaults to `codex` on PATH) # executable: ${{ CODEX_CLI_PATH }} # Optional: override executable path # args: # Optional additional CLI arguments @@ -29,7 +30,7 @@ targets: # Claude - Anthropic's Claude Agent SDK - name: claude provider: claude - judge_target: azure-llm + grader_target: azure-llm # Uses the @anthropic-ai/claude-agent-sdk # model: claude-sonnet-4-20250514 # Optional: override model # cwd: ${{ CLAUDE_WORKSPACE_DIR }} # Optional: working directory (defaults to process.cwd()) @@ -53,7 +54,7 @@ targets: - name: local_cli provider: cli - judge_target: azure-llm + grader_target: azure-llm # Passes the fully rendered prompt and any attached files to a local Python script # NOTE: Do not add quotes around {PROMPT} or {FILES} - they are already shell-escaped command: uv run ./mock_cli.py --prompt {PROMPT} {FILES} --output {OUTPUT_FILE} diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 39513c02f..8ab705f0e 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -260,6 +260,48 @@ describe('buildGradingArtifact', () => { expect(grading.graders?.[1].score).toBe(0.7); }); + it('preserves multi-aspect grader assertions at top level and under the grader', () => { + const rubricAssertions = [ + { + text: '[accuracy] Answer matches the reference - Score: 8/10 (strong)', + passed: true, + evidence: 'The answer includes the expected facts.', + }, + { + text: '[citations] Answer cites the source - Score: 4/10 (weak)', + passed: false, + evidence: 'The answer does not cite a source.', + }, + ]; + const result = makeResult({ + assertions: rubricAssertions, + scores: [ + makeEvaluatorResult({ + name: 'rubric-review', + type: 'llm-grader', + score: 0.6, + assertions: rubricAssertions, + }), + ], + }); + + const grading = buildGradingArtifact(result); + + expect(grading.assertions).toEqual(rubricAssertions); + expect(grading.summary).toEqual({ + passed: 1, + failed: 1, + total: 2, + pass_rate: 0.5, + }); + expect(grading.graders?.[0]).toMatchObject({ + name: 'rubric-review', + type: 'llm-grader', + score: 0.6, + assertions: rubricAssertions, + }); + }); + it('keeps grading.json focused on grading evidence', () => { const result = makeResult({ error: 'Timeout exceeded' }); const grading = buildGradingArtifact(result); diff --git a/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx b/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx index b36fd6b87..4431a1a57 100644 --- a/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx +++ b/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx @@ -345,7 +345,7 @@ Any grader in `assertions` can be marked as `required`. When a required grader f | Value | Behavior | |-------|----------| | `required: true` | Must score >= 0.8 (default threshold) to pass | -| `required: 0.6` | Must score >= 0.6 to pass (custom threshold between 0 and 1) | +| `required: true` + `min_score: 0.6` | Must score >= 0.6 to pass (custom threshold between 0 and 1) | ```yaml assertions: @@ -353,7 +353,8 @@ assertions: value: "DENIED" required: true # must pass (>= 0.8) - type: rubrics - required: 0.6 # must score at least 0.6 + required: true + min_score: 0.6 # must score at least 0.6 criteria: - id: quality outcome: Response is well-structured diff --git a/apps/web/src/content/docs/docs/evaluation/rubrics.mdx b/apps/web/src/content/docs/docs/evaluation/rubrics.mdx index ba9502ea3..2fb186e6f 100644 --- a/apps/web/src/content/docs/docs/evaluation/rubrics.mdx +++ b/apps/web/src/content/docs/docs/evaluation/rubrics.mdx @@ -74,7 +74,7 @@ assertions: | `score_ranges` | — | Score range definitions (analytic mode) | :::note -`required_min_score` (0–10 integer scale) is deprecated. Use `min_score` (0–1 scale) instead. For example, `required_min_score: 8` becomes `min_score: 0.8`. +Use `min_score` for analytic rubric gating. The only 0–10 values in authored rubrics are `score_ranges` bands and grader outputs. ::: ### Criterion Operators diff --git a/apps/web/src/content/docs/docs/graders/custom-graders.mdx b/apps/web/src/content/docs/docs/graders/custom-graders.mdx index 42e9d9865..568d5b989 100644 --- a/apps/web/src/content/docs/docs/graders/custom-graders.mdx +++ b/apps/web/src/content/docs/docs/graders/custom-graders.mdx @@ -77,7 +77,7 @@ final_score = sum(score_i * weight_i) / sum(weight_i) ``` If `weight` is omitted, it defaults to `1.0` (equal weighting). -If any grader has `required: true` (or `required: `) and scores below its required threshold, the overall test score is forced to `0`. +If any grader has `required: true` and scores below its required threshold, the overall test score is forced to `0`. Use `min_score` for a custom threshold. ## Best Practices diff --git a/apps/web/src/content/docs/docs/guides/workspace-architecture.mdx b/apps/web/src/content/docs/docs/guides/workspace-architecture.mdx index c80c07d3a..93a6e281a 100644 --- a/apps/web/src/content/docs/docs/guides/workspace-architecture.mdx +++ b/apps/web/src/content/docs/docs/guides/workspace-architecture.mdx @@ -119,13 +119,12 @@ For each materialized repo, AgentV resolves acquisition in this order: | Order | Source | How it is used | |-------|--------|----------------| -| 1 | Explicit resolver | `workspace.repos[].resolver` names a configured command resolver. If it returns `handled:false`, AgentV fails clearly instead of guessing. | -| 2 | Pattern resolver | The first non-`default` `repo_resolvers[]` entry whose `repos` pattern matches the repo URL or identity. If it returns `handled:false`, AgentV continues to the default resolver. | -| 3 | Default resolver | The resolver named `default`, if configured. It must not declare `repos`; it is the unconditional project default. If it returns `handled:false`, AgentV continues to the built-in git resolver. | -| 4 | Registered project | A project in `$AGENTV_HOME/projects.yaml` whose `origin` matches the repo identity. AgentV seeds its mirror cache from that local checkout, then clones the cache into the workspace and resets `origin` to the declared repo URL. | -| 5 | Configured mirror | A path listed under `git_cache.mirrors`. AgentV seeds its mirror cache from that checkout or bare mirror, then clones the cache into the workspace. | -| 6 | Mirror cache | An AgentV-owned bare cache under `$AGENTV_DATA_DIR/git-cache/`. Cache population is locked, cloned into a temporary path, verified, and atomically renamed before use. | -| 7 | Remote clone | The normalized clone URL from the eval's `repo` field. | +| 1 | Pattern resolver | The first non-`default` `repo_resolvers[]` entry whose `repos` pattern matches the repo URL or identity. If it returns `handled:false`, AgentV continues to the default resolver. | +| 2 | Default resolver | The resolver named `default`, if configured. It must not declare `repos`; it is the unconditional project default. If it returns `handled:false`, AgentV continues to the built-in git resolver. | +| 3 | Registered project | A project in `$AGENTV_HOME/projects.yaml` whose `origin` matches the repo identity. AgentV seeds its mirror cache from that local checkout, then clones the cache into the workspace and resets `origin` to the declared repo URL. | +| 4 | Configured mirror | A path listed under `git_cache.mirrors`. AgentV seeds its mirror cache from that checkout or bare mirror, then clones the cache into the workspace. | +| 5 | Mirror cache | An AgentV-owned bare cache under `$AGENTV_DATA_DIR/git-cache/`. Cache population is locked, cloned into a temporary path, verified, and atomically renamed before use. | +| 6 | Remote clone | The normalized clone URL from the eval's `repo` field. | Workspace clones are independent from user-owned checkouts, configured mirrors, and resolver source directories. AgentV does not leave Git alternates pointing diff --git a/apps/web/src/content/docs/docs/targets/coding-agents.mdx b/apps/web/src/content/docs/docs/targets/coding-agents.mdx index 19d7aa290..a6fe0776f 100644 --- a/apps/web/src/content/docs/docs/targets/coding-agents.mdx +++ b/apps/web/src/content/docs/docs/targets/coding-agents.mdx @@ -5,7 +5,7 @@ sidebar: order: 3 --- -Coding agent targets evaluate AI coding assistants and CLI-based agents. These targets require a `grader_target` (also accepts `judge_target` for backward compatibility) to run LLM-based graders. +Coding agent targets evaluate AI coding assistants and CLI-based agents. These targets require a `grader_target` to run LLM-based graders. ## Prompt format @@ -73,39 +73,6 @@ targets: | `cwd` | No | Working directory | | `grader_target` | Yes | LLM target for evaluation | -## cc-mirror - -[cc-mirror](https://github.com/numman-ali/cc-mirror) creates isolated Claude Code variants that route through alternative providers (Z.ai, Kimi, MiniMax, OpenRouter, etc.). The `cc-mirror` provider alias resolves to `claude-cli` and auto-discovers the binary path from `~/.cc-mirror//variant.json`. - -```yaml -targets: - # Explicit variant with known executable - - name: claude-zai - provider: cc-mirror - executable: claude-zai - grader_target: azure-base - - # Auto-discover binary from variant.json - - name: my-kimi - provider: cc-mirror - grader_target: azure-base -``` - -| Field | Required | Description | -|-------|----------|-------------| -| `executable` | No | CLI binary name or path. When set, used directly (skips variant.json lookup). | -| `variant` | No | Variant name (directory under `~/.cc-mirror/`). Defaults to target `name`. Used to locate `variant.json` when `executable` is not set. | -| `cwd` | No | Working directory | -| `grader_target` | Yes | LLM target for evaluation | - -Setup a variant first, then reference it by name: - -```bash -npx cc-mirror quick --provider zai --name claude-zai --api-key "$Z_AI_API_KEY" -``` - -Since `cc-mirror` resolves to `claude-cli`, all Claude target fields (model, system_prompt, timeout_seconds, etc.) are also supported. - ## Codex CLI ```yaml @@ -131,7 +98,7 @@ targets: ```yaml targets: - name: copilot - provider: copilot + provider: copilot-cli model: gpt-5-mini grader_target: azure-base ``` @@ -140,7 +107,7 @@ targets: |-------|----------|-------------| | `model` | No | Model to use (defaults to copilot's default) | | `cwd` | No | Working directory | -| `subprovider` | No | OpenAI-compatible provider type for `copilot`, `copilot-cli`, or `copilot-sdk`, such as `openai` or `azure` | +| `subprovider` | No | OpenAI-compatible provider type for `copilot-cli` or `copilot-sdk`, such as `openai` or `azure` | | `base_url` | No | Provider base URL or Azure resource URL/name | | `api_key` | No | Provider API key. Prefer `${{ ENV_VAR }}` references. | | `bearer_token` | No | Provider bearer token. Prefer `${{ ENV_VAR }}` references. Takes precedence over `api_key` when set. | @@ -308,7 +275,7 @@ The VS Code provider uses a **subagent file-messaging architecture**. AgentV pro ```yaml targets: - name: copilot - provider: copilot + provider: copilot-cli executable: ${{ COPILOT_EXE }} grader_target: azure-base ``` diff --git a/apps/web/src/content/docs/docs/targets/configuration.mdx b/apps/web/src/content/docs/docs/targets/configuration.mdx index 88a728c5b..50de9994d 100644 --- a/apps/web/src/content/docs/docs/targets/configuration.mdx +++ b/apps/web/src/content/docs/docs/targets/configuration.mdx @@ -76,7 +76,7 @@ tests: ## Grader Target -Agent targets that need LLM-based evaluation specify a `grader_target` (also accepts `judge_target` for backward compatibility) — the LLM used to run LLM grader graders: +Agent targets that need LLM-based evaluation specify a `grader_target` — the LLM used to run LLM grader graders: ```yaml targets: @@ -171,7 +171,6 @@ workspace: |-------|-------------| | `repos[].path` | Directory within the workspace to clone into | | `repos[].repo` | Repository identity: full clone URL or GitHub `org/name` shorthand | -| `repos[].resolver` | Optional configured `repo_resolvers[].name` override | | `repos[].commit` | Branch, tag, or SHA to check out (default: `HEAD`) | | `repos[].base_commit` | Alias for `commit`, useful for SWE-bench-style datasets | | `repos[].ancestor` | Walk N commits back from the checked-out ref (e.g., `1` for parent) | diff --git a/apps/web/src/content/docs/docs/tools/dashboard.mdx b/apps/web/src/content/docs/docs/tools/dashboard.mdx index 6a49a222b..de7ad3011 100644 --- a/apps/web/src/content/docs/docs/tools/dashboard.mdx +++ b/apps/web/src/content/docs/docs/tools/dashboard.mdx @@ -314,7 +314,7 @@ results: auto_push: false ``` -Project-local `.agentv/config.yaml` is for portable eval defaults such as `execution`, `eval_patterns`, and `dashboard`. Do not put `projects` in project-local config; AgentV warns and ignores it there. `results_by_project` is deprecated; use `projects[].results` in `$AGENTV_HOME/config.yaml`. +Project-local `.agentv/config.yaml` is for portable eval defaults such as `execution`, `eval_patterns`, and `dashboard`. Do not put `projects` in project-local config; AgentV warns and ignores it there. Put per-project results settings in `projects[].results` in `$AGENTV_HOME/config.yaml`. The project `repo` and the `results` block sync different repositories: diff --git a/docs/adr/0013-experiment-is-metadata-expressed-as-tags-experiment.md b/docs/adr/0013-experiment-is-metadata-expressed-as-tags-experiment.md index 7c2124e6a..e9b052e0a 100644 --- a/docs/adr/0013-experiment-is-metadata-expressed-as-tags-experiment.md +++ b/docs/adr/0013-experiment-is-metadata-expressed-as-tags-experiment.md @@ -4,7 +4,7 @@ Date: 2026-07-01 ## Status -Accepted +Accepted, then **superseded** (eval-authoring portions) by [ADR 0016](0016-promptfoo-superset-eval-authoring-contract.md) as part of the promptfoo-superset restructure (2026-07-02). Extends [ADR 0009](0009-eval-path-result-identity-and-default-experiment.md) and builds on [ADR 0012](0012-finalize-run-artifact-layout.md), which established diff --git a/docs/adr/0013-stabilize-eval-authoring-contract.md b/docs/adr/0013-stabilize-eval-authoring-contract.md index fc019796f..0fbc601c0 100644 --- a/docs/adr/0013-stabilize-eval-authoring-contract.md +++ b/docs/adr/0013-stabilize-eval-authoring-contract.md @@ -4,7 +4,7 @@ Date: 2026-07-01 ## Status -Accepted +Accepted, then **superseded** (eval-authoring portions) by [ADR 0016](0016-promptfoo-superset-eval-authoring-contract.md) as part of the promptfoo-superset restructure (2026-07-02). Supersedes the eval-authoring placement portions of [ADR 0002](0002-keep-harbor-benchmark-execution-behind-runner-boundary.md), diff --git a/docs/adr/0015-multi-turn-conversation-execution-vs-evaluation.md b/docs/adr/0015-multi-turn-conversation-execution-vs-evaluation.md new file mode 100644 index 000000000..686492b7d --- /dev/null +++ b/docs/adr/0015-multi-turn-conversation-execution-vs-evaluation.md @@ -0,0 +1,97 @@ +# 15. Multi-turn: separate conversation execution from evaluation + +Date: 2026-07-02 + +## Status + +Accepted (2026-07-02). Part of the promptfoo-superset eval restructure — see +`docs/plans/promptfoo-aligned-eval-restructure.md` §3. Records a decision that +had no prior ADR (the `mode: conversation` contract lived only in code and in +issue #1053). + +## Context + +AgentV's multi-turn support (`mode: conversation`, `turns[]`, `aggregation`, +`on_turn_failure`, `window_size`) was designed under **agentv#1053** and +researched in `agentevals/agentevals-research/research/findings/multiturn-conversation-eval/` +against inspect-ai, google-adk, and ragas. Key findings: + +- **inspect-ai**: solver `state.completed` ≡ AgentV `on_turn_failure: stop`; + `await score(state)` inside a solver ≡ per-turn assertions gating + continuation; the research recommends treating **conversation execution and + conversation evaluation as separate concerns**. +- **google-adk**: schema separates scripted `conversation` from LLM-driven + `conversation_scenario` (mutually exclusive; different metric sets). +- **inspect-ai / ragas / promptfoo all lack per-conversation aggregation** — + they aggregate only across *epochs/samples*, never *within* one conversation. + This is why AgentV added a turn-`aggregation` policy. + +Separately, this restructure adopts promptfoo's prompt-variable system, which +brings native conversation **execution** for free: chat-array prompts, the +built-in `_conversation` variable (each test row is a turn, prior completions +looped into the prompt), and session-based providers for stateful threads. + +`window_size` (verified in `orchestrator.ts` `buildWindowedHistory`): a sliding +window that, each turn, sends **all system messages + the last N turns** +(`N×2` non-system messages) to the provider and to the per-turn grader, instead +of the full accumulated history. Its purpose is context/cost control for long +conversations. + +## Decision + +1. **Split execution from evaluation** (per the inspect-ai finding). +2. **Conversation execution → promptfoo mechanisms**: `_conversation` + + chat-array prompts for stateless/rebuild-history turns; session providers for + stateful same-thread turns. AgentV does not keep a bespoke turn-driving + subsystem. +3. **Keep AgentV's evaluation layer** as a documented extension: per-turn + assertions, cross-turn **`aggregation`** (`mean` default, `min` = weakest + link, `max`), and **`on_turn_failure`** (`continue`/`stop`). Provenance: + `on_turn_failure` ← inspect-ai `state.completed`; per-conversation + aggregation is a deliberate gap-fill for a capability the surveyed frameworks + lack. Relying on promptfoo alone would reintroduce that gap. +4. **Drop `window_size`.** In the `_conversation` model the author controls how + much history to include directly in the prompt template (include all, slice + the last N, or summarize), so a dedicated schema field is redundant; it also + has no framework pedigree. This reverses the `window_size` portion of the + #1053 design. +5. **Turn-aggregation is a distinct axis from trial-aggregation.** Cross-turn + `aggregation` (within one conversation) is separate from the + `repeat`/pass@k reducer across samples/trials (ADR for the output contract, + plan §4/§6). Both are retained on their respective axes; neither replaces the + other. + +## Migration: `window_size` → `_conversation` template + +`window_size: N` kept *system + the last N turns* (`buildWindowedHistory`: +all system messages + the last `N×2` non-system messages) when calling the +provider and the per-turn grader. In the `_conversation` model the author +expresses the same windowing in the prompt template: + +```njk +[ + { "role": "system", "content": "{{ system }}" }, {# always kept — outside the loop #} + {% for c in _conversation %} + {% if loop.revindex <= N %} {# keep last N turns == window_size: N #} + { "role": "user", "content": "{{ c.input }}" }, + { "role": "assistant", "content": "{{ c.output }}" }, + {% endif %} + {% endfor %} + { "role": "user", "content": "{{ question }}" } +] +``` + +- `window_size` unset (full history) → loop over all of `_conversation` (drop the `if`). +- `window_size: N` → `{% if loop.revindex <= N %}` (each `_conversation` entry is one turn, so this is the last N turns = last `N×2` messages, matching `buildWindowedHistory`). +- The template is strictly more expressive — it can also **summarize** prior turns (via a `nunjucks_filters` filter or a var) rather than only truncate, which `window_size` could not do. + +The codemod rewrites `window_size: N` scenarios into this template idiom. + +## Compatibility + +Major-version, hard deprecation (nothing in production). The one-shot codemod +removes `window_size` and reshapes authored `mode: conversation` scenarios so +that turn *execution* is expressed via `_conversation`/prompts while the +retained `turns` evaluation fields (per-turn `assert`, `aggregation`, +`on_turn_failure`) carry over. No prior ADR is superseded; this supersedes the +`window_size` intent recorded in issue #1053. diff --git a/docs/adr/0016-promptfoo-superset-eval-authoring-contract.md b/docs/adr/0016-promptfoo-superset-eval-authoring-contract.md new file mode 100644 index 000000000..010c69cdb --- /dev/null +++ b/docs/adr/0016-promptfoo-superset-eval-authoring-contract.md @@ -0,0 +1,113 @@ +# 16. promptfoo-superset eval authoring contract + +Date: 2026-07-02 + +## Status + +Accepted (2026-07-02). Anchor decision for the eval-authoring restructure — see +`docs/plans/promptfoo-aligned-eval-restructure.md` §1–§2, §11.1. **Supersedes the +eval-authoring portions of [ADR 0013 (stabilize eval authoring)](0013-stabilize-eval-authoring-contract.md) +and [ADR 0013 (experiment as tags.experiment)](0013-experiment-is-metadata-expressed-as-tags-experiment.md)**; +multi-turn is carved out to [ADR 0015](0015-multi-turn-conversation-execution-vs-evaluation.md); +the output/artifact contract to [ADR 0017](0017-output-artifact-and-workspace-resolver-contract.md). + +## Context + +AgentV's eval-authoring surface diverged from industry primitives. We are re-basing +it on promptfoo (the lowest-common-denominator eval config) so that **any promptfoo +config, mechanically snake_cased, is a valid AgentV eval with equivalent semantics**, +and AgentV extensions layer on top (repo-native workspaces, agentic judges, gate, +multi-turn). This ships as a **major version with hard deprecation** — nothing is in +production, so removed keys are deleted (not aliased) and a one-shot codemod migrates +existing files. + +## Decision + +Governing principle: **prefer promptfoo's name/shape where functionally equivalent; +keep AgentV's only where its semantics are genuinely better.** + +1. **`assert` is canonical** (per-test and `default_test`); `assertions` removed. + Promptfoo type names adopted (`contains`/`equals`/`regex`/`is-json`/`icontains`/ + `contains-all|any`/`starts-with`/`similar`/`latency`/`cost`/`webhook`/`javascript`/ + `python`/`assert-set`). `composite` removed → `assert-set`. +2. **LLM judge vocabulary follows semantics.** `g-eval` is the criteria/rubric + scoring type; AgentV's `rubrics` and bare-string `assert` entries desugar to a + grouped `g-eval` (N criteria, one judge flow) as an AgentV superset extension. + `llm-rubric` remains the promptfoo-compatible free-form rubric judge. Agentic + evidence-gathering judges stay an AgentV extension rather than being forced into + `llm-rubric`. Structured AgentV rubric criteria are preserved, not flattened + into a single text blob: criteria objects keep `weight`, `operator`, + `score_ranges`, and `min_score`. Artifact assertion rows are the generic + AgentV grader contract, not a `g-eval` special case: each grader returns + `assertions[]`, the orchestrator flattens those rows into + `grading.json.assertions[]`, and `grading.json.graders[].assertions[]` keeps + the per-grader breakdown. Deterministic graders usually emit one row, while + multi-aspect graders emit one row per authored check or result unit. Structured + `g-eval` criteria therefore populate one assertion row per criterion so the + Dashboard can show criterion-level evidence, using the same mechanism as code + graders, field accuracy, execution metrics, and tool trajectory. +3. **Grader execution**: `javascript` in-process (Bun `import`), `python` subprocess, + `code-grader` = the subprocess power tool (workspace-`cwd`, arbitrary language) — + `javascript` is NOT desugared to `code-grader`. +4. **`metric` is the named-score field** (nunjucks-templated); grader `name` becomes + display-only. Add `named_scores` + `derived_metrics`. +5. **`targets` is the canonical system-under-test** axis (promptfoo target/`ProviderOptions` + object shape + AgentV extensions). `provider`/`apiId` = the **backend** kind (never a + SUT). No runtime top-level `providers` alias (would overload the backend term); the + codemod/conversion remaps promptfoo `providers:` → `targets:`. +6. **Prompts + vars, not `input`**: adopt top-level `prompts` (string/chat-array/file/ + fn, nunjucks `{{vars}}`); collapse `tests[].input` into `prompts`+`vars`. `input_files` + survives as prompt content. +7. **Templating**: nunjucks for BOTH vars and env (promptfoo-native), via the `nunjucks` + package. `{{ var }}` = eval-time vars (array-var expansion, `nunjucks_filters`, autoescape + off, render-then-parse for chat arrays); `{{ env.VAR }}` = config-time env, rendered at + load-time before validation (defaults via `{{ env.VAR | default('x') }}`). One engine, + phase-separated by render pass + the `env` namespace — **no `${ENV}` sigil**. Replaces + `${{ ENV }}`. Rationale beyond superset-compat: `{{ env.VAR }}` **does not collide with + runtime shell `${VAR}`** — CLI-target commands can carry `$VAR`/`${VAR}` that must reach + the shell at runtime untouched; a `${ENV}` config sigil would clobber them. +8. **Optional test `id`**, layered identity: content identity = `test_id` (content hash, + derived when unauthored); governance/trend identity = an author `tag`/`metadata` key + (Dashboard keys comparison on this); display label = `description` → vars → `Test #n`. +9. **Keep AgentV where better**: first-class `expected_output` as passive golden/reference + answer data (DeepEval-aligned; not moved into `vars`, and not sent to target prompts + unless the author separately places it in `vars`); `repeat: { count, strategy, early_exit }` (map promptfoo + `repeat:int` → `count`+`pass_all`); executable `gate` release policy (alongside per-test + `threshold`); `imports`/`select`; `depends_on`. `experiment` is authored as `tags.experiment` — a plain tag with **no structural privilege** (not a bucket/field/storage path; not a privileged grouping key; tags alphabetical; default compare key is a user preference). `--experiment X` = sugar for `--tag experiment=X`. Its **value** is auto-defaulted to the eval/suite name when unset so runs are always groupable (ADR-0009 derivation) — a default value, not a privileged key (ADR-0017). +10. **Workspace repo provisioning is a declarative FIELD, not an extension.** + `workspace.repos: [{ path, repo, commit (base_commit alias), sparse?, ancestor? }]` is + declared per-test (overridable) / at suite level, and the **harness materializes it + (harness-owned resolver, ADR 0017) BEFORE any hook or the target runs.** The *common + case* is not a hand-rolled per-eval hook (ordering + reproducibility + declarative + provenance). **But acquisition is pluggable** (ADR 0017 pt5): custom acquisition is + first-class via a registered custom backend or a `beforeAll` escape hatch, and the + built-in acquisition may itself be a swappable plugin — this is the correction of an + earlier over-absolute "not an extension" claim; provenance stays a declarative field, + acquisition stays extensible. `isolation` (shared/pooled/fresh) is a `workspace` + config field, not a hook choice. + **Extensions are for pluggable non-provisioning setup only**: promptfoo lifecycle + (`beforeAll`/`afterAll`/`beforeEach`/`afterEach`), running *after* materialization — + e.g. `agentv:agent-rules` (stage skills/hooks/agents) and custom `file://` hooks. + Removed: `on_run_complete`, `preprocessors` (→ `extensions`). +11. **Scope**: `similar` ships with a configured embeddings provider, and `g-eval` ships + as the structured criteria/rubric judge. Exotic promptfoo assertions + (`context-*`/`moderation`/…) and `redteam` are **future scope** — + treated as unrecognized fields, not stubbed. Superset holds over the *implemented* + surface. + +Removed (hard): `assertions`, `composite`, `eval_cases`, `tests[].input`, +`workspace.hooks` (→ `extensions`), `on_run_complete`, `preprocessors`, `${{ ENV }}`, +top-level `budget_usd`, scalar top-level `threshold`, grader `name`-as-metric, the +`z.never()` rejection stubs. **Kept** as declarative fields: `workspace.repos` (provenance), +`workspace.isolation`, `workspace.docker`, `workspace.template`. + +## Consequences + +- Reverses ADR-0013 (`assertions`-only, no `assert`). Both 0013 files marked superseded. +- A one-shot codemod migrates existing eval files and hard-errors on removed keys with a + message pointing at the replacement. +- promptfoo authors get a near-drop-in contract (snake_case); AgentV keeps repo/agent + differentiation as documented extensions. +- FizzBuzz/SWE-bench-style test grading needs no new assertion primitive — a + workspace-`cwd` `code-grader` runs the tests (see ADR 0017 note on SWE-bench + `FAIL_TO_PASS`/`PASS_TO_PASS`). diff --git a/docs/adr/0017-output-artifact-and-workspace-resolver-contract.md b/docs/adr/0017-output-artifact-and-workspace-resolver-contract.md new file mode 100644 index 000000000..6bcd8e80d --- /dev/null +++ b/docs/adr/0017-output-artifact-and-workspace-resolver-contract.md @@ -0,0 +1,182 @@ +# 17. Output/artifact contract + workspace resolver (provenance vs acquisition) + +Date: 2026-07-02 + +## Status + +Accepted (2026-07-02). Part of the eval-authoring restructure — see +`docs/plans/promptfoo-aligned-eval-restructure.md` §6, §11. **Refines/supersedes +[ADR 0011 (result output artifact contract)](0011-result-output-artifact-contract.md) +and [ADR 0012 (finalize run artifact layout)](0012-finalize-run-artifact-layout.md)**; +extends [ADR 0008 (normalized transcript)](0008-normalized-transcript-artifact-contract.md). +Companion to [ADR 0016](0016-promptfoo-superset-eval-authoring-contract.md). + +## Context + +We reviewed the output formats of promptfoo, margin-lab, vercel-agent-eval, and +agentskills, and the workspace-acquisition models of SWE-bench, margin, Harbor, and +Inspect AI (`docs/plans/…` §6, §11.1). Two decisions follow: the canonical result +bundle, and how a workspace is acquired. + +## Decision — output/artifact contract (best-of-each, split, no DB) + +1. **Split bundle is the single source of truth** (`.agentv/results//`); NO + maintained consolidated single-file export (generate on demand if ever needed). + 3 of 4 references split; only promptfoo consolidates (for its DB/hosted model). +2. **Queryable aggregate ← margin-lab**: run-root `summary.json` is a rich `jq`-queryable + `Summary` (run_id, status breakdown, per-case **pass@k**, per-instance summaries, + usage, infra-failure taxonomy) — widen AgentV's current thin summary to this. Plus + `index.jsonl` (one row per case) for streaming/line queries. No database. +3. **Transcript + metrics ← vercel**: two-layer transcript (raw + normalized) with a + canonical cross-agent `tool_name` enum and precomputed `transcript_summary`, the + summary **inlined into each result row** for cheap trajectory/metrics assertions; + transcript referenced **by path**. +4. **Per-assertion grading ← agentskills**: `grading.json` = `assertion_results[{ text, + passed, evidence }]` + `summary` counts, PLUS AgentV's superset — top-level **string + `verdict` (`pass`|`fail`|`skip`)** + fractional **`score`** (not a boolean; needs skip + + fractional). These rows are the generic AgentV grader evidence channel: every + grader returns `assertions[]`; deterministic graders typically return one row, + while multi-aspect graders return one row per distinct criterion/aspect. The + artifact preserves both the flattened rows and each grader's nested rows. + Default judge = skeptical evidence-by-path (opt-out via + explicit `prompt`); judge pinning via `grader_target`. Evidence stays in + `grading.json`. +5. **Bundle layout / naming**: machine files move under per-run **`.internal/`** + (`index.jsonl`, `progress.json`, `events.jsonl`, `bundle.json`); run root stays clean + (`summary.json` + per-case dirs). Rename the reference field `manifest_path` → + `index_path` (file stays `index.jsonl`, JSONL for append/stream/query). Reserve + "manifest"/`bundle.json` for the frozen config. +6. **Merge `timing.json` into `metrics.json`** (sections: duration/tokens/cost always; + execution/trajectory when a trace exists); drop `timing_path`, keep one `metrics_path`. +7. **Analytics = one pure `Build()`** (margin-lab shape) producing the `Summary` with + pass@k; add promptfoo-shaped `named_scores`/`derived_metrics` on rows. + +### Multi-suite runs — one run_id, categorize by suite AND tags/experiment +Confirms ADR-0009 + ADR-0012 (not a new decision): +- **One `` (one timestamp) per CLI invocation**, across any number of suite YAMLs — all suites' cases live under the single `/` bundle. **Never a separate timestamp/folder per suite.** `runtime_source.kind = multi_eval` records the multi-suite invocation. +- **Identity = `eval_path` + `test_id`** (uuid-suffixed dir), so overlapping `test_id`s across suites don't collide. `suite`/`name` are **display/grouping metadata, not routing** (ADR-0009). +- **Categorize by BOTH, orthogonally** (each `index.jsonl` row carries both): **`suite`** (+`eval_path`) = structural origin; **`tags`** (map, incl **`experiment`**) = semantic/campaign grouping. `experiment` = the run/campaign bucket; `suite` = the intra-run structural group; the Dashboard groups by any tag key, and suite is another grouping dimension. Reports filter/group by either axis. + +### Artifact filenames (locked — accuracy over cosmetic consistency) +- **`summary.json`** (run-root AND per-case) — the aggregate. Kept over margin's `results.json`: it's a *summary*, not the full results (those are the per-case dirs + `index.jsonl`); avoids the `results//results.json` stutter; symmetric at both levels (run aggregates cases, case aggregates samples); vercel-aligned. We match margin on the aggregate *concept/shape*, not the filename. +- Per-sample triad (distinct, all kept): **`result.json`** (what happened), **`grading.json`** (verdict = `assertion_results`+`verdict`+`score`), **`metrics.json`** (duration+tokens+cost+execution/trajectory; the `timing.json` merge). +- **`grading.json`** kept (not `grades.json`) — source-consistent with agentskills (whose file is `grading.json`), and "grading" names the grading *result*. + +### Full results-tree layout (two levels — no per-run `.indexes`) +``` +.agentv/results/ + .indexes/ # CROSS-RUN derived catalogs (reserved, ADR-0012; rebuildable, not source of truth) + runs.jsonl # one row per RUN (run-level filtering/listing) + cases.jsonl # one row per (RUN x CASE) (case-level cross-run filter/trend) + .cache/ # CROSS-RUN caches (reserved) + / # one run bundle (one CLI invocation, incl multi-suite) + summary.json # queryable aggregate (root, human-facing) + /sample-1/ … # per-case detail + repeats (sample-N) + .internal/ # PER-RUN machine files + index.jsonl # one row per CASE (this run) — the per-run index lives HERE + progress.json events.jsonl bundle.json +``` +- Per-run index (rows = cases) = `/.internal/index.jsonl`; **no separate per-run `.indexes`** — `.internal` already holds it. Cross-run catalog (rows = runs) = `.agentv/results/.indexes/runs.jsonl`. Names signal scope: `.internal` = one bundle; `.indexes`/`.cache` = across runs. Both dot-prefixed (skipped by discovery). +- **Cross-run filtering needs `cases.jsonl`, not just `runs.jsonl`.** `runs.jsonl` (one row/run) answers "which runs match"; **case-level cross-run** queries ("every `fizzbuzz` across runs", "failing cases with tag X over last 10 runs", "trend of `test_id` T") need one row per (run x case) → `.indexes/cases.jsonl`, rebuilt by concatenating every `/.internal/index.jsonl` + run metadata. Join key for trends = the layered identity (content-hash `test_id` + author governance tag, ADR-0016 pt8). Both catalogs are derived/rebuildable; if JSONL scanning outgrows laptop scale, a rebuildable SQLite **view** is the escape hatch (optional adapter, never core — exploitbench pattern, Phoenix boundary intact). +- **margin-lab consistency & divergence:** matches on the *filesystem* substance — top-level queryable aggregate (`results.json`=`summary.json`), `internal/` machine folder (we dot-prefix `.internal/`), per-execution-unit dirs, one pure `Build()` for pass@k, `instance_key = test_id#sample_index`. **Divergences (deliberate):** (1) margin's *runner* uses a persistent **`RunStore` (in-memory / Postgres, NOT SQLite)** for scheduling + queries; **AgentV declines a store entirely** (laptop-first; resumability via `index.jsonl` + `--rerun-failed`). (2) hierarchical `/sample-N/` vs margin's flat `instances/#/`. (3) `timing`→`metrics` merge. The **rebuildable derived index/view** idea (JSONL `.indexes/`, optional SQLite escape hatch) is from **exploitbench** (`import`/`export` bijection), not margin — margin's store is the operational source during a run, not a filesystem-derived index. (Nuance: margin *can* rehydrate a run's completed-work state from its run-dir for **resume** — `LoadProgressSnapshot` + `loadSavedResumeBundle` + `carryForwardLocalCases` — but that's targeted carry-forward, not a general `import` that rebuilds the multi-run query DB from files; the memory store is ephemeral, the Postgres store persists independently. **AgentV follows exploitbench's model** — filesystem is source of truth, `.indexes/*.jsonl` are derived/rebuildable — with `--rerun-failed` reading `index.jsonl` from fs and no store to rehydrate.) +- **Dashboard default view is sensible, never odd/empty:** because `tags.experiment` is value-defaulted to the eval/suite name (always populated), the default view groups by `experiment` (real names, no "(none)" wall) or a recent-runs list; the grouping key is a user preference they can change, not the absence of a default. + +### Run organization: cross-run index, repeat naming, experiment-as-tag +- **Cross-run index (rebuildable cache, not source of truth):** keep per-run `index.jsonl` (rows = cases); add a cross-run catalog `.agentv/results/.indexes/runs.jsonl` (already-reserved `.indexes/` namespace) — **one row per run** (run_id, timestamp, targets, `tags` incl experiment, aggregate pass@k). Derived by scanning `*/summary.json`, rebuildable, optional (Dashboard can glob summaries as fallback). JSONL (append per run), **not `index.json`**. +- **Repeat folder = `sample-N`, not `run-N`.** "run" is overloaded (`run_id` = the whole invocation). Rename `run-${attempt+1}` → `sample-1`, `sample-2`, … (matches margin `samples_per_case`/`sample_index`, pass@k, and AgentV's `repeat`; Inspect's `epoch` is the ML-jargon alt). Keep the metadata split: `sample_index` = repeats, `retry_index` = infra retries. +- **`experiment` has no *structural* privilege, but its *value* is auto-defaulted.** No storage dir (already `/`), no top-level field (`tags.experiment`), no special schema; tag keys sort **alphabetically**; the default grouping/compare **key** is a user preference (any tag — AgentV blesses none). `--experiment X` = sugar for `--tag experiment=X`. **The one convenience:** the harness auto-populates the `experiment` tag's **value** when unset, deriving it from the eval/suite name (ADR-0009: `--experiment` > authored `tags.experiment` > eval/suite name). So every run always has a meaningful `experiment` value and is groupable — without the author setting anything. This is a default *value*, not a privileged *key*. + +## Decision — workspace resolver (provenance vs acquisition) + +Cross-framework convergent rule (SWE-bench, Terminal-bench, margin, lm-eval, Inspect): +**the case declares WHAT (identity + pin); the harness resolves WHERE-FROM via a +selectable backend. Nobody puts acquisition in the task.** + +**Field (WHAT) and resolver (HOW) are orthogonal — both required, neither replaces the +other.** Analogy: `package.json` vs the package registry. `dependencies: {lodash: ^4}` is +the **field** (always declared); npm's registry/mirror/tarball resolution is **pluggable +acquisition** — you can point npm at a custom registry, but you don't delete `package.json`. +Likewise: the `workspace` field declares provenance; the resolver (built-in backends + +custom-backend plugin + `beforeAll` escape hatch) is the pluggable *how*. A custom backend +still reads the field to know which `repo`+`commit` to fetch. You only "don't need the +field" if you go full escape-hatch and forgo declarative provenance (not recommended). + +### Naming: `workspace` (durable, locked) +Chosen over alternatives for longevity — it names the *what* (a working directory), not the +*how*: CI-standard (`GITHUB_WORKSPACE`), used by margin-lab, git/Cargo/Bazel/VS Code. +Rejected: `sandbox` (Inspect — connotes an isolation boundary, which is a *property* → the +`isolation`/`docker` fields, not the concept); `environment` (overloaded with env vars); +`testbed` (SWE-bench jargon). + +### Final locked schema +```yaml +workspace: # suite-level default; tests[].workspace overrides per case + repos: # PROVENANCE only (what to materialize) + - path: ./CargoWise # where it lands in the workspace + repo: https://github.com/WiseTechGlobal/CargoWise.git # canonical identity (join key) + commit: 953adb9 # immutable SHA pin (base_commit accepted as input alias) + sparse: [src/X] # optional content selection + ancestor: 1 # optional (nth-ancestor pin) + isolation: fresh # fresh (default, safe) | pooled | shared + template: ./tmpl # optional local scaffold + docker: { image: ... } # optional container env +``` +**Never in this schema:** acquisition (resolver + backends → harness/machine config, keyed +on `repo`) and hooks (→ `extensions`). Keeping those out is what makes the schema durable — +new acquisition technology plugs in without touching it. `commit` is an immutable SHA +(reproducible); mutable refs are excluded. + +1. **Eval declares provenance ONLY, in a declarative `workspace.repos` field** (per-test + overridable / suite-level; NOT a `vars` blob and NOT an extension): `workspace.repos: + [{ path, repo, commit (base_commit alias), sparse?, ancestor? }]`, plus `workspace.isolation` + (shared/pooled/fresh). Remove the tangled acquisition fields (`type`/`local`, `resolve`, + `clone.depth`, `clone.filter`, per-repo `resolver`). The harness materializes this + **before hooks** (ADR 0016 pt10). +2. **Acquisition = harness resolver in machine config (`$AGENTV_HOME/config.yaml`), + keyed on `repo`**, ordered backends: (1) local checkout auto-adopt via origin-match + → `git clone --reference`; (2) bare mirror clone-cache (`--reference`, shared objects); + (3) snapshot artifact (WTG `download-release-deps` reframed); (4) remote clone; + (5) *future* Docker image (SWE-bench/margin/Inspect — same identity key, new backend; + adopt Inspect's `image`/`build`/`x-local` distinction + per-config init caching). +3. **`--reference` (mirror cache) is the workhorse**: shallow-speed WITH full history, so + deep `base_commit` pins never break — retires the `--depth`/`--filter` debate. Keep + `sparse` for content selection. +4. **Materialization runs before hooks and reads the declared provenance**; ordinary user + `beforeAll`/`beforeEach` hooks run *after* it. Resolver config is machine-local, + orthogonal to eval and target YAML. Targets carry no repos. +5. **The resolver is PLUGGABLE — custom acquisition is first-class** (per the "plugins over + built-ins" product guardrail). Two extension points beyond the built-in backends: + (a) **register a custom acquisition backend** (a resolver plugin, config-level, keyed on + `repo`) for a bespoke store/format — the recommended path; (b) a **`beforeAll` extension + escape hatch** that materializes a fully author-owned workspace and reports its path + (what the promptfoo parity example did). The built-in acquisition itself may be + implemented as an auto-registered, ordered-first, **swappable** plugin over the same + public interface — so the default is zero-config yet replaceable. + +The invariants (not the mechanism) are what matter: provenance is declared as data; +acquisition runs before hooks and is keyed on the pin; built-ins ship. New backends — +built-in or user — plug in without touching the eval schema because all resolve the same pin. + +### Note: SWE-bench `FAIL_TO_PASS` / `PASS_TO_PASS` + +`FAIL_TO_PASS`/`PASS_TO_PASS` are two lists of test IDs shipped with each SWE-bench +dataset row. The distinction (fix-tests vs regression-tests) matters only at +*dataset-construction* time; at *run* time it collapses to "**run these named tests; pass +iff all pass**". So it is **too domain-specific for a core primitive, and needs no +dedicated SDK recipe** — it is plainly a workspace-`cwd` **`code-grader`**: the grader runs +the repo's tests in the workspace and its exit code is the verdict (exactly margin's +`tests/test.sh` 0/1/2 model). The two lists are just data the grader's command consumes +(inline, or from `vars`/`metadata`). Combined with the Docker-image acquisition backend +(#5), this is how AgentV runs SWE-bench natively — same `repo`+`commit` provenance, no +schema change, no new grader type. + +### Cross-check: exploitbench (confirms + two borrowables) +exploitbench (security-exploit benchmark; AgentV research `entities/exploitbench.md`) **confirms** this contract: split filesystem run-tree is the source of truth (`job.json`/`score.json`/`cost.json`/`transcript.jsonl`/`tool_calls.jsonl`/`config_snapshot.yaml`); its SQLite is a **derived, rebuildable view** (`import`/`export` bijection), not required — validating our no-DB core (jq + `index.jsonl` is the query surface; a SQLite view stays an optional post-run adapter, Phoenix boundary intact). Docker images are pinned by `sha256:` digest at run start (reinforces resolver backend #5); `config_snapshot` = our `bundle.json`. **Borrow:** (1) a **`provenance`** field on result rows (`native`/`mock`/`replay`/`imported_from_*`) — durable, fits AgentV's replay/transcript/mock providers; adopt now. (2) **Eval-integrity / anti-reward-hacking — future scope**: run high-stakes graders in a fresh container with the workspace mounted **read-only**; an `audit` pass that re-grades from the stored transcript, scans for reward-hacking red flags, and verifies model identity (the provider served the requested model). "Post-hoc audit as part of benchmark validity." + +## Consequences + +- Refines ADR-0011/0012 (bundle layout, `index_path`, timing→metrics merge, `.internal/`); + 0011/0012 marked accordingly. +- Opik export (`av-bv4.6`) and the Dashboard (`av-2s7`) consume the new bundle → re-gate on + this contract. +- Codemod handles bundle-field renames and drops the tangled repo-acquisition fields. diff --git a/docs/plans/promptfoo-aligned-eval-restructure.md b/docs/plans/promptfoo-aligned-eval-restructure.md new file mode 100644 index 000000000..f9ad8d475 --- /dev/null +++ b/docs/plans/promptfoo-aligned-eval-restructure.md @@ -0,0 +1,521 @@ +# Plan (DRAFT): Restructure AgentV eval authoring to clone promptfoo + +Status: draft for review. Not started. No code changed. + +Sources analyzed (all cloned locally, read-only): +- promptfoo v0.121.17 — `/home/christso/projects/promptfoo-clone` (authoring format — the thing we clone) +- Margin-Lab/evals — `/home/christso/projects/margin-lab-evals` (runner, I/O contracts, workspace, analytics) +- vercel-labs/agent-eval — `/home/christso/projects/vercel-agent-eval` (transcripts, agentic LLM graders, output JSON) +- AgentV today — `packages/core/src/evaluation/**`, `packages/sdk/**`, `docs/adr/**` + +## 0. Goal, scope, non-goals + +**Goal — AgentV's eval contract is a strict SUPERSET of promptfoo.** Adopt promptfoo's `promptfooconfig.yaml` authoring surface verbatim (field names + semantics), then layer AgentV's repo/agent value-adds on top. The success property is: + +> **Any promptfoo config, mechanically snake_cased, is a valid AgentV eval that runs with equivalent semantics.** AgentV additionally accepts more (bare-string asserts, repo/fixture materialization via a built-in extension, `gate`, agentic judges, multi-turn, …) — all through promptfoo-native surfaces (`vars`, `extensions`), not new top-level concepts. + +Two consequences of "superset": +- **Compatibility is one-way, and that is the design** — promptfoo ⊆ AgentV. AgentV extensions that promptfoo lacks (bare-string asserts, the repo-native `workspace.repos` field, agentic judges, etc.) are the superset, not a defect. +- **Superset is a design property, not a shipped importer.** AgentV's snake_case authoring contract *is* promptfoo's contract (snake_cased) + extensions. New evals are authored in snake_case directly — nothing to import. The superset is over *snake_cased* promptfoo; a literal camelCase promptfoo file (`providers:`, `defaultTest:`) would need a mechanical transform (camelCase→snake_case + `providers`→`targets`), but that's a documented one-off (`yq`/script), optionally a tiny helper if promptfoo-suite migration is ever actually needed — **not** a maintained core feature (YAGNI, same as the consolidated export). Distinct from the **hard-deprecation codemod**, which migrates AgentV's *own* existing eval files across §deprecation and *is* built. + +Borrow runner/analytics from margin-lab and transcripts/agentic-graders from vercel-agent-eval. + +**In scope.** The eval-file schema (`eval-file.schema.ts`), the parser/config layer, the assertion/grader vocabulary, the templating engine, the run/execution model, transcript normalization, and the output/analytics contract. + +**Non-goals (this plan).** promptfoo `redteam`, promptfoo cloud `sharing`, and promptfoo's SQLite results DB. AgentV keeps the local `.agentv/results//` bundle + Dashboard per the Phoenix product boundary. + +**Hard constraint.** Every conflict below is a reversal of an existing shipped AgentV decision (several are ADR-0013 decisions made within the last week). Each needs an explicit keep/replace call — that is section 2, and it's the part that needs your sign-off before any code moves. + +--- + +## 1. Target authoring format — promptfoo, snake_cased + +This is the format we are cloning. Field names are promptfoo's, mechanically snake_cased. + +### 1.1 Top-level keys + +| promptfoo (camelCase) | AgentV target (snake_case) | Notes | +|---|---|---| +| `description` | `description` | already exists | +| `tags` (`Record`) | `tags` (map) | AgentV already moved here (`tags.experiment`) — **aligned** | +| `prompts` | `prompts` | **NEW top-level concept** (see 1.2) | +| `providers` / `targets` | `targets` (canonical); `provider`/`apiId` = backend field | plural matrix axis; top-level promptfoo `providers` is remapped to `targets` **at conversion time** (codemod/one-off), NOT a live alias (avoids overloading AgentV's backend `provider`; see 2.a) | +| `tests` | `tests` | keep; row shape changes (see 1.4) | +| `default_test` (`defaultTest`) | `default_test` | widen from threshold-only (see 1.5) | +| `scenarios` | `scenarios` | **NEW** (see 1.7) | +| `derived_metrics` (`derivedMetrics`) | `derived_metrics` | **NEW** (see 1.7) | +| `output_path` (`outputPath`) | *(map to fixed bundle)* | AgentV writes `.agentv/results/` — keep bundle, accept `output_path` as an extra export sink | +| `env` | `env` | provider env overrides | +| `nunjucks_filters` (`nunjucksFilters`) | `nunjucks_filters` | depends on templating decision (2.f) | +| `extensions` | `extensions` | **canonical lifecycle surface** (`beforeAll`/`afterAll`/`beforeEach`/`afterEach`). `on_run_complete`, `preprocessors`, `workspace.hooks` are REMOVED and fold into this (see 2.l) | +| `metadata` | `metadata` | exists | +| `evaluate_options` (`evaluateOptions`) | `evaluate_options` | widen (see 1.6) | +| `sharing`, `redteam`, `tracing` | — | out of scope / Phoenix boundary | + +### 1.2 `prompts` (NEW — the biggest addition) + +promptfoo separates **prompt templates** (top-level `prompts`) from **data rows** (`tests[].vars`). One test row is rendered through every prompt × every provider. AgentV today has no top-level prompt list — it puts a full message array in each `tests[].input`. + +Forms to support (snake_cased where object keys appear): +- inline string with nunjucks: `"Convert to {{language}}: {{input}}"` +- file ref: `file://prompts.txt` (`---` separates multiple), `file://p.json`, `file://p.yaml` +- file + label: `{ id: file://prompts.txt, label: content_generation }` +- function prompt: `file://prompt.js:func` / `file://prompt.py:func` +- chat array: `[{ role, content }]` +- map form: `{ id: template }` + +### 1.3 `providers` / `targets` (plural matrix) + +promptfoo: `providers` is an **array**; the eval is the matrix `prompts × providers × tests`. `targets` is an accepted alias (promptfoo enforces exactly one of the two and normalizes `targets`→`providers`). + +Forms: string id (`openai:chat:gpt-4o`), object `{ id, label, config, prompts, transform, delay, env }`, map form, function, and protocol providers (`http`, `exec:`, `file://…`, `python:`, `websocket:`, `echo`). + +AgentV already has a rich target provider set (CLI/SDK/codex/copilot/claude/replay/transcript) and per-execution `targets: []`. The work is promoting `providers`/`targets` to a **top-level plural matrix axis** and reconciling with AgentV's `.agentv/targets.yaml` named-target registry (see 2.a). + +### 1.4 `tests` / test case + +promptfoo test row fields → snake_case: +`description`, `vars`, `provider`, `providers`, `prompts`, `provider_output` (`providerOutput`), `assert`, `assert_scoring_function` (`assertScoringFunction`), `options`, `threshold`, `metadata`. + +Key shape changes vs AgentV today: +- add `vars` as the row's data (AgentV has `vars` already, but it's secondary to `input`) +- add `assert` as the canonical grader key (AgentV renamed this to `assertions` — conflict 2.c) +- `provider_output` short-circuits the provider call and grades a fixed output — AgentV has no equivalent +- promptfoo test `id`/`description` is optional; AgentV requires `id` (conflict 2.d) + +### 1.5 `default_test` (widen) + +promptfoo `default_test` is a full test-case-minus-description whose `vars`/`assert`/`options`/`threshold`/`metadata` merge into every row (and `file://` loadable). AgentV's `default_test` today is `{ threshold }` only. Widen it to the full promptfoo merge semantics, plus `options.disable_default_asserts` opt-out. + +### 1.6 `evaluate_options` (widen) + +promptfoo: `cache`, `delay`, `generate_suggestions`, `max_concurrency`, `repeat`, `timeout_ms` (per test), `max_eval_time_ms` (whole run), `filter_range`. AgentV has `{ budget_usd, max_concurrency }`. Superset them; reconcile `repeat` with AgentV's `repeat: { count, strategy, early_exit }` block and margin-lab's `samples_per_case` (conflict 2.g). + +### 1.7 `scenarios`, `derived_metrics`, named metrics + +- `scenarios`: `[{ description, config: [partialTest…], tests: [test…] }]` — cartesian of config groups × tests. **NEW** to AgentV. +- `derived_metrics`: `[{ name, value }]` where `value` is a math expression over named scores or a function. **NEW**. +- **named metrics**: promptfoo's `assert[].metric` (nunjucks-templated) feeds `named_scores`; `assert-set` groups sub-asserts under one metric. AgentV graders use `name` — conflict 2.e. + +### 1.8 Assertions (`assert`) + +Per-assertion fields → snake_case: `type`, `value`, `config`, `threshold`, `weight`, `provider`, `rubric_prompt` (`rubricPrompt`), `metric`, `transform`, `context_transform`. + +promptfoo assertion **type** catalogue is large and flat, each with a `not-` variant, plus `assert-set`, `select-best`, `human`, `max-score`. AgentV has a smaller typed set (`contains`/`equals`/`regex`/`is-json`/`rubrics`/`llm-grader`/`code-grader`/`composite`/`tool-trajectory`/`field-accuracy`/`latency`/`cost`/`token-usage`/`execution-metrics`/`include`). + +Proposed type mapping (this is the crux of conflict 2.c): + +| promptfoo type | AgentV today | Plan | +|---|---|---| +| `contains`/`equals`/`regex`/`is-json`/`icontains`/`starts-with`/`contains-all`/`contains-any` | `contains`/`equals`/`regex`/`is-json` | adopt promptfoo names; add the missing string ops | +| `javascript` / `python` | `code-grader` | accept promptfoo `javascript`/`python`; keep `code-grader` as AgentV superset | +| `g-eval` / `llm-rubric` / `model-graded-*` / `factuality` | `llm-grader` / `rubrics` | adopt `g-eval` for criteria/rubric scoring; keep `llm-rubric` for free-form rubric text; keep agentic judge behavior as an AgentV extension | +| `assert-set` | `composite` | adopt `assert-set`; keep `composite` alias or deprecate | +| `similar` / `similar:*` | *(none)* | **NEW** — needs embeddings provider | +| `latency` / `cost` / `perplexity` / `word-count` | `latency` / `cost` / `token-usage` | align names | +| `trajectory:tool-used` / `:tool-sequence` / `:step-count` / `:goal-success` | `tool-trajectory` | map AgentV's single typed grader onto promptfoo's `trajectory:*` family | +| `webhook` / `classifier` / `moderation` / `guardrails` / `answer-relevance` / `context-*` | *(none)* | evaluate per-need; most are optional | +| — | `field-accuracy`, `execution-metrics`, `include` | AgentV-only extensions to preserve | + +### 1.9 Datasets / CSV + +promptfoo `tests: file://tests.csv` with magic columns (`__expected`, `__expectedN`, `__prefix`, `__suffix`, `__description`, `__provider_output`, `__metric`, `__threshold`, `__metadata:key`, `__config:…`) and the `assertionFromString` mini-DSL. AgentV has `imports`/`include`/`select` instead. Plan: support `file://…csv|json|jsonl|yaml|py:func` row loading + the `__expected` column DSL as the promptfoo-compatible dataset path; keep AgentV `imports`/`select` as the suite-composition path. + +### 1.10 Templating + +promptfoo renders `{{var}}` via **nunjucks** into prompt `raw`, `assert.value`, and `assert.metric`; array vars auto-expand into multiple rows; `_conversation` var auto-injected; custom filters via `nunjucks_filters`. AgentV uses `${{ ENV }}` substitution only. This is conflict 2.f — adopting nunjucks (for vars **and** env via `{{ env.VAR }}`) is required for true format parity. + +--- + +## 2. Conflicts — RESOLVED by the naming principle + +**Governing principle (owner decision).** Where a feature is functionally equivalent and semantically the same, **use promptfoo's name/shape** (e.g. `assert`, not `assertions`; `metric`, not `name`). **Keep AgentV's form only where its semantics are genuinely better** — e.g. AgentV's executable `gate` over promptfoo's scalar `threshold`, and AgentV's `repeat: { count, strategy, early_exit }` block over promptfoo's `repeat: `. + +**Deprecation policy: HARD (owner decision).** This ships as a **major version**. Renamed/replaced keys are *removed*, not aliased — no back-compat shims, no soft-deprecation window. `assertions` → removed (use `assert`); `composite` → removed (use `assert-set`); grader `name`-as-metric → removed (use `metric`); `eval_cases` → removed; `tests[].input` / suite `input` → removed (use `prompts`+`vars`, §2.b); `workspace`/`on_run_complete`/`preprocessors` → removed (use `extensions`, §2.l); `${{ ENV }}` → removed (use nunjucks `{{ env.VAR }}`, §2.f). A one-shot codemod migrates existing eval files; the parser hard-errors on removed keys with a message pointing at the new name. **Owner note: the churn is acceptable because all of this was introduced recently and is not yet in production** — no external users to migrate, so a clean break beats carrying aliases. + +Applying that principle, the decisions are below (D = decided, ▸ = still a judgment call, tracked in §8). + +### 2.a `targets` first-class SUT; `provider` = harness/backend — SEMANTIC DEPARTURE +- **promptfoo (verified against source + history):** `providers` is the **original, canonical** field. `targets` is a **strict alias added 2024-05-09** (commit `102804f0`, the red-team feature) — `UnifiedConfigSchema` enforces "exactly one of `targets`/`providers`," then `transform`/`readConfig` do `providers = targets; delete targets`. So in promptfoo `targets` ≡ `providers` exactly; the "system-under-test" connotation is *conventional* (redteam domain), not schema-enforced. +- **AgentV's move is a deliberate re-canonicalization, not a mirror.** promptfoo's canonical is `providers` (LLM-endpoint framing). AgentV's domain (evaluating agents/apps) matches the red-team *"target = thing under test"* framing far better, so AgentV elevates `targets` to first-class canonical and demotes `provider`/`apiId` to the "backend kind" vocabulary. Superset holds because the **importer** rewrites promptfoo's top-level `providers:` → `targets:` — NOT via a live alias (a top-level `providers` key would collide with AgentV's existing backend `provider`/sub-provider meaning). Net: we adopt promptfoo's target/`ProviderOptions` object *shape*, keep exactly one live SUT key (`targets`), and re-canonicalize on the name that fits agent-eval. +- **AgentV today:** top-level `target` (singular) is the SUT; `.agentv/targets.yaml` is a named registry; a target has a `provider` field naming the backend kind. +- **AgentV's better semantics (keep, and this is a deliberate departure):** do **not** treat `provider`==`target`. Layer them: + - **`target` / `targets`** = first-class **system under evaluation** (agent/model being tested). Canonical name, and the matrix axis. + - **`provider`** = the **harness or LLM backend** inside a target (`openai`/`anthropic`/`claude-code`/`cli`/`replay`/…) — never itself a SUT. +- **D — registry target = promptfoo target schema + AgentV extensions.** `.agentv/targets.yaml` entries adopt promptfoo's target/`ProviderOptions` object shape — `id`, `label`, `config`, `prompts`, `transform`, `delay`, `env` — extended with AgentV fields: `provider` (backend kind), `model`, `use_target`, `fallback_targets`, `grader_target`, `max_budget_usd`, `hooks`. A promptfoo `id: openai:gpt-4o` string decomposes to `{ provider: openai, model: gpt-4o }`. +- **D — canonical `targets`; do NOT accept a top-level `providers` alias (owner).** AgentV already uses `provider` + `apiId` (a provider + **sub-provider** pair, e.g. `anthropic:claude-sonnet` → `{providerName: 'anthropic', apiId: 'anthropic-messages'}`) as the **backend** vocabulary. Introducing a top-level `providers`-as-SUT alias would overload the word with two meanings — reject it. If a raw promptfoo file is ever converted to AgentV, the one-off transform rewrites top-level `providers:` → `targets:` (alongside camelCase→snake_case) — this lives in the conversion/codemod, **not** a runtime alias. AgentV's live schema has exactly one SUT key (`targets`), and `provider`/`apiId`/sub-provider stay unambiguously "backend." A `targets` entry may be a registry name (string) or an inline promptfoo-shaped target object; `use_target`/`fallback_targets`/`max_budget_usd` stay as extensions. +- **Note vs 1.3:** §1.3 said "promote `providers`/`targets`"; the refinement is that **`targets` is canonical and `provider` is demoted to the backend field**, not a top-level synonym. + +### 2.b Top-level `prompts` vs per-test `input` — SIMPLIFY by collapsing `input` +- **promptfoo:** prompt templates are top-level and shared; test rows carry only `vars`. A prompt can be a string OR a chat array `[{role, content}]`, both with `{{var}}`. +- **AgentV today:** each `tests[].input` is a full message array; no shared prompt list. This is a *second* way to express "the messages sent to the target," parallel to promptfoo's `prompts`. +- **D — collapse `input` into `prompts` + `vars` (owner: "agentv can be simplified"):** there should be ONE way to express what's sent to the target. A promptfoo chat-array prompt already covers everything `input` did (multi-turn, roles, `{{var}}`). So: + - Shared/benchmark case → top-level `prompts` (string or chat array) × `tests[].vars`. + - "Each test is unique" (agent tasks) → a passthrough prompt (e.g. `"{{task}}"`) + `vars: { task: … }`, or a per-test prompt override. No bespoke `input` field. + - `input_files`/attachments (file/image content) survive as a **prompt-content** convenience (part of the prompt/vars), not a separate top-level concept. + - **Removes a concept** (hard, major version): `tests[].input` and suite-level `input` go away; codemod rewrites `input:` message arrays into a per-test prompt. +- **Concurrency/workspace model (owner):** workers either **share one workspace** or draw from a **workspace pool**, where a checked-out workspace is **reset to original** between uses (git clean / snapshot restore). This is the reset-based pool, not container-per-instance — see §4. + +### 2.c `assert` vs `assertions`, and grader type names — NAMING (reverses ADR-0013) +- **promptfoo:** key is `assert`; types are `llm-rubric`, `javascript`, `python`, `assert-set`, `model-graded-*`. +- **AgentV today:** commit `d5514b9a` **removed** the `assert` alias and requires `assertions`; types are `llm-grader`, `code-grader`, `composite`, `rubrics`. ADR-0013 explicitly says it does NOT rename `grader`. +- **Conflict:** direct naming collision; functionally equivalent → principle says promptfoo wins. +- **D — `assert` is canonical; `assertions` REMOVED** (hard). Reverses ADR-0013; needs a superseding ADR. +- **D — adopt promptfoo type names where equivalent:** `javascript`/`python`, `assert-set` (over `composite`, removed), the string ops (`contains`/`equals`/`regex`/`is-json`/`icontains`/`contains-all`/`contains-any`/`starts-with`), `similar`, `latency`, `cost`, `webhook`. +- **D — keep AgentV grader types that have better/extra semantics** as first-class extension types (no promptfoo equivalent, or strictly richer): `code-grader` (workspace/`cwd`-aware, arbitrary-language subprocess power tool), `tool-trajectory`, `execution-metrics`, `field-accuracy`, `include`. +- **D — execution model (verified against promptfoo; corrects the old §8.3):** promptfoo runs `javascript` **in-process** (`new Function` for inline, dynamic `import()` for `file://`) and shells out **only** for `python` (`PythonShell` subprocess). AgentV matches this: + - **`javascript` → in-process** — live `output`/`context` objects by reference, ~zero overhead. On Bun this is *easier* than Node (Bun `import()`s `.ts` directly, no transpile step). + - **`python` → subprocess** — unavoidable cross-runtime; JSON args in/out. + - **`code-grader` → stays the subprocess power tool** for workspace-`cwd` graders (build/test commands), arbitrary languages, and isolation-sensitive cases. + - **Do NOT desugar `javascript`→`code-grader`** — that would throw away in-process speed. Boundary: *light/pure → in-process `javascript`; heavy/workspace/multi-language → subprocess `code-grader`.* Same trust model both ways (author code is trusted, as in promptfoo). +- **D — split LLM-judge names by semantics, not implementation.** `g-eval` is the criteria/rubric scoring surface. `rubrics` (multi-criteria, one judge flow, operators/`score_ranges`) is removed as a type name and folded into `g-eval` as optional structured fields: + ```yaml + type: g-eval + value: string | string[] # promptfoo-compatible criteria + rubrics: rubric_item[] # AgentV structured extension + ``` + `llm-rubric` remains the promptfoo-compatible free-form rubric-text judge. AgentV's agentic/evidence-gathering judge behavior (judge target, workspace/transcript evidence, max steps, preprocessors) remains an AgentV extension instead of being forced into promptfoo's non-agentic `llm-rubric` shape. + Artifact rows are generic AgentV grader output, not a `g-eval`-only feature: every grader returns `EvaluationScore.assertions[]`, the orchestrator flattens those rows into the result and `grading.json.assertions[]`, and nested `graders[].assertions[]` preserves the per-grader breakdown. Deterministic graders usually emit one row; multi-aspect graders emit one row per authored check or result unit. `g-eval`'s structured criteria should use one row per criterion because those criteria are distinct scoring aspects, the same pattern used by field-accuracy fields, execution metrics, and tool-trajectory requirements. + - **Name — RESOLVED (owner Q): short-form criteria and structured rubrics use `g-eval`, not `llm-rubric`.** Promptfoo's `g-eval` and DeepEval's `GEval` both model criteria-driven evaluation; DeepEval also has rubric score bands, making it the better home for AgentV `score_ranges`. + - **Default grading behavior change — approved by owner; here's what changes** when the default judge adopts vercel's judge (§5.2): (1) **skeptical stance by default** ("strict judge; when in doubt, fail") → borderline outputs fail more often, so existing suites may show *lower* pass rates; (2) **evidence-by-path** → for agent/workspace tasks the judge reads the transcript/environment from files instead of a stuffed prompt, enabling tool-using investigation and better judgments on large outputs; (3) fixed `{pass, score, reason}` verdict contract. **Opt-out:** authoring an explicit `prompt` overrides the default skeptical prompt, preserving prior behavior. Implementation must diff the exact current `llm-grader-prompt.ts` wording to quantify the shift before flipping the default. + +### 2.k `assert` short-form (bare strings → rubric) — how to handle it +- **AgentV today** (`grader-parser.ts:394`): bare strings in the `assertions` array are collected and unwrapped into **one** `rubrics` grader (`criteria: [strings]`, `weight = N`), evaluated in a single LLM call at equal weight. promptfoo has no bare-string form — every `assert` entry is an object, and multiple criteria are multiple `llm-rubric` asserts (one call each). +- **Better semantics = AgentV's:** grouping N criteria into one judge call is cheaper and more holistic than N separate calls. Keep the shorthand. +- **D — keep the bare-string shorthand, retarget it to `g-eval`:** bare strings in `assert` desugar to a single grouped `{ type: g-eval, value: [strings], weight: N }` (was `rubrics`; same grouping/weight/single-flow behavior, now under the criteria-eval type). Result: + - `assert: ["is polite", "cites a source"]` → one `g-eval` with `value: [both]`, `weight: 2`. + - `assert: [{type: contains, value: hi}, "is polite"]` → `contains(w=1)` + `g-eval(value:["is polite"], w=1)`. + - promptfoo-style `assert: [{type: llm-rubric, value: "is polite"}]` works unchanged (import parity). + This keeps AgentV's terse authoring + single-flow economy while making the desugared type a promptfoo/DeepEval-aligned name. The explicit form `{ type: g-eval, value: [strings] }` is also accepted, so authors can pick terse or explicit. +- **This is the superset, not a divergence:** bare-string asserts are an AgentV extension on top of promptfoo. promptfoo ⊆ AgentV holds (every promptfoo `assert` is valid AgentV); AgentV simply accepts more. The only obligation: an "export to promptfoo" path must desugar bare strings to explicit `g-eval` objects first. + +### 2.d Test `id` required vs optional +- **promptfoo:** `id`/`description` optional. +- **AgentV today:** `tests[].id` required; it's the flattened `test_id` result identity and `--test-id` CLI filter (ADR-0013). +- **Fact:** promptfoo has **no durable per-test id** — internally it keys on `testIdx` (ordinal) and shows `description`. Run-to-run comparison is fragile (breaks on reorder/edit). +- **D — layer identity into three distinct roles (owner's governance point):** don't overload one field. + - **Content identity — `test_id`** = short content hash of `(prompt + vars + assert)`. Answers "did the content change?"; drives artifact dirs, dedup, drift detection. Changes when you edit the case — correct for *content*. + - **Governance/trend identity — an author-set `tag`/`metadata` key** (e.g. `metadata.case_id` or `tags.case`). Durable across BOTH reorder AND edits — the human declares "this is the same logical test." **This is what the Dashboard keys trend-lines/comparison on** (best practice: stable identity lives in governance metadata, not derived content). Falls back to `test_id`/ordinal when absent. + - **Display label** — precedence `description` → `vars` (e.g. `language=French`) → `Test #`. + - So: content-hash = "is it the same bytes," metadata/tag = "is it the same logical test," description/vars = "what a human reads." A promptfoo file with none still renders + filters; governance stability is opt-in via a tag/metadata key. + +### 2.e Named metrics: `metric` vs grader `name` +- **promptfoo:** `assert[].metric` (nunjucks-templated) names a score; `derived_metrics` computes over them; `assert-set` groups. +- **AgentV today:** graders carry `name`; no `derived_metrics`; aggregation via `weight`/`required`/`min_score`. +- **D — prefer promptfoo `metric`:** `metric` is the named-score field (nunjucks-templated); `name` becomes display-only/alias. Add `named_scores` + `derived_metrics` to the result contract. Keep `weight`/`required`/`min_score` as AgentV extensions (richer aggregation). + +### 2.f Templating engine: nunjucks for BOTH vars and env (promptfoo-native), replacing `${{ ENV }}` +- **promptfoo:** nunjucks `{{ }}` everywhere + array-var row expansion + custom filters. **Env vars too** — `{{ env.VAR }}`, rendered at **config-load time before validation** (`load.ts:336`, `providers/index.ts:94`, docs `modular-configs.md`), so env works in paths/configs. NOT a `${ }` sigil. +- **AgentV today:** `${{ ENV_VAR }}` env substitution in target configs only. +- **Conflict:** different delimiters; nunjucks is the shared standard → principle says promptfoo wins for env *and* vars. +- **D — one engine (nunjucks), phase-separated by render pass + namespace (owner: adopt promptfoo-native env):** + - **`{{ var }}`** — **eval-time** template vars, for `prompts`/`vars`/`assert.value`/`assert.metric`, + array-var row expansion + `nunjucks_filters`. + - **`{{ env.VAR }}`** — **config-time** env, rendered at **load-time before validation** (promptfoo-native), usable in target configs/paths. Defaults via `{{ env.VAR | default('x') }}` (replaces `${ENV:-default}`). + - **No `${ENV}` sigil** (reversed earlier decision): promptfoo uses `{{ env.X }}`, so adopting it keeps **one templating engine**, superset-compat (promptfoo env configs run unchanged), and the config-time/eval-time split is handled by *when* each is rendered + the `env` namespace — not a second sigil. + - **Key correctness win (owner insight): `{{ env.VAR }}` does not collide with runtime shell `${VAR}`.** CLI targets run shell commands whose `command` may contain `${VAR}`/`$VAR` that must reach the **shell at runtime in the subprocess**, untouched. A `${ENV}` config sigil would clobber those (ambiguous: resolve-now vs expand-later). With `{{ env.VAR }}`, AgentV resolves its own env at load-time and passes `${VAR}` through verbatim for the shell to expand — two non-overlapping namespaces. + - Codemod rewrites `${{ X }}` → `{{ env.X }}`. + - Inline mixing is nunjucks' primary mode: `--flag {{ myvar }}` (eval var), `--key {{ env.MY_KEY }}` (AgentV env), while `$SHELL_VAR` in a command stays for the runtime shell. + +### 2.g `repeat` block vs `samples_per_case` vs promptfoo `repeat:int` +- **promptfoo:** `evaluate_options.repeat` = integer (naive re-run). +- **AgentV today:** `repeat: { count, strategy: pass_any|pass_all|mean|confidence_interval, early_exit, cost_limit_usd }`. +- **margin-lab:** `samples_per_case` (int) → expands to N independent instances; pass@k computed in analytics. +- **D — keep AgentV `repeat` block (better semantics):** it is strictly more expressive than `repeat: `. Map promptfoo's `repeat: ` → `repeat.count` with `strategy: pass_all`. Implement via margin-lab-style **instance expansion** (§4). This is an explicit "AgentV wins" case under the principle. + +### 2.h Output store: bundle vs promptfoo SQLite +- **promptfoo:** SQLite `~/.promptfoo/promptfoo.db` + optional `output_path` file. +- **AgentV today:** `.agentv/results//` bundle (ADR-0011/0012) + Dashboard. +- **D — keep AgentV bundle as the single source of truth (better semantics + product boundary):** aligns with margin-lab's on-disk `results.json` + per-instance dirs and the Phoenix boundary. Do not adopt the SQLite DB, and do **not** maintain a consolidated single-file export (§6.0, YAGNI). promptfoo `output_path` (json/jsonl/csv/yaml) can be produced *on demand* from the bundle if a user asks, but is not a first-class artifact. + +### 2.j `threshold` (per-test) + `gate` (release policy) +- **promptfoo:** scalar per-test/`default_test` `threshold` only; no release gate. +- **AgentV today:** per-test/`default_test` `threshold` **and** an executable `gate` (`min_test_pass_rate`, `max_execution_errors`, command over run JSON). +- **D — keep both:** adopt promptfoo `threshold` (same concept, per-test score cutoff) **and** keep AgentV `gate` (better semantics for release gating; no promptfoo equivalent). Different levels, both stay. + +### 2.i AgentV-only fields promptfoo lacks (preserve, don't lose) +`gate` (executable release policy), `imports`/`include`/`select`, multi-turn **evaluation** layer (`turns` per-turn assertions, cross-turn `aggregation`, `on_turn_failure`; execution via promptfoo `_conversation`; `window_size` dropped — §3), `depends_on`/`on_dependency_failure`, `conversation_id`, `requires`, replay/transcript providers, code-grader SDK. **All preserved as documented AgentV extensions** (section 3). (Workspace, `on_run_complete`, `preprocessors` are handled by 2.l, not kept as-is.) + +### 2.m `expected_output` vs `vars.expected_output` — KEEP first-class golden answer +- **promptfoo:** there is no first-class test-case `expected_output` field. Expected values usually live in `assert[].value`, CSV/XLSX `__expected*` columns that generate assertions, or ordinary vars such as `reference_answer` used by `default_test.assert.value`. +- **DeepEval:** `LLMTestCase` and `Golden` both carry `expected_output` as the ground-truth/golden answer field. +- **AgentV today:** `expected_output` is a passive reference answer passed to LLM graders, code graders, field-accuracy, custom assertions, and prompt templates. It also makes a case gradeable without `criteria`; when no `assertions` are declared, the implicit default LLM grader uses the case context including `criteria` and `expected_output`. When `assertions` are present, declared graders are the complete grader list, so `expected_output` remains data and does not add an implicit grader. +- **D — keep `expected_output` first-class; do not move it wholesale into `vars`.** Golden answers should not be target prompt variables by default because that risks leaking benchmark labels into the system under test. `vars.expected_output` remains an ordinary promptfoo-style variable and never triggers the implicit default grader by itself. The first-class `expected_output` field is the evaluation-context signal: it is available to graders, can satisfy "this case has something to grade," and can drive the default judge only when no explicit `assert`/`assertions` list is present. +- **Import/conversion rule:** promptfoo `__expected*` columns convert to explicit assertions; promptfoo regular columns such as `reference_answer` or `expected_output` stay in `vars` unless a user or importer intentionally lifts them to AgentV `expected_output`. DeepEval and AgentV-native imports map `expected_output` to the first-class field. + +### 2.l Workspace repos = declarative FIELD (harness-materialized); extensions only for agent-rules/setup + +> **REVISED (finalized in ADR-0016 pt10 / ADR-0017).** The text below originally proposed repo materialization via `vars.workspace` + an `agentv:workspace` *extension*. That is **superseded**: **repo provisioning is a declarative `workspace.repos` field the harness materializes BEFORE hooks** (all 4 benchmark frameworks treat provisioning as harness-core; promptfoo has no workspace to align with). Extensions (`beforeAll`/…) are only for **non-provisioning** pluggable setup — `agentv:agent-rules` (skills/hooks/agents staging) + custom `file://` hooks — running after materialization. `isolation` is a `workspace` field, not a hook choice. `on_run_complete`/`preprocessors` still removed → `extensions`. +- **Principle (owner decision):** don't invent a top-level `workspace:` block, and don't keep AgentV-specific lifecycle keys. Align maximally with promptfoo. Both reference frameworks agree workspace **is part of the dataset** — vercel: a case *is* a fixture dir; margin-lab: a case *is* a Docker image + tests. +- **D — one lifecycle surface: promptfoo `extensions`.** `beforeAll`/`afterAll`/`beforeEach`/`afterEach`. **Remove** `on_run_complete` (= `afterAll`), `preprocessors`, and `workspace.hooks` — they collapse into `extensions` (hard, major version). +- **FINAL (ADR-0016 pt10 / ADR-0017):** repo provisioning is a declarative **`workspace` field** (durable name — CI-standard `GITHUB_WORKSPACE`, margin, git/Cargo/Bazel; not `sandbox`/`environment`/`testbed`), suite-level default + per-test override. Provenance only; acquisition is a pluggable harness resolver (machine config, keyed on `repo`); hooks → `extensions`. + ```yaml + targets: file://targets/reviewer.yaml + workspace: # suite default; tests[].workspace overrides per case + repos: + - path: ./CargoWise + repo: https://github.com/WiseTechGlobal/CargoWise.git + commit: 953adb9 # immutable SHA (base_commit input alias); sparse/ancestor optional + isolation: fresh # fresh (default) | pooled | shared + extensions: # non-provisioning setup only + - agentv:agent-rules:beforeAll + tests: file://cases.yaml + ``` +- **Never in the schema:** acquisition (resolver/backends → machine config) and hooks (→ extensions) — keeping them out is what makes the schema durable. Custom acquisition = a registered resolver backend or a `beforeAll` escape hatch (ADR-0017 pt5). + +--- + +## 3. AgentV-only features to preserve as extensions + +These have no promptfoo equivalent and are AgentV's differentiation. Keep them, document them as extensions layered above the promptfoo-compatible core: + +- **Repo provisioning — a declarative `workspace.repos` field the harness materializes before hooks** (ADR-0016 pt10 / ADR-0017; git repos at pinned commits + resolver backends + mirror cache; `isolation` field). NOT an extension. `agentv:agent-rules` + custom `file://` hooks remain extensions for non-provisioning setup. +- **Executable `gate`** release policy (`min_test_pass_rate`, `max_execution_errors`, command receiving run JSON). +- **Agent target providers**: CLI/SDK/codex/copilot/claude/replay/transcript, `use_target` indirection, `fallback_targets`, `grader_target`. +- **First-class `expected_output`** golden/reference answers (DeepEval-aligned). Keep them distinct from `vars` so gold labels are grader context, not target prompt variables, unless the author explicitly duplicates them into `vars`. +- **Code-grader SDK** (`@agentv/sdk`): `define_assertion`/`define_code_grader`/`define_workspace_grader`/`define_vitest_workspace_grader`. +- **Multi-turn — KEEP the evaluation layer, split execution out (has real provenance).** Researched under **agentv#1053** in `agentevals/agentevals-research/research/findings/multiturn-conversation-eval/` against inspect-ai, google-adk, ragas: + - `on_turn_failure: stop/continue` ← inspect-ai solver `state.completed`; per-turn assertions gating continuation ← inspect-ai `await score(state)`; scripted-vs-LLM-driven turns ← google-adk `conversation` vs `conversation_scenario`. + - **per-conversation `aggregation` (mean/min/max) is a deliberate AgentV gap-fill** — the research found inspect-ai/ragas/**promptfoo** aggregate only across *epochs/samples*, never within one conversation. Dropping it and relying on promptfoo's `_conversation` would reintroduce that documented gap. + - **Decision (per the research's own recommendation): split execution from evaluation.** Conversation *driving* → promptfoo `_conversation` + chat-array prompts + session providers (execution). **Keep** AgentV's *evaluation* layer: per-turn assertions, cross-turn `aggregation`, `on_turn_failure`. **Drop `window_size`**: it kept system + the last N turns of history per turn (context/cost control) — but in the `_conversation` model the author controls history windowing directly in the prompt template (slice `_conversation`), so a dedicated field is redundant (also no framework pedigree). This is distinct from the trials-axis `repeat`/pass@k reducer (§4/§6) — turn-aggregation and trial-aggregation are different axes. **Documented in ADR-0015.** +- **`depends_on` DAG, `imports`/`select` suite composition.** +- **Trajectory / execution-metrics / field-accuracy** graders. + +Removed (folded into promptfoo `extensions`, see 2.l): top-level `workspace:` block, `workspace.hooks`, `on_run_complete`, `preprocessors`. + +--- + +## 4. Runner & execution model — borrow margin-lab's *ideas*, stay laptop-first + +**Owner decision: laptop-first, no persistent store.** Borrow margin-lab's *concepts* (instance expansion, pass@k, infra-only retry) but NOT its DB-backed lease/heartbeat/reaper scheduler — that machinery exists for margin's Postgres/distributed target and would violate AgentV's "zero-infra local to CI" guardrail. + +- **Instance = unit of work.** Expand `(prompt × target × test × sample)` into flat **instances** at compile time. `samples_per_case`/`repeat.count` → `instance_key = "#"` (margin-lab's `BuildInstanceKey`). Subsumes AgentV's current repeat handling and gives pass@k for free. +- **Simple in-process worker pool**, `max_concurrency` = worker count. No lease/heartbeat/store. +- **Resumability via the run-index JSON, not a store (owner):** each instance's status is tracked in `index.jsonl` as it completes. "Rerun failed" = read the run index, filter failed/errored `test_id`s, re-run just those into the same/new bundle (`--rerun-failed `). Cheap, laptop-native, no DB. +- **Workspace: separate *materialization* from *reuse/pooling* (owner clarification, ties to §2.b/§2.l).** + - **Materialization = always-on core (local + CI):** acquire repo(s) via a resolver + **git-mirror cache** (parity example's `git_cache.mirrors`), then check out into the workspace. Mirror caching makes even a *fresh* workspace fast — so **clone speed is decoupled from reuse**. + - **Pooling = reuse a workspace + quick-reset between cases** (git clean / snapshot restore). Needed *only* when reusing. It's a **performance optimization** that amortizes expensive *setup* (checkout + setup scripts + installed state), **mainly for local evals** (fast iteration, limited parallelism, large repos). CI usually prefers fresh-per-case for correctness and relies on the mirror cache for speed. Trade-off: pooling risks state leakage if reset is imperfect (isolation-safety vs speed). + - **Three isolation levels:** `shared` (one workspace for all cases) · **pooled** (N reused, reset between) · **fresh per-case** (new each time, margin-lab style). + - The `agentv:workspace` extension owns materialization + reset; `beforeAll` = shared, `beforeEach` = per-case (fresh, or pooled+reset). +- **Retry = infra-only.** Test failures are valid graded outcomes; only infra failures requeue (`retry_count`). Adopt margin's `domain.NextRunState` state machine (run `completed` unless `infra_failed > 0`). +- **Per-instance hard timeout** covering setup+agent+grade (`instance_timeout_seconds`). +- **Caveat:** margin-lab's `fail_fast` is declared-but-inert — if we want fail-fast, implement it (don't copy the dead field). + +Borrow the *instance model + state machine + analytics*, not the store or the in-container agent-server HTTP host (AgentV already has CLI/SDK providers). + +--- + +## 5. Transcripts & agentic graders — borrow vercel-agent-eval + +### 5.1 Two-layer transcript (aligns with AgentV ADR-0008) +- Keep **raw** agent-native transcript (`transcript-raw.jsonl`) AND a **normalized** `transcript.json` with a canonical cross-agent **`tool_name` enum** (`file_read`/`file_write`/`file_edit`/`shell`/`web_fetch`/`web_search`/`glob`/`grep`/`list_dir`/`agent_task`/`unknown`) and a precomputed **`transcript_summary`** (`total_turns`, `tool_calls` map, `files_read`/`files_modified`, `shell_commands`, `web_fetches`, `errors`, `thinking_blocks`). +- **Inline the summary** into per-instance `result.json` for cheap trajectory assertions (feeds AgentV's `tool-trajectory`/`execution-metrics` graders directly). +- Per-agent parsers routed by agent id (vercel's `AGENT_PARSERS`). + +### 5.2 Agentic LLM judge (evidence-by-path) +Borrow vercel's judge model, which is stronger than a prompt-stuffed rubric: +- The judge is a **re-invoked agent in the same workspace** that reads evidence **by path** (transcript file / final environment) rather than having the transcript stuffed into the prompt. Maps onto AgentV's existing `llm-grader`/`code-grader` with `target` + `max_steps`. +- Two **subjects**: `environment` (inspect final workspace state) and `transcript` (read the transcript) — matches AgentV workspace vs trace grading. +- **Framework-owned skeptical prompt**, tiny verdict contract `{ pass, score?, reason }` (author supplies only a criterion string). Adopt this as the default `llm-grader` prompt. +- **Judge pinning** knob: `grader_target` = `{ agent?, model }` with self-grade default. AgentV already has `grader_target`; formalize the `{model}`-required pinning for apples-to-apples comparison. +- **Gap to fix vs vercel:** they capture no token/cost — AgentV already does; keep it. + +### 5.3 `grading.json` contract — reconciled with agentskills (owner-flagged main risk) +The output contract originates from agentskills' [evaluating-skills](https://github.com/agentskills/agentskills/blob/main/docs/skill-creation/evaluating-skills.mdx). Its `grading.json` is: +```json +{ "assertion_results": [ { "text": "…", "passed": true, "evidence": "…" } ], + "summary": { "passed": 3, "failed": 1, "total": 4, "pass_rate": 0.75 } } +``` +i.e. **per-assertion `passed` (boolean) + `evidence`, and a top-level `summary` of counts. There is NO overall `verdict` in agentskills.** + +AgentV's current `grading.json` = `EvaluationScore` (`graders/types.ts`) is a **superset**: +``` +score: number (0-1) # fractional (weighted / rubric) — AgentV addition over agentskills' binary +verdict: 'pass' | 'fail' | 'skip' # AgentV addition (NOT in agentskills) +assertions: [{ text, passed, evidence? }] # ≡ agentskills assertion_results — exact match ++ scores? (child graders), details?, graderRawRequest?, tokenUsage?, graderTarget? +``` + +- **Per-assertion: no change.** `{ text, passed, evidence }` already matches agentskills exactly. (Nit: align array key `assertions` → **`assertion_results`** and add the agentskills **`summary`** counts, since agentskills is the source and we're pre-production.) +- **Overall result: keep a STRING, not a boolean (owner Q resolved).** A boolean `passed` can't express **`skip`** (not-run / dependency-skipped) and doesn't pair with a fractional **`score`**. So keep `verdict: 'pass'|'fail'|'skip'` + `score: number`. `verdict` was an AgentV addition, kept deliberately for the skip state + fractional scoring — this is a "keep AgentV, better semantics" call, not an agentskills field. +- **Assertion rows are generic (the actual risk):** every AgentV grader returns `assertions[]`; the run artifact flattens them into `grading.json.assertions[]` and also keeps `grading.json.graders[].assertions[]` under each grader. A deterministic string assertion usually emits one row, while multi-aspect graders can emit several rows (field-accuracy per field, execution-metrics per configured metric, tool-trajectory per tool/sequence requirement, code-grader per script-emitted assertion). For structured `g-eval`, the judge's `{pass, score, reason}` maps as `pass`→per-criterion `passed` (+ rolls up to `verdict`), `score`→`score`, `reason`→ per-criterion **`evidence`**. Multi-criteria `g-eval` / bare-string batch §2.k must therefore emit **one assertion row per criterion** as an instance of the generic grader assertion-row contract, not one lumped reason. +- **Structured rubric inputs stay.** Promptfoo's `g-eval` supports string or string-array criteria and averages aggregate results; DeepEval's `GEval` supports rubric score bands but still stores one metric score/reason. AgentV's `rubrics` shorthand is a structured authoring convenience that unwraps to `g-eval` while preserving per-criterion `weight`, `operator`, `score_ranges`, and `min_score`; do not collapse it to text-only parity. `code-grader` likewise keeps its structured stdin payload and `config`. +- **Keep `evidence` in `grading.json`.** An SDK comment (`packages/sdk/src/schemas.ts`) nudges evidence toward the trace; override — `grading.json` stays the self-contained verdict record (transcript still in the trace). +- **Consistency:** define `verdict`/`score`/`passed` derivation once (e.g. `verdict = score >= threshold`; `skip` orthogonal; weighted roll-up via `weight`/`min_score`) + a golden `grading.json` test so they never disagree when the default judge prompt flips. +- **Net:** the public criteria/rubric rename points to `g-eval`; the real work is the verdict→`assertion_results[]` mapping + the naming alignment (`assertion_results`/`summary`), on the existing `EvaluationScore` — no new contract. + +--- + +## 6. Output artifacts & analytics + +### 6.0 Canonical output format — reviewed across all four frameworks (owner Q) +The subagents reviewed every reference's output format. Verdict: **split artifacts + a top-level aggregate is the canonical, cleanest, most expressive shape — NOT one giant `results.json`.** + +| Framework | Shape | Aggregate | +|---|---|---| +| **promptfoo** | consolidated: SQLite DB + one export file (`EvaluateSummaryV3`) | in the single file | +| **margin-lab** | **split**: per-instance `instances//result.json` + trajectory + logs | top-level `results.json` (`Summary`) | +| **vercel-agent-eval** | **split**: per-run `result.json` + `transcript.json` + `transcript-raw.jsonl` + `outputs/` (transcript by path, summary inlined) | per-eval `summary.json` | +| **agentskills** | **split**: per-case `grading.json` + `timing.json` | `benchmark.json` (counts) | +| **AgentV today** | **split**: `run-N/{grading.json, timing.json}`, transcript by path, `outputs/answer.md`, `file_changes` | run-root `summary.json` + `index.jsonl` | + +**Decision (owner): best-of-each hybrid — split bundle, queryable-on-the-filesystem, no DB.** Each part comes from the framework that does it best: + +- **Aggregate + queryability ← margin-lab (owner's pick).** The top-level **`summary.json` is a rich, self-contained, `jq`-queryable `Summary`** in margin-lab's shape — `run_id`, `status` breakdown, per-case **pass@k** (`pass_count`/`pass_rate`), per-instance summaries, `usage`, infra-failure taxonomy. You can query the whole run from one file with no database (margin-lab's key strength). Plus **`index.jsonl`** (one row per case) for streaming/line-wise queries (`jq`/`grep` per line) that scale better than a single fat file. AgentV's current top-level `summary.json` must be *widened* to margin's `Summary` richness so it's genuinely queryable, not just a manifest. +- **Transcript + tool calls + metrics ← vercel (owner's pick).** Two-layer transcript (raw + normalized), canonical **`tool_name` enum**, and a precomputed **`transcript_summary`** (tool_calls, files_read/modified, shell_commands, web_fetches, thinking) — **inlined into each result row** so `tool-trajectory`/`execution-metrics`/pass@k read metrics cheaply without parsing the transcript. Transcript itself referenced **by path** (§5.1). This is where AgentV's trajectory/metrics graders get their signal. +- **Per-assertion grading ← agentskills.** `grading.json` = `assertion_results[{text, passed, evidence}]` + `summary` counts, plus AgentV's `verdict`/`score` superset (§5.3). +- **No maintained consolidated single-file export (owner: YAGNI).** Since the split bundle is the source of truth, we do **not** ship or maintain a promptfoo `EvaluateSummaryV3` file. If some external tool ever needs it, it can be **generated on demand** from the bundle — but it's not a first-class artifact. (We still adopt promptfoo-shaped `named_scores`/`derived_metrics` *inside* the split rows, because those feed the Dashboard — that's not a consolidated file.) + +So: **split detail** (per-case/per-attempt dirs, transcript by path) + a **margin-style queryable aggregate** (`summary.json`) + **row-per-case `index.jsonl`** + **vercel transcript/metrics** + **agentskills grading**. No DB, no maintained consolidated file; the filesystem is the query surface. + +- **Don't over-split.** Keep the agentskills-aligned split (`grading.json`, `timing.json`), transcript-by-path, and the inlined `transcript_summary`; don't add more sidecars. Remove deprecated/duplicate artifact paths (see §10, e.g. `isDeprecatedTraceArtifactPath`). + +### 6.0.1 Bundle layout & naming — resolve the index/manifest drift (ADR-0012 refinement) +Current drift (verified): the per-case index file is `index.jsonl` (`RESULT_INDEX_FILENAME`) but `summary.json` references it as **`manifest_path`** and the resolver is `resolveExistingResultManifestPath()` — "index" and "manifest" name the *same* file. Fix by giving three distinct roles distinct names, and adopt margin-lab's `internal/` folder (dot-prefixed to match AgentV's "dot = skip discovery" convention): + +``` +.agentv/results// + summary.json # run-level queryable aggregate (margin `results.json` role) — root, human-facing + /… # per-case detail dirs (the discoverable children) + .internal/ # machine-coordination (margin's internal/ + AgentV dot-prefix skip) + index.jsonl # row-per-case pointer table (moved off the root) + progress.json # live progress + events.jsonl # state/event stream + bundle.json # frozen resolved config (optional; the "manifest") +``` +Decisions: +- **`index.jsonl` stays JSONL** (append-as-you-go for live progress + `--rerun-failed`; streamable; line-queryable). NOT `index.json` (would force whole-file rewrites, not streamable). +- **Filename considered:** `index.jsonl` (kept) vs `rows.jsonl` vs `manifest.jsonl`. `manifest.jsonl` **rejected** — "manifest" is reserved for the frozen `bundle.json`, and margin uses `manifest.json` for run metadata (reusing it re-creates the drift we're fixing). `index` is precise (each line is a *pointer* into per-case detail — the queryable entry point). `rows.jsonl` is the only acceptable fallback if the singular-file `index.jsonl` vs plural-folder cross-run `.indexes/` overlap is deemed confusing. +- **Rename the reference `manifest_path` → `index_path`** (+ `resolveExistingResultIndexPath`) so file and field agree. Reserve **"manifest"/`bundle.json`** for the frozen-config file only. +- **`summary.json`** = the queryable aggregate (do not call it a manifest). +- Move machine files (`index.jsonl`, `progress.json`, `events.jsonl`, `bundle.json`) into per-run **`.internal/`**; keep the run root clean (`summary.json` + per-case dirs). Cross-run `.indexes/`/`.cache/` at the results root are a separate scope and stay. +- **Merge `timing.json` into `metrics.json` (owner Q; fixes a real duplication).** Prior discussion: ADR-0011 & ADR-0012 both defined `metrics.json` *and* `timing.json` as separate per-attempt sidecars (`metrics_path` + `timing_path`), and both are written today. But the split is muddy — `timing.json` (`TimingArtifact`) already carries `total_tokens`, `cost_usd`, `token_usage`, `usage_sources` (perf metrics, not just timing), while `metrics.json` carries trace-derived execution/trajectory metrics. **No reference splits them** (agentskills folds tokens+duration into one `timing.json`; vercel/margin keep one per-attempt metrics/result blob). **Decision: one `metrics.json`** per attempt with sections — `duration` (timing), `tokens`, `cost` (always present), and `execution`/trajectory tool-call metrics (present when a trace exists). Drop `timing.json` and the `timing_path` field; keep a single `metrics_path`. The output-contract ADR supersedes the 0011/0012 timing+metrics split. + +- **Keep** `.agentv/results//` bundle (ADR-0011/0012). Reconcile field names with margin-lab's `results.json` `Summary` where useful. +- **Analytics = one pure function** (margin-lab's `runresults.Build`): given instances+results, produce a deterministic `Summary` with per-case **pass@k** (`pass_count`/`pass_rate` over samples), `status` breakdown, `usage` aggregation, and infra-failure taxonomy. AgentV currently lacks pass@k/variance — this fills it. +- Add promptfoo-shaped **`named_scores`** + **`derived_metrics`** to per-result rows (feeds Dashboard Tags/metrics tabs). +- Reference transcripts **by relative path** in the result row (vercel), never inline the full transcript. +- **No maintained consolidated export** — the split bundle is the single source of truth; a promptfoo-shaped single file can be generated on demand if ever needed, but is not shipped (§6.0). + +--- + +## 7. Phasing + +1. **Schema spike** — write the snake_cased promptfoo Zod schema alongside the existing one; land conflict decisions from §2 as an ADR (supersede/annotate ADR-0013). *No behavior change.* +2. **Templating** — introduce nunjucks + array-var expansion behind the new schema. +3. **Prompts × providers matrix + instance expansion** — compile step producing flat instances; adopt lease-based scheduler (§4). +4. **Assertion vocabulary** — promptfoo types + `assert`/`assert-set`/`metric`; keep AgentV graders as extensions; add `similar` (embeddings) if wanted. +5. **Transcript normalization + agentic judge** — canonical `tool_name` enum, `transcript_summary`, evidence-by-path judge, judge pinning (§5). +6. **Analytics** — pure `Build` summary with pass@k, `named_scores`, `derived_metrics` (§6). +7. **Datasets/CSV** — `file://…csv` + `__expected` DSL. +8. **Docs + migration** — dual-format support window; codemod old `assertions`→`assert` etc.; dogfood with live provider + real LLM grader (per `.agents/verification.md`). + +Each phase is a reviewable PR; §2 decisions gate phase 1. + +--- + +## 8. Remaining judgment calls + +The naming principle + hard-deprecation + the **superset goal** resolve 2.a–2.m. Note the owner scoped the superset pragmatically: it holds over the **implemented** promptfoo surface; unimplemented exotic assertion types and `redteam` are an honest documented gap, not silently accepted (items 1–2). The remaining calls, now all resolved: + +1. **Assertion parity scope — RESOLVED (owner).** Launch-real set: string ops, `javascript`, `python`, `g-eval`, `llm-rubric`, `latency`, `cost`, token/execution metrics, `tool-trajectory`, `field-accuracy`, **and `similar`** (requires a configured embeddings provider — new provider config surface). **Do NOT stub the rest** — `context-*`, `moderation`, `guardrails`, `answer-relevance`, `classifier`, `perplexity` are **future scope** (unimplemented, not silently accepted). Consequence: the schema does NOT accept every promptfoo type; superset holds over the *implemented* surface, and unimplemented types are an honest documented gap (handled by the general unknown-type policy, next item). +2. **Redteam — RESOLVED (owner): treat like any other unrecognized field.** No special-casing. `redteam:` follows AgentV's general unknown-field policy. NB: the eval-file schema is `.strict()` today (unknown top-level keys are rejected), so a promptfoo `redteam` config errors like a typo unless/until that policy is loosened. Documented superset caveat: **superset excludes redteam** (and other unimplemented exotic types). +3. **`code-grader` vs `javascript`/`python` — RESOLVED (§2.c):** distinct execution paths — `javascript` in-process, `python` subprocess, `code-grader` the subprocess power tool. Not desugared. + +**All §8 items are now resolved.** Phase 1 (schema + superseding ADR + codemod + camel→snake importer) can start. The extensions/workspace slice already has an implementation-ready plan in **PR #1592** (see §9). + +--- + +## 9. Reconciliation with PR #1592 (extensions/workspace slice) + +PR #1592 (`docs/plans/2026-07-01-001-feat-promptfoo-compatible-extensions-plan.md`) is an implementation-ready plan for the **extensions + workspace-removal** slice. It is well-structured and **largely consistent** with this plan — treat it as the **first implementation slice** of §2.l/§4, not a competing design. Confirmed-aligned: promptfoo `file://path:function` extension refs + four hook names (U2); hook-name selects phase; remove core `workspace`, hard (KTD5, AE4); typed extension outputs instead of env-var side channels (KTD3); JS/TS in-process first, Python via code-grader subprocess discipline (KTD4); canonical AgentV run bundle preserved (R8); snake_case docs (U7); adds a skills extension (U5) — a good addition this plan didn't cover, **renamed to `agent-rules`** (see amendment 5 below). + +**Amendments needed to align #1592 with the decisions above:** + +1. **Workspace spec location — the main divergence.** #1592 routes workspace config through dedicated `extensions/workspace.config.yaml` files (extension-owned config). This plan's §2.b/§2.l decision (owner: "workspace is part of the dataset") puts the per-case spec in **`vars.workspace`** (dataset data), consumed by the extension. Reconcile: **global/shared config in a config file is fine; the per-case spec should be expressible as `vars.workspace`** so it rides the dataset (and `beforeEach` reads it from test context). Amend #1592 U2/U4 to consume `vars.workspace`, not only a config file. +2. **Built-in + auto-registered vs bring-your-own `file://`.** #1592 has users reference local `file://extensions/workspace.ts`. This plan wants a **shipped, auto-registered `agentv:workspace` / `agentv:agent-rules`** built-in (zero-config, overridable) so the common case needs no copied script. Amend #1592 to add a built-in `agentv:` reference scheme alongside `file://` (keep `file://` for custom). +5. **Rename `skills` → `agent-rules` (owner).** The staging extension isn't skills-only — it stages **skills, hooks, subagents/agents, and other agent rules** into the workspace. Rename the built-in to **`agentv:agent-rules`** (kebab — identifier token, like grader types `llm-rubric`/`tool-trajectory`; NOT snake_case), the package to `packages/extensions/agent-rules`, and the provider context field from `skill_paths` to `agent_rules_paths` (snake_case — this IS a data field). `skills` survives only as one *kind* of agent rule, not the extension name. + - **Naming convention (general):** identifier/reference tokens are **kebab-case** (`agentv:agent-rules`, `llm-rubric`, `is-json`, `assert-set`); data fields/keys are **snake_case** (`agent_rules_paths`, `vars.workspace`, `max_budget_usd`); npm packages are kebab. +3. **Isolation model.** #1592 treats `isolation: per_case` as extension config returned by the workspace extension. This plan derives shared-vs-per-case from **which hook** (`beforeAll` vs `beforeEach`) and uses a **reset-based workspace pool** (§4). Reconcile the two: hook selects shared/per-case; pool+reset is the mechanism; `isolation` config, if kept, must not contradict the hook. +4. **Sequencing vs the wider restructure.** #1592 cites ADR-0013 as authority and proposes ADR-0014. This plan *reverses* parts of ADR-0013 (`assert`, grader names, `input` removal — §2.c/§2.b). #1592 doesn't touch those, so no direct conflict, but ADR-0014 should note the broader superseding ADR is coming so it doesn't re-entrench `input`/`assertions`. + +**Verdict:** reasonable and mergeable as the extensions/workspace slice. **Amended (2026-07-02)** — an "Amendments (agreed)" section was added to the #1592 doc capturing A1–A6: hook-derived isolation + reset-based workspace pool (drop the `isolation` config knob), per-case spec in `vars.workspace`, built-in auto-registered `agentv:workspace`/`agentv:agent-rules` scheme alongside `file://`, grading contract unchanged (`EvaluationScore`), ADR-0014 sequencing note, and **rename `skills`→`agent-rules`** (stages skills + hooks + agents + rules). No rewrite required. + +--- + +## 10. Simplification / dead-code cleanup (hard deprecation — no back-compat) + +Since this is a major version with nothing in production, **remove** the accumulated `@deprecated` aliases and legacy shims rather than carry them. Verified inventory (grep across `packages/core/src`, `packages/sdk/src`, `apps/cli/src`), grouped by how to handle it. + +**A. Pure deprecated aliases / shims — delete now (safe, self-contained, not entangled with the restructure):** +- `script` alias for `command` in workspace hooks/scripts — `types.ts:218/258/375/401`, `workspace/script-executor.ts:66-69` (runtime warning), `validation/workspace-path-validator.ts:112`. Keep only `command`. +- `judge_target` → `grader_target` — `providers/types.ts:373`. Remove alias. +- Legacy grader-provider fields — `graders/llm-grader.ts:95/102` (`resolveGraderProvider`, `graderTargetProvider`), `registry/grader-registry.ts:22/34` (`graderProvider`, `llmGrader`). +- Legacy rubric gating — `types.ts:480/489-490`, `graders/llm-grader.ts:1316-1325` (0-10 `required_min_score`, numeric `required` thresholds). Keep boolean `required` for hard gates and `min_score` (0-1) for custom thresholds. +- `EvalCase` → `EvalTest` alias — `types.ts:1053`. (Ties to `eval_cases`→`tests` removal.) +- Programmatic snake_case `expected_output` aliases in `evaluate.ts:104/127` — keep the YAML/wire field `expected_output` and SDK field `expectedOutput`; remove or clearly isolate only the TypeScript convenience alias if this Group A cleanup reaches the programmatic API surface. +- `@deprecated Use Message` — `providers/types.ts:255`. +- `DEFAULT_SEMANTIC_*`/threshold alias — `graders/scoring.ts:26`. +- `results_by_project` — `validation/config-validator.ts:108` (already a deprecation error; drop the field entirely). +- Deprecated trace artifact paths — `results-repo.ts:3013/3214` (`isDeprecatedTraceArtifactPath`), `trace.ts:951` legacy persistence path. +- Backward-compat home/config alias — `paths.ts:20`. +- Unsupported-field rejection messages that no longer need to exist post-major — `targets-validator.ts:299/308` (`api_format`, `log_format`). +- **Provider-kind alias sprawl** — `providers/types.ts:110-121`: `azure-openai`, `google`, `google-gemini`, `codex-cli`, `copilot`, `copilot_sdk`, `pi`, `claude-code`, `cc-mirror`, plus `bedrock`/`vertex` ("legacy/future support", currently dead). Keep one canonical name per provider; drop the rest (or keep a *small* documented set). `bedrock`/`vertex` are dead scaffolding → remove until actually implemented. + +**B. Removed by the restructure itself (don't double-handle — fold into the relevant phase):** +- `assertions`/`composite`/`eval_cases`/`experiment`(top-level)/`tests[].input`/`workspace`/`on_run_complete`/`preprocessors`/`${{ ENV }}`/`budget_usd`(top-level, `config-loader.ts:468`)/scalar `threshold`(`config-loader.ts:502`) — all removed as part of §2/§2.l; the `z.never()` rejection stubs (`runs`/`early_exit`/`policy`/`execution`/`model`/`trials`/`workers`) can drop once the new schema lands. +- The **two conflicting ADR-0013 files** (`0013-stabilize-eval-authoring-contract.md` vs `0013-experiment-is-metadata-expressed-as-tags-experiment.md`) — resolve into the single superseding ADR; don't leave both. +- Schema/runtime drift: `eval-file.schema.ts` `experiment`/`tags` lag the runtime `metadata.ts` tag-map union — the new schema removes the drift. + +**C. Duplicate types to consolidate (judgment, not blind delete):** +- `EvaluationScore` vs `ChildGraderResult` (`graders/types.ts`) are near-identical (`score`/`verdict`/`assertions`/`graderRawRequest`/`scores`/`details`/`tokenUsage`). Consider one recursive type. +- **`timing.json` + `metrics.json` → one `metrics.json`** (see §6.0.1). `timing.json` already holds tokens/cost, overlapping `metrics.json`; merge into sections, drop `timing_path`. Supersedes the ADR-0011/0012 split. +- `orchestrator.ts:495` "legacy workspace pooling toggle" (`workspaceMode`) — reconcile with §4's reset-based pool (one pooling model, not two). + +**Sequencing:** land **group A** as its own small, tested, dogfooded cleanup PR *now* (independent of the schema work — pure removal of dead aliases, shrinks the surface the restructure must touch). Handle **group B** inside the restructure phases (they need the new schema first). Do **group C** as targeted refactors with tests. Every deletion needs a green test run + a live dogfood per `.agents/verification.md` before merge. + +--- + +## 11. Quality gate & live dogfood (CargoWise PR-679) + +Layered gate (matches `.agents/verification.md`): + +1. **Deterministic CI = authoritative merge gate (every bead PR):** `agentv validate`, `bun test` (unit + schema-sync), `typecheck`, `lint`, `build`, Validate Evals. No live provider. +2. **Live eval dogfood = blocking before ready-for-review:** the **PR-679 CargoWise scenario** with a **live provider + real LLM grader**, producing canonical `.agentv/results//`, evidence to `agentv-private`. Chosen because it exercises the whole restructure at once: repo materialization, the `agentv:workspace` extension, `llm-rubric` agentic judge, transcript/`tool_name`, `grading.json`, output bundle, targets. This is `av-kfik.16`. +3. **Superset-parity check (recommended):** run PR-679 in **both** promptfoo (the parity example) and AgentV (new format) and diff — proves "snake_cased promptfoo ⊆ AgentV" end-to-end. + +### Workspace acquisition — two resolver adapters (= `av-w5cw` → `av-kfik.14`) +Both materialize a workspace at a pinned `workspace.repos[].commit`; they differ only in source: + +- **Local git-mirror path** (default for dev): clone/checkout from the local mirror (`~/projects/WiseTechGlobal/CargoWise`, ~7.3 GB). Offline, fast. The parity example's `git_cache.mirrors`. Not usable in CI (size + private + no LFS creds). +- **Snapshot-download adapter** (CI): port `WTG.AI.Prompts/scripts/eval-config/download-release-deps.ts` — `agentv workspace deps ` → manifest → download pinned **per-year `.git`-only tarballs** from a release (`snapshot/v1.x.0`), one shared `git clone`+`checkout ` per (repo,commit), symlink each workspace to the shared **read-only** checkout, `GIT_LFS_SKIP_SMUDGE=1`. Reproducible, no full mirror. + +Unify under the **resolver** abstraction (already fronted by `agentv workspace deps`): the eval file is identical; the resolver picks mirror (local) vs snapshot (CI) by config/env. `av-kfik.14` carries this two-adapter design; `av-kfik.16` depends on it. + +### Acquisition performance — pick the cheapest per environment (owner) +Not simply "download vs shallow clone". Ordered fastest-first: + +1. **Local mirror → alternates/reference clone** (dev): `git clone --reference ~/projects/WiseTechGlobal/CargoWise --shared` (or `--local` hardlinks). Near-instant, zero transfer. This is what `git_cache.mirrors` should do — beats both network-clone and download. +2. **Direct partial + sparse + shallow clone** (CI default, no producer): `git clone --filter=blob:none --sparse --depth 1 ` → `sparse-checkout set ` → `checkout `. Blobless (no upfront blobs), sparse (only the tree subset the eval reads — **exploit `workspace.repos[].sparse`**), shallow (no history). For one pinned commit + a sparse subset this transfers *less* than the WTG per-year `.git` tarball and needs no infra. Caveat: arbitrary-sha fetch needs server support — pin to a tag or `--revision ` (git ≥2.49) to be safe. +3. **Snapshot-download adapter** (CI fallback): wins when **amortizing across many commits/evals** (download the year's `.git` once via CDN, local-clone many workspaces from it — `download-release-deps.ts`) or when **sha-fetch/LFS blocks a direct clone**. Costs a producer that publishes snapshots. + +The resolver selects 1→2→3 by environment/config. Plain full clone of the 7.3 GB mirror is never the answer. + + +### 11.1 Canonical workspace resolver — provenance vs acquisition (SWE-bench + margin + Harbor + lm-eval) +Grounded in AgentV research `repo-provisioning-schema-design.md` (compares SWE-bench, Terminal-bench, Margin, lm-eval-harness). Convergent rule: **the case declares WHAT (identity+pin); the harness resolves WHERE-FROM via a selectable backend. Nobody puts acquisition in the task.** SWE-bench's image registry and Margin's suite registry are *acquisition backends*, not identity variants. + +**Defect to fix (this supersedes the §11 "acquisition performance" framing):** AgentV's repo entry today tangles identity with acquisition. Split them: + +- **Eval declares provenance ONLY:** `repos: [{ path, repo, commit (base_commit alias), sparse?, ancestor? }]`. **Remove** `type` (`local` is dead schema), `resolve`, `clone.depth`, `clone.filter`, and the per-repo `resolver` field (→ §10 cleanup). +- **Acquisition = harness resolver in `$AGENTV_HOME/config.yaml` (NOT the eval), keyed on `repo`**, ordered backends: + 1. **local checkout auto-adopt** — project whose `git remote origin` matches `repo` → `git clone --reference` (origin-match, no override); + 2. **bare mirror clone-cache** (`$AGENTV_DATA_DIR/git-cache/`) via `--reference` (shared objects); + 3. **snapshot artifact** (WTG `download-release-deps` reframed as a backend); + 4. **remote clone** (fallback); + 5. *(future)* **Docker image** — SWE-bench/margin per-instance image; **same identity key, new backend** → this is how AgentV runs SWE-bench natively. +- **`--reference` (mirror cache) is the workhorse:** shallow-speed WITH full history, so deep `base_commit` pins never break — **retires the `--depth`/`--filter`/sparse-shallow debate**. Keep `sparse` (content selection) only. +- **Materialization is declarative harness logic, not a hook.** Hooks (`before_all` etc.) run *after* materialization. Resolver config is machine-local, orthogonal to eval and target YAML. Targets carry no repos. + +Net: new acquisition backends (mirror, snapshot, Docker image) plug in without touching the eval schema because all resolve the same `repo`+`commit` pin. This aligns SWE-bench (`base_commit`→`commit`, image registry = backend #5), margin (suite registry = a backend), and Harbor (dataset registry = a backend) under one provenance-only eval contract. + +**Inspect AI (4th confirmation, via deepwiki):** Task declares the sandbox (`Dockerfile` to build, or `image:` to pull); the harness owns acquisition in `task_init` (build/pull **once per unique config**, cached). SWE-bench = pull prebuilt image from a registry unless `build`/`x-local: true`. Two ideas to borrow for AgentV's Docker-image backend: the **`image` / `build` / `x-local`** distinction (pull vs build-locally vs local-only) and **per-unique-config init caching** (the same prebuild-once-reuse-many as SWE-bench layered images and our mirror cache). Confirms: provenance in the task, acquisition harness-owned + amortized. diff --git a/examples/features/document-extraction/README.md b/examples/features/document-extraction/README.md index 42a2adee4..6f69515b7 100644 --- a/examples/features/document-extraction/README.md +++ b/examples/features/document-extraction/README.md @@ -3,14 +3,14 @@ This folder demonstrates two evaluation patterns for document extraction: 1. **`field_accuracy`** (built-in) - Per-test-case scoring with pass/fail per field -2. **`code_grader`** (custom) - TP/TN/FP/FN metrics for cross-document aggregation +2. **`code-grader`** (custom) - TP/TN/FP/FN metrics for cross-document aggregation ## When to Use Each Pattern | Pattern | Use Case | Output | |---------|----------|--------| | `field_accuracy` | Simple pass/fail scoring per test case | Score (0-1) per test case | -| `code_grader` with `details.metrics` | Aggregate precision/recall across documents | TP/TN/FP/FN per field | +| `code-grader` with `details.metrics` | Aggregate precision/recall across documents | TP/TN/FP/FN per field | ## Quick Start @@ -53,7 +53,7 @@ graders: ## Pattern 2: Confusion Metrics (`confusion-metrics.eval.yaml`) -Uses a custom `code_grader` that emits `details.metrics` with TP/TN/FP/FN per field: +Uses a custom `code-grader` that emits `details.metrics` with TP/TN/FP/FN per field: ```yaml graders: diff --git a/examples/features/document-extraction/evals/field-accuracy.eval.yaml b/examples/features/document-extraction/evals/field-accuracy.eval.yaml index efb66d77e..b1fb12477 100644 --- a/examples/features/document-extraction/evals/field-accuracy.eval.yaml +++ b/examples/features/document-extraction/evals/field-accuracy.eval.yaml @@ -9,7 +9,7 @@ # - Exact matching for invoice numbers and currency codes # - Date matching with format normalization # - Numeric tolerance for currency amounts -# - Fuzzy matching via code_grader plugins +# - Fuzzy matching via code-grader plugins # - Line item array validation # # EXPECTED SCORES: @@ -47,7 +47,7 @@ assertions: weight: 1.0 # Party information - # Note: For fuzzy matching (OCR variations), use code_grader with fuzzy_match.ts + # Note: For fuzzy matching (OCR variations), use code-grader with fuzzy_match.ts - path: supplier.name match: exact required: true @@ -199,7 +199,7 @@ tests: # ============================================ # Test Case 2: Supplier Name Spacing Variation - # Demonstrates fuzzy matching via code_grader with config pass-through + # Demonstrates fuzzy matching via code-grader with config pass-through # ============================================ - id: invoice-002 conversation_id: document-extraction @@ -208,7 +208,7 @@ tests: EXPECTED GROUND TRUTH: supplier.name = "Acme Shipping" (normalized) This simulates OCR output that preserves document formatting. - Uses code_grader with config pass-through for multi-field fuzzy matching. + Uses code-grader with config pass-through for multi-field fuzzy matching. assertions: # Multi-field fuzzy match with config pass-through - name: party_names_fuzzy diff --git a/examples/features/document-extraction/graders/fuzzy_match.ts b/examples/features/document-extraction/graders/fuzzy_match.ts index c2ea71474..18a172bd6 100644 --- a/examples/features/document-extraction/graders/fuzzy_match.ts +++ b/examples/features/document-extraction/graders/fuzzy_match.ts @@ -1,8 +1,8 @@ #!/usr/bin/env bun /** - * Fuzzy String Matching code_grader Example + * Fuzzy String Matching code-grader Example * - * This script demonstrates how to implement fuzzy string matching as a code_grader + * This script demonstrates how to implement fuzzy string matching as a code-grader * grader. Use this approach for comparing extracted text that may have OCR errors, * formatting variations, or minor typos. * @@ -10,8 +10,8 @@ * ```yaml * graders: * - name: vendor_name_fuzzy - * type: code_grader - * script: ["bun", "run", "../graders/fuzzy_match.ts"] + * type: code-grader + * command: ["bun", "run", "../graders/fuzzy_match.ts"] * ``` * * The script reads evaluation context from stdin and outputs a JSON result. diff --git a/examples/features/document-extraction/graders/header_confusion_metrics.ts b/examples/features/document-extraction/graders/header_confusion_metrics.ts index 70dfa5b79..976f109ae 100644 --- a/examples/features/document-extraction/graders/header_confusion_metrics.ts +++ b/examples/features/document-extraction/graders/header_confusion_metrics.ts @@ -2,7 +2,7 @@ /** * Header Field Confusion Metrics Grader * - * A code_grader that compares header fields and classifies them as TP/TN/FP/FN + * A code-grader that compares header fields and classifies them as TP/TN/FP/FN * based on empty vs non-empty expected/parsed values. * * Classification rules (per attribute): @@ -16,8 +16,8 @@ * ```yaml * graders: * - name: header_confusion - * type: code_grader - * script: ["bun", "run", "../graders/header_confusion_metrics.ts"] + * type: code-grader + * command: ["bun", "run", "../graders/header_confusion_metrics.ts"] * fields: * - path: invoice_number * - path: supplier.name diff --git a/examples/features/document-extraction/graders/line_item_matching.ts b/examples/features/document-extraction/graders/line_item_matching.ts index d25084bb8..a7a066a21 100644 --- a/examples/features/document-extraction/graders/line_item_matching.ts +++ b/examples/features/document-extraction/graders/line_item_matching.ts @@ -2,7 +2,7 @@ /** * Line Item Matching Grader * - * A code_grader that matches expected line items to parsed line items using + * A code-grader that matches expected line items to parsed line items using * greedy matching before scoring. This handles reordered and duplicate items. * * Matching strategy (greedy): @@ -15,8 +15,8 @@ * ```yaml * graders: * - name: line_items_matched - * type: code_grader - * script: ["bun", "run", "../graders/line_item_matching.ts"] + * type: code-grader + * command: ["bun", "run", "../graders/line_item_matching.ts"] * match_fields: ["description"] # Fields used for matching * score_fields: ["description", "quantity", "line_total"] # Fields to score * threshold: 0.8 # Similarity threshold for matching diff --git a/examples/features/document-extraction/graders/multi_field_fuzzy.ts b/examples/features/document-extraction/graders/multi_field_fuzzy.ts index 8630508a7..ac40a9d34 100644 --- a/examples/features/document-extraction/graders/multi_field_fuzzy.ts +++ b/examples/features/document-extraction/graders/multi_field_fuzzy.ts @@ -2,15 +2,15 @@ /** * Multi-Field Fuzzy Matcher * - * A configurable code_grader that compares multiple fields using Levenshtein similarity. + * A configurable code-grader that compares multiple fields using Levenshtein similarity. * Configuration is passed via YAML properties that become stdin config. * * Usage in dataset.eval.yaml: * ```yaml * graders: * - name: party_names_fuzzy - * type: code_grader - * script: ["bun", "run", "../graders/multi_field_fuzzy.ts"] + * type: code-grader + * command: ["bun", "run", "../graders/multi_field_fuzzy.ts"] * fields: * - path: supplier.name * threshold: 0.85 diff --git a/examples/features/execution-metrics/scripts/check-metrics-present.ts b/examples/features/execution-metrics/scripts/check-metrics-present.ts index 0c10ebbca..39a0920ab 100644 --- a/examples/features/execution-metrics/scripts/check-metrics-present.ts +++ b/examples/features/execution-metrics/scripts/check-metrics-present.ts @@ -9,7 +9,7 @@ * graders: * - name: metrics-present * type: code_grader - * script: ["bun", "run", "../scripts/check-metrics-present.ts"] + * command: ["bun", "run", "../scripts/check-metrics-present.ts"] */ import { defineCodeGrader } from '@agentv/sdk'; diff --git a/examples/features/tool-evaluation-plugins/graders/tool-args-f1.ts b/examples/features/tool-evaluation-plugins/graders/tool-args-f1.ts index fbd0a5971..7de26fd98 100644 --- a/examples/features/tool-evaluation-plugins/graders/tool-args-f1.ts +++ b/examples/features/tool-evaluation-plugins/graders/tool-args-f1.ts @@ -16,7 +16,7 @@ * graders: * - name: tool-args-f1 * type: code_grader - * script: ["bun", "run", "../graders/tool-args-f1.ts"] + * command: ["bun", "run", "../graders/tool-args-f1.ts"] * expected_tools: * - tool: search * args: { query: "weather" } diff --git a/examples/features/tool-evaluation-plugins/graders/tool-call-f1.ts b/examples/features/tool-evaluation-plugins/graders/tool-call-f1.ts index ca57d4b89..8af44853c 100644 --- a/examples/features/tool-evaluation-plugins/graders/tool-call-f1.ts +++ b/examples/features/tool-evaluation-plugins/graders/tool-call-f1.ts @@ -17,7 +17,7 @@ * graders: * - name: tool-f1 * type: code_grader - * script: ["bun", "run", "../graders/tool-call-f1.ts"] + * command: ["bun", "run", "../graders/tool-call-f1.ts"] * expected_tools: ["search", "fetch"] */ import { type CodeGraderInput, defineCodeGrader } from '@agentv/sdk'; diff --git a/examples/showcase/cross-repo-sync/evals/ground-truth/eval-spec-v2.diff b/examples/showcase/cross-repo-sync/evals/ground-truth/eval-spec-v2.diff index a8888676d..c05ef063a 100644 --- a/examples/showcase/cross-repo-sync/evals/ground-truth/eval-spec-v2.diff +++ b/examples/showcase/cross-repo-sync/evals/ground-truth/eval-spec-v2.diff @@ -203,7 +203,7 @@ index e1a4bc4..22bf3e9 100644 +| Value | Behavior | +|-------|----------| +| `required: true` | Must score >= 0.8 (default threshold) | -+| `required: 0.6` | Must score >= custom threshold (0-1) | ++| `required: true` + `min_score: 0.6` | Must score >= custom threshold (0-1) | ```yaml -rubrics: @@ -215,7 +215,8 @@ index e1a4bc4..22bf3e9 100644 + value: "DENIED" + required: true # Must pass (>= 0.8) + - type: rubrics -+ required: 0.6 # Must score at least 0.6 ++ required: true ++ min_score: 0.6 # Must score at least 0.6 + criteria: + - id: quality + outcome: Response is well-structured diff --git a/examples/showcase/tool-evaluation-plugins/scripts/efficiency-scorer.ts b/examples/showcase/tool-evaluation-plugins/scripts/efficiency-scorer.ts index 043b0fc74..d17af700e 100644 --- a/examples/showcase/tool-evaluation-plugins/scripts/efficiency-scorer.ts +++ b/examples/showcase/tool-evaluation-plugins/scripts/efficiency-scorer.ts @@ -16,8 +16,8 @@ * Usage in eval YAML: * graders: * - name: efficiency - * type: code_grader - * script: ["bun", "run", "scripts/efficiency-scorer.ts"] + * type: code-grader + * command: ["bun", "run", "scripts/efficiency-scorer.ts"] */ import { type TraceSummary, defineCodeGrader } from '@agentv/sdk'; diff --git a/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts b/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts index ffd3b20a4..c5f330676 100644 --- a/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts +++ b/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts @@ -14,8 +14,8 @@ * Usage in eval YAML: * graders: * - name: pairwise-compare - * type: code_grader - * script: ["bun", "run", "scripts/pairwise-tool-compare.ts"] + * type: code-grader + * command: ["bun", "run", "scripts/pairwise-tool-compare.ts"] */ import { type Message, defineCodeGrader } from '@agentv/sdk'; diff --git a/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-grader.ts b/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-grader.ts index c1a7af96b..930279139 100644 --- a/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-grader.ts +++ b/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-grader.ts @@ -14,8 +14,8 @@ * Usage in eval YAML: * graders: * - name: tool-selection - * type: code_grader - * script: ["bun", "run", "scripts/tool-selection-grader.ts"] + * type: code-grader + * command: ["bun", "run", "scripts/tool-selection-grader.ts"] */ import { type Message, defineCodeGrader } from '@agentv/sdk'; diff --git a/packages/core/src/evaluation/evaluate.ts b/packages/core/src/evaluation/evaluate.ts index ea28b1157..31746872f 100644 --- a/packages/core/src/evaluation/evaluate.ts +++ b/packages/core/src/evaluation/evaluate.ts @@ -99,10 +99,8 @@ export interface EvalTestInput { readonly criteria?: string; /** Input to the agent (string or message array). Omit when using turns[]. */ readonly input?: string | readonly { role: string; content: string }[]; - /** Expected reference output (camelCase preferred) */ + /** Expected reference output */ readonly expectedOutput?: string; - /** @deprecated Use `expectedOutput` instead */ - readonly expected_output?: string; /** Assertion graders — accepts factory functions, config objects, or inline functions */ readonly assertions?: readonly AssertEntry[]; /** Arbitrary metadata */ @@ -124,8 +122,6 @@ export interface ConversationTurnInput { readonly input: string | readonly { role: string; content: string }[]; /** Expected reference output for this turn */ readonly expectedOutput?: string; - /** @deprecated Use `expectedOutput` instead */ - readonly expected_output?: string; /** Per-turn assertions (string criteria or grader config) */ readonly assertions?: readonly AssertEntry[]; } @@ -144,13 +140,13 @@ export interface EvalAssertionInput { /** Weight for scoring */ readonly weight?: number; /** Whether this assertion is required to pass */ - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** Prompt file for llm_grader */ readonly prompt?: string; - /** Script for code_grader */ - readonly script?: string | readonly string[]; + /** Command for code_grader */ + readonly command?: string | readonly string[]; /** Additional config passed to the assertion */ readonly config?: Record; /** Nested assertions for composite type */ @@ -487,6 +483,7 @@ function toBeforeAllHook(beforeAll: string | readonly string[]): WorkspaceHookCo } const REMOVED_ASSERT_KEY = 'assert'; +const REMOVED_EXPECTED_OUTPUT_KEY = 'expected_output'; function rejectRemovedAssertKey(value: unknown, location: string): void { if ( @@ -498,6 +495,18 @@ function rejectRemovedAssertKey(value: unknown, location: string): void { } } +function rejectRemovedProgrammaticExpectedOutputKey(value: unknown, location: string): void { + if ( + value && + typeof value === 'object' && + Object.prototype.hasOwnProperty.call(value, REMOVED_EXPECTED_OUTPUT_KEY) + ) { + throw new Error( + `${location}: 'expected_output' has been removed. Use 'expectedOutput' instead.`, + ); + } +} + function validateAssertionEntries( entries: readonly AssertEntry[] | undefined, location: string, @@ -559,6 +568,7 @@ function buildInlineEvalTests( .filter((test) => !options.filter || matchesFilter(test.id, options.filter)) .map((test): EvalTest => { rejectRemovedAssertKey(test, `Test '${test.id}'`); + rejectRemovedProgrammaticExpectedOutputKey(test, `Test '${test.id}'`); const isConversation = test.mode === 'conversation' || (test.turns && test.turns.length > 0); if (!isConversation && !test.input) { @@ -573,7 +583,7 @@ function buildInlineEvalTests( ? extractQuestion(test.turns?.[0]?.input ?? '') : extractQuestion(test.input ?? ''); - const expectedOutputValue = test.expectedOutput ?? test.expected_output; + const expectedOutputValue = test.expectedOutput; const expectedOutput = expectedOutputValue ? ([ { role: 'assistant' as const, content: expectedOutputValue }, @@ -584,7 +594,8 @@ function buildInlineEvalTests( const assertConfigs = convertAssertions(allAssertions, `Test '${test.id}'.assertions`); const turns: ConversationTurn[] | undefined = test.turns?.map((turn) => { rejectRemovedAssertKey(turn, `Test '${test.id}'.turns[]`); - const turnExpected = turn.expectedOutput ?? turn.expected_output; + rejectRemovedProgrammaticExpectedOutputKey(turn, `Test '${test.id}'.turns[]`); + const turnExpected = turn.expectedOutput; return { input: turn.input as ConversationTurn['input'], ...(turnExpected !== undefined && { diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts index b5a6c6e06..f40171f03 100644 --- a/packages/core/src/evaluation/graders/code-grader.ts +++ b/packages/core/src/evaluation/graders/code-grader.ts @@ -97,8 +97,6 @@ export async function materializeContentForGrader( export interface CodeGraderOptions { readonly command: readonly string[]; - /** @deprecated Use `command` instead */ - readonly script?: readonly string[]; readonly cwd?: string; readonly agentTimeoutMs?: number; /** Pass-through configuration from YAML (any unrecognized properties) */ @@ -117,7 +115,7 @@ export class CodeGrader implements Grader { private readonly target?: TargetAccessConfig; constructor(options: CodeGraderOptions) { - this.command = options.command ?? options.script ?? []; + this.command = options.command; this.cwd = options.cwd; this.agentTimeoutMs = options.agentTimeoutMs; this.config = options.config; @@ -206,10 +204,10 @@ export class CodeGrader implements Grader { let proxyShutdown: (() => Promise) | undefined; let getProxyUsage: (() => TargetProxyUsageMetadata) | undefined; - if (this.target !== undefined && context.judgeProvider) { + if (this.target !== undefined && context.graderProvider) { const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS; const proxy = await createTargetProxy({ - defaultProvider: context.judgeProvider, + defaultProvider: context.graderProvider, targetResolver: context.targetResolver, availableTargets: context.availableTargets, maxCalls, diff --git a/packages/core/src/evaluation/graders/index.ts b/packages/core/src/evaluation/graders/index.ts index 8a1a28800..d7a118338 100644 --- a/packages/core/src/evaluation/graders/index.ts +++ b/packages/core/src/evaluation/graders/index.ts @@ -11,7 +11,6 @@ export type { // Scoring utilities export { DEFAULT_THRESHOLD, - PASS_THRESHOLD, clampScore, deepEqual, extractJsonBlob, diff --git a/packages/core/src/evaluation/graders/llm-grader-prompt.ts b/packages/core/src/evaluation/graders/llm-grader-prompt.ts index 234d99613..185960d93 100644 --- a/packages/core/src/evaluation/graders/llm-grader-prompt.ts +++ b/packages/core/src/evaluation/graders/llm-grader-prompt.ts @@ -285,9 +285,7 @@ function assembleScoreRange( for (const rubric of rubrics) { const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : ''; const minScoreLabel = - rubric.required_min_score !== undefined - ? ` [REQUIRED: min score ${rubric.required_min_score}]` - : ''; + rubric.min_score !== undefined ? ` [REQUIRED: min score ${rubric.min_score}]` : ''; parts.push('', `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`); diff --git a/packages/core/src/evaluation/graders/llm-grader.ts b/packages/core/src/evaluation/graders/llm-grader.ts index f5593801a..27633b8ef 100644 --- a/packages/core/src/evaluation/graders/llm-grader.ts +++ b/packages/core/src/evaluation/graders/llm-grader.ts @@ -92,15 +92,11 @@ type GraderProviderResolver = (context: EvaluationContext) => Promise; + this.resolveGraderProvider = options.resolveGraderProvider; this.maxOutputTokens = options.maxOutputTokens; this.temperature = options.temperature; this.graderTemplate = options.graderTemplate; this.maxSteps = Math.min(options.maxSteps ?? DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT); - this.graderTargetProvider = options.graderTargetProvider ?? options.judgeTargetProvider; + this.graderTargetProvider = options.graderTargetProvider; } async evaluate(context: EvaluationContext): Promise { @@ -937,11 +932,7 @@ export class LlmGrader implements Grader { for (const rubric of rubrics) { const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : ''; const minScoreLabel = - rubric.min_score !== undefined - ? ` [REQUIRED: min score ${rubric.min_score}]` - : rubric.required_min_score !== undefined - ? ` [REQUIRED: min score ${rubric.required_min_score}]` - : ''; + rubric.min_score !== undefined ? ` [REQUIRED: min score ${rubric.min_score}]` : ''; parts.push('', `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`); @@ -1280,7 +1271,7 @@ Important: The "score" must be an integer from 0 to 10 that falls within one of * Calculate score from score-range rubric evaluation results. * - Normalizes each criterion score (0-10) to 0-1 by dividing by 10 * - Computes weighted average across criteria - * - Applies required_min_score gating (force fail if below threshold) + * - Applies min_score gating (force fail if below threshold) */ function calculateScoreRangeResult( result: z.infer, @@ -1311,19 +1302,7 @@ function calculateScoreRangeResult( totalWeight += rubric.weight; weightedScoreSum += normalizedScore * rubric.weight; - // Determine required minimum score (as normalized 0-1): - // - If min_score is set (0-1), use directly - // - If required_min_score is set (legacy 0-10), normalize to 0-1 - // - If required is true (legacy), treat as min_score: 1.0 - // - Otherwise, no gating - let minScoreThreshold: number | undefined; - if (rubric.min_score !== undefined) { - minScoreThreshold = rubric.min_score; - } else if (rubric.required_min_score !== undefined) { - minScoreThreshold = rubric.required_min_score / 10; - } else if (rubric.required === true) { - minScoreThreshold = 1.0; // Legacy: required: true means must score 10/10 - } + const minScoreThreshold = rubric.min_score; // Find the matching score range description for reporting const matchingRange = rubric.score_ranges?.find( diff --git a/packages/core/src/evaluation/graders/scoring.ts b/packages/core/src/evaluation/graders/scoring.ts index 49f1611cf..9d6c071b4 100644 --- a/packages/core/src/evaluation/graders/scoring.ts +++ b/packages/core/src/evaluation/graders/scoring.ts @@ -23,9 +23,6 @@ import type { EvaluationScore } from './types.js'; /** Default score threshold for pass verdict (0-1). Scores below this are fail. */ export const DEFAULT_THRESHOLD = 0.8; -/** @deprecated Use DEFAULT_THRESHOLD instead. */ -export const PASS_THRESHOLD = DEFAULT_THRESHOLD; - export function scoreToVerdict(score: number, threshold = DEFAULT_THRESHOLD): EvaluationVerdict { return score >= threshold ? 'pass' : 'fail'; } diff --git a/packages/core/src/evaluation/graders/types.ts b/packages/core/src/evaluation/graders/types.ts index e519c667e..d0b9b03fe 100644 --- a/packages/core/src/evaluation/graders/types.ts +++ b/packages/core/src/evaluation/graders/types.ts @@ -31,8 +31,6 @@ export interface EvaluationContext { }; readonly now: Date; readonly graderProvider?: Provider; - /** @deprecated Use `graderProvider` instead */ - readonly judgeProvider?: Provider; readonly graderTemplateOverride?: string; readonly evaluator?: GraderConfig; /** Output messages from agent execution (primary source for tool trajectory) */ diff --git a/packages/core/src/evaluation/loaders/config-loader.ts b/packages/core/src/evaluation/loaders/config-loader.ts index 511e6b9f2..c3629288f 100644 --- a/packages/core/src/evaluation/loaders/config-loader.ts +++ b/packages/core/src/evaluation/loaders/config-loader.ts @@ -376,17 +376,15 @@ export function extractTargetsFromSuite(suite: JsonObject): readonly string[] | function parseHookConfig(raw: unknown): WorkspaceHookConfig | undefined { if (!raw || typeof raw !== 'object') return undefined; const obj = raw as Record; + if (obj.script !== undefined) { + throw new Error("Workspace hook field 'script' has been removed. Use 'command' instead."); + } - // Accept command as string (shell command) or array let command: readonly string[] | undefined; if (typeof obj.command === 'string') { command = ['sh', '-c', obj.command]; } else if (Array.isArray(obj.command)) { command = obj.command.filter((s): s is string => typeof s === 'string'); - } else if (typeof obj.script === 'string') { - command = ['sh', '-c', obj.script]; - } else if (Array.isArray(obj.script)) { - command = obj.script.filter((s): s is string => typeof s === 'string'); } if (!command || command.length === 0) return undefined; diff --git a/packages/core/src/evaluation/loaders/grader-parser.ts b/packages/core/src/evaluation/loaders/grader-parser.ts index 7df185914..a2d74a4ea 100644 --- a/packages/core/src/evaluation/loaders/grader-parser.ts +++ b/packages/core/src/evaluation/loaders/grader-parser.ts @@ -516,13 +516,12 @@ async function parseGraderList( if (typeValue === 'code-grader') { let command: string[] | undefined; - // Precedence: command > script (deprecated alias) - if (rawEvaluator.script !== undefined && rawEvaluator.command === undefined) { - console.warn( - `${ANSI_YELLOW}Warning: 'script' is deprecated in evaluator '${name}' in '${evalId}'. Use 'command' instead.${ANSI_RESET}`, + if (rawEvaluator.script !== undefined) { + throw new Error( + `Grader '${name}' in '${evalId}': 'script' has been removed. Use 'command' instead.`, ); } - const rawCommand = rawEvaluator.command ?? rawEvaluator.script; + const rawCommand = rawEvaluator.command; if (typeof rawCommand === 'string') { const trimmed = rawCommand.trim(); @@ -603,7 +602,6 @@ async function parseGraderList( 'name', 'type', 'command', - 'script', 'cwd', 'weight', 'target', @@ -1578,14 +1576,13 @@ async function parseGraderList( if (isJsonObject(rawPrompt)) { // Executable prompt template: { command: [...], config: {...} } - // Precedence: command > script (deprecated alias) - if (rawPrompt.script !== undefined && rawPrompt.command === undefined) { - console.warn( - `${ANSI_YELLOW}Warning: 'prompt.script' is deprecated in evaluator '${name}' in '${evalId}'. Use 'prompt.command' instead.${ANSI_RESET}`, + if (rawPrompt.script !== undefined) { + throw new Error( + `Grader '${name}' in '${evalId}': 'prompt.script' has been removed. Use 'prompt.command' instead.`, ); } const commandArray = asStringArray( - rawPrompt.command ?? rawPrompt.script, + rawPrompt.command, `prompt.command for evaluator '${name}' in '${evalId}'`, ); @@ -2004,31 +2001,19 @@ function logWarning(message: string, details?: readonly string[]): void { } /** - * Parse a `required` value from raw evaluator config. - * Accepts `true` (uses default 0.8 threshold) or a number in (0, 1] range. - * Returns undefined for falsy/invalid values. - */ -function parseRequired(value: JsonValue | undefined): boolean | number | undefined { - if (value === true) return true; - if (typeof value === 'number' && value > 0 && value <= 1) return value; - return undefined; -} - -/** - * Parse `required` and `min_score` from raw evaluator config, handling deprecated `required: number`. + * Parse `required` and `min_score` from raw evaluator config. * * - `required: true` → `{ required: true }` - * - `required: 0.7` (deprecated) → `{ required: true, min_score: 0.7 }` + deprecation warning * - `min_score: 0.7` → `{ min_score: 0.7 }` - * - Explicit `min_score` takes priority over `required: number` + * - Numeric `required` has been removed; use `required: true` + `min_score`. */ function parseRequiredAndMinScore( rawRequired: JsonValue | undefined, rawMinScore: JsonValue | undefined, evaluatorName: string, evalId: string, -): { required?: boolean | number; min_score?: number } { - const result: { required?: boolean | number; min_score?: number } = {}; +): { required?: boolean; min_score?: number } { + const result: { required?: boolean; min_score?: number } = {}; // Parse min_score (explicit field, takes priority) if (typeof rawMinScore === 'number' && rawMinScore > 0 && rawMinScore <= 1) { @@ -2038,15 +2023,9 @@ function parseRequiredAndMinScore( // Parse required if (rawRequired === true) { result.required = true; - } else if (typeof rawRequired === 'number' && rawRequired > 0 && rawRequired <= 1) { - // Deprecated: required: number → required: true + min_score - if (result.min_score === undefined) { - result.min_score = rawRequired; - } - // Keep numeric required for backward compat (orchestrator reads min_score preferentially) - result.required = rawRequired; - logWarning( - `Grader '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. ` + + } else if (typeof rawRequired === 'number') { + throw new Error( + `Grader '${evaluatorName}' in '${evalId}': numeric 'required: ${rawRequired}' has been removed. ` + `Use 'required: true' + 'min_score: ${rawRequired}' instead.`, ); } @@ -2149,13 +2128,17 @@ function parseRubricItems( const operator = parseRubricOperator(rawRubric.operator, id, evaluatorName, evalId); const weight = typeof rawRubric.weight === 'number' ? rawRubric.weight : 1.0; - // Parse min_score (0-1 scale), required_min_score (deprecated 0-10 scale), and required + if (rawRubric.required_min_score !== undefined) { + throw new Error( + `Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score' has been removed. Use 'min_score' (0-1 scale) instead.`, + ); + } + + // Parse min_score (0-1 scale) and checklist required let minScore: number | undefined; - let requiredMinScore: number | undefined; let required: boolean | undefined; if (typeof rawRubric.min_score === 'number') { - // New field: 0-1 scale const ms = rawRubric.min_score as number; if (ms <= 0 || ms > 1) { throw new Error( @@ -2163,22 +2146,6 @@ function parseRubricItems( ); } minScore = ms; - // Compute legacy required_min_score for backward compat with llm-grader internals - requiredMinScore = Math.round(ms * 10); - } else if (typeof rawRubric.required_min_score === 'number') { - // Deprecated: 0-10 integer scale - const rms = rawRubric.required_min_score as number; - if (!Number.isInteger(rms) || rms < 0 || rms > 10) { - throw new Error( - `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`, - ); - } - requiredMinScore = rms; - minScore = rms / 10; - logWarning( - `Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. ` + - `Use 'min_score: ${rms / 10}' (0-1 scale) instead.`, - ); } if (typeof rawRubric.required === 'boolean') { @@ -2205,9 +2172,7 @@ function parseRubricItems( weight, ...(expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {}), ...(operator !== undefined ? { operator } : {}), - ...(required !== undefined ? { required } : {}), ...(minScore !== undefined ? { min_score: minScore } : {}), - ...(requiredMinScore !== undefined ? { required_min_score: requiredMinScore } : {}), score_ranges: scoreRanges, }); } else { @@ -2227,7 +2192,6 @@ function parseRubricItems( // Default to required: true if not specified (backward compatibility) required: required ?? true, ...(minScore !== undefined ? { min_score: minScore } : {}), - ...(requiredMinScore !== undefined ? { required_min_score: requiredMinScore } : {}), }); } } @@ -2405,7 +2369,7 @@ function parseScoreRanges( * Parse inline rubrics field (syntactic sugar at eval case level). * Supports: * - String shorthand: "Must be polite" -> { id: "rubric-1", outcome: "Must be polite", weight: 1.0, required: true } - * - Object form with outcome, weight, required, score_ranges, required_min_score + * - Object form with outcome, weight, required, score_ranges, min_score * * Returns an LlmGraderConfig to prepend to evaluators, or undefined if no valid rubrics. */ @@ -2451,15 +2415,16 @@ export function parseInlineRubrics( weight: typeof rubric.weight === 'number' ? rubric.weight : 1.0, }; - // Parse min_score (0-1) or required_min_score (deprecated 0-10) + if (rubric.required_min_score !== undefined) { + throw new Error( + `Inline rubric '${id}': 'required_min_score' has been removed. Use 'min_score' (0-1 scale) instead.`, + ); + } + + // Parse min_score (0-1) let inlineMinScore: number | undefined; - let inlineRequiredMinScore: number | undefined; if (typeof rubric.min_score === 'number') { inlineMinScore = rubric.min_score as number; - inlineRequiredMinScore = Math.round(inlineMinScore * 10); - } else if (typeof rubric.required_min_score === 'number') { - inlineRequiredMinScore = rubric.required_min_score as number; - inlineMinScore = inlineRequiredMinScore / 10; } // For score_ranges rubrics, outcome at rubric level is optional @@ -2467,11 +2432,7 @@ export function parseInlineRubrics( return { ...baseRubric, ...(expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {}), - ...(typeof rubric.required === 'boolean' ? { required: rubric.required } : {}), ...(inlineMinScore !== undefined ? { min_score: inlineMinScore } : {}), - ...(inlineRequiredMinScore !== undefined - ? { required_min_score: inlineRequiredMinScore } - : {}), score_ranges: scoreRanges, }; } @@ -2482,9 +2443,6 @@ export function parseInlineRubrics( outcome: expectedOutcome, required: typeof rubric.required === 'boolean' ? rubric.required : true, ...(inlineMinScore !== undefined ? { min_score: inlineMinScore } : {}), - ...(inlineRequiredMinScore !== undefined - ? { required_min_score: inlineRequiredMinScore } - : {}), }; }) // Filter: must have outcome OR score_ranges diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index dace6158c..93e15cab5 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -2799,7 +2799,7 @@ async function runEvaluatorList(options: { readonly name: string; readonly type: string; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; readonly min_score?: number; }> = []; const scores: GraderResult[] = []; @@ -2939,8 +2939,7 @@ async function runEvaluatorList(options: { const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD; const hasRequiredFailure = scored.some((entry) => { if (!entry.required) return false; - const minScore = - entry.min_score ?? (typeof entry.required === 'number' ? entry.required : effectiveThreshold); + const minScore = entry.min_score ?? effectiveThreshold; return entry.score.score < minScore; }); diff --git a/packages/core/src/evaluation/providers/claude-cli.ts b/packages/core/src/evaluation/providers/claude-cli.ts index 1977a56ac..b4c18e69e 100644 --- a/packages/core/src/evaluation/providers/claude-cli.ts +++ b/packages/core/src/evaluation/providers/claude-cli.ts @@ -209,6 +209,9 @@ export class ClaudeCliProvider implements Provider { } private resolveLogDirectory(request: ProviderRequest): string | undefined { + if (this.config.streamLog === false) { + return undefined; + } const disabled = isClaudeCliLogStreamingDisabled(); if (disabled) { return undefined; diff --git a/packages/core/src/evaluation/providers/claude-sdk.ts b/packages/core/src/evaluation/providers/claude-sdk.ts index 49959c8c0..2a40a91d7 100644 --- a/packages/core/src/evaluation/providers/claude-sdk.ts +++ b/packages/core/src/evaluation/providers/claude-sdk.ts @@ -234,6 +234,9 @@ export class ClaudeSdkProvider implements Provider { } private resolveLogDirectory(request: ProviderRequest): string | undefined { + if (this.config.streamLog === false) { + return undefined; + } const disabled = isClaudeLogStreamingDisabled(); if (disabled) { return undefined; diff --git a/packages/core/src/evaluation/providers/claude.ts b/packages/core/src/evaluation/providers/claude.ts index d0951fbb4..64e15ed88 100644 --- a/packages/core/src/evaluation/providers/claude.ts +++ b/packages/core/src/evaluation/providers/claude.ts @@ -231,6 +231,9 @@ export class ClaudeProvider implements Provider { } private resolveLogDirectory(request: ProviderRequest): string | undefined { + if (this.config.streamLog === false) { + return undefined; + } const disabled = isClaudeLogStreamingDisabled(); if (disabled) { return undefined; diff --git a/packages/core/src/evaluation/providers/copilot-cli.ts b/packages/core/src/evaluation/providers/copilot-cli.ts index ce47bd600..244e202c1 100644 --- a/packages/core/src/evaluation/providers/copilot-cli.ts +++ b/packages/core/src/evaluation/providers/copilot-cli.ts @@ -543,6 +543,9 @@ export class CopilotCliProvider implements Provider { } private resolveLogDirectory(request: ProviderRequest): string | undefined { + if (this.config.streamLog === false) { + return undefined; + } if (isLogStreamingDisabled('AGENTV_COPILOT_CLI_STREAM_LOGS')) { return undefined; } diff --git a/packages/core/src/evaluation/providers/copilot-sdk.ts b/packages/core/src/evaluation/providers/copilot-sdk.ts index 9a8011a02..90353f16a 100644 --- a/packages/core/src/evaluation/providers/copilot-sdk.ts +++ b/packages/core/src/evaluation/providers/copilot-sdk.ts @@ -444,6 +444,9 @@ export class CopilotSdkProvider implements Provider { } private resolveLogDirectory(request: ProviderRequest): string | undefined { + if (this.config.streamLog === false) { + return undefined; + } if (isLogStreamingDisabled('AGENTV_COPILOT_SDK_STREAM_LOGS')) { return undefined; } diff --git a/packages/core/src/evaluation/providers/index.ts b/packages/core/src/evaluation/providers/index.ts index 89de1de05..98ae28744 100644 --- a/packages/core/src/evaluation/providers/index.ts +++ b/packages/core/src/evaluation/providers/index.ts @@ -31,7 +31,6 @@ import { VSCodeProvider } from './vscode-provider.js'; export type { EnvLookup, Message, - OutputMessage, Provider, ProviderKind, ProviderRequest, diff --git a/packages/core/src/evaluation/providers/pi-cli.ts b/packages/core/src/evaluation/providers/pi-cli.ts index f1dcf6dba..258afc77c 100644 --- a/packages/core/src/evaluation/providers/pi-cli.ts +++ b/packages/core/src/evaluation/providers/pi-cli.ts @@ -327,6 +327,9 @@ export class PiCliProvider implements Provider { } private resolveLogDirectory(request: ProviderRequest): string | undefined { + if (this.config.streamLog === false) { + return undefined; + } if (this.config.logDir) { return path.resolve(this.config.logDir); } diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts index b45446535..993e22c32 100644 --- a/packages/core/src/evaluation/providers/pi-coding-agent.ts +++ b/packages/core/src/evaluation/providers/pi-coding-agent.ts @@ -567,6 +567,9 @@ export class PiCodingAgentProvider implements Provider { } private resolveLogDirectory(request: ProviderRequest): string | undefined { + if (this.config.streamLog === false) { + return undefined; + } if (this.config.logDir) { return path.resolve(this.config.logDir); } diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index 10ee4939e..5705ed8dd 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -1,5 +1,3 @@ -import { existsSync, readFileSync } from 'node:fs'; -import { homedir } from 'node:os'; import path from 'node:path'; import { z } from 'zod'; @@ -112,7 +110,6 @@ export const CliTargetInputSchema = z // Common target fields grader_target: z.string().optional(), - judge_target: z.string().optional(), // backward compat workers: z.number().int().min(1).optional(), provider_batching: z.boolean().optional(), }) @@ -597,8 +594,8 @@ const DEPRECATED_TARGET_CAMEL_CASE_FIELDS = new Map([ ['timeoutSeconds', 'timeout_seconds'], ['logDir', 'log_dir'], ['logDirectory', 'log_directory'], - ['logFormat', 'log_format'], - ['logOutputFormat', 'log_output_format'], + ['logFormat', 'stream_log'], + ['logOutputFormat', 'stream_log'], ['systemPrompt', 'system_prompt'], ['maxTurns', 'max_turns'], ['maxBudgetUsd', 'max_budget_usd'], @@ -696,6 +693,25 @@ function assertNoDeprecatedCamelCaseTargetFields(definition: TargetDefinition): ); } +function assertNoRemovedTargetFields(definition: TargetDefinition): void { + const rawDefinition = definition as unknown as Record; + if (Object.prototype.hasOwnProperty.call(rawDefinition, 'judge_target')) { + throw new Error( + `target "${definition.name}".judge_target: field 'judge_target' has been removed. Use 'grader_target' instead.`, + ); + } + if (Object.prototype.hasOwnProperty.call(rawDefinition, 'log_format')) { + throw new Error( + `target "${definition.name}".log_format: field 'log_format' has been removed. Use 'stream_log' instead.`, + ); + } + if (Object.prototype.hasOwnProperty.call(rawDefinition, 'log_output_format')) { + throw new Error( + `target "${definition.name}".log_output_format: field 'log_output_format' has been removed. Use 'stream_log' instead.`, + ); + } +} + export function findDeprecatedCamelCaseTargetWarnings( target: unknown, location: string, @@ -810,7 +826,6 @@ const BASE_TARGET_SCHEMA = z provider: z.string().optional(), use_target: z.string().optional(), grader_target: z.string().optional(), - judge_target: z.string().optional(), // backward compat workers: z.number().int().min(1).optional(), subagent_mode_allowed: z.boolean().optional(), fallback_targets: z.array(z.string().min(1)).optional(), @@ -945,6 +960,7 @@ export function resolveTargetDefinition( options?: { readonly emitDeprecationWarnings?: boolean }, ): ResolvedTarget { void options; + assertNoRemovedTargetFields(definition); assertNoDeprecatedCamelCaseTargetFields(definition); const parsed = BASE_TARGET_SCHEMA.parse(definition); @@ -966,7 +982,7 @@ export function resolveTargetDefinition( const fallbackTargets = parsed.fallback_targets; const base = { name: parsed.name, - graderTarget: parsed.grader_target ?? parsed.judge_target, + graderTarget: parsed.grader_target, workers: parsed.workers, providerBatching, subagentModeAllowed, @@ -987,7 +1003,6 @@ export function resolveTargetDefinition( config: resolveOpenRouterConfig(parsed, env), }; case 'azure': - case 'azure-openai': return { kind: 'azure', ...base, @@ -1000,28 +1015,23 @@ export function resolveTargetDefinition( config: resolveAnthropicConfig(parsed, env), }; case 'gemini': - case 'google': - case 'google-gemini': return { kind: 'gemini', ...base, config: resolveGeminiConfig(parsed, env), }; case 'codex': - case 'codex-cli': return { kind: 'codex', ...base, config: resolveCodexConfig(parsed, env, evalFilePath), }; case 'copilot-sdk': - case 'copilot_sdk': return { kind: 'copilot-sdk', ...base, config: resolveCopilotSdkConfig(parsed, env, evalFilePath), }; - case 'copilot': case 'copilot-cli': return { kind: 'copilot-cli', @@ -1034,7 +1044,6 @@ export function resolveTargetDefinition( ...base, config: resolveCopilotLogConfig(parsed, env), }; - case 'pi': case 'pi-coding-agent': return { kind: 'pi-coding-agent', @@ -1047,24 +1056,7 @@ export function resolveTargetDefinition( ...base, config: resolvePiCliConfig(parsed, env, evalFilePath), }; - case 'cc-mirror': { - const variantName = - resolveOptionalString(parsed.variant, env, `${parsed.name} cc-mirror variant`, { - allowLiteral: true, - optionalEnv: true, - }) ?? parsed.name; - // If executable is explicitly set, use it; otherwise auto-discover from variant.json - if (!parsed.executable) { - parsed.executable = resolveCcMirrorBinaryPath(variantName); - } - return { - kind: 'claude-cli', - ...base, - config: resolveClaudeConfig(parsed, env, evalFilePath), - }; - } case 'claude': - case 'claude-code': case 'claude-cli': return { kind: 'claude-cli', @@ -1159,7 +1151,7 @@ function resolveAzureConfig( // documented escape hatch instead of silently 400-ing on every call. if (target.api_format !== undefined) { throw new Error( - `The 'api_format' field is no longer supported on Azure targets ('${target.name}'). AgentV always uses Azure's Responses API. If your deployment only exposes /chat/completions, use 'provider: openai' with a deployment-scoped 'base_url' instead. See docs/targets/llm-providers for details.`, + `The 'api_format' field has been removed from Azure targets ('${target.name}'). AgentV always uses Azure's Responses API. If your deployment only exposes /chat/completions, use 'provider: openai' with a deployment-scoped 'base_url' instead. See docs/targets/llm-providers for details.`, ); } @@ -1333,11 +1325,6 @@ function resolveCodexConfig( const logDirSource = target.log_dir ?? target.log_directory; const systemPromptSource = target.system_prompt; - if (target.log_format !== undefined || target.log_output_format !== undefined) { - throw new Error( - `${target.name}: log_format is no longer supported for codex targets. Use stream_log instead.`, - ); - } const streamLogResult = resolveStreamLog({ name: target.name, stream_log: target.stream_log }); const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, { @@ -1493,26 +1480,11 @@ function normalizeCodexApprovalPolicy(value: string | undefined): CodexApprovalP ); } -/** - * Resolve the stream_log config field, falling back to log_format with a - * deprecation warning. - * - * Resolution order: - * 1. stream_log (new canonical field) - * 2. log_format / log_output_format (deprecated, mapped to stream_log equivalent) - * 3. environment variable fallback (optional) - * - * Mapping: log_format 'json' → 'raw', log_format 'summary' → 'summary'. - */ -function resolveStreamLog( - target: { stream_log?: unknown; log_format?: unknown; log_output_format?: unknown; name: string }, - envFallback?: unknown, -): { +/** Resolve canonical stream_log config and the legacy logger format it implies. */ +function resolveStreamLog(target: { stream_log?: unknown; name: string }): { streamLog: false | 'raw' | 'summary' | undefined; logFormat: 'summary' | 'json' | undefined; - deprecationWarning?: string; } { - // 1. New stream_log field takes precedence if (target.stream_log !== undefined && target.stream_log !== null) { const val = target.stream_log; if (val === false || val === 'false') { @@ -1527,27 +1499,7 @@ function resolveStreamLog( throw new Error(`${target.name}: stream_log must be false, 'raw', or 'summary'`); } - // 2. Fall back to log_format (deprecated) - const logFormatRaw = target.log_format ?? target.log_output_format ?? envFallback; - if (logFormatRaw === undefined || logFormatRaw === null) { - return { streamLog: undefined, logFormat: undefined }; - } - - if (typeof logFormatRaw !== 'string') { - throw new Error(`${target.name}: log_format must be 'summary' or 'json'`); - } - - const normalized = logFormatRaw.trim().toLowerCase(); - if (normalized !== 'json' && normalized !== 'summary') { - throw new Error(`${target.name}: log_format must be 'summary' or 'json'`); - } - - const streamLogEquivalent = normalized === 'json' ? 'raw' : 'summary'; - return { - streamLog: streamLogEquivalent, - logFormat: normalized as 'json' | 'summary', - deprecationWarning: `${target.name}: 'log_format' is deprecated and will be removed in v4.16. Use 'stream_log: ${streamLogEquivalent}' instead (log_format: '${normalized}' → stream_log: '${streamLogEquivalent}').`, - }; + return { streamLog: undefined, logFormat: undefined }; } function resolveCopilotSdkConfig( @@ -1563,13 +1515,9 @@ function resolveCopilotSdkConfig( const cwdSource = target.cwd; const timeoutSource = target.timeout_seconds; const logDirSource = target.log_dir ?? target.log_directory; - const logFormatSource = target.log_format; const systemPromptSource = target.system_prompt; const streamLogResult = resolveStreamLog(target); - if (streamLogResult.deprecationWarning) { - process.stderr.write(`[agentv] ⚠ ${streamLogResult.deprecationWarning}\n`); - } const cliUrl = resolveOptionalString(cliUrlSource, env, `${target.name} copilot-sdk cli URL`, { allowLiteral: true, @@ -1615,8 +1563,6 @@ function resolveCopilotSdkConfig( }, ); - const logFormat = normalizeCopilotLogFormat(logFormatSource); - const systemPrompt = typeof systemPromptSource === 'string' && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() @@ -1633,7 +1579,7 @@ function resolveCopilotSdkConfig( cwd, timeoutMs, logDir, - logFormat, + logFormat: streamLogResult.logFormat, streamLog: streamLogResult.streamLog, systemPrompt, ...(customProvider ? { customProvider } : {}), @@ -1730,13 +1676,9 @@ function resolveCopilotCliConfig( const cwdSource = target.cwd; const timeoutSource = target.timeout_seconds; const logDirSource = target.log_dir ?? target.log_directory; - const logFormatSource = target.log_format; const systemPromptSource = target.system_prompt; const streamLogResult = resolveStreamLog(target); - if (streamLogResult.deprecationWarning) { - process.stderr.write(`[agentv] ⚠ ${streamLogResult.deprecationWarning}\n`); - } const executable = resolveOptionalString(executableSource, env, `${target.name} copilot-cli executable`, { @@ -1768,8 +1710,6 @@ function resolveCopilotCliConfig( }, ); - const logFormat = normalizeCopilotLogFormat(logFormatSource); - const systemPrompt = typeof systemPromptSource === 'string' && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() @@ -1783,21 +1723,13 @@ function resolveCopilotCliConfig( cwd, timeoutMs, logDir, - logFormat, + logFormat: streamLogResult.logFormat, streamLog: streamLogResult.streamLog, systemPrompt, ...(customProvider ? { customProvider } : {}), }; } -function normalizeCopilotLogFormat(value: unknown): 'summary' | 'json' | undefined { - if (value === undefined || value === null) return undefined; - if (typeof value !== 'string') throw new Error("copilot log format must be 'summary' or 'json'"); - const normalized = value.trim().toLowerCase(); - if (normalized === 'json' || normalized === 'summary') return normalized; - throw new Error("copilot log format must be 'summary' or 'json'"); -} - function resolvePiCodingAgentConfig( target: z.infer, env: EnvLookup, @@ -1811,13 +1743,9 @@ function resolvePiCodingAgentConfig( const cwdSource = target.cwd; const timeoutSource = target.timeout_seconds; const logDirSource = target.log_dir ?? target.log_directory; - const logFormatSource = target.log_format; const systemPromptSource = target.system_prompt; const streamLogResult = resolveStreamLog(target); - if (streamLogResult.deprecationWarning) { - process.stderr.write(`[agentv] ⚠ ${streamLogResult.deprecationWarning}\n`); - } const subprovider = resolveOptionalString( subproviderSource, @@ -1867,9 +1795,6 @@ function resolvePiCodingAgentConfig( optionalEnv: true, }); - const logFormat = - logFormatSource === 'json' || logFormatSource === 'summary' ? logFormatSource : undefined; - const systemPrompt = typeof systemPromptSource === 'string' && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() @@ -1885,7 +1810,7 @@ function resolvePiCodingAgentConfig( cwd, timeoutMs, logDir, - logFormat, + logFormat: streamLogResult.logFormat, streamLog: streamLogResult.streamLog, systemPrompt, }; @@ -1905,13 +1830,9 @@ function resolvePiCliConfig( const cwdSource = target.cwd; const timeoutSource = target.timeout_seconds; const logDirSource = target.log_dir ?? target.log_directory; - const logFormatSource = target.log_format; const systemPromptSource = target.system_prompt; const streamLogResult = resolveStreamLog(target); - if (streamLogResult.deprecationWarning) { - process.stderr.write(`[agentv] ⚠ ${streamLogResult.deprecationWarning}\n`); - } const executable = resolveOptionalString(executableSource, env, `${target.name} pi-cli executable`, { @@ -1968,9 +1889,6 @@ function resolvePiCliConfig( optionalEnv: true, }); - const logFormat = - logFormatSource === 'json' || logFormatSource === 'summary' ? logFormatSource : undefined; - const systemPrompt = typeof systemPromptSource === 'string' && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() @@ -1988,7 +1906,7 @@ function resolvePiCliConfig( cwd, timeoutMs, logDir, - logFormat, + logFormat: streamLogResult.logFormat, streamLog: streamLogResult.streamLog, systemPrompt, }; @@ -2018,14 +1936,9 @@ function resolveClaudeConfig( const cwdSource = target.cwd; const timeoutSource = target.timeout_seconds; const logDirSource = target.log_dir ?? target.log_directory; - const logFormatSource = - target.log_format ?? target.log_output_format ?? env.AGENTV_CLAUDE_LOG_FORMAT; const systemPromptSource = target.system_prompt; const streamLogResult = resolveStreamLog(target); - if (streamLogResult.deprecationWarning) { - process.stderr.write(`[agentv] ⚠ ${streamLogResult.deprecationWarning}\n`); - } const executable = resolveOptionalString(executableSource, env, `${target.name} claude-cli executable`, { @@ -2050,8 +1963,6 @@ function resolveClaudeConfig( optionalEnv: true, }); - const logFormat = normalizeClaudeLogFormat(logFormatSource); - const systemPrompt = typeof systemPromptSource === 'string' && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() @@ -2076,53 +1987,12 @@ function resolveClaudeConfig( maxTurns, maxBudgetUsd, logDir, - logFormat, + logFormat: streamLogResult.logFormat, streamLog: streamLogResult.streamLog, bypassPermissions, }; } -/** - * Resolve the binary path for a cc-mirror variant. - * Reads ~/.cc-mirror//variant.json → binaryPath. - */ -function resolveCcMirrorBinaryPath(variant: string): string { - const variantJsonPath = path.join(homedir(), '.cc-mirror', variant, 'variant.json'); - if (!existsSync(variantJsonPath)) { - throw new Error( - `cc-mirror variant "${variant}": ${variantJsonPath} not found. Install the variant or set "executable" explicitly.`, - ); - } - let parsed: { binaryPath?: string }; - try { - parsed = JSON.parse(readFileSync(variantJsonPath, 'utf8')); - } catch (e) { - throw new Error( - `cc-mirror variant "${variant}": failed to parse ${variantJsonPath}: ${(e as Error).message}`, - ); - } - if (typeof parsed.binaryPath !== 'string' || parsed.binaryPath.trim().length === 0) { - throw new Error( - `cc-mirror variant "${variant}": ${variantJsonPath} missing "binaryPath" field`, - ); - } - return parsed.binaryPath; -} - -function normalizeClaudeLogFormat(value: unknown): 'summary' | 'json' | undefined { - if (value === undefined || value === null) { - return undefined; - } - if (typeof value !== 'string') { - throw new Error("claude log format must be 'summary' or 'json'"); - } - const normalized = value.trim().toLowerCase(); - if (normalized === 'json' || normalized === 'summary') { - return normalized; - } - throw new Error("claude log format must be 'summary' or 'json'"); -} - function resolveMockConfig(target: z.infer): MockResolvedConfig { const response = typeof target.response === 'string' ? target.response : undefined; return { response }; diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index 2cf479d65..7a7947be5 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -102,25 +102,6 @@ export const KNOWN_PROVIDERS: readonly ProviderKind[] = [ 'replay', ] as const; -/** - * Provider aliases that are accepted in target definitions. - * These map to the canonical ProviderKind values. - */ -export const PROVIDER_ALIASES: readonly string[] = [ - 'azure-openai', // alias for "azure" - 'google', // alias for "gemini" - 'google-gemini', // alias for "gemini" - 'codex-cli', // alias for "codex" - 'copilot', // alias for "copilot-cli" (default copilot experience) - 'copilot_sdk', // alias for "copilot-sdk" (underscore variant) - - 'pi', // alias for "pi-coding-agent" - 'claude-code', // alias for "claude" (legacy) - 'cc-mirror', // alias for "claude-cli" (auto-discovers binary from ~/.cc-mirror//) - 'bedrock', // legacy/future support - 'vertex', // legacy/future support -] as const; - /** * Schema identifier for targets.yaml files (version 2). */ @@ -252,9 +233,6 @@ export interface Message { readonly tokenUsage?: ProviderTokenUsage; } -/** @deprecated Use Message instead */ -export type OutputMessage = Message; - /** * Token usage metrics reported by provider. */ @@ -370,8 +348,6 @@ export interface TargetDefinition { // Supports ${{ ENV_VAR }} syntax (e.g., use_target: ${{ AGENT_TARGET }}). readonly use_target?: string | unknown | undefined; readonly grader_target?: string | undefined; - /** @deprecated Use `grader_target` instead */ - readonly judge_target?: string | undefined; readonly workers?: number | undefined; // Provider batching readonly provider_batching?: boolean | undefined; @@ -409,9 +385,7 @@ export interface TargetDefinition { readonly timeout_seconds?: number | unknown | undefined; readonly log_dir?: string | unknown | undefined; readonly log_directory?: string | unknown | undefined; - readonly log_format?: string | unknown | undefined; - readonly log_output_format?: string | unknown | undefined; - /** New stream_log field — replaces log_format. false=no stream log, 'raw'=per-event, 'summary'=consolidated. */ + /** false=no stream log, 'raw'=per-event, 'summary'=consolidated. */ readonly stream_log?: string | boolean | unknown | undefined; // System prompt (codex, copilot, claude, pi-coding-agent) readonly system_prompt?: string | unknown | undefined; diff --git a/packages/core/src/evaluation/registry/builtin-graders.ts b/packages/core/src/evaluation/registry/builtin-graders.ts index 4133f9d44..d3dbcf873 100644 --- a/packages/core/src/evaluation/registry/builtin-graders.ts +++ b/packages/core/src/evaluation/registry/builtin-graders.ts @@ -80,7 +80,7 @@ export const INLINE_ASSERT_FN = Symbol.for('agentv.inline-assert-fn'); */ export const llmGraderFactory: GraderFactoryFn = (config, context) => { const c = config as LlmGraderConfig; - const { llmGrader, graderProvider, judgeProvider, targetResolver, agentTimeoutMs } = context; + const { llmGrader, graderProvider, targetResolver, agentTimeoutMs } = context; let evaluator = llmGrader; if (c.target) { @@ -102,7 +102,7 @@ export const llmGraderFactory: GraderFactoryFn = (config, context) => { resolveGraderProvider: async (evalContext) => { if (graderTargetProvider) return graderTargetProvider; if (evalContext.graderProvider) return evalContext.graderProvider; - return graderProvider ?? judgeProvider; + return graderProvider; }, maxSteps: c.max_steps, temperature: c.temperature, @@ -169,7 +169,7 @@ export const llmGraderFactory: GraderFactoryFn = (config, context) => { export const codeFactory: GraderFactoryFn = (config, context) => { const c = config as CodeGraderConfig; return new CodeGrader({ - command: c.command ?? c.script ?? [], + command: c.command, cwd: c.resolvedCwd ?? c.cwd, agentTimeoutMs: context.agentTimeoutMs, config: c.config, diff --git a/packages/core/src/evaluation/registry/grader-registry.ts b/packages/core/src/evaluation/registry/grader-registry.ts index 93ff7794e..84eb739ed 100644 --- a/packages/core/src/evaluation/registry/grader-registry.ts +++ b/packages/core/src/evaluation/registry/grader-registry.ts @@ -19,8 +19,6 @@ import type { GraderConfig } from '../types.js'; export interface GraderDispatchContext { /** Shared LLM grader provider (resolved at suite level) */ readonly graderProvider?: Provider; - /** @deprecated Use `graderProvider` instead */ - readonly judgeProvider?: Provider; /** Function to resolve target names to providers */ readonly targetResolver?: TargetResolver; /** Available target names for code graders */ @@ -31,8 +29,6 @@ export interface GraderDispatchContext { readonly evalFileDir?: string; /** Shared LLM grader evaluator instance */ readonly llmGrader: Grader; - /** @deprecated Use `llmGrader` instead */ - readonly llmJudge?: Grader; /** Reference to the registry itself (for composite evaluators that need to create children) */ readonly registry: GraderRegistry; } diff --git a/packages/core/src/evaluation/trace.ts b/packages/core/src/evaluation/trace.ts index 25053073c..48c3c1120 100644 --- a/packages/core/src/evaluation/trace.ts +++ b/packages/core/src/evaluation/trace.ts @@ -908,7 +908,7 @@ export interface ToolTrajectoryGraderConfig { readonly expected?: readonly ToolTrajectoryExpectedItem[]; /** Optional weight for top-level aggregation (defaults to 1.0) */ readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index a23acd308..1964d1336 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -215,8 +215,6 @@ export type TargetAccessConfig = { export type WorkspaceScriptConfig = { /** Command array to execute (e.g., ["bun", "run", "setup.ts"]) */ readonly command: readonly string[]; - /** @deprecated Use `command` instead */ - readonly script?: readonly string[]; /** Optional timeout in milliseconds (default: 60000 for setup, 30000 for teardown) */ readonly timeout_ms?: number; readonly timeoutMs?: number; @@ -248,15 +246,11 @@ export type RepoConfig = { readonly ancestor?: number; /** Optional sparse-checkout paths. */ readonly sparse?: readonly string[]; - /** Optional project-configured repo resolver name. */ - readonly resolver?: string; }; export type WorkspaceHookConfig = { /** Optional command array to execute (e.g., ["bun", "run", "setup.ts"]) */ readonly command?: readonly string[]; - /** @deprecated Use `command` instead */ - readonly script?: readonly string[]; /** Optional timeout in milliseconds */ readonly timeout_ms?: number; readonly timeoutMs?: number; @@ -372,13 +366,11 @@ export type CodeGraderConfig = { readonly name: string; readonly type: 'code-grader'; readonly command: readonly string[]; - /** @deprecated Use `command` instead */ - readonly script?: readonly string[]; readonly resolvedScriptPath?: string; readonly cwd?: string; readonly resolvedCwd?: string; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -398,8 +390,6 @@ export type CodeGraderConfig = { export type PromptScriptConfig = { /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */ readonly command: readonly string[]; - /** @deprecated Use `command` instead */ - readonly script?: readonly string[]; /** Pass-through configuration for the prompt template */ readonly config?: Record; }; @@ -425,7 +415,7 @@ export type LlmGraderConfig = { readonly resolvedPromptScript?: readonly string[]; readonly rubrics?: readonly RubricItem[]; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -477,7 +467,7 @@ export type RubricItem = { readonly operator?: RubricOperator; readonly weight: number; /** - * Legacy boolean gating (treated as min_score: 1.0 for score-range rubrics). + * Binary checklist gating. Score-range rubrics use min_score instead. */ readonly required?: boolean; /** @@ -485,11 +475,6 @@ export type RubricItem = { * Internally compared against normalized score (rawScore / 10). */ readonly min_score?: number; - /** - * @deprecated Use min_score (0-1 scale) instead. - * Legacy: minimum score on 0-10 integer scale. - */ - readonly required_min_score?: number; /** * Score range definitions for analytic rubric scoring. * When present, the grader outputs an integer 0-10 score per criterion. @@ -515,7 +500,7 @@ export type CompositeGraderConfig = { readonly assertions: readonly GraderConfig[]; readonly aggregator: CompositeAggregatorConfig; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -565,7 +550,7 @@ export type FieldAccuracyGraderConfig = { /** Strategy for combining field scores (default: weighted_average) */ readonly aggregation?: FieldAggregationType; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -582,7 +567,7 @@ export type LatencyGraderConfig = { /** Maximum allowed duration in milliseconds */ readonly threshold: number; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -599,7 +584,7 @@ export type CostGraderConfig = { /** Maximum allowed cost in USD */ readonly budget: number; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -620,7 +605,7 @@ export type TokenUsageGraderConfig = { /** Maximum allowed output tokens (completion) */ readonly max_output?: number; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -650,7 +635,7 @@ export type ExecutionMetricsGraderConfig = { /** Tolerance for exploration ratio check (default: 0.2) */ readonly exploration_tolerance?: number; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -666,7 +651,7 @@ export type ContainsGraderConfig = { readonly type: 'contains'; readonly value: string; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -682,7 +667,7 @@ export type ContainsAnyGraderConfig = { readonly type: 'contains-any'; readonly value: readonly string[]; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -698,7 +683,7 @@ export type ContainsAllGraderConfig = { readonly type: 'contains-all'; readonly value: readonly string[]; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -714,7 +699,7 @@ export type IcontainsGraderConfig = { readonly type: 'icontains'; readonly value: string; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -730,7 +715,7 @@ export type IcontainsAnyGraderConfig = { readonly type: 'icontains-any'; readonly value: readonly string[]; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -746,7 +731,7 @@ export type IcontainsAllGraderConfig = { readonly type: 'icontains-all'; readonly value: readonly string[]; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -762,7 +747,7 @@ export type StartsWithGraderConfig = { readonly type: 'starts-with'; readonly value: string; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -778,7 +763,7 @@ export type EndsWithGraderConfig = { readonly type: 'ends-with'; readonly value: string; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -796,7 +781,7 @@ export type RegexGraderConfig = { /** Optional regex flags (e.g., "i" for case-insensitive, "m" for multiline) */ readonly flags?: string; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -811,7 +796,7 @@ export type IsJsonGraderConfig = { readonly name: string; readonly type: 'is-json'; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -827,7 +812,7 @@ export type EqualsGraderConfig = { readonly type: 'equals'; readonly value: string; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -843,7 +828,7 @@ export type RubricsEvaluatorConfig = { readonly type: 'rubrics'; readonly criteria: readonly RubricItem[]; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ @@ -864,7 +849,7 @@ export type SkillTriggerGraderConfig = { /** Whether the skill is expected to trigger (default: true) */ readonly should_trigger?: boolean; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; readonly negate?: boolean; @@ -878,7 +863,7 @@ export type InlineAssertEvaluatorConfig = { readonly name: string; readonly type: 'inline-assert'; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ readonly min_score?: number; readonly negate?: boolean; @@ -935,7 +920,7 @@ export interface EvalGraderSource { readonly name: string; readonly type: string; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; readonly minScore?: number; readonly definition: JsonObject; } @@ -1050,9 +1035,6 @@ export interface DependencyResult { readonly status: 'passed' | 'failed' | 'error'; } -/** @deprecated Use `EvalTest` instead */ -export type EvalCase = EvalTest; - /** * Supported repeat aggregation strategies. */ diff --git a/packages/core/src/evaluation/validation/config-validator.ts b/packages/core/src/evaluation/validation/config-validator.ts index ba00ccb33..7e7d23050 100644 --- a/packages/core/src/evaluation/validation/config-validator.ts +++ b/packages/core/src/evaluation/validation/config-validator.ts @@ -99,16 +99,6 @@ export async function validateConfigFile( } } - if (config.results_by_project !== undefined) { - errors.push({ - severity: 'warning', - filePath, - location: 'results_by_project', - message: - "Field 'results_by_project' is deprecated. Put per-project result repo settings under projects[].results in $AGENTV_HOME/config.yaml.", - }); - } - const allowedFields = new Set([ '$schema', 'eval_patterns', @@ -117,7 +107,6 @@ export async function validateConfigFile( 'results', 'repo_resolvers', 'projects', - 'results_by_project', 'dashboard', 'studio', ]); diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 2528f1f50..a289a3f2a 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -13,6 +13,7 @@ import { z } from 'zod'; // --------------------------------------------------------------------------- const JsonObjectSchema = z.object({}).catchall(z.unknown()); +const JsonRecordSchema = z.record(z.unknown()); /** Message content: string, structured object, or structured array */ const ContentItemSchema = z.object({ @@ -47,23 +48,36 @@ const ExpectedOutputSchema = z.union([z.string(), JsonObjectSchema, z.array(Mess /** Common fields shared by all evaluators */ const EvaluatorCommonSchema = z.object({ name: z.string().optional(), + metric: z.string().optional(), weight: z.number().min(0).optional(), - required: z.union([z.boolean(), z.number().gt(0).lte(1)]).optional(), + required: z.boolean().optional(), /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ min_score: z.number().gt(0).lte(1).optional(), negate: z.boolean().optional(), }); -/** Prompt: string (inline/file path) or executable script config */ +/** Prompt: string (inline/file path), promptfoo-shaped object, or executable script config */ const PromptSchema = z.union([ z.string(), z.object({ - command: z.union([z.string(), z.array(z.string())]).optional(), - script: z.union([z.string(), z.array(z.string())]).optional(), + command: z.union([z.string(), z.array(z.string())]), config: z.record(z.unknown()).optional(), }), + z + .object({ + id: z.string().optional(), + label: z.string().optional(), + raw: z.string().optional(), + path: z.string().optional(), + prefix: z.string().optional(), + suffix: z.string().optional(), + config: JsonRecordSchema.optional(), + }) + .passthrough(), ]); +const PromptsSchema = z.union([PromptSchema, z.array(PromptSchema).min(1)]); + const PreprocessorSchema = z.object({ type: z.string().min(1), command: z.union([z.string(), z.array(z.string())]), @@ -94,7 +108,6 @@ const RubricCriterionSchema = z.union([z.string().min(1), RubricItemSchema]); const CodeGraderSchema = EvaluatorCommonSchema.extend({ type: z.enum(['code-grader', 'code_grader']), command: z.union([z.string(), z.array(z.string())]), - script: z.union([z.string(), z.array(z.string())]).optional(), cwd: z.string().optional(), target: z.union([z.boolean(), z.object({ max_calls: z.number().optional() })]).optional(), config: z.record(z.unknown()).optional(), @@ -242,10 +255,46 @@ const RubricsSchema = EvaluatorCommonSchema.extend({ criteria: z.array(RubricCriterionSchema).min(1), }); +const PromptfooAssertionSchema = EvaluatorCommonSchema.extend({ + type: z.enum([ + 'assert-set', + 'g-eval', + 'llm-rubric', + 'javascript', + 'python', + 'webhook', + 'similar', + 'select-best', + 'human', + 'contains', + 'contains-any', + 'contains-all', + 'icontains', + 'icontains-any', + 'icontains-all', + 'starts-with', + 'ends-with', + 'regex', + 'is-json', + 'equals', + ]), + value: z.unknown().optional(), + threshold: z.number().min(0).max(1).optional(), + criteria: z.union([z.string(), z.array(RubricCriterionSchema)]).optional(), + rubrics: z.array(RubricItemSchema).optional(), + score_ranges: z.array(ScoreRangeSchema).optional(), + provider: z.union([z.string(), JsonObjectSchema]).optional(), + config: JsonRecordSchema.optional(), + assert: z.array(z.union([z.string(), JsonObjectSchema])).optional(), + assertions: z.array(z.union([z.string(), JsonObjectSchema])).optional(), + transform: z.union([z.string(), JsonObjectSchema]).optional(), +}).passthrough(); + /** Union of all grader types */ const EvaluatorSchema = z.union([ CodeGraderSchema, LlmGraderSchema, + PromptfooAssertionSchema, IncludeSchema, CompositeSchema, ToolTrajectorySchema, @@ -261,16 +310,20 @@ const EvaluatorSchema = z.union([ RubricsSchema, ]); +/** Assertion item: string shorthand (becomes a criteria/rubric grader) or full evaluator config. */ +const AssertionItemSchema = z.union([z.string(), JsonObjectSchema]); + // --------------------------------------------------------------------------- // Workspace // --------------------------------------------------------------------------- -const WorkspaceScriptSchema = z.object({ - command: z.union([z.string(), z.array(z.string())]).optional(), - script: z.union([z.string(), z.array(z.string())]).optional(), - timeout_ms: z.number().min(0).optional(), - cwd: z.string().optional(), -}); +const WorkspaceScriptSchema = z + .object({ + command: z.union([z.string(), z.array(z.string())]).optional(), + timeout_ms: z.number().min(0).optional(), + cwd: z.string().optional(), + }) + .strict(); // --------------------------------------------------------------------------- // Repo lifecycle @@ -284,29 +337,31 @@ const RepoSchema = z base_commit: z.string().min(1).optional(), ancestor: z.number().int().min(0).optional(), sparse: z.array(z.string()).optional(), - resolver: z.string().min(1).optional(), }) .strict() .refine((repo) => !repo.commit || !repo.base_commit || repo.commit === repo.base_commit, { message: 'commit and base_commit must match when both are set', }); -const WorkspaceHookSchema = z.object({ - command: z.union([z.string(), z.array(z.string())]).optional(), - script: z.union([z.string(), z.array(z.string())]).optional(), - timeout_ms: z.number().optional(), - timeoutMs: z.number().optional(), - cwd: z.string().optional(), - reset: z.enum(['none', 'fast', 'strict']).optional(), -}); +const WorkspaceHookSchema = z + .object({ + command: z.union([z.string(), z.array(z.string())]).optional(), + timeout_ms: z.number().optional(), + timeoutMs: z.number().optional(), + cwd: z.string().optional(), + reset: z.enum(['none', 'fast', 'strict']).optional(), + }) + .strict(); -const WorkspaceHooksSchema = z.object({ - enabled: z.boolean().optional(), - before_all: WorkspaceHookSchema.optional(), - before_each: WorkspaceHookSchema.optional(), - after_each: WorkspaceHookSchema.optional(), - after_all: WorkspaceHookSchema.optional(), -}); +const WorkspaceHooksSchema = z + .object({ + enabled: z.boolean().optional(), + before_all: WorkspaceHookSchema.optional(), + before_each: WorkspaceHookSchema.optional(), + after_each: WorkspaceHookSchema.optional(), + after_all: WorkspaceHookSchema.optional(), + }) + .strict(); const DockerWorkspaceSchema = z.object({ image: z.string(), @@ -337,31 +392,45 @@ const WorkspaceSchema = z // Target hooks (eval-level per-target customization) // --------------------------------------------------------------------------- -const TargetHooksSchema = z.object({ - before_all: WorkspaceHookSchema.optional(), - before_each: WorkspaceHookSchema.optional(), - after_each: WorkspaceHookSchema.optional(), - after_all: WorkspaceHookSchema.optional(), -}); +const TargetHooksSchema = z + .object({ + before_all: WorkspaceHookSchema.optional(), + before_each: WorkspaceHookSchema.optional(), + after_each: WorkspaceHookSchema.optional(), + after_all: WorkspaceHookSchema.optional(), + }) + .strict(); /** Eval target reference: string shorthand or object with hooks */ -const EvalTargetRefSchema = z.object({ - name: z.string().min(1), - use_target: z.string().optional(), - hooks: TargetHooksSchema.optional(), -}); +const EvalTargetRefSchema = z + .object({ + name: z.string().min(1), + use_target: z.string().optional(), + hooks: TargetHooksSchema.optional(), + }) + .strict(); const EvalLocalTargetSchema = z .object({ + id: z.string().min(1).optional(), + label: z.string().min(1).optional(), extends: z.string().min(1).optional(), name: z.string().min(1).optional(), provider: z.string().min(1).optional(), model: z.string().min(1).optional(), + config: JsonRecordSchema.optional(), + prompts: PromptsSchema.optional(), + transform: z.union([z.string(), JsonObjectSchema]).optional(), + delay: z.number().min(0).optional(), + env: z.record(z.string()).optional(), reasoning_effort: z.string().min(1).optional(), hooks: TargetHooksSchema.optional(), }) .passthrough(); +const EvalTargetSchema = z.union([z.string().min(1), EvalLocalTargetSchema]); +const EvalTargetsSchema = z.union([EvalTargetSchema, z.array(EvalTargetSchema).min(1)]); + // --------------------------------------------------------------------------- // Execution block // --------------------------------------------------------------------------- @@ -373,7 +442,7 @@ const ExecutionSchema = z.object({ target: z.string().optional(), targets: z.array(z.union([z.string(), EvalTargetRefSchema])).optional(), workers: z.never().optional(), - assertions: z.array(EvaluatorSchema).optional(), + assertions: z.array(AssertionItemSchema).optional(), evaluators: z.array(EvaluatorSchema).optional(), skip_defaults: z.boolean().optional(), cache: z.boolean().optional(), @@ -407,7 +476,18 @@ const RunOverrideSchema = z const DefaultTestSchema = z .object({ + vars: JsonObjectSchema.optional(), + provider: EvalTargetSchema.optional(), + providers: EvalTargetsSchema.optional(), + prompts: PromptsSchema.optional(), + provider_output: ExpectedOutputSchema.optional(), + expected_output: ExpectedOutputSchema.optional(), + assert: z.array(AssertionItemSchema).optional(), + assertions: z.array(AssertionItemSchema).optional(), + assert_scoring_function: z.union([z.string().min(1), JsonObjectSchema]).optional(), + options: JsonObjectSchema.optional(), threshold: z.number().min(0).max(1).optional(), + metadata: z.record(z.unknown()).optional(), }) .strict(); @@ -415,17 +495,22 @@ const EvaluateOptionsSchema = z .object({ budget_usd: z.number().gt(0).optional(), max_concurrency: z.number().int().min(1).max(50).optional(), + cache: z.union([z.boolean(), JsonObjectSchema]).optional(), + delay: z.number().min(0).optional(), + generate_suggestions: z.boolean().optional(), + repeat: z.union([z.number().int().min(1), ExperimentRepeatSchema]).optional(), + timeout_ms: z.number().gt(0).optional(), + max_eval_time_ms: z.number().gt(0).optional(), + filter_range: z.union([z.tuple([z.number(), z.number()]), z.string()]).optional(), }) .strict(); -/** Per-turn assertion: string shorthand (becomes rubric) or full evaluator config */ -const TurnAssertionSchema = z.union([z.string(), EvaluatorSchema]); - /** A single turn in a multi-turn conversation */ const ConversationTurnSchema = z.object({ input: z.union([z.string(), MessageContentSchema]), expected_output: z.union([z.string(), MessageContentSchema]).optional(), - assertions: z.array(TurnAssertionSchema).optional(), + assert: z.array(AssertionItemSchema).optional(), + assertions: z.array(AssertionItemSchema).optional(), }); // --------------------------------------------------------------------------- @@ -435,13 +520,22 @@ const ConversationTurnSchema = z.object({ const TestExecutionSchema = ExecutionSchema.omit({ target: true, targets: true }).strict(); const EvalTestSchema = z.object({ - id: z.string().min(1), + id: z.string().min(1).optional(), + description: z.string().optional(), vars: JsonObjectSchema.optional(), criteria: z.string().optional(), + provider: EvalTargetSchema.optional(), + providers: EvalTargetsSchema.optional(), + prompts: PromptsSchema.optional(), + provider_output: ExpectedOutputSchema.optional(), input: InputSchema.optional(), input_files: z.array(z.string()).optional(), expected_output: ExpectedOutputSchema.optional(), - assertions: z.array(EvaluatorSchema).optional(), + assert: z.array(AssertionItemSchema).optional(), + assertions: z.array(AssertionItemSchema).optional(), + assert_scoring_function: z.union([z.string().min(1), JsonObjectSchema]).optional(), + options: JsonObjectSchema.optional(), + threshold: z.number().min(0).max(1).optional(), evaluators: z.array(EvaluatorSchema).optional(), execution: TestExecutionSchema.optional(), run: RunOverrideSchema.optional(), @@ -508,11 +602,46 @@ const TestsSchema = z.union([ z.string().min(1), ]); +const ScenarioConfigSchema = z + .object({ + vars: JsonObjectSchema.optional(), + provider: EvalTargetSchema.optional(), + providers: EvalTargetsSchema.optional(), + prompts: PromptsSchema.optional(), + provider_output: ExpectedOutputSchema.optional(), + assert: z.array(AssertionItemSchema).optional(), + assertions: z.array(AssertionItemSchema).optional(), + options: JsonObjectSchema.optional(), + threshold: z.number().min(0).max(1).optional(), + metadata: z.record(z.unknown()).optional(), + }) + .passthrough(); + +const ScenarioSchema = z + .object({ + description: z.string().optional(), + config: z.array(ScenarioConfigSchema).optional(), + tests: z.array(EvalTestSchema).optional(), + }) + .strict(); + +const DerivedMetricSchema = z + .object({ + name: z.string().min(1), + value: z.union([z.string().min(1), JsonObjectSchema]), + }) + .strict(); + +const TagsSchema = z.union([ + z.array(z.string()), + z.record(z.union([z.string(), z.number(), z.boolean()])), +]); + // --------------------------------------------------------------------------- // Top-level eval file // --------------------------------------------------------------------------- -export const EvalFileSchema = z +export const EvalFileSchema: z.ZodType = z .object({ $schema: z.string().optional(), // Metadata @@ -524,11 +653,12 @@ export const EvalFileSchema = z category: z.string().optional(), version: z.string().optional(), author: z.string().optional(), - tags: z.array(z.string()).optional(), + tags: TagsSchema.optional(), license: z.string().optional(), requires: z.object({ agentv: z.string().optional() }).optional(), // Suite-level input input: InputSchema.optional(), + prompts: PromptsSchema.optional(), // Suite-level input_files shorthand input_files: z.array(z.string()).optional(), // Imports: suites preserve child context; tests import raw rows into parent context @@ -539,6 +669,7 @@ export const EvalFileSchema = z eval_cases: TestsSchema.optional(), // Target target: z.union([z.string().min(1), EvalLocalTargetSchema]).optional(), + targets: EvalTargetsSchema.optional(), model: z.never().optional(), // Run/result grouping label and flat run controls experiment: z.string().min(1).optional(), @@ -550,11 +681,18 @@ export const EvalFileSchema = z budget_usd: z.number().gt(0).optional(), threshold: z.number().min(0).max(1).optional(), default_test: DefaultTestSchema.optional(), + scenarios: z.array(ScenarioSchema).optional(), + derived_metrics: z.array(DerivedMetricSchema).optional(), + output_path: z.union([z.string().min(1), z.array(z.string().min(1))]).optional(), + env: z.record(z.string()).optional(), + nunjucks_filters: z.union([JsonObjectSchema, z.array(z.string().min(1))]).optional(), + extensions: z.array(z.union([z.string().min(1), JsonObjectSchema])).optional(), on_run_complete: z.union([z.string().min(1), z.array(z.string().min(1))]).optional(), policy: z.never().optional(), execution: z.never().optional(), // Suite-level assertions - assertions: z.array(EvaluatorSchema).optional(), + assert: z.array(AssertionItemSchema).optional(), + assertions: z.array(AssertionItemSchema).optional(), // Suite-level content preprocessors shared by evaluators preprocessors: z.array(PreprocessorSchema).optional(), // Workspace (inline object or path to external workspace YAML file) @@ -562,6 +700,9 @@ export const EvalFileSchema = z }) .refine( (value) => - value.tests !== undefined || value.eval_cases !== undefined || value.imports !== undefined, - { message: "Eval files must define 'tests' or 'imports'." }, + value.tests !== undefined || + value.eval_cases !== undefined || + value.imports !== undefined || + value.scenarios !== undefined, + { message: "Eval files must define 'tests', 'imports', or 'scenarios'." }, ); diff --git a/packages/core/src/evaluation/validation/eval-validator.ts b/packages/core/src/evaluation/validation/eval-validator.ts index 785dbfc46..561231864 100644 --- a/packages/core/src/evaluation/validation/eval-validator.ts +++ b/packages/core/src/evaluation/validation/eval-validator.ts @@ -44,6 +44,17 @@ const ASSERTION_TYPES_WITH_ARRAY_VALUE = new Set([ 'icontains-any', 'icontains-all', ]); +const PROMPTFOO_ASSERTION_TYPES = new Set([ + 'assert-set', + 'g-eval', + 'llm-rubric', + 'javascript', + 'python', + 'webhook', + 'similar', + 'select-best', + 'human', +]); /** Valid file extensions for external test files. */ const VALID_TEST_FILE_EXTENSIONS = new Set(['.yaml', '.yml', '.jsonl']); @@ -61,9 +72,11 @@ const KNOWN_TOP_LEVEL_FIELDS = new Set([ 'requires', 'input', 'input_files', + 'prompts', 'imports', 'tests', 'target', + 'targets', 'model', 'policy', 'experiment', @@ -76,6 +89,13 @@ const KNOWN_TOP_LEVEL_FIELDS = new Set([ 'budget_usd', 'threshold', 'default_test', + 'assert', + 'scenarios', + 'derived_metrics', + 'output_path', + 'env', + 'nunjucks_filters', + 'extensions', 'on_run_complete', 'assertions', 'evaluators', @@ -89,9 +109,35 @@ const KNOWN_TOP_LEVEL_FIELDS = new Set([ const KNOWN_INCLUDE_FIELDS = new Set(['include', 'type', 'select', 'run']); const KNOWN_IMPORT_FIELDS = new Set(['path', 'select', 'run']); const KNOWN_RUN_OVERRIDE_FIELDS = new Set(['threshold', 'repeat', 'timeout_seconds', 'budget_usd']); +const KNOWN_DEFAULT_TEST_FIELDS = new Set([ + 'vars', + 'provider', + 'providers', + 'prompts', + 'provider_output', + 'expected_output', + 'assert', + 'assertions', + 'assert_scoring_function', + 'options', + 'threshold', + 'metadata', +]); +const KNOWN_EVALUATE_OPTION_FIELDS = new Set([ + 'budget_usd', + 'max_concurrency', + 'cache', + 'delay', + 'generate_suggestions', + 'repeat', + 'timeout_ms', + 'max_eval_time_ms', + 'filter_range', +]); const KNOWN_REPEAT_FIELDS = new Set(['count', 'strategy', 'early_exit', 'cost_limit_usd']); const KNOWN_REPEAT_STRATEGIES = new Set(['pass_any', 'pass_all', 'mean', 'confidence_interval']); const KNOWN_TEST_EXECUTION_FIELDS = new Set([ + 'assert', 'assertions', 'evaluators', 'skip_defaults', @@ -107,7 +153,6 @@ const KNOWN_TEST_EXECUTION_FIELDS = new Set([ /** Removed top-level fields with migration hints. */ const REMOVED_TOP_LEVEL_FIELDS = new Map([ - ['assert', "'assert' has been removed. Use 'assertions' instead."], [ 'workers', "'workers' has been removed from eval YAML. Set authored eval concurrency with evaluate_options.max_concurrency, or operational defaults with --workers, agentv.config.*, .agentv/config.yaml execution.workers, or target-level runtime config.", @@ -135,12 +180,21 @@ const DEPRECATED_TOP_LEVEL_FIELDS = new Map([ /** Known fields at the test level. */ const KNOWN_TEST_FIELDS = new Set([ 'id', + 'description', 'vars', 'criteria', + 'provider', + 'providers', + 'prompts', + 'provider_output', 'input', 'input_files', 'expected_output', + 'assert', 'assertions', + 'assert_scoring_function', + 'options', + 'threshold', 'evaluators', 'rubrics', 'execution', @@ -159,9 +213,7 @@ const KNOWN_TEST_FIELDS = new Set([ ]); /** Removed test-level fields with migration hints. */ -const REMOVED_TEST_FIELDS = new Map([ - ['assert', "'assert' has been removed. Use 'assertions' instead."], -]); +const REMOVED_TEST_FIELDS = new Map([]); /** Deprecated test-level fields with migration hints. */ const DEPRECATED_TEST_FIELDS = new Map([ @@ -336,11 +388,14 @@ export async function validateEvalFile(filePath: string): Promise 0; + const hasScenarios = Array.isArray(parsed.scenarios); // tests can be a string path (external file/directory reference) or an array if (typeof cases === 'string') { @@ -356,7 +411,7 @@ export async function validateEvalFile(filePath: string): Promise, ): void { if (defaultTest === undefined) { return; @@ -1074,16 +1143,32 @@ function validateDefaultTest( } for (const key of Object.keys(defaultTest)) { - if (key !== 'threshold') { + if (!KNOWN_DEFAULT_TEST_FIELDS.has(key)) { errors.push({ severity: 'error', filePath, location: `default_test.${key}`, - message: 'Invalid default_test field. Supported fields: threshold.', + message: + 'Invalid default_test field. Supported fields: vars, provider, providers, prompts, provider_output, expected_output, assert, assertions, assert_scoring_function, options, threshold, metadata.', }); } } + validateAssertArray( + defaultTest.assert, + 'default_test.assert', + filePath, + errors, + customAssertionTypes, + ); + validateAssertArray( + defaultTest.assertions, + 'default_test.assertions', + filePath, + errors, + customAssertionTypes, + ); + const threshold = defaultTest.threshold; if ( threshold !== undefined && @@ -1118,7 +1203,7 @@ function validateEvaluateOptions( } for (const key of Object.keys(evaluateOptions)) { - if (key !== 'budget_usd' && key !== 'max_concurrency') { + if (!KNOWN_EVALUATE_OPTION_FIELDS.has(key)) { errors.push({ severity: 'warning', filePath, @@ -1153,6 +1238,95 @@ function validateEvaluateOptions( message: "Invalid 'max_concurrency' field (must be an integer between 1 and 50)", }); } + + validateNonNegativeNumber(evaluateOptions.delay, `${location}.delay`, filePath, errors); + validatePositiveNumber(evaluateOptions.timeout_ms, `${location}.timeout_ms`, filePath, errors); + validatePositiveNumber( + evaluateOptions.max_eval_time_ms, + `${location}.max_eval_time_ms`, + filePath, + errors, + ); + validateEvaluateOptionsRepeat(evaluateOptions.repeat, `${location}.repeat`, filePath, errors); + validateFilterRange(evaluateOptions.filter_range, `${location}.filter_range`, filePath, errors); +} + +function validatePositiveNumber( + value: JsonValue | undefined, + location: string, + filePath: string, + errors: ValidationError[], +): void { + if (value !== undefined && (typeof value !== 'number' || value <= 0)) { + errors.push({ + severity: 'error', + filePath, + location, + message: `Invalid '${location}' field (must be a positive number)`, + }); + } +} + +function validateNonNegativeNumber( + value: JsonValue | undefined, + location: string, + filePath: string, + errors: ValidationError[], +): void { + if (value !== undefined && (typeof value !== 'number' || value < 0)) { + errors.push({ + severity: 'error', + filePath, + location, + message: `Invalid '${location}' field (must be a non-negative number)`, + }); + } +} + +function validateEvaluateOptionsRepeat( + repeat: JsonValue | undefined, + location: string, + filePath: string, + errors: ValidationError[], +): void { + if (repeat === undefined) { + return; + } + if (typeof repeat === 'number') { + if (!Number.isInteger(repeat) || repeat < 1) { + errors.push({ + severity: 'error', + filePath, + location, + message: "Invalid 'evaluate_options.repeat' field (must be a positive integer)", + }); + } + return; + } + validateRepeatOverride(repeat, location, filePath, errors); +} + +function validateFilterRange( + filterRange: JsonValue | undefined, + location: string, + filePath: string, + errors: ValidationError[], +): void { + if (filterRange === undefined || typeof filterRange === 'string') { + return; + } + if ( + !Array.isArray(filterRange) || + filterRange.length !== 2 || + filterRange.some((value) => typeof value !== 'number') + ) { + errors.push({ + severity: 'error', + filePath, + location, + message: "Invalid 'evaluate_options.filter_range' field (must be a two-number array)", + }); + } } function validateRepeatOverride( @@ -1416,6 +1590,34 @@ function validateWorkspaceRepoConfig( }); } + if ('type' in repo) { + errors.push({ + severity: 'error', + filePath, + location: `${location}.repos[path=${repo.path ?? '(none)'}]`, + message: 'workspace.repos[].type has been removed. Use workspace.repos[].repo.', + }); + } + + if ('resolve' in repo) { + errors.push({ + severity: 'error', + filePath, + location: `${location}.repos[path=${repo.path ?? '(none)'}]`, + message: 'workspace.repos[].resolve has been removed. Configure repo_resolvers instead.', + }); + } + + if ('resolver' in repo) { + errors.push({ + severity: 'error', + filePath, + location: `${location}.repos[path=${repo.path ?? '(none)'}]`, + message: + 'workspace.repos[].resolver has been removed. Configure repo_resolvers.repos patterns instead.', + }); + } + if (!repo.repo && !isObject(docker)) { errors.push({ severity: 'error', @@ -1796,18 +1998,21 @@ async function validateSuiteImportCyclesFromParsed( } function validateAssertArray( - assertField: JsonValue, - parentLocation: string, + assertField: JsonValue | undefined, + location: string, filePath: string, errors: ValidationError[], customAssertionTypes: ReadonlySet = new Set(), ): void { + if (assertField === undefined) { + return; + } if (!Array.isArray(assertField)) { errors.push({ severity: 'warning', filePath, - location: `${parentLocation}.assertions`, - message: "'assertions' must be an array of assertion objects.", + location, + message: `'${location}' must be an array of assertion objects.`, }); return; } @@ -1822,7 +2027,7 @@ function validateAssertArray( errors.push({ severity: 'warning', filePath, - location: `${parentLocation}.assertions[${i}]`, + location: `${location}[${i}]`, message: 'Empty string assertion item will be ignored.', }); } @@ -1832,7 +2037,7 @@ function validateAssertArray( errors.push({ severity: 'warning', filePath, - location: `${parentLocation}.assertions[${i}]`, + location: `${location}[${i}]`, message: 'Assertion item must be a string or an object with a type field.', }); continue; @@ -1841,7 +2046,7 @@ function validateAssertArray( } for (const { item, index } of objectItems) { - const location = `${parentLocation}.assertions[${index}]`; + const itemLocation = `${location}[${index}]`; // Validate type field const rawTypeValue = item.type; @@ -1849,7 +2054,7 @@ function validateAssertArray( errors.push({ severity: 'warning', filePath, - location: `${location}.type`, + location: `${itemLocation}.type`, message: "Assertion item is missing a 'type' field.", }); continue; @@ -1858,11 +2063,15 @@ function validateAssertArray( // Normalize snake_case to kebab-case for backward compatibility const typeValue = rawTypeValue.replace(/_/g, '-'); - if (!isGraderKind(typeValue) && !customAssertionTypes.has(typeValue)) { + if ( + !isGraderKind(typeValue) && + !PROMPTFOO_ASSERTION_TYPES.has(typeValue) && + !customAssertionTypes.has(typeValue) + ) { errors.push({ severity: 'warning', filePath, - location: `${location}.type`, + location: `${itemLocation}.type`, message: `Unknown assertion type '${rawTypeValue}'.`, }); continue; @@ -1875,7 +2084,7 @@ function validateAssertArray( errors.push({ severity: 'warning', filePath, - location: `${location}.value`, + location: `${itemLocation}.value`, message: `Assertion type '${typeValue}' requires a 'value' field (string).`, }); continue; @@ -1889,7 +2098,7 @@ function validateAssertArray( errors.push({ severity: 'warning', filePath, - location: `${location}.value`, + location: `${itemLocation}.value`, message: `Invalid regex pattern '${value}': not a valid regular expression.`, }); } @@ -1899,11 +2108,15 @@ function validateAssertArray( // Validate value field for types that require a string array value if (ASSERTION_TYPES_WITH_ARRAY_VALUE.has(typeValue)) { const value = item.value; - if (!Array.isArray(value) || value.length === 0) { + if ( + !Array.isArray(value) || + value.length === 0 || + value.some((entry) => typeof entry !== 'string') + ) { errors.push({ severity: 'warning', filePath, - location: `${location}.value`, + location: `${itemLocation}.value`, message: `Assertion type '${typeValue}' requires a 'value' field (non-empty string array).`, }); continue; @@ -1913,7 +2126,7 @@ function validateAssertArray( // Validate required field if present const required = item.required; if (required !== undefined) { - validateRequiredField(required, location, filePath, errors); + validateRequiredField(required, itemLocation, filePath, errors); } } } @@ -1928,21 +2141,19 @@ function validateRequiredField( return; // Valid } if (typeof required === 'number') { - if (required <= 0 || required > 1) { - errors.push({ - severity: 'warning', - filePath, - location: `${parentLocation}.required`, - message: `Invalid 'required' value ${required}. When a number, it must be between 0 (exclusive) and 1 (inclusive).`, - }); - } + errors.push({ + severity: 'warning', + filePath, + location: `${parentLocation}.required`, + message: `Numeric 'required: ${required}' has been removed. Use 'required: true' and 'min_score: ${required}' instead.`, + }); return; } errors.push({ severity: 'warning', filePath, location: `${parentLocation}.required`, - message: `Invalid 'required' value. Must be a boolean or a number between 0 (exclusive) and 1 (inclusive).`, + message: `Invalid 'required' value. Must be a boolean. Use 'min_score' for custom score thresholds.`, }); } diff --git a/packages/core/src/evaluation/validation/targets-validator.ts b/packages/core/src/evaluation/validation/targets-validator.ts index 990059a10..bd847fde3 100644 --- a/packages/core/src/evaluation/validation/targets-validator.ts +++ b/packages/core/src/evaluation/validation/targets-validator.ts @@ -7,7 +7,7 @@ import { COMMON_TARGET_SETTINGS, findDeprecatedCamelCaseTargetWarnings, } from '../providers/targets.js'; -import { KNOWN_PROVIDERS, PROVIDER_ALIASES } from '../providers/types.js'; +import { KNOWN_PROVIDERS } from '../providers/types.js'; import { parseYamlValue } from '../yaml-loader.js'; import type { ValidationError, ValidationResult } from './types.js'; @@ -133,7 +133,6 @@ const COPILOT_SDK_SETTINGS = new Set([ 'cwd', 'timeout_seconds', 'log_dir', - 'log_format', 'stream_log', 'system_prompt', 'subprovider', @@ -157,7 +156,6 @@ const COPILOT_CLI_SETTINGS = new Set([ 'cwd', 'timeout_seconds', 'log_dir', - 'log_format', 'stream_log', 'system_prompt', 'subprovider', @@ -208,16 +206,12 @@ const CLAUDE_SETTINGS = new Set([ 'timeout_seconds', 'log_dir', 'log_directory', - 'log_format', - 'log_output_format', 'stream_log', 'system_prompt', 'max_turns', 'max_budget_usd', ]); -const CC_MIRROR_SETTINGS = new Set([...CLAUDE_SETTINGS, 'variant']); - function getKnownSettings(provider: string): Set | null { const normalizedProvider = provider.toLowerCase(); switch (normalizedProvider) { @@ -226,27 +220,18 @@ function getKnownSettings(provider: string): Set | null { case 'openrouter': return OPENROUTER_SETTINGS; case 'azure': - case 'azure-openai': return AZURE_SETTINGS; case 'anthropic': return ANTHROPIC_SETTINGS; case 'gemini': - case 'google': - case 'google-gemini': return GEMINI_SETTINGS; case 'codex': - case 'codex-cli': return CODEX_SETTINGS; case 'copilot-sdk': - case 'copilot_sdk': return COPILOT_SDK_SETTINGS; - case 'copilot': case 'copilot-cli': return COPILOT_CLI_SETTINGS; - case 'cc-mirror': - return CC_MIRROR_SETTINGS; case 'claude': - case 'claude-code': case 'claude-cli': case 'claude-sdk': return CLAUDE_SETTINGS; @@ -296,32 +281,12 @@ function validateUnknownSettings( azure: new Map([ [ 'api_format', - "The 'api_format' field is no longer supported on Azure targets. " + + "The 'api_format' field has been removed from Azure targets. " + "AgentV always uses Azure's Responses API (`/openai/v1/responses`). " + "If your deployment only exposes /chat/completions, use 'provider: openai' " + "with a deployment-scoped 'base_url' instead.", ], ]), - codex: new Map([ - [ - 'log_format', - "The 'log_format' field is no longer supported on Codex targets. Use 'stream_log: raw' for per-event logs or 'stream_log: summary' for consolidated logs.", - ], - [ - 'log_output_format', - "The 'log_output_format' field is no longer supported on Codex targets. Use 'stream_log: raw' for per-event logs or 'stream_log: summary' for consolidated logs.", - ], - ]), - 'codex-cli': new Map([ - [ - 'log_format', - "The 'log_format' field is no longer supported on Codex targets. Use 'stream_log: raw' for per-event logs or 'stream_log: summary' for consolidated logs.", - ], - [ - 'log_output_format', - "The 'log_output_format' field is no longer supported on Codex targets. Use 'stream_log: raw' for per-event logs or 'stream_log: summary' for consolidated logs.", - ], - ]), }; const removedForProvider = removedPerProvider[provider]; @@ -336,6 +301,15 @@ function validateUnknownSettings( }); continue; } + if (key === 'log_format' || key === 'log_output_format') { + errors.push({ + severity: 'error', + filePath: absolutePath, + location: `${location}.${key}`, + message: `The '${key}' field has been removed. Use 'stream_log: raw' for per-event logs or 'stream_log: summary' for consolidated logs.`, + }); + continue; + } if (removedForProvider?.has(key)) { errors.push({ severity: 'error', @@ -548,7 +522,7 @@ export async function validateTargetsFile(filePath: string): Promise { const resolvers = this.loadRepoResolvers(); - if (resolvers.length === 0) { - if (repo.resolver) { - throw new Error(`workspace.repos[].resolver '${repo.resolver}' is not configured.`); - } - return undefined; - } + if (resolvers.length === 0) return undefined; const originUrl = resolveRepoCloneUrl(repo.repo ?? ''); const selection = selectRepoResolver(repo, resolvers); @@ -663,12 +658,6 @@ export class RepoManager { }; } - if (selection.kind === 'explicit') { - throw new Error( - `Repo resolver '${selection.resolver.name}' was selected by workspace.repos[].resolver but returned handled:false.`, - ); - } - if (selection.kind === 'pattern') { const defaultResolver = resolvers.find((resolver) => resolver.name === 'default'); if (defaultResolver) { diff --git a/packages/core/src/evaluation/workspace/repo-resolver.ts b/packages/core/src/evaluation/workspace/repo-resolver.ts index d2b5ae293..5aca876e1 100644 --- a/packages/core/src/evaluation/workspace/repo-resolver.ts +++ b/packages/core/src/evaluation/workspace/repo-resolver.ts @@ -52,7 +52,6 @@ export interface RepoResolverUnhandledResult { export type RepoResolverResult = RepoResolverHandledResult | RepoResolverUnhandledResult; export type RepoResolverSelection = - | { readonly kind: 'explicit'; readonly resolver: RepoResolverConfig } | { readonly kind: 'pattern'; readonly resolver: RepoResolverConfig } | { readonly kind: 'default'; readonly resolver: RepoResolverConfig }; @@ -99,14 +98,6 @@ export function selectRepoResolver( repo: RepoConfig, resolvers: readonly RepoResolverConfig[], ): RepoResolverSelection | undefined { - if (repo.resolver) { - const resolver = resolvers.find((candidate) => candidate.name === repo.resolver); - if (!resolver) { - throw new Error(`workspace.repos[].resolver '${repo.resolver}' is not configured.`); - } - return { kind: 'explicit', resolver }; - } - const patternResolver = resolvers.find( (resolver) => resolver.name !== 'default' && matchesRepoPatterns(repo, resolver.repos), ); diff --git a/packages/core/src/evaluation/workspace/script-executor.ts b/packages/core/src/evaluation/workspace/script-executor.ts index 812666e4d..ffb98c461 100644 --- a/packages/core/src/evaluation/workspace/script-executor.ts +++ b/packages/core/src/evaluation/workspace/script-executor.ts @@ -63,14 +63,7 @@ export async function executeWorkspaceScript( const timeoutMs = config.timeout_ms ?? (failureMode === 'fatal' ? 60000 : 30000); const cwd = config.cwd ?? context.workspaceFileDir ?? context.evalDir; - // Support both command (canonical) and script (deprecated alias) - if (config.script !== undefined && config.command === undefined) { - console.warn( - "\u001b[33mWarning: 'script' is deprecated in workspace config. Use 'command' instead.\u001b[0m", - ); - } - const rawCommand = config.command ?? config.script ?? []; - const commandArray = interpolateArgs(rawCommand, context); + const commandArray = interpolateArgs(config.command, context); const result = await execFileWithStdin(commandArray, stdin, { timeoutMs, diff --git a/packages/core/src/evaluation/workspace/setup.ts b/packages/core/src/evaluation/workspace/setup.ts index cd7ccbbb4..df0004df2 100644 --- a/packages/core/src/evaluation/workspace/setup.ts +++ b/packages/core/src/evaluation/workspace/setup.ts @@ -151,21 +151,20 @@ export function toScriptConfig( hookName: string, context: string, ): WorkspaceScriptConfig { - const command = hook.command ?? hook.script; + const command = hook.command; if (!command || command.length === 0) { - throw new Error(`${hookName} hook in ${context} requires command or script`); + throw new Error(`${hookName} hook in ${context} requires command`); } return { command, ...(hook.timeout_ms !== undefined && { timeout_ms: hook.timeout_ms }), ...(hook.timeoutMs !== undefined && { timeoutMs: hook.timeoutMs }), ...(hook.cwd !== undefined && { cwd: hook.cwd }), - ...(hook.script !== undefined && { script: hook.script }), }; } export function hasHookCommand(hook: WorkspaceHookConfig | undefined): hook is WorkspaceHookConfig { - return !!((hook?.command && hook.command.length > 0) || (hook?.script && hook.script.length > 0)); + return !!(hook?.command && hook.command.length > 0); } /** @@ -338,7 +337,7 @@ export async function resetWorkspaceRoot( } function commandForHook(hook: WorkspaceHookConfig | undefined): readonly string[] | undefined { - return hook?.command ?? hook?.script; + return hook?.command; } function hookExecution(options: { @@ -606,7 +605,7 @@ export async function prepareSharedWorkspaceSetup( const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all; if (sharedWorkspacePath && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) { const beforeAllHook = suiteBeforeAllHook; - const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(' '); + const beforeAllCommand = (beforeAllHook.command ?? []).join(' '); setupLog( `running shared before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`, ); @@ -1029,7 +1028,7 @@ export async function prepareEvalCaseWorkspace( const caseBeforeAllHook = evalCase.workspace?.hooks?.before_all; if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeAllHook)) { const beforeAllHook = caseBeforeAllHook; - const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(' '); + const beforeAllCommand = (beforeAllHook.command ?? []).join(' '); if (setupDebug) { console.log( `[setup] test=${evalCase.id} running before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`, diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 7e6e87535..a5a99ef18 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -1754,7 +1754,7 @@ function collectSingleGraderSourceReferences( const references: EvalSourceReference[] = []; if (evaluator.type === 'code-grader') { - const command = evaluator.command ?? evaluator.script ?? []; + const command = evaluator.command ?? []; references.push({ kind: 'code_grader_command', displayPath: evaluator.resolvedScriptPath ?? command.join(' '), @@ -1927,7 +1927,6 @@ function parseCommandArray(source: unknown): string[] | undefined { /** * Parse a WorkspaceScriptConfig from raw YAML value. - * Accepts both `command` (preferred) and `script` (deprecated alias). * Command can be an array of strings or a single string (auto-split on whitespace). * Note: string commands are split naively on whitespace. For arguments containing * spaces, use the array form: command: ["node", "path with spaces/setup.mjs"] @@ -1938,12 +1937,11 @@ function parseWorkspaceScriptConfig( ): WorkspaceScriptConfig | undefined { if (!isJsonObject(raw)) return undefined; const obj = raw as Record; - // Precedence: command > script (deprecated) - if (obj.script !== undefined && obj.command === undefined) { - logWarning("'script' is deprecated. Use 'command' instead."); + if (obj.script !== undefined) { + throw new Error("Workspace hook field 'script' has been removed. Use 'command' instead."); } - const command = parseCommandArray(obj.command ?? obj.script); + const command = parseCommandArray(obj.command); if (!command) return undefined; const timeoutMs = typeof obj.timeout_ms === 'number' ? obj.timeout_ms : undefined; @@ -1966,13 +1964,13 @@ function parseWorkspaceHookConfig( evalFileDir: string, ): WorkspaceHookConfig | undefined { if (!isJsonObject(raw)) return undefined; - const script = parseWorkspaceScriptConfig(raw, evalFileDir); + const commandConfig = parseWorkspaceScriptConfig(raw, evalFileDir); const obj = raw as Record; const reset = obj.reset === 'none' || obj.reset === 'fast' || obj.reset === 'strict' ? obj.reset : undefined; - if (!script && !reset) return undefined; + if (!commandConfig && !reset) return undefined; return { - ...(script ?? {}), + ...(commandConfig ?? {}), ...(reset !== undefined && { reset }), }; } diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index ada1cbb8e..b9a12327b 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -182,7 +182,6 @@ export { } from './config-overlays.js'; export { getAgentvConfigDir, - getAgentvHome, getAgentvDataDir, getWorkspacesRoot, getSubagentsRoot, diff --git a/packages/core/src/paths.ts b/packages/core/src/paths.ts index 9a60fe592..03af1dde4 100644 --- a/packages/core/src/paths.ts +++ b/packages/core/src/paths.ts @@ -16,15 +16,6 @@ export function getAgentvConfigDir(): string { return readEnvPath('AGENTV_HOME') ?? path.join(os.homedir(), '.agentv'); } -/** - * Backward-compatible alias for AgentV's home/config directory. - * Prefer getAgentvConfigDir() for lightweight config files and - * getAgentvDataDir() for heavy runtime data. - */ -export function getAgentvHome(): string { - return getAgentvConfigDir(); -} - /** * AgentV's heavy runtime data directory. Stores workspaces, workspace pool, * subagents, trace state, caches, downloaded dependencies, and results clones. diff --git a/packages/core/test/evaluation/evaluate-enhanced.test.ts b/packages/core/test/evaluation/evaluate-enhanced.test.ts index 068574f50..6a4fbfe76 100644 --- a/packages/core/test/evaluation/evaluate-enhanced.test.ts +++ b/packages/core/test/evaluation/evaluate-enhanced.test.ts @@ -127,18 +127,24 @@ describe('evaluate() — enhanced features', () => { expect(summary.passed).toBe(2); }); - it('supports legacy expected_output for backwards compatibility', async () => { - const { summary } = await evaluate({ - tests: [ - { - id: 'legacy', - input: 'hello', - expected_output: 'world', - assertions: [{ type: 'equals', value: 'world' }], - }, - ], - target: { name: 'default', provider: 'mock', response: 'world' }, - }); - expect(summary.passed).toBe(1); + it('rejects removed expected_output in inline tests', async () => { + const removedKey = ['expected', 'output'].join('_'); + const removedAliasTest: { + readonly id: string; + readonly input: string; + readonly assertions: readonly { readonly type: string; readonly value: string }[]; + readonly [key: string]: unknown; + } = { + id: 'removed-expected-output', + input: 'hello', + [removedKey]: 'world', + assertions: [{ type: 'equals', value: 'world' }], + }; + await expect( + evaluate({ + tests: [removedAliasTest], + target: { name: 'default', provider: 'mock', response: 'world' }, + }), + ).rejects.toThrow("'expected_output' has been removed"); }); }); diff --git a/packages/core/test/evaluation/evaluate-programmatic-api.test.ts b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts index fc894370f..7485f8c2b 100644 --- a/packages/core/test/evaluation/evaluate-programmatic-api.test.ts +++ b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts @@ -252,6 +252,35 @@ describe('evaluate() — programmatic API extensions', () => { PROGRAMMATIC_API_TIMEOUT_MS, ); + it( + 'rejects removed expected_output on individual turns', + async () => { + const removedKey = ['expected', 'output'].join('_'); + const removedAliasTurn: { + readonly input: string; + readonly assertions: readonly { readonly type: string; readonly value: string }[]; + readonly [key: string]: unknown; + } = { + input: 'Say hello', + [removedKey]: 'Hello!', + assertions: [{ type: 'contains', value: 'mock' }], + }; + + await expect( + evaluate({ + tests: [ + { + id: 'turn-removed-expected-output', + turns: [removedAliasTurn], + }, + ], + target: { name: 'default', provider: 'mock', response: 'mock response' }, + }), + ).rejects.toThrow("'expected_output' has been removed"); + }, + PROGRAMMATIC_API_TIMEOUT_MS, + ); + it( 'supports message array input in turns', async () => { diff --git a/packages/core/test/evaluation/experiment.test.ts b/packages/core/test/evaluation/experiment.test.ts index 6ad24e94b..1d9392e0a 100644 --- a/packages/core/test/evaluation/experiment.test.ts +++ b/packages/core/test/evaluation/experiment.test.ts @@ -79,7 +79,7 @@ describe('inline experiment config', () => { expect(() => normalizeExperimentConfig({ repeat: { count: 2, costLimitUsd: 1 } })).toThrow( /repeat.costLimitUsd/, ); - expect(() => normalizeExperimentConfig({ setup: [{ script: 'bun install' }] })).toThrow( + expect(() => normalizeExperimentConfig({ setup: [{ command: 'bun install' }] })).toThrow( /setup is not supported/, ); expect(() => normalizeExperimentConfig({ scripts: ['bun test'] })).toThrow( diff --git a/packages/core/test/evaluation/loaders/grader-parser.test.ts b/packages/core/test/evaluation/loaders/grader-parser.test.ts index 4e2329cfd..6b8e2e9e2 100644 --- a/packages/core/test/evaluation/loaders/grader-parser.test.ts +++ b/packages/core/test/evaluation/loaders/grader-parser.test.ts @@ -467,7 +467,7 @@ describe('parseGraders - code-grader config pass-through', () => { { name: 'fuzzy-matcher', type: 'code-grader', - script: ['bun', 'run', './test_script.ts'], + command: ['bun', 'run', './test_script.ts'], fields: [ { path: 'supplier.name', threshold: 0.85 }, { path: 'importer.name', threshold: 0.9 }, @@ -500,7 +500,7 @@ describe('parseGraders - code-grader config pass-through', () => { { name: 'simple-grader', type: 'code-grader', - script: ['bun', 'run', './test_script.ts'], + command: ['bun', 'run', './test_script.ts'], }, ], }; @@ -519,7 +519,7 @@ describe('parseGraders - code-grader config pass-through', () => { { name: 'with-weight', type: 'code-grader', - script: ['bun', 'run', './test_script.ts'], + command: ['bun', 'run', './test_script.ts'], cwd: tempDir, weight: 2.0, required: true, @@ -545,13 +545,13 @@ describe('parseGraders - code-grader config pass-through', () => { expect(config.config).toEqual({ threshold: 0.9, algorithm: 'levenshtein' }); }); - it('converts string scripts into argv using a shell', async () => { + it('converts string commands into argv using a shell', async () => { const rawEvalCase = { evaluators: [ { - name: 'legacy-script', + name: 'shell-command', type: 'code-grader', - script: './test_script.ts', + command: './test_script.ts', }, ], }; @@ -566,6 +566,25 @@ describe('parseGraders - code-grader config pass-through', () => { expect(config.command).toEqual(['sh', '-lc', './test_script.ts']); } }); + + it('rejects removed code-grader script alias', async () => { + await expect( + parseGraders( + { + evaluators: [ + { + name: 'legacy-script', + type: 'code-grader', + script: './test_script.ts', + }, + ], + }, + undefined, + [tempDir], + 'test-case', + ), + ).rejects.toThrow(/'script' has been removed.*command/); + }); }); describe('parseGraders - kebab-case type normalization', () => { @@ -596,7 +615,7 @@ describe('parseGraders - kebab-case type normalization', () => { { name: 'kebab-code', type: 'code-grader', - script: ['bun', 'run', './test_script.ts'], + command: ['bun', 'run', './test_script.ts'], }, ], }; @@ -709,11 +728,37 @@ describe('parseGraders - score_ranges rubrics', () => { expect(rubric?.id).toBe('accuracy'); expect(rubric?.weight).toBe(2.0); expect(rubric?.min_score).toBe(0.7); - expect(rubric?.required_min_score).toBe(7); expect(rubric?.score_ranges).toHaveLength(4); } }); + it('rejects removed required_min_score', async () => { + const rawEvalCase = { + evaluators: [ + { + name: 'correctness', + type: 'llm-grader', + rubrics: [ + { + id: 'accuracy', + required_min_score: 7, + score_ranges: [ + { score_range: [0, 3], outcome: 'Incorrect' }, + { score_range: [4, 6], outcome: 'Partially correct' }, + { score_range: [7, 9], outcome: 'Mostly correct' }, + { score_range: [10, 10], outcome: 'Fully correct' }, + ], + }, + ], + }, + ], + }; + + await expect( + parseGraders(rawEvalCase, undefined, [process.cwd()], 'test-case'), + ).rejects.toThrow(/required_min_score.*has been removed/i); + }); + it('throws on overlapping score_ranges', async () => { const rawEvalCase = { evaluators: [ @@ -828,7 +873,6 @@ describe('parseGraders - score_ranges shorthand map', () => { const rubric = config.rubrics?.[0]; expect(rubric?.id).toBe('accuracy'); expect(rubric?.min_score).toBe(0.7); - expect(rubric?.required_min_score).toBe(7); expect(rubric?.score_ranges).toHaveLength(4); expect(rubric?.score_ranges?.[0]).toEqual({ score_range: [0, 2], @@ -1759,6 +1803,44 @@ describe('parseGraders - type: rubrics with criteria', () => { expect(evaluators).toHaveLength(1); expect((evaluators?.[0] as LlmGraderConfig).rubrics).toHaveLength(2); }); + + it('preserves score_ranges in rubrics assertion criteria', async () => { + const evaluators = await parseGraders( + { + assertions: [ + { + type: 'rubrics', + criteria: [ + { + id: 'quality', + outcome: 'Answer quality', + min_score: 0.8, + score_ranges: [ + { score_range: [0, 4], outcome: 'Weak' }, + { score_range: [5, 7], outcome: 'Adequate' }, + { score_range: [8, 10], outcome: 'Strong' }, + ], + }, + ], + }, + ], + }, + undefined, + [tempDir], + 'test-1', + ); + + expect(evaluators).toHaveLength(1); + const config = evaluators?.[0] as LlmGraderConfig; + expect(config.name).toBe('rubrics'); + expect(config.type).toBe('llm-grader'); + expect(config.rubrics?.[0]?.min_score).toBe(0.8); + expect(config.rubrics?.[0]?.score_ranges).toEqual([ + { score_range: [0, 4], outcome: 'Weak' }, + { score_range: [5, 7], outcome: 'Adequate' }, + { score_range: [8, 10], outcome: 'Strong' }, + ]); + }); }); describe('parseGraders - required field', () => { @@ -1788,10 +1870,31 @@ describe('parseGraders - required field', () => { expect(config.required).toBe(true); }); - it('parses required: 0.6 (numeric threshold) on contains evaluator', async () => { + it('rejects required: 0.6 numeric threshold on contains evaluator', async () => { + await expect( + parseGraders( + { + evaluators: [{ name: 'check', type: 'contains', value: 'DENIED', required: 0.6 }], + }, + undefined, + [tempDir], + 'test-1', + ), + ).rejects.toThrow(/numeric 'required: 0\.6' has been removed/i); + }); + + it('parses required: true with min_score on contains evaluator', async () => { const evaluators = await parseGraders( { - evaluators: [{ name: 'check', type: 'contains', value: 'DENIED', required: 0.6 }], + evaluators: [ + { + name: 'check', + type: 'contains', + value: 'DENIED', + required: true, + min_score: 0.6, + }, + ], }, undefined, [tempDir], @@ -1799,7 +1902,8 @@ describe('parseGraders - required field', () => { ); expect(evaluators).toHaveLength(1); const config = evaluators?.[0] as ContainsGraderConfig; - expect(config.required).toBe(0.6); + expect(config.required).toBe(true); + expect(config.min_score).toBe(0.6); }); it('ignores required: false', async () => { @@ -1837,7 +1941,7 @@ describe('parseGraders - required field', () => { { name: 'code-check', type: 'code-grader', - script: ['bun', 'run', './test_script.ts'], + command: ['bun', 'run', './test_script.ts'], required: true, }, ], @@ -1851,10 +1955,10 @@ describe('parseGraders - required field', () => { expect(config.required).toBe(true); }); - it('parses required on llm-grader evaluator', async () => { + it('parses required with min_score on llm-grader evaluator', async () => { const evaluators = await parseGraders( { - evaluators: [{ name: 'grader', type: 'llm-grader', required: 0.7 }], + evaluators: [{ name: 'grader', type: 'llm-grader', required: true, min_score: 0.7 }], }, undefined, [tempDir], @@ -1862,25 +1966,55 @@ describe('parseGraders - required field', () => { ); expect(evaluators).toHaveLength(1); const config = evaluators?.[0] as LlmGraderConfig; - expect(config.required).toBe(0.7); + expect(config.required).toBe(true); + expect(config.min_score).toBe(0.7); }); - it('ignores invalid required values (string, negative, > 1)', async () => { + it('rejects numeric required values', async () => { + await expect( + parseGraders( + { + evaluators: [{ name: 'check', type: 'contains', value: 'DENIED', required: 0 }], + }, + undefined, + [tempDir], + 'test-1', + ), + ).rejects.toThrow(/numeric 'required: 0' has been removed/i); + + await expect( + parseGraders( + { + evaluators: [{ name: 'check', type: 'contains', value: 'DENIED', required: 1.5 }], + }, + undefined, + [tempDir], + 'test-1', + ), + ).rejects.toThrow(/numeric 'required: 1\.5' has been removed/i); + + await expect( + parseGraders( + { + evaluators: [{ name: 'check', type: 'contains', value: 'DENIED', required: -0.5 }], + }, + undefined, + [tempDir], + 'test-1', + ), + ).rejects.toThrow(/numeric 'required: -0\.5' has been removed/i); + }); + + it('ignores non-numeric invalid required values', async () => { const evaluators = await parseGraders( { - evaluators: [ - { name: 'c1', type: 'contains', value: 'A', required: 'yes' }, - { name: 'c2', type: 'contains', value: 'B', required: -0.5 }, - { name: 'c3', type: 'contains', value: 'C', required: 1.5 }, - { name: 'c4', type: 'contains', value: 'D', required: 0 }, - ], + evaluators: [{ name: 'c1', type: 'contains', value: 'A', required: 'yes' }], }, undefined, [tempDir], 'test-1', ); - expect(evaluators).toHaveLength(4); - // All invalid required values should be dropped (undefined) + expect(evaluators).toHaveLength(1); for (const config of evaluators ?? []) { expect((config as ContainsGraderConfig).required).toBeUndefined(); } diff --git a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts index deb73ae68..e3f377943 100644 --- a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts +++ b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts @@ -230,6 +230,50 @@ describe('loadTestsFromJsonl', () => { expect(rubricEvaluator.rubrics).toHaveLength(2); }); + it('supports inline rubrics field with score_ranges', async () => { + const jsonlPath = path.join(tempDir, 'with-score-range-rubrics.jsonl'); + await writeFile( + jsonlPath, + `${JSON.stringify({ + id: 'test-1', + criteria: 'Goal', + input: [{ role: 'user', content: 'Query' }], + rubrics: [ + { + id: 'quality', + outcome: 'Answer quality', + min_score: 0.8, + score_ranges: [ + { score_range: [0, 4], outcome: 'Weak' }, + { score_range: [5, 7], outcome: 'Adequate' }, + { score_range: [8, 10], outcome: 'Strong' }, + ], + }, + ], + })}\n`, + ); + + const cases = await loadTestsFromJsonl(jsonlPath, tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0].assertions).toHaveLength(1); + expect(cases[0].assertions?.[0]).toMatchObject({ + name: 'rubrics', + type: 'llm-grader', + rubrics: [ + { + id: 'quality', + min_score: 0.8, + score_ranges: [ + { score_range: [0, 4], outcome: 'Weak' }, + { score_range: [5, 7], outcome: 'Adequate' }, + { score_range: [8, 10], outcome: 'Strong' }, + ], + }, + ], + }); + }); + it('filters by pattern (exact match)', async () => { const jsonlPath = path.join(tempDir, 'filter.jsonl'); await writeFile( diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 5b5307f86..73c544135 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -2644,14 +2644,15 @@ describe('required gates', () => { expectedIndividualScores: undefined, }, { - label: 'numeric required threshold triggers gate when score is below threshold', + label: 'min_score threshold triggers required gate when score is below threshold', output: 'The answer is goodbye', assertions: [ { name: 'must-pass', type: 'contains' as const, value: 'hello', - required: 0.6 as boolean | number, + required: true, + min_score: 0.6, }, { name: 'optional', type: 'contains' as const, value: 'goodbye' }, ], @@ -2659,14 +2660,15 @@ describe('required gates', () => { expectedIndividualScores: undefined, }, { - label: 'numeric required threshold passes when score meets threshold', + label: 'min_score threshold passes when required grader meets threshold', output: 'hello world', assertions: [ { name: 'must-pass', type: 'contains' as const, value: 'hello', - required: 0.6 as boolean | number, + required: true, + min_score: 0.6, }, { name: 'optional', type: 'contains' as const, value: 'foo' }, ], diff --git a/packages/core/test/evaluation/providers/targets.test.ts b/packages/core/test/evaluation/providers/targets.test.ts index ffb9ab54a..1ec0c2f5c 100644 --- a/packages/core/test/evaluation/providers/targets.test.ts +++ b/packages/core/test/evaluation/providers/targets.test.ts @@ -231,7 +231,7 @@ describe('resolveTargetDefinition', () => { expect(target.config.version).toBe('2024-08-01-preview'); }); - it('rejects azure api_format with a migration error', () => { + it('rejects azure api_format with a removed-field error', () => { const env = { AZURE_OPENAI_ENDPOINT: 'https://example.openai.azure.com', AZURE_OPENAI_API_KEY: 'secret', @@ -250,7 +250,97 @@ describe('resolveTargetDefinition', () => { }, env, ), - ).toThrow(/'api_format' field is no longer supported/i); + ).toThrow(/'api_format' field has been removed/i); + }); + + it('rejects removed judge_target alias', () => { + const env = { + OPENAI_API_KEY: 'secret', + } satisfies Record; + + expect(() => + resolveTargetDefinition( + { + name: 'openai-with-judge-target', + provider: 'openai', + api_key: '${{ OPENAI_API_KEY }}', + model: 'gpt-5-mini', + judge_target: 'grader', + } as never, + env, + ), + ).toThrow(/judge_target.*has been removed/i); + }); + + it('rejects removed log_format target aliases', () => { + expect(() => + resolveTargetDefinition( + { + name: 'copilot-log-format', + provider: 'copilot-cli', + log_format: 'json', + } as never, + {}, + ), + ).toThrow(/log_format.*has been removed.*stream_log/i); + + expect(() => + resolveTargetDefinition( + { + name: 'claude-log-output-format', + provider: 'claude', + log_output_format: 'summary', + } as never, + {}, + ), + ).toThrow(/log_output_format.*has been removed.*stream_log/i); + }); + + it('maps canonical stream_log values to agent logger config', () => { + const raw = resolveTargetDefinition( + { + name: 'copilot-raw-log', + provider: 'copilot-cli', + stream_log: 'raw', + }, + {}, + ); + expect(raw.kind).toBe('copilot-cli'); + if (raw.kind !== 'copilot-cli') { + throw new Error('expected copilot-cli target'); + } + expect(raw.config.streamLog).toBe('raw'); + expect(raw.config.logFormat).toBe('json'); + + const summary = resolveTargetDefinition( + { + name: 'claude-summary-log', + provider: 'claude', + stream_log: 'summary', + }, + {}, + ); + expect(summary.kind).toBe('claude-cli'); + if (summary.kind !== 'claude-cli') { + throw new Error('expected claude-cli target'); + } + expect(summary.config.streamLog).toBe('summary'); + expect(summary.config.logFormat).toBe('summary'); + + const disabled = resolveTargetDefinition( + { + name: 'pi-disabled-log', + provider: 'pi-cli', + stream_log: false, + }, + {}, + ); + expect(disabled.kind).toBe('pi-cli'); + if (disabled.kind !== 'pi-cli') { + throw new Error('expected pi-cli target'); + } + expect(disabled.config.streamLog).toBe(false); + expect(disabled.config.logFormat).toBeUndefined(); }); it('defaults azure to api version v1', () => { @@ -441,7 +531,7 @@ describe('resolveTargetDefinition', () => { const target = resolveTargetDefinition( { name: 'gemini-flash', - provider: 'google-gemini', + provider: 'gemini', api_key: '${{ GOOGLE_API_KEY }}', model: 'gemini-1.5-flash', }, @@ -700,16 +790,21 @@ describe('resolveTargetDefinition', () => { ).toThrow(/reasoning_effort must be one of: minimal, low, medium, high, xhigh/); }); - it('resolves copilot alias to copilot-cli', () => { + it('does not canonicalize removed provider aliases to built-ins', () => { const target = resolveTargetDefinition( { - name: 'copilot-alias', + name: 'copilot-alias-removed', provider: 'copilot', }, {}, ); - expect(target.kind).toBe('copilot-cli'); + expect(target.kind).toBe('cli'); + if (target.kind !== 'cli') { + throw new Error('expected discovered cli target'); + } + + expect(target.config.command).toBe('bun run .agentv/providers/copilot.ts {PROMPT}'); }); it('claude-cli defaults executable to claude', () => { @@ -747,43 +842,6 @@ describe('resolveTargetDefinition', () => { expect(target.config.executable).toBe('claude-zai'); }); - it('cc-mirror with explicit executable resolves to claude-cli kind', () => { - const target = resolveTargetDefinition( - { - name: 'claude-zai', - provider: 'cc-mirror', - executable: '/usr/local/bin/claude-zai', - }, - {}, - ); - - expect(target.kind).toBe('claude-cli'); - if (target.kind !== 'claude-cli') { - throw new Error('expected claude-cli target'); - } - - expect(target.config.executable).toBe('/usr/local/bin/claude-zai'); - }); - - it('cc-mirror with explicit variant and executable', () => { - const target = resolveTargetDefinition( - { - name: 'my-mirror', - provider: 'cc-mirror', - variant: 'claude-zai', - executable: '/opt/bin/zai', - }, - {}, - ); - - expect(target.kind).toBe('claude-cli'); - if (target.kind !== 'claude-cli') { - throw new Error('expected claude-cli target'); - } - - expect(target.config.executable).toBe('/opt/bin/zai'); - }); - it('resolves copilot-cli as its own provider kind', () => { const target = resolveTargetDefinition( { diff --git a/packages/core/test/evaluation/repo-schema-validation.test.ts b/packages/core/test/evaluation/repo-schema-validation.test.ts index 27f55cc48..476d20810 100644 --- a/packages/core/test/evaluation/repo-schema-validation.test.ts +++ b/packages/core/test/evaluation/repo-schema-validation.test.ts @@ -102,6 +102,24 @@ describe('repo lifecycle schema validation', () => { expect(result.success).toBe(false); }); + it('rejects removed repo acquisition fields', () => { + const result = EvalFileSchema.safeParse({ + ...baseEval, + workspace: { + repos: [ + { + path: './repo-a', + repo: 'https://github.com/org/repo.git', + type: 'git', + resolve: 'custom', + resolver: 'custom', + }, + ], + }, + }); + expect(result.success).toBe(false); + }); + it('rejects negative ancestor', () => { const result = EvalFileSchema.safeParse({ ...baseEval, diff --git a/packages/core/test/evaluation/validation/config-validator.test.ts b/packages/core/test/evaluation/validation/config-validator.test.ts index 68328e50e..5d45de69c 100644 --- a/packages/core/test/evaluation/validation/config-validator.test.ts +++ b/packages/core/test/evaluation/validation/config-validator.test.ts @@ -298,7 +298,7 @@ describe('validateConfigFile', () => { ); }); - it('warns on deprecated results_by_project', async () => { + it('treats removed results_by_project as an unexpected field', async () => { const filePath = path.join(tempDir, 'deprecated-results-by-project.yaml'); await writeFile( filePath, @@ -315,7 +315,7 @@ describe('validateConfigFile', () => { expect(result.errors).toContainEqual( expect.objectContaining({ severity: 'warning', - location: 'results_by_project', + message: 'Unexpected fields: results_by_project', }), ); }); diff --git a/packages/core/test/evaluation/validation/eval-file-schema.test.ts b/packages/core/test/evaluation/validation/eval-file-schema.test.ts index c0e4bba2b..5f9a83b11 100644 --- a/packages/core/test/evaluation/validation/eval-file-schema.test.ts +++ b/packages/core/test/evaluation/validation/eval-file-schema.test.ts @@ -162,6 +162,97 @@ describe('EvalFileSchema input shorthand', () => { expect(result.success).toBe(true); }); + it('accepts a snake_cased promptfoo-shaped eval config', () => { + const result = EvalFileSchema.safeParse({ + description: 'Promptfoo-compatible authoring shape', + tags: { + suite: 'smoke', + }, + prompts: [ + { + label: 'reviewer', + raw: 'Review {{ vars.diff }}', + }, + ], + targets: [ + { + id: 'local-agent', + provider: 'codex', + config: { + model: 'gpt-5.4-mini', + }, + }, + ], + default_test: { + vars: { + tone: 'concise', + }, + assert: ['Mentions the highest-risk issue'], + options: { + disable_default_asserts: true, + }, + threshold: 0.7, + metadata: { + priority: 'p0', + }, + }, + tests: [ + { + description: 'grades a fixed provider output', + vars: { + diff: 'change', + }, + provider_output: 'Looks safe.', + assert: [ + { + type: 'contains', + value: 'safe', + metric: 'safety_text', + threshold: 0.5, + }, + { + type: 'g-eval', + value: ['Identifies user impact', 'Avoids unsupported claims'], + score_ranges: [{ score_range: [0, 10], outcome: 'overall quality' }], + }, + ], + }, + ], + scenarios: [ + { + description: 'severity variants', + config: [{ vars: { severity: 'high' } }], + tests: [ + { + vars: { diff: 'critical fix' }, + assert: [{ type: 'llm-rubric', value: 'Flags the risk clearly' }], + }, + ], + }, + ], + derived_metrics: [{ name: 'weighted_quality', value: 'safety_text * 0.5' }], + output_path: 'results.json', + env: { + EVAL_MODE: 'local', + }, + nunjucks_filters: { + slug: './filters/slug.ts', + }, + extensions: ['agentv:agent-rules'], + evaluate_options: { + cache: true, + delay: 100, + generate_suggestions: false, + repeat: 2, + timeout_ms: 30_000, + max_eval_time_ms: 120_000, + filter_range: [0, 10], + }, + }); + + expect(result.success).toBe(true); + }); + it('rejects invalid default_test values', () => { const invalidThreshold = EvalFileSchema.safeParse({ default_test: { @@ -172,7 +263,7 @@ describe('EvalFileSchema input shorthand', () => { const unknownDefault = EvalFileSchema.safeParse({ default_test: { threshold: 0.6, - assertions: [], + unsupported: true, }, tests: [baseTest], }); @@ -273,7 +364,7 @@ describe('EvalFileSchema input shorthand', () => { it('rejects lifecycle commands under authored policy blocks', () => { const result = EvalFileSchema.safeParse({ policy: { - setup: [{ script: 'bun install' }], + setup: [{ command: 'bun install' }], scripts: ['bun test'], }, tests: [baseTest], @@ -316,7 +407,26 @@ describe('EvalFileSchema input shorthand', () => { criteria: 'Goal', run: { target: 'other-agent', - setup: [{ script: 'bun install' }], + setup: [{ command: 'bun install' }], + }, + }, + ], + }); + + expect(result.success).toBe(false); + }); + + it('rejects removed workspace hook script alias', () => { + const result = EvalFileSchema.safeParse({ + tests: [ + { + ...baseTest, + workspace: { + hooks: { + before_all: { + script: ['bun', 'run', 'setup.ts'], + }, + }, }, }, ], diff --git a/packages/core/test/evaluation/validation/eval-validator.test.ts b/packages/core/test/evaluation/validation/eval-validator.test.ts index b15ee15f6..9b294058b 100644 --- a/packages/core/test/evaluation/validation/eval-validator.test.ts +++ b/packages/core/test/evaluation/validation/eval-validator.test.ts @@ -129,7 +129,7 @@ tests: filePath, `default_test: threshold: 1.2 - assertions: [] + unsupported: true tests: - id: test-1 criteria: Goal @@ -147,7 +147,108 @@ tests: ).toBe(true); expect( result.errors.some( - (error) => error.severity === 'error' && error.location === 'default_test.assertions', + (error) => error.severity === 'error' && error.location === 'default_test.unsupported', + ), + ).toBe(true); + }); + + it('validates promptfoo-shaped assert, default_test, and evaluate_options fields', async () => { + const filePath = path.join(tempDir, 'promptfoo-shaped.yaml'); + await writeFile( + filePath, + `description: Promptfoo-compatible shape +tags: + suite: smoke +prompts: + - raw: "Review {{ vars.diff }}" +targets: + - id: local-agent + provider: codex +default_test: + vars: + tone: concise + assert: + - Mentions the main risk + options: + disable_default_asserts: true + threshold: 0.7 +evaluate_options: + cache: true + delay: 100 + generate_suggestions: false + repeat: 2 + timeout_ms: 30000 + max_eval_time_ms: 120000 + filter_range: [0, 10] +tests: + - description: fixed output row + vars: + diff: change + provider_output: "Looks safe." + assert: + - type: contains + value: safe + metric: safety_text + - type: g-eval + value: + - Identifies user impact + - Avoids unsupported claims +scenarios: + - description: severity variants + config: + - vars: + severity: high + tests: + - vars: + diff: critical fix + assert: + - type: llm-rubric + value: Flags the risk clearly +derived_metrics: + - name: weighted_quality + value: safety_text * 0.5 +output_path: results.json +env: + EVAL_MODE: local +nunjucks_filters: + slug: ./filters/slug.ts +extensions: + - agentv:agent-rules +`, + ); + + const result = await validateEvalFile(filePath); + + expect(result.valid).toBe(true); + expect(result.errors).toHaveLength(0); + }); + + it('warns rather than accepting top-level providers as a live alias for targets', async () => { + const filePath = path.join(tempDir, 'top-level-providers.yaml'); + await writeFile( + filePath, + `prompts: + - raw: Hello {{ vars.name }} +providers: + - openai:gpt-5.4-mini +tests: + - vars: + name: Ada + assert: + - type: contains + value: Hello +`, + ); + + const result = await validateEvalFile(filePath); + + expect(result.valid).toBe(true); + expect( + result.errors.some( + (error) => + error.severity === 'warning' && + error.location === 'providers' && + error.message.includes("Unknown field 'providers'"), ), ).toBe(true); }); @@ -1161,7 +1262,7 @@ tests: expect(warnings).toHaveLength(0); }); - it('validates required field accepts number between 0 and 1', async () => { + it('warns when required field is numeric', async () => { const filePath = path.join(tempDir, 'assert-required-number.yaml'); await writeFile( filePath, @@ -1177,9 +1278,8 @@ tests: const result = await validateEvalFile(filePath); - expect(result.valid).toBe(true); const warnings = result.errors.filter((e) => e.severity === 'warning'); - expect(warnings).toHaveLength(0); + expect(warnings.some((e) => e.message.includes("Numeric 'required: 0.8'"))).toBe(true); }); it('warns on invalid required field type', async () => { @@ -1202,7 +1302,7 @@ tests: expect(warnings.some((e) => e.message.includes('required'))).toBe(true); }); - it('warns on required number out of range (0)', async () => { + it('warns on removed required number 0', async () => { const filePath = path.join(tempDir, 'assert-required-zero.yaml'); await writeFile( filePath, @@ -1219,10 +1319,10 @@ tests: const result = await validateEvalFile(filePath); const warnings = result.errors.filter((e) => e.severity === 'warning'); - expect(warnings.some((e) => e.message.includes('required'))).toBe(true); + expect(warnings.some((e) => e.message.includes("Numeric 'required: 0'"))).toBe(true); }); - it('warns on required number out of range (> 1)', async () => { + it('warns on removed required number greater than 1', async () => { const filePath = path.join(tempDir, 'assert-required-over-one.yaml'); await writeFile( filePath, @@ -1239,7 +1339,7 @@ tests: const result = await validateEvalFile(filePath); const warnings = result.errors.filter((e) => e.severity === 'warning'); - expect(warnings.some((e) => e.message.includes('required'))).toBe(true); + expect(warnings.some((e) => e.message.includes("Numeric 'required: 1.5'"))).toBe(true); }); it('warns when assertions is not an array', async () => { @@ -1629,6 +1729,49 @@ tests: ).toBe(true); }); + it('errors when removed repo acquisition fields are set', async () => { + const filePath = path.join(tempDir, 'workspace-removed-acquisition-fields-error.yaml'); + await writeFile( + filePath, + `workspace: + repos: + - path: ./repo + repo: https://github.com/org/repo.git + type: git + resolve: custom + resolver: custom +tests: + - id: test-1 + criteria: Goal + input: "Query" +`, + ); + + const result = await validateEvalFile(filePath); + + expect(result.valid).toBe(false); + expect( + result.errors.some( + (e) => + e.severity === 'error' && e.message.includes('workspace.repos[].type has been removed'), + ), + ).toBe(true); + expect( + result.errors.some( + (e) => + e.severity === 'error' && + e.message.includes('workspace.repos[].resolve has been removed'), + ), + ).toBe(true); + expect( + result.errors.some( + (e) => + e.severity === 'error' && + e.message.includes('workspace.repos[].resolver has been removed'), + ), + ).toBe(true); + }); + it('errors when non-Docker repo omits repo identity', async () => { const filePath = path.join(tempDir, 'workspace-missing-repo-error.yaml'); await writeFile( @@ -1901,15 +2044,14 @@ tests: ).toBe(true); }); - it('errors on removed assertion field at test level', async () => { - const removedKey = ['ass', 'ert'].join(''); - const filePath = path.join(tempDir, 'removed-test-field.yaml'); + it('accepts canonical assert field at test level', async () => { + const filePath = path.join(tempDir, 'test-level-assert.yaml'); await writeFile( filePath, `tests: - id: test-1 input: "Hello" - ${removedKey}: + assert: - type: contains value: "hello" `, @@ -1917,22 +2059,15 @@ tests: const result = await validateEvalFile(filePath); - expect(result.valid).toBe(false); - const errors = result.errors.filter((e) => e.severity === 'error'); - expect( - errors.some( - (e) => - e.message.includes("'assert' has been removed") && e.message.includes("'assertions'"), - ), - ).toBe(true); + expect(result.valid).toBe(true); + expect(result.errors).toHaveLength(0); }); - it('errors on removed assertion field at top level', async () => { - const removedKey = ['ass', 'ert'].join(''); - const filePath = path.join(tempDir, 'removed-top-field.yaml'); + it('accepts canonical assert field at top level', async () => { + const filePath = path.join(tempDir, 'top-level-assert.yaml'); await writeFile( filePath, - `${removedKey}: + `assert: - type: contains value: "hello" tests: @@ -1943,14 +2078,8 @@ tests: const result = await validateEvalFile(filePath); - expect(result.valid).toBe(false); - const errors = result.errors.filter((e) => e.severity === 'error'); - expect( - errors.some( - (e) => - e.message.includes("'assert' has been removed") && e.message.includes("'assertions'"), - ), - ).toBe(true); + expect(result.valid).toBe(true); + expect(result.errors).toHaveLength(0); }); }); }); diff --git a/packages/core/test/evaluation/validation/targets-validator.test.ts b/packages/core/test/evaluation/validation/targets-validator.test.ts index 8f87c9431..1342292b6 100644 --- a/packages/core/test/evaluation/validation/targets-validator.test.ts +++ b/packages/core/test/evaluation/validation/targets-validator.test.ts @@ -40,6 +40,62 @@ describe('validateTargetsFile', () => { ).toBe(false); }); + it('warns on removed built-in provider aliases', async () => { + const filePath = path.join(tempDir, 'removed-provider-aliases.yaml'); + await writeFile( + filePath, + `targets: + - name: azure-alias + provider: azure-openai + - name: google-alias + provider: google + - name: google-gemini-alias + provider: google-gemini + - name: codex-cli-alias + provider: codex-cli + - name: copilot-alias + provider: copilot + - name: copilot-sdk-alias + provider: copilot_sdk + - name: pi-alias + provider: pi + - name: claude-code-alias + provider: claude-code + - name: cc-mirror-alias + provider: cc-mirror + - name: bedrock-future + provider: bedrock + - name: vertex-future + provider: vertex +`, + ); + + const result = await validateTargetsFile(filePath); + + for (const provider of [ + 'azure-openai', + 'google', + 'google-gemini', + 'codex-cli', + 'copilot', + 'copilot_sdk', + 'pi', + 'claude-code', + 'cc-mirror', + 'bedrock', + 'vertex', + ]) { + expect( + result.errors.some( + (error) => + error.severity === 'warning' && + error.location.endsWith('.provider') && + error.message.includes(`Unknown provider '${provider}'`), + ), + ).toBe(true); + } + }); + it('rejects camelCase target aliases', async () => { const filePath = path.join(tempDir, 'camel-case-aliases.yaml'); await writeFile( @@ -262,7 +318,70 @@ describe('validateTargetsFile', () => { } }); - it('rejects azure api_format with a migration error', async () => { + it('rejects removed judge_target alias', async () => { + const filePath = path.join(tempDir, 'judge-target-alias.yaml'); + await writeFile( + filePath, + `targets: + - name: codex-agent + provider: codex + model: gpt-5 + judge_target: grader + - name: grader + provider: openai + model: gpt-5-mini +`, + ); + + const result = await validateTargetsFile(filePath); + + expect(result.valid).toBe(false); + expect( + result.errors.some( + (error) => + error.severity === 'error' && + error.location === 'targets[0].judge_target' && + error.message.includes("'judge_target' field has been removed"), + ), + ).toBe(true); + }); + + it('rejects removed log_format target aliases', async () => { + const filePath = path.join(tempDir, 'log-format-aliases.yaml'); + await writeFile( + filePath, + `targets: + - name: copilot-agent + provider: copilot-cli + log_format: json + - name: claude-agent + provider: claude + log_output_format: summary +`, + ); + + const result = await validateTargetsFile(filePath); + + expect(result.valid).toBe(false); + expect( + result.errors.some( + (error) => + error.severity === 'error' && + error.location === 'targets[0].log_format' && + error.message.includes("Use 'stream_log: raw'"), + ), + ).toBe(true); + expect( + result.errors.some( + (error) => + error.severity === 'error' && + error.location === 'targets[1].log_output_format' && + error.message.includes("Use 'stream_log: raw'"), + ), + ).toBe(true); + }); + + it('rejects azure api_format with a removed-field error', async () => { const filePath = path.join(tempDir, 'azure-api-format.yaml'); await writeFile( filePath, @@ -283,7 +402,7 @@ describe('validateTargetsFile', () => { (error) => error.severity === 'error' && error.location === 'targets[0].api_format' && - /'api_format' field is no longer supported/i.test(error.message), + /'api_format' field has been removed/i.test(error.message), ), ).toBe(true); }); diff --git a/packages/core/test/evaluation/validation/workspace-path-validator.test.ts b/packages/core/test/evaluation/validation/workspace-path-validator.test.ts index 7735b528e..e164c5987 100644 --- a/packages/core/test/evaluation/validation/workspace-path-validator.test.ts +++ b/packages/core/test/evaluation/validation/workspace-path-validator.test.ts @@ -214,7 +214,7 @@ describe('validateWorkspacePaths', () => { expect(errors.some((e) => e.message.includes('missing-after-all.mjs'))).toBe(true); }); - it('supports deprecated script alias', async () => { + it('reports removed script alias', async () => { const evalFilePath = path.join(tempDir, 'eval-script-alias.yaml'); await writeFile( evalFilePath, @@ -229,6 +229,7 @@ describe('validateWorkspacePaths', () => { const errors = await validateWorkspacePaths(evalFilePath); expect(errors).toHaveLength(1); - expect(errors[0]?.message).toContain('missing-via-alias.mjs'); + expect(errors[0]?.location).toBe('workspace.hooks.before_all.script'); + expect(errors[0]?.message).toContain("field 'script' has been removed"); }); }); diff --git a/packages/core/test/evaluation/workspace-config-parsing.test.ts b/packages/core/test/evaluation/workspace-config-parsing.test.ts index 72ba310e2..4429c6992 100644 --- a/packages/core/test/evaluation/workspace-config-parsing.test.ts +++ b/packages/core/test/evaluation/workspace-config-parsing.test.ts @@ -306,7 +306,6 @@ workspace: repos: - path: ./repo-a repo: https://github.com/org/repo.git - resolver: custom commit: main ancestor: 1 sparse: @@ -323,12 +322,34 @@ tests: expect(workspace?.repos).toHaveLength(1); expect(workspace?.repos?.[0].path).toBe('./repo-a'); expect(workspace?.repos?.[0].repo).toBe('https://github.com/org/repo.git'); - expect(workspace?.repos?.[0].resolver).toBe('custom'); expect(workspace?.repos?.[0].commit).toBe('main'); expect(workspace?.repos?.[0].ancestor).toBe(1); expect(workspace?.repos?.[0].sparse).toEqual(['src/**']); }); + it('rejects removed workspace repo resolver field', async () => { + const evalFile = path.join(testDir, 'workspace-repos-resolver.yaml'); + await writeFile( + evalFile, + ` +description: test +workspace: + repos: + - path: ./repo-a + repo: https://github.com/org/repo.git + resolver: custom +tests: + - id: test-1 + input: "hello" + criteria: "world" +`, + ); + + await expect(loadTests(evalFile, testDir)).rejects.toThrow( + 'workspace.repos[].resolver has been removed', + ); + }); + it('parses workspace hooks after_each reset config', async () => { const evalFile = path.join(testDir, 'workspace-reset.yaml'); await writeFile( diff --git a/packages/core/test/evaluation/workspace/repo-manager.test.ts b/packages/core/test/evaluation/workspace/repo-manager.test.ts index f2facc09f..214a3fead 100644 --- a/packages/core/test/evaluation/workspace/repo-manager.test.ts +++ b/packages/core/test/evaluation/workspace/repo-manager.test.ts @@ -398,41 +398,6 @@ describe('RepoManager', () => { ); }, 30_000); - it('uses an explicit workspace repo resolver even when the resolver has no repos pattern', async () => { - const sourceRepo = path.join(tmpDir, 'explicit-source'); - createTestRepo(sourceRepo, { 'explicit.txt': 'selected explicitly' }); - const scriptPath = path.join(tmpDir, 'scripts', 'resolver.ts'); - writeResolverScript(scriptPath); - const projectDir = path.join(tmpDir, 'project-explicit'); - const evalDir = path.join(projectDir, 'evals'); - mkdirSync(path.join(projectDir, '.git'), { recursive: true }); - mkdirSync(evalDir, { recursive: true }); - writeRepoResolversConfig(path.join(projectDir, '.agentv', 'config.yaml'), [ - { - name: 'inline_only', - command: ['bun', scriptPath], - config: { source_path: sourceRepo }, - }, - ]); - - const projectManager = new RepoManager(false, { - progress: false, - projectConfigDir: evalDir, - }); - await projectManager.materialize( - { - path: './explicit', - repo: 'https://github.com/other/repo.git', - resolver: 'inline_only', - }, - workspaceDir, - ); - - expect(readFileSync(path.join(workspaceDir, 'explicit', 'explicit.txt'), 'utf-8')).toBe( - 'selected explicitly', - ); - }, 30_000); - it('sends the stable stdin protocol and clones from resolver stdout git source', async () => { const sourceRepo = path.join(tmpDir, 'protocol-source'); const firstCommit = createTestRepo(sourceRepo, { 'src/main.ts': 'first' }); @@ -567,62 +532,6 @@ describe('RepoManager', () => { ); }, 30_000); - it('fails when an explicitly selected resolver returns handled:false', async () => { - const scriptPath = path.join(tmpDir, 'scripts', 'resolver.ts'); - writeResolverScript(scriptPath); - const projectDir = path.join(tmpDir, 'project-explicit-false'); - const evalDir = path.join(projectDir, 'evals'); - mkdirSync(path.join(projectDir, '.git'), { recursive: true }); - mkdirSync(evalDir, { recursive: true }); - writeRepoResolversConfig(path.join(projectDir, '.agentv', 'config.yaml'), [ - { - name: 'explicit_false', - command: ['bun', scriptPath], - config: { handled: false }, - }, - ]); - - const projectManager = new RepoManager(false, { - progress: false, - projectConfigDir: evalDir, - }); - await expect( - projectManager.materialize( - { - path: './explicit-false', - repo: 'https://github.com/example/explicit-false.git', - resolver: 'explicit_false', - }, - workspaceDir, - ), - ).rejects.toThrow( - "Repo resolver 'explicit_false' was selected by workspace.repos[].resolver but returned handled:false.", - ); - }, 30_000); - - it('fails clearly when inline resolver names are unknown', async () => { - const projectDir = path.join(tmpDir, 'project-missing-resolver'); - const evalDir = path.join(projectDir, 'evals'); - mkdirSync(path.join(projectDir, '.git'), { recursive: true }); - mkdirSync(evalDir, { recursive: true }); - writeRepoResolversConfig(path.join(projectDir, '.agentv', 'config.yaml'), []); - - const projectManager = new RepoManager(false, { - progress: false, - projectConfigDir: evalDir, - }); - await expect( - projectManager.materialize( - { - path: './missing', - repo: 'https://github.com/example/missing.git', - resolver: 'missing', - }, - workspaceDir, - ), - ).rejects.toThrow("workspace.repos[].resolver 'missing' is not configured."); - }, 30_000); - it('rejects duplicate resolver names and repos on the default resolver', async () => { const scriptPath = path.join(tmpDir, 'scripts', 'resolver.ts'); writeResolverScript(scriptPath); diff --git a/packages/core/test/paths.test.ts b/packages/core/test/paths.test.ts index e3067aa68..fa2eb59e5 100644 --- a/packages/core/test/paths.test.ts +++ b/packages/core/test/paths.test.ts @@ -5,7 +5,6 @@ import path from 'node:path'; import { getAgentvConfigDir, getAgentvDataDir, - getAgentvHome, getSubagentsRoot, getTraceStateRoot, getWorkspacePoolRoot, @@ -36,7 +35,6 @@ describe('paths', () => { it('returns ~/.agentv when AGENTV_HOME is not set', () => { expect(getAgentvConfigDir()).toBe(path.join(os.homedir(), '.agentv')); - expect(getAgentvHome()).toBe(path.join(os.homedir(), '.agentv')); }); it('treats the string "undefined" as unset', () => { @@ -49,7 +47,6 @@ describe('paths', () => { it('uses AGENTV_HOME as the lightweight config/home directory', () => { process.env.AGENTV_HOME = '/custom/agentv-home'; expect(getAgentvConfigDir()).toBe('/custom/agentv-home'); - expect(getAgentvHome()).toBe('/custom/agentv-home'); }); it('defaults heavy data to the config/home directory', () => { diff --git a/packages/sdk/src/eval.ts b/packages/sdk/src/eval.ts index 0a04d49cc..970a5eab4 100644 --- a/packages/sdk/src/eval.ts +++ b/packages/sdk/src/eval.ts @@ -33,7 +33,6 @@ const KNOWN_SNAKE_CASE_KEYS = { onDependencyFailure: 'on_dependency_failure', onTurnFailure: 'on_turn_failure', outputPath: 'output_path', - requiredMinScore: 'required_min_score', reasoningEffort: 'reasoning_effort', scoreRange: 'score_range', scoreRanges: 'score_ranges', @@ -85,7 +84,6 @@ export interface EvalPreprocessor { export interface EvalWorkspaceHook { readonly command?: string | readonly string[]; - readonly script?: string | readonly string[]; readonly timeoutMs?: number; readonly cwd?: string; readonly reset?: 'none' | 'fast' | 'strict'; diff --git a/packages/sdk/src/graders.ts b/packages/sdk/src/graders.ts index 5939f5265..c5fc27083 100644 --- a/packages/sdk/src/graders.ts +++ b/packages/sdk/src/graders.ts @@ -5,7 +5,7 @@ export type GraderCommand = string | readonly string[]; export interface GraderHelperOptions { readonly name?: string; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; readonly minScore?: number; readonly negate?: boolean; } @@ -13,7 +13,7 @@ export interface GraderHelperOptions { export interface GraderCommonConfig { readonly name?: string; readonly weight?: number; - readonly required?: boolean | number; + readonly required?: boolean; readonly minScore?: number; readonly negate?: boolean; } @@ -57,7 +57,6 @@ export interface GraderRubric { readonly weight?: number; readonly required?: boolean; readonly minScore?: number; - readonly requiredMinScore?: number; readonly scoreRanges?: readonly GraderScoreRange[]; } diff --git a/packages/sdk/test/eval-authoring.test.ts b/packages/sdk/test/eval-authoring.test.ts index 96e463308..568cc5ad2 100644 --- a/packages/sdk/test/eval-authoring.test.ts +++ b/packages/sdk/test/eval-authoring.test.ts @@ -44,14 +44,14 @@ describe('YAML-aligned eval authoring helpers', () => { workspace: { hooks: { beforeEach: { - script: 'git reset --hard', + command: 'git reset --hard', timeoutMs: 5_000, }, afterEach: { command: ['git', 'status'], }, afterAll: { - script: ['echo', 'done'], + command: ['echo', 'done'], }, }, }, @@ -130,14 +130,14 @@ describe('YAML-aligned eval authoring helpers', () => { workspace: { hooks: { before_each: { - script: 'git reset --hard', + command: 'git reset --hard', timeout_ms: 5_000, }, after_each: { command: ['git', 'status'], }, after_all: { - script: ['echo', 'done'], + command: ['echo', 'done'], }, }, }, diff --git a/packages/sdk/test/grader-helpers.test.ts b/packages/sdk/test/grader-helpers.test.ts index dad99d786..791a1351c 100644 --- a/packages/sdk/test/grader-helpers.test.ts +++ b/packages/sdk/test/grader-helpers.test.ts @@ -108,7 +108,7 @@ describe('grader helper config builders', () => { { id: 'useful', outcome: 'The answer is useful.', - requiredMinScore: 8, + minScore: 0.8, }, ], }), @@ -157,7 +157,7 @@ describe('grader helper config builders', () => { { id: 'useful', outcome: 'The answer is useful.', - required_min_score: 8, + min_score: 0.8, }, ], }, @@ -177,11 +177,12 @@ describe('grader helper config builders', () => { expect(yaml).toContain('type: code-grader'); expect(yaml).toContain('max_steps: 2'); expect(yaml).toContain('max_calls: 2'); - expect(yaml).toContain('required_min_score: 8'); + expect(yaml).toContain('min_score: 0.8'); expect(yaml).toContain('score_range:'); expect(yaml).not.toContain('maxSteps'); expect(yaml).not.toContain('maxCalls'); expect(yaml).not.toContain('requiredMinScore'); + expect(yaml).not.toContain('required_min_score'); expect(yaml).not.toContain('scoreRange'); }); }); diff --git a/skills-data/agentv-eval-writer/SKILL.md b/skills-data/agentv-eval-writer/SKILL.md index fcdf0217f..d27fdc5db 100644 --- a/skills-data/agentv-eval-writer/SKILL.md +++ b/skills-data/agentv-eval-writer/SKILL.md @@ -327,7 +327,8 @@ assertions: value: "DENIED" required: true # must score >= 0.8 (default) - type: rubrics - required: 0.6 # must score >= 0.6 (custom threshold) + required: true + min_score: 0.6 # must score >= 0.6 (custom threshold) criteria: - id: accuracy outcome: Identifies the denied party diff --git a/skills-data/agentv-eval-writer/references/eval.schema.json b/skills-data/agentv-eval-writer/references/eval.schema.json index 76d1340cb..d044864f7 100644 --- a/skills-data/agentv-eval-writer/references/eval.schema.json +++ b/skills-data/agentv-eval-writer/references/eval.schema.json @@ -27,10 +27,20 @@ "type": "string" }, "tags": { - "type": "array", - "items": { - "type": "string" - } + "anyOf": [ + { + "type": "array", + "items": { + "type": "string" + } + }, + { + "type": "object", + "additionalProperties": { + "type": ["string", "number", "boolean"] + } + } + ] }, "license": { "type": "string" @@ -143,6 +153,132 @@ } ] }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, "input_files": { "type": "array", "items": { @@ -746,6 +882,9 @@ "type": "string", "minLength": 1 }, + "description": { + "type": "string" + }, "vars": { "type": "object", "properties": {}, @@ -754,264 +893,201 @@ "criteria": { "type": "string" }, - "input": { + "provider": { "anyOf": [ { - "type": "string" + "type": "string", + "minLength": 1 }, { "type": "object", "properties": { - "role": { + "id": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "minLength": 1 }, - "content": { + "label": { + "type": "string", + "minLength": 1 + }, + "extends": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "minLength": 1 + }, + "provider": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "prompts": { "anyOf": [ { - "type": "string" - }, - { - "type": "object", - "properties": {}, - "additionalProperties": {} - }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": ["text", "file", "image"] + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } }, - "value": { - "type": "string" - } + "required": ["command"], + "additionalProperties": false }, - "required": ["type", "value"], - "additionalProperties": false - } - } - ] - } - }, - "required": ["role", "content"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "role": { - "not": {} - } - }, - "additionalProperties": {} - }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "role": { - "type": "string", - "enum": ["system", "user", "assistant", "tool"] - }, - "content": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": {}, - "additionalProperties": {} - }, - { - "type": "array", - "items": { + { "type": "object", "properties": { - "type": { - "type": "string", - "enum": ["text", "file", "image"] + "id": { + "type": "string" }, - "value": { + "label": { + "type": "string" + }, + "raw": { "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} } }, - "required": ["type", "value"], - "additionalProperties": false + "additionalProperties": true } - } - ] - } - }, - "required": ["role", "content"], - "additionalProperties": false - } - } - ] - }, - "input_files": { - "type": "array", - "items": { - "type": "string" - } - }, - "expected_output": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": {}, - "additionalProperties": {} - }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "role": { - "type": "string", - "enum": ["system", "user", "assistant", "tool"] - }, - "content": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": {}, - "additionalProperties": {} - }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": ["text", "file", "image"] + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } }, - "value": { - "type": "string" - } + "required": ["command"], + "additionalProperties": false }, - "required": ["type", "value"], - "additionalProperties": false - } - } - ] - } - }, - "required": ["role", "content"], - "additionalProperties": false - } - } - ] - }, - "assertions": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["code-grader", "code_grader"] - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "cwd": { + "minItems": 1 + } + ] + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "env": { + "type": "object", + "additionalProperties": { "type": "string" - }, - "target": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "object", - "properties": { - "max_calls": { - "type": "number" - } - }, - "additionalProperties": false - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "preprocessors": { - "type": "array", - "items": { + } + }, + "reasoning_effort": { + "type": "string", + "minLength": 1 + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { "type": "object", "properties": { - "type": { - "type": "string", - "minLength": 1 - }, "command": { "anyOf": [ { @@ -1024,186 +1100,90 @@ } } ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] } }, - "required": ["type", "command"], "additionalProperties": false - } - } - }, - "required": ["type", "command"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["llm-grader", "llm_grader"] - }, - "prompt": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - } + } + ] }, - "additionalProperties": false - } - ] - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" + "timeout_ms": { + "type": "number" }, - "outcome": { + "timeoutMs": { + "type": "number" + }, + "cwd": { "type": "string" }, - "operator": { + "reset": { "type": "string", - "enum": ["correctness", "contradiction"] + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] }, - "weight": { + "timeout_ms": { "type": "number" }, - "required": { - "type": "boolean" + "timeoutMs": { + "type": "number" }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "cwd": { + "type": "string" }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false - } - }, - "model": { - "type": "string" - }, - "target": { - "type": "string" - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "preprocessors": { - "type": "array", - "items": { + }, + "after_all": { "type": "object", "properties": { - "type": { - "type": "string", - "minLength": 1 - }, "command": { "anyOf": [ { @@ -1216,1463 +1196,1108 @@ } } ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] } }, - "required": ["type", "command"], "additionalProperties": false } - } - }, - "required": ["type"], - "additionalProperties": false + }, + "additionalProperties": false + } }, - { - "type": "object", - "properties": { - "include": { - "type": "string", - "minLength": 1 - } + "additionalProperties": true + } + ] + }, + "providers": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string", + "minLength": 1 }, - "required": ["include"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} - }, - "evaluators": { - "type": "array", - "items": {} - }, - "aggregator": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { - "type": "object", - "additionalProperties": { - "type": "number" - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "code-grader" - }, - "path": { - "type": "string" - }, - "cwd": { - "type": "string" - } - }, - "required": ["type", "path"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm-grader" - }, - "prompt": { - "type": "string" - }, - "model": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - } - ] - } - }, - "required": ["type", "aggregator"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] - }, - "mode": { - "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] - }, - "minimums": { - "type": "object", - "additionalProperties": { - "type": "integer", - "minimum": 0 - } - }, - "expected": { - "type": "array", - "items": { + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "label": { + "type": "string", + "minLength": 1 + }, + "extends": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "minLength": 1 + }, + "provider": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "config": { "type": "object", - "properties": { - "tool": { - "type": "string" - }, - "args": { + "additionalProperties": {} + }, + "prompts": { + "anyOf": [ + { "anyOf": [ { - "type": "string", - "const": "any" + "type": "string" }, { "type": "object", - "additionalProperties": {} - } - ] - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "maxDurationMs": { - "type": "number", - "minimum": 0 - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false }, { - "type": "array", - "items": { - "type": "string" - } + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true } ] }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { + { + "type": "array", + "items": { + "anyOf": [ + { "type": "string" - } - } - ] - } - }, - "required": ["tool"], - "additionalProperties": false - } - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "transform": { + "anyOf": [ + { "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "env": { + "type": "object", + "additionalProperties": { + "type": "string" } - ] - } - }, - "required": ["type", "mode"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["field-accuracy", "field_accuracy"] - }, - "fields": { - "type": "array", - "items": { + }, + "reasoning_effort": { + "type": "string", + "minLength": 1 + }, + "hooks": { "type": "object", "properties": { - "path": { - "type": "string" - }, - "match": { - "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] - }, - "required": { - "type": "boolean" - }, - "weight": { - "type": "number" + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false }, - "tolerance": { - "type": "number", - "minimum": 0 + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false }, - "relative": { - "type": "boolean" + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false }, - "formats": { - "type": "array", - "items": { - "type": "string" - } + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false } }, - "required": ["path", "match"], "additionalProperties": false - }, - "minItems": 1 + } }, - "aggregation": { - "type": "string", - "enum": ["weighted_average", "all_or_nothing"] - } - }, - "required": ["type", "fields"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { "type": "string", - "const": "latency" - }, - "threshold": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + "minLength": 1 }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "cost" - }, - "budget": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "budget"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "label": { + "type": "string", + "minLength": 1 }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["token-usage", "token_usage"] - }, - "max_total": { - "type": "number", - "minimum": 0 - }, - "max_input": { - "type": "number", - "minimum": 0 - }, - "max_output": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "extends": { + "type": "string", + "minLength": 1 }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["execution-metrics", "execution_metrics"] - }, - "max_tool_calls": { - "type": "number", - "minimum": 0 - }, - "max_llm_calls": { - "type": "number", - "minimum": 0 - }, - "max_tokens": { - "type": "number", - "minimum": 0 - }, - "max_cost_usd": { - "type": "number", - "minimum": 0 - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "target_exploration_ratio": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "name": { + "type": "string", + "minLength": 1 }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "regex" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "provider": { + "type": "string", + "minLength": 1 }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["is-json", "is_json"] - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "model": { + "type": "string", + "minLength": 1 }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "equals" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "config": { + "type": "object", + "additionalProperties": {} }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "rubrics" - }, - "criteria": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { "type": "object", "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ + "command": { + "anyOf": [ { - "type": "integer", - "minimum": 0, - "maximum": 10 + "type": "string" }, { - "type": "integer", - "minimum": 0, - "maximum": 10 + "type": "array", + "items": { + "type": "string" + } } ] }, - "outcome": { - "type": "string", - "minLength": 1 + "config": { + "type": "object", + "additionalProperties": {} } }, - "required": ["score_range", "outcome"], + "required": ["command"], "additionalProperties": false - } - } - }, - "additionalProperties": false - } - ] - }, - "minItems": 1 - } - }, - "required": ["type", "criteria"], - "additionalProperties": false - } - ] - } - }, - "evaluators": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] }, - { + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["code-grader", "code_grader"] - }, - "command": { - "anyOf": [ - { - "type": "string" + "minimum": 0 }, - { - "type": "array", - "items": { + "env": { + "type": "object", + "additionalProperties": { "type": "string" } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "cwd": { - "type": "string" - }, - "target": { - "anyOf": [ - { - "type": "boolean" + "reasoning_effort": { + "type": "string", + "minLength": 1 }, - { + "hooks": { "type": "object", "properties": { - "max_calls": { - "type": "number" - } - }, - "additionalProperties": false - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } }, - { - "type": "array", - "items": { + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] } - } - ] - } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type", "command"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["llm-grader", "llm_grader"] - }, - "prompt": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] + }, + "additionalProperties": false }, - "script": { - "anyOf": [ - { + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { "type": "string" }, - { - "type": "array", - "items": { - "type": "string" - } + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] } - ] + }, + "additionalProperties": false }, - "config": { - "type": "object", - "additionalProperties": {} - } - }, - "additionalProperties": false - } - ] - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { + "after_all": { "type": "object", "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ + "command": { + "anyOf": [ { - "type": "integer", - "minimum": 0, - "maximum": 10 + "type": "string" }, { - "type": "integer", - "minimum": 0, - "maximum": 10 + "type": "array", + "items": { + "type": "string" + } } ] }, - "outcome": { + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { "type": "string", - "minLength": 1 + "enum": ["none", "fast", "strict"] } }, - "required": ["score_range", "outcome"], "additionalProperties": false } + }, + "additionalProperties": false + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } } - }, - "additionalProperties": false + ] + }, + "config": { + "type": "object", + "additionalProperties": {} } }, - "model": { - "type": "string" + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } }, - "target": { + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { "type": "string" }, - "config": { + { "type": "object", - "additionalProperties": {} - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } + { + "type": "object", + "properties": { + "id": { + "type": "string" }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "include": { - "type": "string", - "minLength": 1 + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true } - }, - "required": ["include"], - "additionalProperties": false + ] }, - { + "minItems": 1 + } + ] + }, + "provider_output": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { "type": "object", "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { + "role": { "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} - }, - "evaluators": { - "type": "array", - "items": {} + "enum": ["system", "user", "assistant", "tool"] }, - "aggregator": { + "content": { "anyOf": [ { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { - "type": "object", - "additionalProperties": { - "type": "number" - } - } - }, - "required": ["type"], - "additionalProperties": false + "type": "string" }, { "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false + "properties": {}, + "additionalProperties": {} }, { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "code-grader" - }, - "path": { - "type": "string" + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } }, - "cwd": { - "type": "string" - } - }, - "required": ["type", "path"], - "additionalProperties": false - }, - { + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "input": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { "type": "object", "properties": { "type": { "type": "string", - "const": "llm-grader" - }, - "prompt": { - "type": "string" + "enum": ["text", "file", "image"] }, - "model": { + "value": { "type": "string" } }, - "required": ["type"], + "required": ["type", "value"], "additionalProperties": false } - ] - } - }, - "required": ["type", "aggregator"], - "additionalProperties": false + } + ] + } }, - { + "required": ["role", "content"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "role": { + "not": {} + } + }, + "additionalProperties": {} + }, + { + "type": "array", + "items": { "type": "object", "properties": { - "name": { - "type": "string" + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { + "content": { "anyOf": [ { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] - }, - "mode": { - "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] - }, - "minimums": { - "type": "object", - "additionalProperties": { - "type": "integer", - "minimum": 0 - } - }, - "expected": { - "type": "array", - "items": { - "type": "object", - "properties": { - "tool": { - "type": "string" - }, - "args": { - "anyOf": [ - { - "type": "string", - "const": "any" - }, - { - "type": "object", - "additionalProperties": {} - } - ] - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "maxDurationMs": { - "type": "number", - "minimum": 0 - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } + "type": "string" }, - "required": ["tool"], - "additionalProperties": false - } - }, - "args_match": { - "anyOf": [ { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "type": "object", + "properties": {}, + "additionalProperties": {} }, { "type": "array", "items": { - "type": "string" + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false } } ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "input_files": { + "type": "array", + "items": { + "type": "string" + } + }, + "expected_output": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] }, - "argsMatch": { + "content": { "anyOf": [ { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} }, { "type": "array", "items": { - "type": "string" + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false } } ] } }, - "required": ["type", "mode"], + "required": ["role", "content"], "additionalProperties": false + } + } + ] + }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "assert_scoring_function": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "options": { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "evaluators": { + "type": "array", + "items": { + "anyOf": [ { "type": "object", "properties": { "name": { "type": "string" }, + "metric": { + "type": "string" + }, "weight": { "type": "number", "minimum": 0 }, "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] + "type": "boolean" }, "min_score": { "type": "number", @@ -2685,139 +2310,73 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": ["code-grader", "code_grader"] }, - "fields": { - "type": "array", - "items": { - "type": "object", - "properties": { - "path": { - "type": "string" - }, - "match": { - "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] - }, - "required": { - "type": "boolean" - }, - "weight": { - "type": "number" - }, - "tolerance": { - "type": "number", - "minimum": 0 - }, - "relative": { - "type": "boolean" - }, - "formats": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "required": ["path", "match"], - "additionalProperties": false - }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["weighted_average", "all_or_nothing"] - } - }, - "required": ["type", "fields"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { + "command": { "anyOf": [ { - "type": "boolean" + "type": "string" }, { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "type": "array", + "items": { + "type": "string" + } } ] }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "latency" - }, - "threshold": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + "cwd": { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { + "target": { "anyOf": [ { "type": "boolean" }, { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false } ] }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "cost" + "config": { + "type": "object", + "additionalProperties": {} }, - "budget": { - "type": "number", - "minimum": 0 + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } } }, - "required": ["type", "budget"], + "required": ["type", "command"], "additionalProperties": false }, { @@ -2826,22 +2385,15 @@ "name": { "type": "string" }, + "metric": { + "type": "string" + }, "weight": { "type": "number", "minimum": 0 }, "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] + "type": "boolean" }, "min_score": { "type": "number", @@ -2854,88 +2406,175 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] - }, - "max_total": { - "type": "number", - "minimum": 0 - }, - "max_input": { - "type": "number", - "minimum": 0 - }, - "max_output": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + "enum": ["llm-grader", "llm_grader"] }, - "required": { + "prompt": { "anyOf": [ { - "type": "boolean" + "type": "string" }, { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true } ] }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["execution-metrics", "execution_metrics"] - }, - "max_tool_calls": { - "type": "number", - "minimum": 0 - }, - "max_llm_calls": { - "type": "number", - "minimum": 0 + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } }, - "max_tokens": { - "type": "number", - "minimum": 0 + "model": { + "type": "string" }, - "max_cost_usd": { - "type": "number", - "minimum": 0 + "target": { + "type": "string" }, - "max_duration_ms": { - "type": "number", - "minimum": 0 + "config": { + "type": "object", + "additionalProperties": {} }, - "target_exploration_ratio": { + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { "type": "number", "minimum": 0, - "maximum": 1 + "maximum": 2 }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } } }, "required": ["type"], @@ -2947,47 +2586,7 @@ "name": { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + "metric": { "type": "string" }, "weight": { @@ -2995,17 +2594,7 @@ "minimum": 0 }, "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] + "type": "boolean" }, "min_score": { "type": "number", @@ -3018,120 +2607,293 @@ }, "type": { "type": "string", - "const": "regex" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } + "enum": [ + "assert-set", + "g-eval", + "llm-rubric", + "javascript", + "python", + "webhook", + "similar", + "select-best", + "human", + "contains", + "contains-any", + "contains-all", + "icontains", + "icontains-any", + "icontains-all", + "starts-with", + "ends-with", + "regex", + "is-json", + "equals" ] }, - "min_score": { + "value": {}, + "threshold": { "type": "number", - "exclusiveMinimum": true, "minimum": 0, "maximum": 1 }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["is-json", "is_json"] - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { + "criteria": { "anyOf": [ { - "type": "boolean" + "type": "string" }, { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "equals" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, + "type": "array", + "items": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + ] + } + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + }, + "provider": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "required": ["type"], + "additionalProperties": true + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, { "type": "object", "properties": { "name": { "type": "string" }, + "metric": { + "type": "string" + }, "weight": { "type": "number", "minimum": 0 }, "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] + "type": "boolean" }, "min_score": { "type": "number", @@ -3144,2904 +2906,675 @@ }, "type": { "type": "string", - "const": "rubrics" + "const": "composite" }, - "criteria": { + "assertions": { "type": "array", - "items": { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } } - }, - "additionalProperties": false - } - ] - }, - "minItems": 1 - } - }, - "required": ["type", "criteria"], - "additionalProperties": false - } - ] - } - }, - "execution": { - "type": "object", - "properties": { - "workers": { - "not": {} - }, - "assertions": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" }, - { + "threshold": { "type": "number", - "exclusiveMinimum": true, "minimum": 0, "maximum": 1 } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["code-grader", "code_grader"] + }, + "required": ["type", "threshold"], + "additionalProperties": false }, - "command": { - "anyOf": [ - { - "type": "string" + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { + "path": { "type": "string" }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "cwd": { - "type": "string" - }, - "target": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "object", - "properties": { - "max_calls": { - "type": "number" - } - }, - "additionalProperties": false + "cwd": { + "type": "string" } - ] + }, + "required": ["type", "path"], + "additionalProperties": false }, - "config": { + { "type": "object", - "additionalProperties": {} - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } + "properties": { + "type": { + "type": "string", + "const": "llm-grader" }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type", "command"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "prompt": { + "type": "string" }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "model": { + "type": "string" } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["llm-grader", "llm_grader"] - }, - "prompt": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - } - }, - "additionalProperties": false - } - ] - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" + "const": "any" }, - "required": { - "type": "boolean" + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] }, - "score_ranges": { + { "type": "array", "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false + "type": "string" } } - }, - "additionalProperties": false + ] } }, - "model": { - "type": "string" - }, - "target": { - "type": "string" - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] }, - "preprocessors": { + { "type": "array", "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "command"], - "additionalProperties": false + "type": "string" } } - }, - "required": ["type"], - "additionalProperties": false + ] }, - { - "type": "object", - "properties": { - "include": { + "argsMatch": { + "anyOf": [ + { "type": "string", - "minLength": 1 - } - }, - "required": ["include"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} + "enum": ["exact", "ignore", "subset", "superset"] }, - "evaluators": { + { "type": "array", - "items": {} - }, - "aggregator": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { - "type": "object", - "additionalProperties": { - "type": "number" - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "code-grader" - }, - "path": { - "type": "string" - }, - "cwd": { - "type": "string" - } - }, - "required": ["type", "path"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm-grader" - }, - "prompt": { - "type": "string" - }, - "model": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - } - ] + "items": { + "type": "string" + } } - }, - "required": ["type", "aggregator"], - "additionalProperties": false + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] - }, - "mode": { - "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] - }, - "minimums": { - "type": "object", - "additionalProperties": { - "type": "integer", + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } } }, - "expected": { - "type": "array", - "items": { - "type": "object", - "properties": { - "tool": { - "type": "string" - }, - "args": { - "anyOf": [ - { - "type": "string", - "const": "any" - }, - { - "type": "object", - "additionalProperties": {} - } - ] - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "maxDurationMs": { - "type": "number", - "minimum": 0 - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["tool"], - "additionalProperties": false - } - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } + "required": ["path", "match"], + "additionalProperties": false }, - "required": ["type", "mode"], - "additionalProperties": false + "minItems": 1 }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["field-accuracy", "field_accuracy"] - }, - "fields": { - "type": "array", - "items": { - "type": "object", - "properties": { - "path": { - "type": "string" - }, - "match": { - "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] - }, - "required": { - "type": "boolean" - }, - "weight": { - "type": "number" - }, - "tolerance": { - "type": "number", - "minimum": 0 - }, - "relative": { - "type": "boolean" - }, - "formats": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "required": ["path", "match"], - "additionalProperties": false - }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["weighted_average", "all_or_nothing"] - } - }, - "required": ["type", "fields"], - "additionalProperties": false + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "latency" - }, - "threshold": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false + "metric": { + "type": "string" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "cost" - }, - "budget": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "budget"], - "additionalProperties": false + "weight": { + "type": "number", + "minimum": 0 }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["token-usage", "token_usage"] - }, - "max_total": { - "type": "number", - "minimum": 0 - }, - "max_input": { - "type": "number", - "minimum": 0 - }, - "max_output": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["execution-metrics", "execution_metrics"] - }, - "max_tool_calls": { - "type": "number", - "minimum": 0 - }, - "max_llm_calls": { - "type": "number", - "minimum": 0 - }, - "max_tokens": { - "type": "number", - "minimum": 0 - }, - "max_cost_usd": { - "type": "number", - "minimum": 0 - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "target_exploration_ratio": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false + "required": { + "type": "boolean" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "regex" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false + "negate": { + "type": "boolean" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["is-json", "is_json"] - } - }, - "required": ["type"], - "additionalProperties": false + "type": { + "type": "string", + "const": "latency" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "equals" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "rubrics" - }, - "criteria": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - ] - }, - "minItems": 1 - } - }, - "required": ["type", "criteria"], - "additionalProperties": false + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 } - ] - } - }, - "evaluators": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["code-grader", "code_grader"] - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "cwd": { - "type": "string" - }, - "target": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "object", - "properties": { - "max_calls": { - "type": "number" - } - }, - "additionalProperties": false - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type", "command"], - "additionalProperties": false + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["llm-grader", "llm_grader"] - }, - "prompt": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - } - }, - "additionalProperties": false - } - ] - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "model": { - "type": "string" - }, - "target": { - "type": "string" - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type"], - "additionalProperties": false + "metric": { + "type": "string" }, - { - "type": "object", - "properties": { - "include": { - "type": "string", - "minLength": 1 - } - }, - "required": ["include"], - "additionalProperties": false + "weight": { + "type": "number", + "minimum": 0 }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} - }, - "evaluators": { - "type": "array", - "items": {} - }, - "aggregator": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { - "type": "object", - "additionalProperties": { - "type": "number" - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "code-grader" - }, - "path": { - "type": "string" - }, - "cwd": { - "type": "string" - } - }, - "required": ["type", "path"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm-grader" - }, - "prompt": { - "type": "string" - }, - "model": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - } - ] - } - }, - "required": ["type", "aggregator"], - "additionalProperties": false + "required": { + "type": "boolean" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] - }, - "mode": { - "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] - }, - "minimums": { - "type": "object", - "additionalProperties": { - "type": "integer", - "minimum": 0 - } - }, - "expected": { - "type": "array", - "items": { - "type": "object", - "properties": { - "tool": { - "type": "string" - }, - "args": { - "anyOf": [ - { - "type": "string", - "const": "any" - }, - { - "type": "object", - "additionalProperties": {} - } - ] - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "maxDurationMs": { - "type": "number", - "minimum": 0 - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["tool"], - "additionalProperties": false - } - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "mode"], - "additionalProperties": false + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["field-accuracy", "field_accuracy"] - }, - "fields": { - "type": "array", - "items": { - "type": "object", - "properties": { - "path": { - "type": "string" - }, - "match": { - "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] - }, - "required": { - "type": "boolean" - }, - "weight": { - "type": "number" - }, - "tolerance": { - "type": "number", - "minimum": 0 - }, - "relative": { - "type": "boolean" - }, - "formats": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "required": ["path", "match"], - "additionalProperties": false - }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["weighted_average", "all_or_nothing"] - } - }, - "required": ["type", "fields"], - "additionalProperties": false + "negate": { + "type": "boolean" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "latency" - }, - "threshold": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "cost" - }, - "budget": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "budget"], - "additionalProperties": false + "max_total": { + "type": "number", + "minimum": 0 }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["token-usage", "token_usage"] - }, - "max_total": { - "type": "number", - "minimum": 0 - }, - "max_input": { - "type": "number", - "minimum": 0 - }, - "max_output": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false + "max_input": { + "type": "number", + "minimum": 0 }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["execution-metrics", "execution_metrics"] - }, - "max_tool_calls": { - "type": "number", - "minimum": 0 - }, - "max_llm_calls": { - "type": "number", - "minimum": 0 - }, - "max_tokens": { - "type": "number", - "minimum": 0 - }, - "max_cost_usd": { - "type": "number", - "minimum": 0 - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "target_exploration_ratio": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false + "metric": { + "type": "string" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "regex" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false + "weight": { + "type": "number", + "minimum": 0 }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["is-json", "is_json"] - } - }, - "required": ["type"], - "additionalProperties": false + "required": { + "type": "boolean" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "equals" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "rubrics" - }, - "criteria": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - ] - }, - "minItems": 1 - } - }, - "required": ["type", "criteria"], - "additionalProperties": false - } - ] - } - }, - "skip_defaults": { - "type": "boolean" - }, - "cache": { - "type": "boolean" - }, - "trials": { - "not": {} - }, - "budget_usd": { - "type": "number", - "minimum": 0 - }, - "budgetUsd": { - "type": "number", - "minimum": 0 - }, - "fail_on_error": { - "type": "boolean" - }, - "failOnError": { - "type": "boolean" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "workspace": { - "not": {} - } - }, - "additionalProperties": false - }, - "run": { - "type": "object", - "properties": { - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "repeat": { - "type": "object", - "properties": { - "count": { - "type": "integer", - "minimum": 1 - }, - "strategy": { - "type": "string", - "enum": ["pass_any", "pass_all", "mean", "confidence_interval"] - }, - "early_exit": { - "type": "boolean" - }, - "cost_limit_usd": { - "type": "number", - "minimum": 0 - } - }, - "required": ["count"], - "additionalProperties": false - }, - "timeout_seconds": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0 - }, - "budget_usd": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0 - } - }, - "additionalProperties": false - }, - "workspace": { - "type": "object", - "properties": { - "template": { - "type": "string" - }, - "isolation": { - "type": "string", - "enum": ["shared", "per_case"] - }, - "repos": { - "type": "array", - "items": { - "type": "object", - "properties": { - "path": { - "type": "string" + "negate": { + "type": "boolean" }, - "repo": { + "type": { "type": "string", - "minLength": 1 + "enum": ["execution-metrics", "execution_metrics"] }, - "commit": { - "type": "string", - "minLength": 1 + "max_tool_calls": { + "type": "number", + "minimum": 0 }, - "base_commit": { - "type": "string", - "minLength": 1 + "max_llm_calls": { + "type": "number", + "minimum": 0 }, - "ancestor": { - "type": "integer", + "max_tokens": { + "type": "number", "minimum": 0 }, - "sparse": { - "type": "array", - "items": { - "type": "string" - } + "max_cost_usd": { + "type": "number", + "minimum": 0 }, - "resolver": { - "type": "string", - "minLength": 1 + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 } }, + "required": ["type"], "additionalProperties": false - } - }, - "hooks": { - "type": "object", - "properties": { - "enabled": { - "type": "boolean" - }, - "before_all": { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - "additionalProperties": false - }, - "before_each": { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } + "metric": { + "type": "string" }, - "additionalProperties": false - }, - "after_each": { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } + "weight": { + "type": "number", + "minimum": 0 }, - "additionalProperties": false - }, - "after_all": { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } + "required": { + "type": "boolean" }, - "additionalProperties": false - } - }, - "additionalProperties": false - }, - "docker": { - "type": "object", - "properties": { - "image": { - "type": "string" - }, - "timeout": { - "type": "integer", - "minimum": 1 - }, - "memory": { - "type": "string" - }, - "cpus": { - "type": "number", - "minimum": 0.1 - } - }, - "required": ["image"], - "additionalProperties": false - }, - "env": { - "type": "object", - "properties": { - "required_commands": { - "type": "array", - "items": { + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { "type": "string", - "minLength": 1 + "const": "contains" + }, + "value": { + "type": "string" } }, - "required_python_modules": { - "type": "array", - "items": { - "type": "string", - "minLength": 1 - } - } + "required": ["type", "value"], + "additionalProperties": false }, - "additionalProperties": false - } - }, - "additionalProperties": false - }, - "metadata": { - "type": "object", - "additionalProperties": {} - }, - "conversation_id": { - "type": "string" - }, - "suite": { - "type": "string" - }, - "depends_on": { - "type": "array", - "items": { - "type": "string" - } - }, - "on_dependency_failure": { - "type": "string", - "enum": ["skip", "fail", "run"] - }, - "mode": { - "type": "string", - "enum": ["conversation"] - }, - "turns": { - "type": "array", - "items": { - "type": "object", - "properties": { - "input": { - "anyOf": [ - { + { + "type": "object", + "properties": { + "name": { "type": "string" }, - { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": {}, - "additionalProperties": {} - }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": ["text", "file", "image"] - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - } - } - ] + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" } - ] + }, + "required": ["type", "value"], + "additionalProperties": false }, - "expected_output": { - "anyOf": [ - { + { + "type": "object", + "properties": { + "name": { "type": "string" }, - { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": {}, - "additionalProperties": {} - }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": ["text", "file", "image"] - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - } - } - ] + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] } - ] + }, + "required": ["type"], + "additionalProperties": false }, - "assertions": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "string" - }, - { + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { "anyOf": [ + { + "type": "string", + "minLength": 1 + }, { "type": "object", "properties": { - "name": { + "id": { + "type": "string" + }, + "outcome": { "type": "string" }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, "weight": { - "type": "number", - "minimum": 0 + "type": "number" }, "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] + "type": "boolean" }, "min_score": { "type": "number", @@ -6049,437 +3582,526 @@ "minimum": 0, "maximum": 1 }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["code-grader", "code_grader"] - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "cwd": { - "type": "string" - }, - "target": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "object", - "properties": { - "max_calls": { - "type": "number" - } - }, - "additionalProperties": false - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "preprocessors": { + "score_ranges": { "type": "array", "items": { "type": "object", "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ { - "type": "string" + "type": "integer", + "minimum": 0, + "maximum": 10 }, { - "type": "array", - "items": { - "type": "string" - } + "type": "integer", + "minimum": 0, + "maximum": 10 } ] + }, + "outcome": { + "type": "string", + "minLength": 1 } }, - "required": ["type", "command"], + "required": ["score_range", "outcome"], "additionalProperties": false } } }, - "required": ["type", "command"], "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + } + ] + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "execution": { + "type": "object", + "properties": { + "workers": { + "not": {} + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "evaluators": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { "type": { "type": "string", - "enum": ["llm-grader", "llm_grader"] + "minLength": 1 }, - "prompt": { + "command": { "anyOf": [ { "type": "string" }, { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - } - }, - "additionalProperties": false + "type": "array", + "items": { + "type": "string" + } } ] - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" }, - "score_ranges": { + { "type": "array", "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false + "type": "string" } } - }, - "additionalProperties": false + ] + }, + "config": { + "type": "object", + "additionalProperties": {} } }, - "model": { + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { "type": "string" }, - "target": { + "outcome": { "type": "string" }, - "config": { - "type": "object", - "additionalProperties": {} + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" }, - "temperature": { + "min_score": { "type": "number", + "exclusiveMinimum": true, "minimum": 0, - "maximum": 2 + "maximum": 1 }, - "preprocessors": { + "score_ranges": { "type": "array", "items": { "type": "object", "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ { - "type": "string" + "type": "integer", + "minimum": 0, + "maximum": 10 }, { - "type": "array", - "items": { - "type": "string" - } + "type": "integer", + "minimum": 0, + "maximum": 10 } ] + }, + "outcome": { + "type": "string", + "minLength": 1 } }, - "required": ["type", "command"], + "required": ["score_range", "outcome"], "additionalProperties": false } } }, - "required": ["type"], "additionalProperties": false - }, - { + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { "type": "object", "properties": { - "include": { + "type": { "type": "string", "minLength": 1 - } - }, - "required": ["include"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 }, - "required": { + "command": { "anyOf": [ { - "type": "boolean" + "type": "string" }, { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "type": "array", + "items": { + "type": "string" + } } ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} - }, - "evaluators": { - "type": "array", - "items": {} - }, - "aggregator": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { - "type": "object", - "additionalProperties": { - "type": "number" - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "assert-set", + "g-eval", + "llm-rubric", + "javascript", + "python", + "webhook", + "similar", + "select-best", + "human", + "contains", + "contains-any", + "contains-all", + "icontains", + "icontains-any", + "icontains-all", + "starts-with", + "ends-with", + "regex", + "is-json", + "equals" + ] + }, + "value": {}, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "criteria": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string", + "minLength": 1 }, { "type": "object", "properties": { - "type": { - "type": "string", - "const": "code-grader" - }, - "path": { + "id": { "type": "string" }, - "cwd": { + "outcome": { "type": "string" - } - }, - "required": ["type", "path"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { + }, + "operator": { "type": "string", - "const": "llm-grader" + "enum": ["correctness", "contradiction"] }, - "prompt": { - "type": "string" + "weight": { + "type": "number" }, - "model": { - "type": "string" + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } } }, - "required": ["type"], "additionalProperties": false } ] } - }, - "required": ["type", "aggregator"], - "additionalProperties": false - }, - { + } + ] + }, + "rubrics": { + "type": "array", + "items": { "type": "object", "properties": { - "name": { + "id": { "type": "string" }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, "weight": { - "type": "number", - "minimum": 0 + "type": "number" }, "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] + "type": "boolean" }, "min_score": { "type": "number", @@ -6487,2508 +4109,1724 @@ "minimum": 0, "maximum": 1 }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] - }, - "mode": { - "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] - }, - "minimums": { - "type": "object", - "additionalProperties": { - "type": "integer", - "minimum": 0 - } - }, - "expected": { + "score_ranges": { "type": "array", "items": { "type": "object", "properties": { - "tool": { - "type": "string" - }, - "args": { - "anyOf": [ - { - "type": "string", - "const": "any" - }, - { - "type": "object", - "additionalProperties": {} - } - ] - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "maxDurationMs": { - "type": "number", - "minimum": 0 - }, - "args_match": { - "anyOf": [ + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ { - "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "type": "integer", + "minimum": 0, + "maximum": 10 }, { - "type": "array", - "items": { - "type": "string" - } + "type": "integer", + "minimum": 0, + "maximum": 10 } ] }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] + "outcome": { + "type": "string", + "minLength": 1 } }, - "required": ["tool"], + "required": ["score_range", "outcome"], "additionalProperties": false } - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] } }, - "required": ["type", "mode"], "additionalProperties": false - }, - { + } + }, + "score_ranges": { + "type": "array", + "items": { "type": "object", "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ { - "type": "boolean" + "type": "integer", + "minimum": 0, + "maximum": 10 }, { - "type": "number", - "exclusiveMinimum": true, + "type": "integer", "minimum": 0, - "maximum": 1 + "maximum": 10 } ] }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { + "outcome": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] - }, - "fields": { - "type": "array", - "items": { - "type": "object", - "properties": { - "path": { - "type": "string" - }, - "match": { - "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] - }, - "required": { - "type": "boolean" - }, - "weight": { - "type": "number" - }, - "tolerance": { - "type": "number", - "minimum": 0 - }, - "relative": { - "type": "boolean" - }, - "formats": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "required": ["path", "match"], - "additionalProperties": false - }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "minLength": 1 } }, - "required": ["type", "fields"], + "required": ["score_range", "outcome"], "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + } + }, + "provider": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "latency" - }, - "threshold": { - "type": "number", - "minimum": 0 + { + "type": "object", + "properties": {}, + "additionalProperties": {} } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + ] + } + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "cost" - }, - "budget": { - "type": "number", - "minimum": 0 + { + "type": "object", + "properties": {}, + "additionalProperties": {} } + ] + } + }, + "transform": { + "anyOf": [ + { + "type": "string" }, - "required": ["type", "budget"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "required": ["type"], + "additionalProperties": true + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["token-usage", "token_usage"] - }, - "max_total": { - "type": "number", - "minimum": 0 + } }, - "max_input": { - "type": "number", - "minimum": 0 + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } }, - "max_output": { - "type": "number", - "minimum": 0 - } + "required": ["type", "threshold"], + "additionalProperties": false }, - "required": ["type"], - "additionalProperties": false - }, - { + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { "type": "object", "properties": { - "name": { + "tool": { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { + "args": { "anyOf": [ { - "type": "boolean" + "type": "string", + "const": "any" }, { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "type": "object", + "additionalProperties": {} } ] }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["execution-metrics", "execution_metrics"] - }, - "max_tool_calls": { - "type": "number", - "minimum": 0 - }, - "max_llm_calls": { - "type": "number", - "minimum": 0 - }, - "max_tokens": { - "type": "number", - "minimum": 0 - }, - "max_cost_usd": { - "type": "number", - "minimum": 0 - }, "max_duration_ms": { "type": "number", "minimum": 0 }, - "target_exploration_ratio": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { + "maxDurationMs": { "type": "number", "minimum": 0 }, - "required": { + "args_match": { "anyOf": [ { - "type": "boolean" + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] }, { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "type": "array", + "items": { + "type": "string" + } } ] }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { + "argsMatch": { "anyOf": [ { - "type": "boolean" + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] }, { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "type": "array", + "items": { + "type": "string" + } } ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "regex" - }, - "value": { - "type": "string" } }, - "required": ["type", "value"], + "required": ["tool"], "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["is-json", "is_json"] } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] }, - "required": ["type"], - "additionalProperties": false - }, - { + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { "type": "object", "properties": { - "name": { + "path": { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { "type": "boolean" }, - "type": { - "type": "string", - "const": "equals" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] + "type": "number" }, - "min_score": { + "tolerance": { "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "minimum": 0 }, - "negate": { + "relative": { "type": "boolean" }, - "type": { - "type": "string", - "const": "rubrics" - }, - "criteria": { + "formats": { "type": "array", "items": { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - ] - }, - "minItems": 1 + "type": "string" + } } }, - "required": ["type", "criteria"], + "required": ["path", "match"], "additionalProperties": false - } - ] - } - ] - } - } - }, - "required": ["input"], - "additionalProperties": false - }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["mean", "min", "max"] - }, - "on_turn_failure": { - "type": "string", - "enum": ["continue", "stop"] - }, - "window_size": { - "type": "integer", - "minimum": 1 - } - }, - "required": ["id"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "include": { - "type": "string", - "minLength": 1 - }, - "type": { - "type": "string", - "enum": ["suite", "tests"] - }, - "select": { - "anyOf": [ - { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "array", - "items": { - "type": "string", - "minLength": 1 - }, - "minItems": 1 - } - ] - }, - { - "type": "object", - "properties": { - "test_ids": { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "array", - "items": { - "type": "string", - "minLength": 1 + }, + "minItems": 1 }, - "minItems": 1 - } - ] - }, - "tags": { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "array", - "items": { + "aggregation": { "type": "string", - "minLength": 1 - }, - "minItems": 1 - } - ] - }, - "metadata": { - "type": "object", - "additionalProperties": { - "anyOf": [ - { + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, - { - "type": "number" + "metric": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { "type": "boolean" }, - { - "type": "array", - "items": { - "type": ["string", "number", "boolean"] - }, - "minItems": 1 + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 } - ] - } - } - }, - "additionalProperties": false - } - ] - }, - "run": { - "type": "object", - "properties": { - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "repeat": { - "type": "object", - "properties": { - "count": { - "type": "integer", - "minimum": 1 - }, - "strategy": { - "type": "string", - "enum": ["pass_any", "pass_all", "mean", "confidence_interval"] - }, - "early_exit": { - "type": "boolean" - }, - "cost_limit_usd": { - "type": "number", - "minimum": 0 - } - }, - "required": ["count"], - "additionalProperties": false - }, - "timeout_seconds": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0 - }, - "budget_usd": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0 - } - }, - "additionalProperties": false - } - }, - "required": ["include", "type"], - "additionalProperties": false - }, - { - "type": "string", - "minLength": 1 - } - ] - } - }, - { - "type": "string", - "minLength": 1 - } - ] - }, - "eval_cases": { - "anyOf": [ - { - "type": "array", - "items": { - "anyOf": [ - { - "type": "object", - "properties": { - "id": { - "type": "string", - "minLength": 1 - }, - "vars": { - "type": "object", - "properties": {}, - "additionalProperties": {} - }, - "criteria": { - "type": "string" - }, - "input": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": { - "role": { - "type": "string", - "enum": ["system", "user", "assistant", "tool"] - }, - "content": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": {}, - "additionalProperties": {} }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": ["text", "file", "image"] - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - } - } - ] - } - }, - "required": ["role", "content"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "role": { - "not": {} - } - }, - "additionalProperties": {} - }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "role": { - "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "required": ["type", "threshold"], + "additionalProperties": false }, - "content": { - "anyOf": [ - { + { + "type": "object", + "properties": { + "name": { "type": "string" }, - { - "type": "object", - "properties": {}, - "additionalProperties": {} + "metric": { + "type": "string" }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": ["text", "file", "image"] - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - } + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 } - ] - } - }, - "required": ["role", "content"], - "additionalProperties": false - } - } - ] - }, - "input_files": { - "type": "array", - "items": { - "type": "string" - } - }, - "expected_output": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": {}, - "additionalProperties": {} - }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "role": { - "type": "string", - "enum": ["system", "user", "assistant", "tool"] + }, + "required": ["type", "budget"], + "additionalProperties": false }, - "content": { - "anyOf": [ - { + { + "type": "object", + "properties": { + "name": { "type": "string" }, - { - "type": "object", - "properties": {}, - "additionalProperties": {} + "metric": { + "type": "string" }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": ["text", "file", "image"] - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - } - } - ] - } - }, - "required": ["role", "content"], - "additionalProperties": false - } - } - ] - }, - "assertions": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { "type": "boolean" }, - { + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["code-grader", "code_grader"] + }, + "required": ["type"], + "additionalProperties": false }, - "command": { - "anyOf": [ - { + { + "type": "object", + "properties": { + "name": { "type": "string" }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { + "metric": { "type": "string" }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "cwd": { - "type": "string" - }, - "target": { - "anyOf": [ - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { "type": "boolean" }, - { - "type": "object", - "properties": { - "max_calls": { - "type": "number" - } - }, - "additionalProperties": false - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type", "command"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { + "negate": { "type": "boolean" }, - { + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { "type": "number", - "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["llm-grader", "llm_grader"] + }, + "required": ["type"], + "additionalProperties": false }, - "prompt": { - "anyOf": [ - { + { + "type": "object", + "properties": { + "name": { "type": "string" }, - { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - } - }, - "additionalProperties": false - } - ] - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } + "metric": { + "type": "string" }, - "additionalProperties": false - } - }, - "model": { - "type": "string" - }, - "target": { - "type": "string" + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false }, - "config": { + { "type": "object", - "additionalProperties": {} - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } + "properties": { + "name": { + "type": "string" }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "include": { - "type": "string", - "minLength": 1 - } - }, - "required": ["include"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { "type": "boolean" }, - { + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" + }, + "required": ["type", "value"], + "additionalProperties": false }, - "type": { - "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} - }, - "evaluators": { - "type": "array", - "items": {} - }, - "aggregator": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { - "type": "object", - "additionalProperties": { - "type": "number" - } - } - }, - "required": ["type"], - "additionalProperties": false + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false + "metric": { + "type": "string" }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "code-grader" - }, - "path": { - "type": "string" - }, - "cwd": { - "type": "string" - } - }, - "required": ["type", "path"], - "additionalProperties": false + "weight": { + "type": "number", + "minimum": 0 }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm-grader" - }, - "prompt": { - "type": "string" - }, - "model": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - } - ] - } - }, - "required": ["type", "aggregator"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { + "required": { "type": "boolean" }, - { + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] - }, - "mode": { - "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "required": ["type"], + "additionalProperties": false }, - "minimums": { + { "type": "object", - "additionalProperties": { - "type": "integer", - "minimum": 0 - } - }, - "expected": { - "type": "array", - "items": { - "type": "object", - "properties": { - "tool": { - "type": "string" - }, - "args": { - "anyOf": [ - { - "type": "string", - "const": "any" - }, - { - "type": "object", - "additionalProperties": {} - } - ] - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "maxDurationMs": { - "type": "number", - "minimum": 0 - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } + "properties": { + "name": { + "type": "string" }, - "required": ["tool"], - "additionalProperties": false - } - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "metric": { + "type": "string" }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "weight": { + "type": "number", + "minimum": 0 }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "mode"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { + "required": { "type": "boolean" }, - { + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["field-accuracy", "field_accuracy"] - }, - "fields": { - "type": "array", - "items": { - "type": "object", - "properties": { - "path": { - "type": "string" - }, - "match": { - "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] - }, - "required": { - "type": "boolean" - }, - "weight": { - "type": "number" - }, - "tolerance": { - "type": "number", - "minimum": 0 - }, - "relative": { - "type": "boolean" - }, - "formats": { - "type": "array", - "items": { - "type": "string" - } - } }, - "required": ["path", "match"], - "additionalProperties": false - }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["weighted_average", "all_or_nothing"] - } - }, - "required": ["type", "fields"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { + "negate": { "type": "boolean" }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "latency" - }, - "threshold": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": ["type", "value"], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { + "metric": { + "type": "string" + }, + "weight": { "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "cost" - }, - "budget": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "budget"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { + "minimum": 0 + }, + "required": { "type": "boolean" }, - { + "min_score": { "type": "number", "exclusiveMinimum": true, "minimum": 0, "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["token-usage", "token_usage"] - }, - "max_total": { - "type": "number", - "minimum": 0 - }, - "max_input": { - "type": "number", - "minimum": 0 - }, - "max_output": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { + }, + "negate": { "type": "boolean" }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["execution-metrics", "execution_metrics"] - }, - "max_tool_calls": { - "type": "number", - "minimum": 0 - }, - "max_llm_calls": { - "type": "number", - "minimum": 0 - }, - "max_tokens": { - "type": "number", - "minimum": 0 - }, - "max_cost_usd": { - "type": "number", - "minimum": 0 - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "target_exploration_ratio": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + ] + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false } + ] + } + }, + "skip_defaults": { + "type": "boolean" + }, + "cache": { + "type": "boolean" + }, + "trials": { + "not": {} + }, + "budget_usd": { + "type": "number", + "minimum": 0 + }, + "budgetUsd": { + "type": "number", + "minimum": 0 + }, + "fail_on_error": { + "type": "boolean" + }, + "failOnError": { + "type": "boolean" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "workspace": { + "not": {} + } + }, + "additionalProperties": false + }, + "run": { + "type": "object", + "properties": { + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "repeat": { + "type": "object", + "properties": { + "count": { + "type": "integer", + "minimum": 1 }, - "required": ["type"], - "additionalProperties": false + "strategy": { + "type": "string", + "enum": ["pass_any", "pass_all", "mean", "confidence_interval"] + }, + "early_exit": { + "type": "boolean" + }, + "cost_limit_usd": { + "type": "number", + "minimum": 0 + } }, - { + "required": ["count"], + "additionalProperties": false + }, + "timeout_seconds": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0 + }, + "budget_usd": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0 + } + }, + "additionalProperties": false + }, + "workspace": { + "type": "object", + "properties": { + "template": { + "type": "string" + }, + "isolation": { + "type": "string", + "enum": ["shared", "per_case"] + }, + "repos": { + "type": "array", + "items": { "type": "object", "properties": { - "name": { + "path": { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "repo": { + "type": "string", + "minLength": 1 }, - "negate": { - "type": "boolean" + "commit": { + "type": "string", + "minLength": 1 }, - "type": { + "base_commit": { "type": "string", - "const": "contains" + "minLength": 1 }, - "value": { - "type": "string" + "ancestor": { + "type": "integer", + "minimum": 0 + }, + "sparse": { + "type": "array", + "items": { + "type": "string" + } } }, - "required": ["type", "value"], "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] + } + }, + "hooks": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } }, - "negate": { - "type": "boolean" + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } }, - "type": { - "type": "string", - "const": "regex" + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } }, - "value": { - "type": "string" + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "docker": { + "type": "object", + "properties": { + "image": { + "type": "string" + }, + "timeout": { + "type": "integer", + "minimum": 1 + }, + "memory": { + "type": "string" + }, + "cpus": { + "type": "number", + "minimum": 0.1 + } + }, + "required": ["image"], + "additionalProperties": false + }, + "env": { + "type": "object", + "properties": { + "required_commands": { + "type": "array", + "items": { + "type": "string", + "minLength": 1 } }, - "required": ["type", "value"], - "additionalProperties": false + "required_python_modules": { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + } }, - { - "type": "object", - "properties": { - "name": { + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "metadata": { + "type": "object", + "additionalProperties": {} + }, + "conversation_id": { + "type": "string" + }, + "suite": { + "type": "string" + }, + "depends_on": { + "type": "array", + "items": { + "type": "string" + } + }, + "on_dependency_failure": { + "type": "string", + "enum": ["skip", "fail", "run"] + }, + "mode": { + "type": "string", + "enum": ["conversation"] + }, + "turns": { + "type": "array", + "items": { + "type": "object", + "properties": { + "input": { + "anyOf": [ + { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { + { "anyOf": [ { - "type": "boolean" + "type": "string" }, { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["is-json", "is_json"] - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "equals" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "type": "object", + "properties": {}, + "additionalProperties": {} }, { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "rubrics" - }, - "criteria": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { + "type": "array", + "items": { "type": "object", "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { + "type": { "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "enum": ["text", "file", "image"] }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } + "value": { + "type": "string" } }, + "required": ["type", "value"], "additionalProperties": false } - ] - }, - "minItems": 1 - } - }, - "required": ["type", "criteria"], - "additionalProperties": false - } - ] - } - }, - "evaluators": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 } ] + } + ] + }, + "expected_output": { + "anyOf": [ + { + "type": "string" }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["code-grader", "code_grader"] - }, - "command": { + { "anyOf": [ { "type": "string" }, { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" + "type": "object", + "properties": {}, + "additionalProperties": {} }, { "type": "array", "items": { - "type": "string" + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false } } ] - }, - "cwd": { - "type": "string" - }, - "target": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "object", - "properties": { - "max_calls": { - "type": "number" - } - }, - "additionalProperties": false - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "preprocessors": { - "type": "array", - "items": { + } + ] + }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "command"], - "additionalProperties": false + "properties": {}, + "additionalProperties": {} } - } - }, - "required": ["type", "command"], - "additionalProperties": false + ] + } }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + } + }, + "required": ["input"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["mean", "min", "max"] + }, + "on_turn_failure": { + "type": "string", + "enum": ["continue", "stop"] + }, + "window_size": { + "type": "integer", + "minimum": 1 + } + }, + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + }, + "type": { + "type": "string", + "enum": ["suite", "tests"] + }, + "select": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 }, - "required": { + "minItems": 1 + } + ] + }, + { + "type": "object", + "properties": { + "test_ids": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 + } + ] + }, + "tags": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 + } + ] + }, + "metadata": { + "type": "object", + "additionalProperties": { "anyOf": [ { - "type": "boolean" + "type": "string" }, { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["llm-grader", "llm_grader"] - }, - "prompt": { - "anyOf": [ + "type": "number" + }, { - "type": "string" + "type": "boolean" }, { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - } + "type": "array", + "items": { + "type": ["string", "number", "boolean"] }, - "additionalProperties": false + "minItems": 1 } ] - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ + } + } + }, + "additionalProperties": false + } + ] + }, + "run": { + "type": "object", + "properties": { + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "repeat": { + "type": "object", + "properties": { + "count": { + "type": "integer", + "minimum": 1 + }, + "strategy": { + "type": "string", + "enum": ["pass_any", "pass_all", "mean", "confidence_interval"] + }, + "early_exit": { + "type": "boolean" + }, + "cost_limit_usd": { + "type": "number", + "minimum": 0 + } + }, + "required": ["count"], + "additionalProperties": false + }, + "timeout_seconds": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0 + }, + "budget_usd": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0 + } + }, + "additionalProperties": false + } + }, + "required": ["include", "type"], + "additionalProperties": false + }, + { + "type": "string", + "minLength": 1 + } + ] + } + }, + { + "type": "string", + "minLength": 1 + } + ] + }, + "eval_cases": { + "anyOf": [ + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "description": { + "type": "string" + }, + "vars": { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + "criteria": { + "type": "string" + }, + "provider": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "label": { + "type": "string", + "minLength": 1 + }, + "extends": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "minLength": 1 + }, + "provider": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ { - "type": "integer", - "minimum": 0, - "maximum": 10 + "type": "string" }, { - "type": "integer", - "minimum": 0, - "maximum": 10 + "type": "array", + "items": { + "type": "string" + } } ] }, - "outcome": { - "type": "string", - "minLength": 1 + "config": { + "type": "object", + "additionalProperties": {} } }, - "required": ["score_range", "outcome"], + "required": ["command"], "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true } - } + ] }, - "additionalProperties": false + "minItems": 1 } - }, - "model": { - "type": "string" - }, - "target": { + ] + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "env": { + "type": "object", + "additionalProperties": { "type": "string" - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "preprocessors": { - "type": "array", - "items": { + } + }, + "reasoning_effort": { + "type": "string", + "minLength": 1 + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { "type": "object", "properties": { - "type": { - "type": "string", - "minLength": 1 - }, "command": { "anyOf": [ { @@ -9001,226 +5839,62 @@ } } ] - } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "include": { - "type": "string", - "minLength": 1 - } - }, - "required": ["include"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} - }, - "evaluators": { - "type": "array", - "items": {} - }, - "aggregator": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { - "type": "object", - "additionalProperties": { - "type": "number" - } - } }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } + "timeout_ms": { + "type": "number" }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "code-grader" - }, - "path": { - "type": "string" - }, - "cwd": { - "type": "string" - } + "timeoutMs": { + "type": "number" }, - "required": ["type", "path"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm-grader" - }, - "prompt": { - "type": "string" - }, - "model": { - "type": "string" - } + "cwd": { + "type": "string" }, - "required": ["type"], - "additionalProperties": false - } - ] - } - }, - "required": ["type", "aggregator"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] - }, - "mode": { - "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] - }, - "minimums": { - "type": "object", - "additionalProperties": { - "type": "integer", - "minimum": 0 - } - }, - "expected": { - "type": "array", - "items": { + "additionalProperties": false + }, + "before_each": { "type": "object", "properties": { - "tool": { - "type": "string" - }, - "args": { + "command": { "anyOf": [ { - "type": "string", - "const": "any" + "type": "string" }, { - "type": "object", - "additionalProperties": {} + "type": "array", + "items": { + "type": "string" + } } ] }, - "max_duration_ms": { - "type": "number", - "minimum": 0 + "timeout_ms": { + "type": "number" }, - "maxDurationMs": { - "type": "number", - "minimum": 0 + "timeoutMs": { + "type": "number" }, - "args_match": { + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { "anyOf": [ { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "type": "string" }, { "type": "array", @@ -9230,11 +5904,29 @@ } ] }, - "argsMatch": { + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { "anyOf": [ { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "type": "string" }, { "type": "array", @@ -9243,785 +5935,588 @@ } } ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] } }, - "required": ["tool"], "additionalProperties": false } }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "mode"], - "additionalProperties": false + "additionalProperties": false + } }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["field-accuracy", "field_accuracy"] - }, - "fields": { - "type": "array", - "items": { + "additionalProperties": true + } + ] + }, + "providers": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "label": { + "type": "string", + "minLength": 1 + }, + "extends": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "minLength": 1 + }, + "provider": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "config": { "type": "object", - "properties": { - "path": { - "type": "string" - }, - "match": { - "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] - }, - "required": { - "type": "boolean" - }, - "weight": { - "type": "number" - }, - "tolerance": { - "type": "number", - "minimum": 0 - }, - "relative": { - "type": "boolean" + "additionalProperties": {} + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] }, - "formats": { + { "type": "array", "items": { - "type": "string" - } + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 } - }, - "required": ["path", "match"], - "additionalProperties": false + ] }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["weighted_average", "all_or_nothing"] - } - }, - "required": ["type", "fields"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "env": { + "type": "object", + "additionalProperties": { + "type": "string" } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "latency" - }, - "threshold": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + }, + "reasoning_effort": { + "type": "string", + "minLength": 1 + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" + "additionalProperties": false + } }, - "type": { + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { "type": "string", - "const": "cost" - }, - "budget": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "budget"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + "minLength": 1 }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["token-usage", "token_usage"] - }, - "max_total": { - "type": "number", - "minimum": 0 - }, - "max_input": { - "type": "number", - "minimum": 0 - }, - "max_output": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "label": { + "type": "string", + "minLength": 1 }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["execution-metrics", "execution_metrics"] - }, - "max_tool_calls": { - "type": "number", - "minimum": 0 - }, - "max_llm_calls": { - "type": "number", - "minimum": 0 - }, - "max_tokens": { - "type": "number", - "minimum": 0 - }, - "max_cost_usd": { - "type": "number", - "minimum": 0 - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "target_exploration_ratio": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "extends": { + "type": "string", + "minLength": 1 }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "name": { + "type": "string", + "minLength": 1 }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "regex" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "provider": { + "type": "string", + "minLength": 1 }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["is-json", "is_json"] - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "model": { + "type": "string", + "minLength": 1 }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "equals" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + "config": { + "type": "object", + "additionalProperties": {} }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "rubrics" - }, - "criteria": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { "type": "object", "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ + "command": { + "anyOf": [ { - "type": "integer", - "minimum": 0, - "maximum": 10 + "type": "string" }, { - "type": "integer", - "minimum": 0, - "maximum": 10 + "type": "array", + "items": { + "type": "string" + } } ] }, - "outcome": { - "type": "string", - "minLength": 1 + "config": { + "type": "object", + "additionalProperties": {} } }, - "required": ["score_range", "outcome"], + "required": ["command"], "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true } - } + ] }, - "additionalProperties": false - } - ] - }, - "minItems": 1 - } - }, - "required": ["type", "criteria"], - "additionalProperties": false - } - ] - } - }, - "execution": { - "type": "object", - "properties": { - "workers": { - "not": {} - }, - "assertions": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] }, - "required": { + "transform": { "anyOf": [ { - "type": "boolean" + "type": "string" }, { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "type": "object", + "properties": {}, + "additionalProperties": {} } ] }, - "min_score": { + "delay": { "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "minimum": 0 }, - "negate": { - "type": "boolean" + "env": { + "type": "object", + "additionalProperties": { + "type": "string" + } }, - "type": { + "reasoning_effort": { "type": "string", - "enum": ["code-grader", "code_grader"] + "minLength": 1 }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "cwd": { - "type": "string" - }, - "target": { - "anyOf": [ - { - "type": "boolean" - }, - { + "hooks": { + "type": "object", + "properties": { + "before_all": { "type": "object", "properties": { - "max_calls": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type", "command"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["llm-grader", "llm_grader"] - }, - "prompt": { - "anyOf": [ - { - "type": "string" }, - { + "before_each": { "type": "object", "properties": { "command": { @@ -10037,7 +6532,26 @@ } ] }, - "script": { + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { "anyOf": [ { "type": "string" @@ -10050,900 +6564,823 @@ } ] }, - "config": { - "type": "object", - "additionalProperties": {} + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false - } - ] - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" }, - "outcome": { - "type": "string", - "minLength": 1 + { + "type": "array", + "items": { + "type": "string" + } } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] } - } - }, - "additionalProperties": false - } - }, - "model": { - "type": "string" - }, - "target": { - "type": "string" - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "command"], - "additionalProperties": false - } + "additionalProperties": false + } + }, + "additionalProperties": false } }, - "required": ["type"], - "additionalProperties": false + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } }, - { - "type": "object", - "properties": { - "include": { - "type": "string", - "minLength": 1 - } + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" }, - "required": ["include"], - "additionalProperties": false + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" }, { "type": "object", "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { + "command": { "anyOf": [ { - "type": "boolean" + "type": "string" }, { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "type": "array", + "items": { + "type": "string" + } } ] }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} - }, - "evaluators": { - "type": "array", - "items": {} - }, - "aggregator": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { - "type": "object", - "additionalProperties": { - "type": "number" - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "code-grader" - }, - "path": { - "type": "string" - }, - "cwd": { - "type": "string" - } - }, - "required": ["type", "path"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm-grader" - }, - "prompt": { - "type": "string" - }, - "model": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - } - ] + "config": { + "type": "object", + "additionalProperties": {} } }, - "required": ["type", "aggregator"], + "required": ["command"], "additionalProperties": false }, { "type": "object", "properties": { - "name": { + "id": { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 + "label": { + "type": "string" }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] + "raw": { + "type": "string" }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "path": { + "type": "string" }, - "negate": { - "type": "boolean" + "prefix": { + "type": "string" }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "suffix": { + "type": "string" }, - "mode": { - "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "provider_output": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" }, - "minimums": { + { "type": "object", - "additionalProperties": { - "type": "integer", - "minimum": 0 - } + "properties": {}, + "additionalProperties": {} }, - "expected": { + { "type": "array", "items": { "type": "object", "properties": { - "tool": { - "type": "string" - }, - "args": { - "anyOf": [ - { - "type": "string", - "const": "any" - }, - { - "type": "object", - "additionalProperties": {} - } - ] - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "maxDurationMs": { - "type": "number", - "minimum": 0 - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] + "type": { + "type": "string", + "enum": ["text", "file", "image"] }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] + "value": { + "type": "string" } }, - "required": ["tool"], + "required": ["type", "value"], "additionalProperties": false } - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "input": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": ["text", "file", "image"] }, - { - "type": "array", - "items": { - "type": "string" - } + "value": { + "type": "string" } - ] + }, + "required": ["type", "value"], + "additionalProperties": false } - }, - "required": ["type", "mode"], - "additionalProperties": false + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "role": { + "not": {} + } + }, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] }, - { - "type": "object", - "properties": { - "name": { + "content": { + "anyOf": [ + { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + { + "type": "object", + "properties": {}, + "additionalProperties": {} }, - "fields": { + { "type": "array", "items": { "type": "object", "properties": { - "path": { - "type": "string" - }, - "match": { + "type": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] - }, - "required": { - "type": "boolean" + "enum": ["text", "file", "image"] }, - "weight": { - "type": "number" - }, - "tolerance": { - "type": "number", - "minimum": 0 - }, - "relative": { - "type": "boolean" - }, - "formats": { - "type": "array", - "items": { - "type": "string" - } + "value": { + "type": "string" } }, - "required": ["path", "match"], + "required": ["type", "value"], "additionalProperties": false - }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + } } - }, - "required": ["type", "fields"], - "additionalProperties": false + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "input_files": { + "type": "array", + "items": { + "type": "string" + } + }, + "expected_output": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] }, - { - "type": "object", - "properties": { - "name": { + "content": { + "anyOf": [ + { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 + { + "type": "object", + "properties": {}, + "additionalProperties": {} }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "latency" - }, - "threshold": { - "type": "number", - "minimum": 0 + "required": ["type", "value"], + "additionalProperties": false + } } - }, - "required": ["type", "threshold"], - "additionalProperties": false + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "assert_scoring_function": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "options": { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "evaluators": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { - "type": "object", - "properties": { - "name": { + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { "type": "boolean" }, - "type": { - "type": "string", - "const": "cost" - }, - "budget": { - "type": "number", - "minimum": 0 + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false } - }, - "required": ["type", "budget"], - "additionalProperties": false + ] }, - { + "config": { "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["token-usage", "token_usage"] - }, - "max_total": { - "type": "number", - "minimum": 0 - }, - "max_input": { - "type": "number", - "minimum": 0 + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } }, - "max_output": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { - "type": "object", - "properties": { - "name": { + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "config": { + "type": "object", + "additionalProperties": {} } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["execution-metrics", "execution_metrics"] - }, - "max_tool_calls": { - "type": "number", - "minimum": 0 - }, - "max_llm_calls": { - "type": "number", - "minimum": 0 - }, - "max_tokens": { - "type": "number", - "minimum": 0 - }, - "max_cost_usd": { - "type": "number", - "minimum": 0 - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "target_exploration_ratio": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 + }, + "required": ["command"], + "additionalProperties": false }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": { + "id": { + "type": "string" }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" + }, + "additionalProperties": true } - }, - "required": ["type", "value"], - "additionalProperties": false + ] }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "regex" + } }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false + "additionalProperties": false + } }, - { + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } }, - "type": { - "type": "string", - "enum": ["is-json", "is_json"] - } - }, - "required": ["type"], - "additionalProperties": false + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { - "type": "object", - "properties": { - "name": { + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "assert-set", + "g-eval", + "llm-rubric", + "javascript", + "python", + "webhook", + "similar", + "select-best", + "human", + "contains", + "contains-any", + "contains-all", + "icontains", + "icontains-any", + "icontains-all", + "starts-with", + "ends-with", + "regex", + "is-json", + "equals" + ] + }, + "value": {}, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "criteria": { + "anyOf": [ + { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "equals" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "rubrics" - }, - "criteria": { + { "type": "array", "items": { "anyOf": [ @@ -11011,1783 +7448,872 @@ "additionalProperties": false } ] - }, - "minItems": 1 + } } - }, - "required": ["type", "criteria"], - "additionalProperties": false - } - ] - } - }, - "evaluators": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["code-grader", "code_grader"] - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "cwd": { - "type": "string" - }, - "target": { - "anyOf": [ - { - "type": "boolean" - }, - { + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { "type": "object", "properties": { - "max_calls": { - "type": "number" + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 } }, + "required": ["score_range", "outcome"], "additionalProperties": false } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} + } }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 + "additionalProperties": false + } + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] + { + "type": "integer", + "minimum": 0, + "maximum": 10 } - }, - "required": ["type", "command"], - "additionalProperties": false + ] + }, + "outcome": { + "type": "string", + "minLength": 1 } - } - }, - "required": ["type", "command"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 + "required": ["score_range", "outcome"], + "additionalProperties": false + } + }, + "provider": { + "anyOf": [ + { + "type": "string" }, - "required": { - "anyOf": [ - { - "type": "boolean" + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "required": ["type"], + "additionalProperties": true + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" }, - { + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { "type": "number", - "exclusiveMinimum": true, "minimum": 0, "maximum": 1 } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["llm-grader", "llm_grader"] + }, + "required": ["type", "threshold"], + "additionalProperties": false }, - "prompt": { - "anyOf": [ - { + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { "type": "string" }, - { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - } - }, - "additionalProperties": false + "cwd": { + "type": "string" } - ] + }, + "required": ["type", "path"], + "additionalProperties": false }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" } } - }, - "additionalProperties": false - } - }, - "model": { - "type": "string" - }, - "target": { - "type": "string" - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { + ] + }, + "argsMatch": { + "anyOf": [ + { "type": "string", - "minLength": 1 + "enum": ["exact", "ignore", "subset", "superset"] }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] + { + "type": "array", + "items": { + "type": "string" + } } - }, - "required": ["type", "command"], - "additionalProperties": false + ] } - } - }, - "required": ["type"], - "additionalProperties": false + }, + "required": ["tool"], + "additionalProperties": false + } }, - { - "type": "object", - "properties": { - "include": { + "args_match": { + "anyOf": [ + { "type": "string", - "minLength": 1 + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } } - }, - "required": ["include"], - "additionalProperties": false + ] }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { + "argsMatch": { + "anyOf": [ + { "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} + "enum": ["exact", "ignore", "subset", "superset"] }, - "evaluators": { + { "type": "array", - "items": {} - }, - "aggregator": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { - "type": "object", - "additionalProperties": { - "type": "number" - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "code-grader" - }, - "path": { - "type": "string" - }, - "cwd": { - "type": "string" - } - }, - "required": ["type", "path"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm-grader" - }, - "prompt": { - "type": "string" - }, - "model": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - } - ] + "items": { + "type": "string" + } } - }, - "required": ["type", "aggregator"], - "additionalProperties": false + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] - }, - "mode": { - "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] - }, - "minimums": { - "type": "object", - "additionalProperties": { - "type": "integer", + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", "minimum": 0 - } - }, - "expected": { - "type": "array", - "items": { - "type": "object", - "properties": { - "tool": { - "type": "string" - }, - "args": { - "anyOf": [ - { - "type": "string", - "const": "any" - }, - { - "type": "object", - "additionalProperties": {} - } - ] - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "maxDurationMs": { - "type": "number", - "minimum": 0 - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["tool"], - "additionalProperties": false - } - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" } - ] + } }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } + "required": ["path", "match"], + "additionalProperties": false }, - "required": ["type", "mode"], - "additionalProperties": false + "minItems": 1 }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["field-accuracy", "field_accuracy"] - }, - "fields": { - "type": "array", - "items": { - "type": "object", - "properties": { - "path": { - "type": "string" - }, - "match": { - "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] - }, - "required": { - "type": "boolean" - }, - "weight": { - "type": "number" - }, - "tolerance": { - "type": "number", - "minimum": 0 - }, - "relative": { - "type": "boolean" - }, - "formats": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "required": ["path", "match"], - "additionalProperties": false - }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["weighted_average", "all_or_nothing"] - } - }, - "required": ["type", "fields"], - "additionalProperties": false + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "latency" - }, - "threshold": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false + "metric": { + "type": "string" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "cost" - }, - "budget": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "budget"], - "additionalProperties": false + "weight": { + "type": "number", + "minimum": 0 }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["token-usage", "token_usage"] - }, - "max_total": { - "type": "number", - "minimum": 0 - }, - "max_input": { - "type": "number", - "minimum": 0 - }, - "max_output": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false + "required": { + "type": "boolean" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["execution-metrics", "execution_metrics"] - }, - "max_tool_calls": { - "type": "number", - "minimum": 0 - }, - "max_llm_calls": { - "type": "number", - "minimum": 0 - }, - "max_tokens": { - "type": "number", - "minimum": 0 - }, - "max_cost_usd": { - "type": "number", - "minimum": 0 - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "target_exploration_ratio": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false + "negate": { + "type": "boolean" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "regex" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false + "type": { + "type": "string", + "const": "latency" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["is-json", "is_json"] - } - }, - "required": ["type"], - "additionalProperties": false + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "equals" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false + "metric": { + "type": "string" }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "rubrics" - }, - "criteria": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - ] - }, - "minItems": 1 - } - }, - "required": ["type", "criteria"], - "additionalProperties": false + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 } - ] - } - }, - "skip_defaults": { - "type": "boolean" - }, - "cache": { - "type": "boolean" - }, - "trials": { - "not": {} - }, - "budget_usd": { - "type": "number", - "minimum": 0 - }, - "budgetUsd": { - "type": "number", - "minimum": 0 - }, - "fail_on_error": { - "type": "boolean" - }, - "failOnError": { - "type": "boolean" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "workspace": { - "not": {} - } - }, - "additionalProperties": false - }, - "run": { - "type": "object", - "properties": { - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "repeat": { - "type": "object", - "properties": { - "count": { - "type": "integer", - "minimum": 1 - }, - "strategy": { - "type": "string", - "enum": ["pass_any", "pass_all", "mean", "confidence_interval"] - }, - "early_exit": { - "type": "boolean" }, - "cost_limit_usd": { - "type": "number", - "minimum": 0 - } + "required": ["type", "budget"], + "additionalProperties": false }, - "required": ["count"], - "additionalProperties": false - }, - "timeout_seconds": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0 - }, - "budget_usd": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0 - } - }, - "additionalProperties": false - }, - "workspace": { - "type": "object", - "properties": { - "template": { - "type": "string" - }, - "isolation": { - "type": "string", - "enum": ["shared", "per_case"] - }, - "repos": { - "type": "array", - "items": { + { "type": "object", "properties": { - "path": { + "name": { "type": "string" }, - "repo": { - "type": "string", - "minLength": 1 + "metric": { + "type": "string" }, - "commit": { - "type": "string", - "minLength": 1 + "weight": { + "type": "number", + "minimum": 0 }, - "base_commit": { + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { "type": "string", - "minLength": 1 + "enum": ["token-usage", "token_usage"] }, - "ancestor": { - "type": "integer", + "max_total": { + "type": "number", "minimum": 0 }, - "sparse": { - "type": "array", - "items": { - "type": "string" - } + "max_input": { + "type": "number", + "minimum": 0 }, - "resolver": { - "type": "string", - "minLength": 1 + "max_output": { + "type": "number", + "minimum": 0 } }, + "required": ["type"], "additionalProperties": false - } - }, - "hooks": { - "type": "object", - "properties": { - "enabled": { - "type": "boolean" - }, - "before_all": { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" }, - "additionalProperties": false - }, - "before_each": { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } + "metric": { + "type": "string" }, - "additionalProperties": false - }, - "after_each": { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } + "weight": { + "type": "number", + "minimum": 0 }, - "additionalProperties": false - }, - "after_all": { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } + "required": { + "type": "boolean" }, - "additionalProperties": false - } - }, - "additionalProperties": false - }, - "docker": { - "type": "object", - "properties": { - "image": { - "type": "string" - }, - "timeout": { - "type": "integer", - "minimum": 1 - }, - "memory": { - "type": "string" - }, - "cpus": { - "type": "number", - "minimum": 0.1 - } - }, - "required": ["image"], - "additionalProperties": false - }, - "env": { - "type": "object", - "properties": { - "required_commands": { - "type": "array", - "items": { - "type": "string", - "minLength": 1 - } - }, - "required_python_modules": { - "type": "array", - "items": { + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { "type": "string", - "minLength": 1 - } - } - }, - "additionalProperties": false - } - }, - "additionalProperties": false - }, - "metadata": { - "type": "object", - "additionalProperties": {} - }, - "conversation_id": { - "type": "string" - }, - "suite": { - "type": "string" - }, - "depends_on": { - "type": "array", - "items": { - "type": "string" - } - }, - "on_dependency_failure": { - "type": "string", - "enum": ["skip", "fail", "run"] - }, - "mode": { - "type": "string", - "enum": ["conversation"] - }, - "turns": { - "type": "array", - "items": { - "type": "object", - "properties": { - "input": { - "anyOf": [ - { + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, - { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": {}, - "additionalProperties": {} - }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": ["text", "file", "image"] - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - } - } - ] + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" } - ] + }, + "required": ["type", "value"], + "additionalProperties": false }, - "expected_output": { - "anyOf": [ - { + { + "type": "object", + "properties": { + "name": { "type": "string" }, - { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": {}, - "additionalProperties": {} - }, - { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": ["text", "file", "image"] - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - } - } - ] + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" } - ] + }, + "required": ["type", "value"], + "additionalProperties": false }, - "assertions": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "string" - }, - { - "anyOf": [ - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["code-grader", "code_grader"] - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "cwd": { - "type": "string" - }, - "target": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "object", - "properties": { - "max_calls": { - "type": "number" - } - }, - "additionalProperties": false - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type", "command"], - "additionalProperties": false + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string", + "minLength": 1 }, { "type": "object", "properties": { - "name": { + "id": { "type": "string" }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, "weight": { - "type": "number", - "minimum": 0 + "type": "number" }, "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] + "type": "boolean" }, "min_score": { "type": "number", @@ -12795,321 +8321,280 @@ "minimum": 0, "maximum": 1 }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["llm-grader", "llm_grader"] - }, - "prompt": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - } - }, - "additionalProperties": false - } - ] - }, - "rubrics": { + "score_ranges": { "type": "array", "items": { "type": "object", "properties": { - "id": { - "type": "string" + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] }, "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "model": { - "type": "string" - }, - "target": { - "type": "string" - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { "type": "string", "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] } }, - "required": ["type", "command"], + "required": ["score_range", "outcome"], "additionalProperties": false } } }, - "required": ["type"], "additionalProperties": false - }, - { - "type": "object", - "properties": { - "include": { - "type": "string", - "minLength": 1 + } + ] + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "execution": { + "type": "object", + "properties": { + "workers": { + "not": {} + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "evaluators": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" }, - "required": ["include"], - "additionalProperties": false - }, - { + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { "type": "object", "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, "type": { "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} - }, - "evaluators": { - "type": "array", - "items": {} + "minLength": 1 }, - "aggregator": { + "command": { "anyOf": [ { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { - "type": "object", - "additionalProperties": { - "type": "number" - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "code-grader" - }, - "path": { - "type": "string" - }, - "cwd": { - "type": "string" - } - }, - "required": ["type", "path"], - "additionalProperties": false + "type": "string" }, { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm-grader" - }, - "prompt": { - "type": "string" - }, - "model": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - } - ] + "type": "array", + "items": { + "type": "string" + } + } + ] } }, - "required": ["type", "aggregator"], + "required": ["type", "command"], "additionalProperties": false - }, - { + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "rubrics": { + "type": "array", + "items": { "type": "object", "properties": { - "name": { + "id": { + "type": "string" + }, + "outcome": { "type": "string" }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, "weight": { - "type": "number", - "minimum": 0 + "type": "number" }, "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] + "type": "boolean" }, "min_score": { "type": "number", @@ -13117,120 +8602,74 @@ "minimum": 0, "maximum": 1 }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] - }, - "mode": { - "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] - }, - "minimums": { - "type": "object", - "additionalProperties": { - "type": "integer", - "minimum": 0 - } - }, - "expected": { + "score_ranges": { "type": "array", "items": { "type": "object", "properties": { - "tool": { - "type": "string" - }, - "args": { - "anyOf": [ - { - "type": "string", - "const": "any" - }, - { - "type": "object", - "additionalProperties": {} - } - ] - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "maxDurationMs": { - "type": "number", - "minimum": 0 - }, - "args_match": { - "anyOf": [ + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ { - "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "type": "integer", + "minimum": 0, + "maximum": 10 }, { - "type": "array", - "items": { - "type": "string" - } + "type": "integer", + "minimum": 0, + "maximum": 10 } ] }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] + "outcome": { + "type": "string", + "minLength": 1 } }, - "required": ["tool"], + "required": ["score_range", "outcome"], "additionalProperties": false } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { + "command": { "anyOf": [ { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "type": "string" }, { "type": "array", @@ -13241,31 +8680,167 @@ ] } }, - "required": ["type", "mode"], + "required": ["type", "command"], "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "assert-set", + "g-eval", + "llm-rubric", + "javascript", + "python", + "webhook", + "similar", + "select-best", + "human", + "contains", + "contains-any", + "contains-all", + "icontains", + "icontains-any", + "icontains-all", + "starts-with", + "ends-with", + "regex", + "is-json", + "equals" + ] + }, + "value": {}, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "criteria": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false } ] + } + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" }, "min_score": { "type": "number", @@ -13273,2195 +8848,9886 @@ "minimum": 0, "maximum": 1 }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["field-accuracy", "field_accuracy"] - }, - "fields": { + "score_ranges": { "type": "array", "items": { "type": "object", "properties": { - "path": { - "type": "string" + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] }, - "match": { + "outcome": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] - }, - "required": { - "type": "boolean" - }, - "weight": { - "type": "number" - }, - "tolerance": { - "type": "number", - "minimum": 0 - }, - "relative": { - "type": "boolean" - }, - "formats": { - "type": "array", - "items": { - "type": "string" - } + "minLength": 1 } }, - "required": ["path", "match"], + "required": ["score_range", "outcome"], "additionalProperties": false - }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + } } }, - "required": ["type", "fields"], "additionalProperties": false - }, - { + } + }, + "score_ranges": { + "type": "array", + "items": { "type": "object", "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ { - "type": "boolean" + "type": "integer", + "minimum": 0, + "maximum": 10 }, { - "type": "number", - "exclusiveMinimum": true, + "type": "integer", "minimum": 0, - "maximum": 1 + "maximum": 10 } ] }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { + "outcome": { "type": "string", - "const": "latency" - }, - "threshold": { - "type": "number", - "minimum": 0 + "minLength": 1 } }, - "required": ["type", "threshold"], + "required": ["score_range", "outcome"], "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + } + }, + "provider": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "cost" - }, - "budget": { - "type": "number", - "minimum": 0 + { + "type": "object", + "properties": {}, + "additionalProperties": {} } - }, - "required": ["type", "budget"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + ] + } + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "required": ["type"], + "additionalProperties": true + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["token-usage", "token_usage"] + } }, - "max_total": { - "type": "number", - "minimum": 0 + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } }, - "max_input": { - "type": "number", - "minimum": 0 + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } }, - "max_output": { - "type": "number", - "minimum": 0 - } + "required": ["type", "path"], + "additionalProperties": false }, - "required": ["type"], - "additionalProperties": false - }, - { + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { "type": "object", "properties": { - "name": { + "tool": { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { + "args": { "anyOf": [ { - "type": "boolean" + "type": "string", + "const": "any" }, { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "type": "object", + "additionalProperties": {} } ] }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["execution-metrics", "execution_metrics"] - }, - "max_tool_calls": { - "type": "number", - "minimum": 0 - }, - "max_llm_calls": { - "type": "number", - "minimum": 0 - }, - "max_tokens": { + "max_duration_ms": { "type": "number", "minimum": 0 }, - "max_cost_usd": { + "maxDurationMs": { "type": "number", "minimum": 0 }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "target_exploration_ratio": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { + "args_match": { "anyOf": [ { - "type": "boolean" + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] }, { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "type": "array", + "items": { + "type": "string" + } } ] }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { + "argsMatch": { "anyOf": [ { - "type": "boolean" + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] }, { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "type": "array", + "items": { + "type": "string" + } } ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "regex" - }, - "value": { - "type": "string" } }, - "required": ["type", "value"], + "required": ["tool"], "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["is-json", "is_json"] } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] }, - "required": ["type"], - "additionalProperties": false - }, - { + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { "type": "object", "properties": { - "name": { + "path": { "type": "string" }, - "weight": { - "type": "number", - "minimum": 0 + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] + "type": "boolean" }, - "min_score": { + "weight": { + "type": "number" + }, + "tolerance": { "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + "minimum": 0 }, - "negate": { + "relative": { "type": "boolean" }, - "type": { - "type": "string", - "const": "equals" - }, - "value": { - "type": "string" + "formats": { + "type": "array", + "items": { + "type": "string" + } } }, - "required": ["type", "value"], + "required": ["path", "match"], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "rubrics" - }, - "criteria": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - ] - }, - "minItems": 1 - } - }, - "required": ["type", "criteria"], - "additionalProperties": false - } - ] - } - ] - } - } - }, - "required": ["input"], - "additionalProperties": false - }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["mean", "min", "max"] - }, - "on_turn_failure": { - "type": "string", - "enum": ["continue", "stop"] - }, - "window_size": { - "type": "integer", - "minimum": 1 - } - }, - "required": ["id"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "include": { - "type": "string", - "minLength": 1 - }, - "type": { - "type": "string", - "enum": ["suite", "tests"] - }, - "select": { - "anyOf": [ - { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "array", - "items": { - "type": "string", - "minLength": 1 - }, - "minItems": 1 - } - ] - }, - { - "type": "object", - "properties": { - "test_ids": { - "anyOf": [ - { - "type": "string", - "minLength": 1 + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } }, - { - "type": "array", - "items": { + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { "type": "string", - "minLength": 1 + "const": "latency" }, - "minItems": 1 - } - ] - }, - "tags": { - "anyOf": [ - { - "type": "string", - "minLength": 1 + "threshold": { + "type": "number", + "minimum": 0 + } }, - { - "type": "array", - "items": { + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { "type": "string", - "minLength": 1 + "const": "cost" }, - "minItems": 1 - } - ] - }, - "metadata": { - "type": "object", - "additionalProperties": { - "anyOf": [ - { + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, - { - "type": "number" + "metric": { + "type": "string" }, - { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { "type": "boolean" }, - { - "type": "array", - "items": { - "type": ["string", "number", "boolean"] - }, - "minItems": 1 + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 } - ] - } - } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + ] + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] + } + }, + "skip_defaults": { + "type": "boolean" + }, + "cache": { + "type": "boolean" + }, + "trials": { + "not": {} + }, + "budget_usd": { + "type": "number", + "minimum": 0 + }, + "budgetUsd": { + "type": "number", + "minimum": 0 + }, + "fail_on_error": { + "type": "boolean" + }, + "failOnError": { + "type": "boolean" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "workspace": { + "not": {} + } + }, + "additionalProperties": false + }, + "run": { + "type": "object", + "properties": { + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "repeat": { + "type": "object", + "properties": { + "count": { + "type": "integer", + "minimum": 1 + }, + "strategy": { + "type": "string", + "enum": ["pass_any", "pass_all", "mean", "confidence_interval"] + }, + "early_exit": { + "type": "boolean" + }, + "cost_limit_usd": { + "type": "number", + "minimum": 0 + } + }, + "required": ["count"], + "additionalProperties": false + }, + "timeout_seconds": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0 + }, + "budget_usd": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0 + } + }, + "additionalProperties": false + }, + "workspace": { + "type": "object", + "properties": { + "template": { + "type": "string" + }, + "isolation": { + "type": "string", + "enum": ["shared", "per_case"] + }, + "repos": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "repo": { + "type": "string", + "minLength": 1 + }, + "commit": { + "type": "string", + "minLength": 1 + }, + "base_commit": { + "type": "string", + "minLength": 1 + }, + "ancestor": { + "type": "integer", + "minimum": 0 + }, + "sparse": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false + } + }, + "hooks": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "docker": { + "type": "object", + "properties": { + "image": { + "type": "string" + }, + "timeout": { + "type": "integer", + "minimum": 1 + }, + "memory": { + "type": "string" + }, + "cpus": { + "type": "number", + "minimum": 0.1 + } + }, + "required": ["image"], + "additionalProperties": false + }, + "env": { + "type": "object", + "properties": { + "required_commands": { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + }, + "required_python_modules": { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "metadata": { + "type": "object", + "additionalProperties": {} + }, + "conversation_id": { + "type": "string" + }, + "suite": { + "type": "string" + }, + "depends_on": { + "type": "array", + "items": { + "type": "string" + } + }, + "on_dependency_failure": { + "type": "string", + "enum": ["skip", "fail", "run"] + }, + "mode": { + "type": "string", + "enum": ["conversation"] + }, + "turns": { + "type": "array", + "items": { + "type": "object", + "properties": { + "input": { + "anyOf": [ + { + "type": "string" + }, + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + ] + }, + "expected_output": { + "anyOf": [ + { + "type": "string" + }, + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + ] + }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + } + }, + "required": ["input"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["mean", "min", "max"] + }, + "on_turn_failure": { + "type": "string", + "enum": ["continue", "stop"] + }, + "window_size": { + "type": "integer", + "minimum": 1 + } + }, + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + }, + "type": { + "type": "string", + "enum": ["suite", "tests"] + }, + "select": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 + } + ] + }, + { + "type": "object", + "properties": { + "test_ids": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 + } + ] + }, + "tags": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 1 + } + ] + }, + "metadata": { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "array", + "items": { + "type": ["string", "number", "boolean"] + }, + "minItems": 1 + } + ] + } + } + }, + "additionalProperties": false + } + ] + }, + "run": { + "type": "object", + "properties": { + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "repeat": { + "type": "object", + "properties": { + "count": { + "type": "integer", + "minimum": 1 + }, + "strategy": { + "type": "string", + "enum": ["pass_any", "pass_all", "mean", "confidence_interval"] + }, + "early_exit": { + "type": "boolean" + }, + "cost_limit_usd": { + "type": "number", + "minimum": 0 + } + }, + "required": ["count"], + "additionalProperties": false + }, + "timeout_seconds": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0 + }, + "budget_usd": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0 + } + }, + "additionalProperties": false + } + }, + "required": ["include", "type"], + "additionalProperties": false + }, + { + "type": "string", + "minLength": 1 + } + ] + } + }, + { + "type": "string", + "minLength": 1 + } + ] + }, + "target": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "label": { + "type": "string", + "minLength": 1 + }, + "extends": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "minLength": 1 + }, + "provider": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "env": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "reasoning_effort": { + "type": "string", + "minLength": 1 + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "additionalProperties": true + } + ] + }, + "targets": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "label": { + "type": "string", + "minLength": 1 + }, + "extends": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "minLength": 1 + }, + "provider": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "env": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "reasoning_effort": { + "type": "string", + "minLength": 1 + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "label": { + "type": "string", + "minLength": 1 + }, + "extends": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "minLength": 1 + }, + "provider": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "env": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "reasoning_effort": { + "type": "string", + "minLength": 1 + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "model": { + "not": {} + }, + "experiment": { + "type": "string", + "minLength": 1 + }, + "repeat": { + "type": "object", + "properties": { + "count": { + "type": "integer", + "minimum": 1 + }, + "strategy": { + "type": "string", + "enum": ["pass_any", "pass_all", "mean", "confidence_interval"] + }, + "early_exit": { + "type": "boolean" + }, + "cost_limit_usd": { + "type": "number", + "minimum": 0 + } + }, + "required": ["count"], + "additionalProperties": false + }, + "runs": { + "not": {} + }, + "early_exit": { + "not": {} + }, + "timeout_seconds": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0 + }, + "evaluate_options": { + "type": "object", + "properties": { + "budget_usd": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0 + }, + "max_concurrency": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "cache": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "generate_suggestions": { + "type": "boolean" + }, + "repeat": { + "anyOf": [ + { + "type": "integer", + "minimum": 1 + }, + { + "type": "object", + "properties": { + "count": { + "type": "integer", + "minimum": 1 + }, + "strategy": { + "type": "string", + "enum": ["pass_any", "pass_all", "mean", "confidence_interval"] + }, + "early_exit": { + "type": "boolean" + }, + "cost_limit_usd": { + "type": "number", + "minimum": 0 + } + }, + "required": ["count"], + "additionalProperties": false + } + ] + }, + "timeout_ms": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0 + }, + "max_eval_time_ms": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0 + }, + "filter_range": { + "anyOf": [ + { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "number" + }, + { + "type": "number" + } + ] + }, + { + "type": "string" + } + ] + } + }, + "additionalProperties": false + }, + "budget_usd": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0 + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "default_test": { + "type": "object", + "properties": { + "vars": { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + "provider": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "label": { + "type": "string", + "minLength": 1 + }, + "extends": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "minLength": 1 + }, + "provider": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "env": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "reasoning_effort": { + "type": "string", + "minLength": 1 + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "additionalProperties": true + } + ] + }, + "providers": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "label": { + "type": "string", + "minLength": 1 + }, + "extends": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "minLength": 1 + }, + "provider": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "env": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "reasoning_effort": { + "type": "string", + "minLength": 1 + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "label": { + "type": "string", + "minLength": 1 + }, + "extends": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "minLength": 1 + }, + "provider": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "env": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "reasoning_effort": { + "type": "string", + "minLength": 1 + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "provider_output": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "expected_output": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "assert_scoring_function": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "options": { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "metadata": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + }, + "scenarios": { + "type": "array", + "items": { + "type": "object", + "properties": { + "description": { + "type": "string" + }, + "config": { + "type": "array", + "items": { + "type": "object", + "properties": { + "vars": { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + "provider": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "label": { + "type": "string", + "minLength": 1 + }, + "extends": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "minLength": 1 + }, + "provider": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "env": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "reasoning_effort": { + "type": "string", + "minLength": 1 + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "additionalProperties": true + } + ] + }, + "providers": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "label": { + "type": "string", + "minLength": 1 + }, + "extends": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "minLength": 1 + }, + "provider": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "env": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "reasoning_effort": { + "type": "string", + "minLength": 1 + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "label": { + "type": "string", + "minLength": 1 + }, + "extends": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "minLength": 1 + }, + "provider": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "env": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "reasoning_effort": { + "type": "string", + "minLength": 1 + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "provider_output": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "options": { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "metadata": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + }, + "tests": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "description": { + "type": "string" + }, + "vars": { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + "criteria": { + "type": "string" + }, + "provider": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "label": { + "type": "string", + "minLength": 1 + }, + "extends": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "minLength": 1 + }, + "provider": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "env": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "reasoning_effort": { + "type": "string", + "minLength": 1 + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "additionalProperties": true + } + ] + }, + "providers": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "label": { + "type": "string", + "minLength": 1 + }, + "extends": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "minLength": 1 + }, + "provider": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "env": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "reasoning_effort": { + "type": "string", + "minLength": 1 + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "label": { + "type": "string", + "minLength": 1 + }, + "extends": { + "type": "string", + "minLength": 1 + }, + "name": { + "type": "string", + "minLength": 1 + }, + "provider": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "delay": { + "type": "number", + "minimum": 0 + }, + "env": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "reasoning_effort": { + "type": "string", + "minLength": 1 + }, + "hooks": { + "type": "object", + "properties": { + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "prompts": { + "anyOf": [ + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "minItems": 1 + } + ] + }, + "provider_output": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "input": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "role": { + "not": {} + } + }, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "input_files": { + "type": "array", + "items": { + "type": "string" + } + }, + "expected_output": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": ["system", "user", "assistant", "tool"] + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + }, + "required": ["role", "content"], + "additionalProperties": false + } + } + ] + }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "assert_scoring_function": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "options": { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "evaluators": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "assert-set", + "g-eval", + "llm-rubric", + "javascript", + "python", + "webhook", + "similar", + "select-best", + "human", + "contains", + "contains-any", + "contains-all", + "icontains", + "icontains-any", + "icontains-all", + "starts-with", + "ends-with", + "regex", + "is-json", + "equals" + ] + }, + "value": {}, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "criteria": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + ] + } + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + }, + "provider": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "required": ["type"], + "additionalProperties": true + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } }, + "required": ["type", "fields"], "additionalProperties": false - } - ] - }, - "run": { - "type": "object", - "properties": { - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 }, - "repeat": { + { "type": "object", "properties": { - "count": { - "type": "integer", - "minimum": 1 + "name": { + "type": "string" }, - "strategy": { - "type": "string", - "enum": ["pass_any", "pass_all", "mean", "confidence_interval"] + "metric": { + "type": "string" }, - "early_exit": { + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { "type": "boolean" }, - "cost_limit_usd": { + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { "type": "number", "minimum": 0 } }, - "required": ["count"], + "required": ["type", "threshold"], "additionalProperties": false }, - "timeout_seconds": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0 - }, - "budget_usd": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0 - } - }, - "additionalProperties": false - } - }, - "required": ["include", "type"], - "additionalProperties": false - }, - { - "type": "string", - "minLength": 1 - } - ] - } - }, - { - "type": "string", - "minLength": 1 - } - ] - }, - "target": { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "object", - "properties": { - "extends": { - "type": "string", - "minLength": 1 - }, - "name": { - "type": "string", - "minLength": 1 - }, - "provider": { - "type": "string", - "minLength": 1 - }, - "model": { - "type": "string", - "minLength": 1 - }, - "reasoning_effort": { - "type": "string", - "minLength": 1 - }, - "hooks": { - "type": "object", - "properties": { - "before_all": { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } - }, - "additionalProperties": false - }, - "before_each": { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } - }, - "additionalProperties": false - }, - "after_each": { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { + { + "type": "object", + "properties": { + "name": { "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { + }, + "metric": { "type": "string" - } - } - ] - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } - }, - "additionalProperties": false - }, - "after_all": { - "type": "object", - "properties": { - "command": { - "anyOf": [ - { - "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } }, - { - "type": "array", - "items": { + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" }, - { - "type": "array", - "items": { + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" - } - } - ] - }, - "timeout_ms": { - "type": "number" - }, - "timeoutMs": { - "type": "number" - }, - "cwd": { - "type": "string" - }, - "reset": { - "type": "string", - "enum": ["none", "fast", "strict"] - } - }, - "additionalProperties": false - } - }, - "additionalProperties": false - } - }, - "additionalProperties": true - } - ] - }, - "model": { - "not": {} - }, - "experiment": { - "type": "string", - "minLength": 1 - }, - "repeat": { - "type": "object", - "properties": { - "count": { - "type": "integer", - "minimum": 1 - }, - "strategy": { - "type": "string", - "enum": ["pass_any", "pass_all", "mean", "confidence_interval"] - }, - "early_exit": { - "type": "boolean" - }, - "cost_limit_usd": { - "type": "number", - "minimum": 0 - } - }, - "required": ["count"], - "additionalProperties": false - }, - "runs": { - "not": {} - }, - "early_exit": { - "not": {} - }, - "timeout_seconds": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0 - }, - "evaluate_options": { - "type": "object", - "properties": { - "budget_usd": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0 - }, - "max_concurrency": { - "type": "integer", - "minimum": 1, - "maximum": 50 - } - }, - "additionalProperties": false - }, - "budget_usd": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0 - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "default_test": { - "type": "object", - "properties": { - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } - }, - "additionalProperties": false - }, - "on_run_complete": { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "array", - "items": { - "type": "string", - "minLength": 1 - } - } - ] - }, - "policy": { - "not": {} - }, - "execution": { - "not": {} - }, - "assertions": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["code-grader", "code_grader"] - }, - "command": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "cwd": { - "type": "string" - }, - "target": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "object", - "properties": { - "max_calls": { - "type": "number" - } - }, - "additionalProperties": false - } - ] - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 - }, - "command": { - "anyOf": [ - { - "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } }, - { - "type": "array", - "items": { + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { "type": "string" } - } - ] - } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type", "command"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + ] + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false + } + ] } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["llm-grader", "llm_grader"] - }, - "prompt": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": { - "command": { + }, + "execution": { + "type": "object", + "properties": { + "workers": { + "not": {} + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "evaluators": { + "type": "array", + "items": { "anyOf": [ { - "type": "string" + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["code-grader", "code_grader"] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type", "command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["llm-grader", "llm_grader"] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": ["command"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "raw": { + "type": "string" + }, + "path": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": true + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "preprocessors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "minLength": 1 + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "command"], + "additionalProperties": false + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "assert-set", + "g-eval", + "llm-rubric", + "javascript", + "python", + "webhook", + "similar", + "select-best", + "human", + "contains", + "contains-any", + "contains-all", + "icontains", + "icontains-any", + "icontains-all", + "starts-with", + "ends-with", + "regex", + "is-json", + "equals" + ] + }, + "value": {}, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "criteria": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + ] + } + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + }, + "provider": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "transform": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "required": ["type"], + "additionalProperties": true + }, + { + "type": "object", + "properties": { + "include": { + "type": "string", + "minLength": 1 + } + }, + "required": ["include"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-grader" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": ["type", "path"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-grader" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + } + }, + "required": ["type", "aggregator"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["tool-trajectory", "tool_trajectory"] + }, + "mode": { + "type": "string", + "enum": ["any_order", "in_order", "exact", "subset", "superset"] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["tool"], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": ["exact", "ignore", "subset", "superset"] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": ["type", "mode"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["field-accuracy", "field_accuracy"] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": ["exact", "numeric_tolerance", "date"] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["path", "match"], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": ["weighted_average", "all_or_nothing"] + } + }, + "required": ["type", "fields"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "threshold"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type", "budget"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["token-usage", "token_usage"] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["execution-metrics", "execution_metrics"] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": ["type"], + "additionalProperties": false }, { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "script": { - "anyOf": [ + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, { - "type": "string" + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false }, { - "type": "array", - "items": { - "type": "string" - } + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": ["is-json", "is_json"] + } + }, + "required": ["type"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "metric": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": ["correctness", "contradiction"] + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "min_score": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0, + "maximum": 1 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": ["score_range", "outcome"], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + ] + }, + "minItems": 1 + } + }, + "required": ["type", "criteria"], + "additionalProperties": false } ] - }, - "config": { - "type": "object", - "additionalProperties": {} } }, - "additionalProperties": false - } - ] - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" + "skip_defaults": { + "type": "boolean" }, - "outcome": { - "type": "string" + "cache": { + "type": "boolean" + }, + "trials": { + "not": {} + }, + "budget_usd": { + "type": "number", + "minimum": 0 }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] + "budgetUsd": { + "type": "number", + "minimum": 0 }, - "weight": { - "type": "number" + "fail_on_error": { + "type": "boolean" }, - "required": { + "failOnError": { "type": "boolean" }, - "min_score": { + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "workspace": { + "not": {} + } + }, + "additionalProperties": false + }, + "run": { + "type": "object", + "properties": { + "threshold": { "type": "number", - "exclusiveMinimum": true, "minimum": 0, "maximum": 1 }, - "score_ranges": { + "repeat": { + "type": "object", + "properties": { + "count": { + "type": "integer", + "minimum": 1 + }, + "strategy": { + "type": "string", + "enum": ["pass_any", "pass_all", "mean", "confidence_interval"] + }, + "early_exit": { + "type": "boolean" + }, + "cost_limit_usd": { + "type": "number", + "minimum": 0 + } + }, + "required": ["count"], + "additionalProperties": false + }, + "timeout_seconds": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0 + }, + "budget_usd": { + "type": "number", + "exclusiveMinimum": true, + "minimum": 0 + } + }, + "additionalProperties": false + }, + "workspace": { + "type": "object", + "properties": { + "template": { + "type": "string" + }, + "isolation": { + "type": "string", + "enum": ["shared", "per_case"] + }, + "repos": { "type": "array", "items": { "type": "object", "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] + "path": { + "type": "string" + }, + "repo": { + "type": "string", + "minLength": 1 }, - "outcome": { + "commit": { "type": "string", "minLength": 1 + }, + "base_commit": { + "type": "string", + "minLength": 1 + }, + "ancestor": { + "type": "integer", + "minimum": 0 + }, + "sparse": { + "type": "array", + "items": { + "type": "string" + } } }, - "required": ["score_range", "outcome"], "additionalProperties": false } - } - }, - "additionalProperties": false - } - }, - "model": { - "type": "string" - }, - "target": { - "type": "string" - }, - "config": { - "type": "object", - "additionalProperties": {} - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "preprocessors": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "minLength": 1 }, - "command": { - "anyOf": [ - { - "type": "string" + "hooks": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "command"], - "additionalProperties": false - } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "include": { - "type": "string", - "minLength": 1 - } - }, - "required": ["include"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "composite" - }, - "assertions": { - "type": "array", - "items": {} - }, - "evaluators": { - "type": "array", - "items": {} - }, - "aggregator": { - "anyOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "weighted_average" - }, - "weights": { - "type": "object", - "additionalProperties": { - "type": "number" + "before_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "before_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_each": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false + }, + "after_all": { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "timeout_ms": { + "type": "number" + }, + "timeoutMs": { + "type": "number" + }, + "cwd": { + "type": "string" + }, + "reset": { + "type": "string", + "enum": ["none", "fast", "strict"] + } + }, + "additionalProperties": false } - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "threshold" - }, - "threshold": { - "type": "number", - "minimum": 0, - "maximum": 1 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "code-grader" - }, - "path": { - "type": "string" - }, - "cwd": { - "type": "string" - } - }, - "required": ["type", "path"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm-grader" }, - "prompt": { - "type": "string" - }, - "model": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - } - ] - } - }, - "required": ["type", "aggregator"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] - }, - "mode": { - "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] - }, - "minimums": { - "type": "object", - "additionalProperties": { - "type": "integer", - "minimum": 0 - } - }, - "expected": { - "type": "array", - "items": { - "type": "object", - "properties": { - "tool": { - "type": "string" + "additionalProperties": false }, - "args": { - "anyOf": [ - { - "type": "string", - "const": "any" + "docker": { + "type": "object", + "properties": { + "image": { + "type": "string" }, - { - "type": "object", - "additionalProperties": {} + "timeout": { + "type": "integer", + "minimum": 1 + }, + "memory": { + "type": "string" + }, + "cpus": { + "type": "number", + "minimum": 0.1 } - ] - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "maxDurationMs": { - "type": "number", - "minimum": 0 + }, + "required": ["image"], + "additionalProperties": false }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { + "env": { + "type": "object", + "properties": { + "required_commands": { "type": "array", "items": { - "type": "string" + "type": "string", + "minLength": 1 } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] }, - { + "required_python_modules": { "type": "array", "items": { + "type": "string", + "minLength": 1 + } + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "metadata": { + "type": "object", + "additionalProperties": {} + }, + "conversation_id": { + "type": "string" + }, + "suite": { + "type": "string" + }, + "depends_on": { + "type": "array", + "items": { + "type": "string" + } + }, + "on_dependency_failure": { + "type": "string", + "enum": ["skip", "fail", "run"] + }, + "mode": { + "type": "string", + "enum": ["conversation"] + }, + "turns": { + "type": "array", + "items": { + "type": "object", + "properties": { + "input": { + "anyOf": [ + { + "type": "string" + }, + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] + } + ] + }, + "expected_output": { + "anyOf": [ + { "type": "string" + }, + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["text", "file", "image"] + }, + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + } + } + ] } + ] + }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + ] } - ] - } - }, - "required": ["tool"], - "additionalProperties": false - } - }, - "args_match": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "argsMatch": { - "anyOf": [ - { - "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - } - }, - "required": ["type", "mode"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["field-accuracy", "field_accuracy"] - }, - "fields": { - "type": "array", - "items": { - "type": "object", - "properties": { - "path": { - "type": "string" - }, - "match": { - "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] - }, - "required": { - "type": "boolean" - }, - "weight": { - "type": "number" - }, - "tolerance": { - "type": "number", - "minimum": 0 - }, - "relative": { - "type": "boolean" - }, - "formats": { - "type": "array", - "items": { - "type": "string" } - } + }, + "required": ["input"], + "additionalProperties": false }, - "required": ["path", "match"], - "additionalProperties": false + "minItems": 1 }, - "minItems": 1 - }, - "aggregation": { - "type": "string", - "enum": ["weighted_average", "all_or_nothing"] - } - }, - "required": ["type", "fields"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "latency" - }, - "threshold": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "threshold"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "cost" - }, - "budget": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type", "budget"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["token-usage", "token_usage"] - }, - "max_total": { - "type": "number", - "minimum": 0 - }, - "max_input": { - "type": "number", - "minimum": 0 - }, - "max_output": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["execution-metrics", "execution_metrics"] - }, - "max_tool_calls": { - "type": "number", - "minimum": 0 - }, - "max_llm_calls": { - "type": "number", - "minimum": 0 - }, - "max_tokens": { - "type": "number", - "minimum": 0 - }, - "max_cost_usd": { - "type": "number", - "minimum": 0 - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "target_exploration_ratio": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" + "aggregation": { + "type": "string", + "enum": ["mean", "min", "max"] + }, + "on_turn_failure": { + "type": "string", + "enum": ["continue", "stop"] + }, + "window_size": { + "type": "integer", + "minimum": 1 + } }, - "type": { + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "derived_metrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "minLength": 1 + }, + "value": { + "anyOf": [ + { "type": "string", - "const": "regex" + "minLength": 1 }, - "value": { - "type": "string" + { + "type": "object", + "properties": {}, + "additionalProperties": {} } - }, - "required": ["type", "value"], - "additionalProperties": false + ] + } + }, + "required": ["name", "value"], + "additionalProperties": false + } + }, + "output_path": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + } + ] + }, + "env": { + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "nunjucks_filters": { + "anyOf": [ + { + "type": "object", + "properties": {}, + "additionalProperties": {} + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + } + ] + }, + "extensions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string", + "minLength": 1 }, { "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["is-json", "is_json"] - } - }, - "required": ["type"], - "additionalProperties": false + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "on_run_complete": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + } + ] + }, + "policy": { + "not": {} + }, + "execution": { + "not": {} + }, + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" }, { "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "equals" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false + "properties": {}, + "additionalProperties": {} + } + ] + } + }, + "assertions": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" }, { "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - } - ] - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "rubrics" - }, - "criteria": { - "type": "array", - "items": { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "operator": { - "type": "string", - "enum": ["correctness", "contradiction"] - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "min_score": { - "type": "number", - "exclusiveMinimum": true, - "minimum": 0, - "maximum": 1 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - ] - }, - "minItems": 1 - } - }, - "required": ["type", "criteria"], - "additionalProperties": false + "properties": {}, + "additionalProperties": {} } ] } @@ -15534,10 +18800,6 @@ "items": { "type": "string" } - }, - "resolver": { - "type": "string", - "minLength": 1 } }, "additionalProperties": false @@ -15565,19 +18827,6 @@ } ] }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, "timeout_ms": { "type": "number" }, @@ -15610,19 +18859,6 @@ } ] }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, "timeout_ms": { "type": "number" }, @@ -15655,19 +18891,6 @@ } ] }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, "timeout_ms": { "type": "number" }, @@ -15700,19 +18923,6 @@ } ] }, - "script": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, "timeout_ms": { "type": "number" }, diff --git a/skills-data/agentv-eval-writer/references/rubric-evaluator.md b/skills-data/agentv-eval-writer/references/rubric-evaluator.md index 141df7a77..d0afd6225 100644 --- a/skills-data/agentv-eval-writer/references/rubric-evaluator.md +++ b/skills-data/agentv-eval-writer/references/rubric-evaluator.md @@ -20,7 +20,6 @@ Rubrics are defined as `assertions` entries with `type: rubrics`. They support b | `weight` | number | 1.0 | Relative importance | | `required` | boolean | true | Failing forces verdict to 'fail' (checklist mode) | | `min_score` | number | - | Minimum score (0–1) to pass this criterion | -| `required_min_score` | integer | - | **Deprecated.** Use `min_score` instead. Legacy 0–10 scale. | | `score_ranges` | map or array | - | Score range definitions for analytic scoring | ## String Shorthand (Recommended)