From f65a26f3ba94e7377b34c98bff655a2251026c61 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 12:04:48 +0200 Subject: [PATCH 1/3] feat(core): canonicalize targets around labels --- apps/cli/src/commands/eval/targets.ts | 4 +- apps/cli/src/commands/eval/task-bundle.ts | 63 +++++++- apps/cli/src/commands/runs/rerun.ts | 3 + apps/cli/test/commands/eval/bundle.test.ts | 10 +- apps/cli/test/commands/runs/rerun.test.ts | 6 +- apps/cli/test/eval.integration.test.ts | 10 +- .../docs/docs/targets/cli-provider.mdx | 10 +- .../docs/docs/targets/coding-agents.mdx | 26 +-- .../docs/docs/targets/configuration.mdx | 31 ++-- .../docs/docs/targets/custom-providers.mdx | 4 +- .../docs/docs/targets/llm-providers.mdx | 14 +- .../src/content/docs/docs/targets/retry.mdx | 2 +- .../src/evaluation/loaders/config-loader.ts | 70 ++++++-- .../src/evaluation/providers/targets-file.ts | 19 ++- .../core/src/evaluation/providers/targets.ts | 57 ++++++- .../core/src/evaluation/providers/types.ts | 10 ++ packages/core/src/evaluation/types.ts | 6 +- .../evaluation/validation/eval-file.schema.ts | 5 +- .../evaluation/validation/eval-validator.ts | 4 + .../validation/targets-validator.ts | 76 +++++++-- packages/core/src/evaluation/yaml-parser.ts | 19 ++- .../evaluation/loaders/config-loader.test.ts | 51 +++++- .../evaluation/providers/targets-file.test.ts | 60 +++++++ .../test/evaluation/providers/targets.test.ts | 41 +++++ .../validation/eval-validator.test.ts | 10 +- .../validation/targets-validator.test.ts | 153 ++++++++++++++---- packages/sdk/src/eval.ts | 11 +- packages/sdk/test/eval-authoring.test.ts | 18 ++- skills-data/agentv-eval-writer/SKILL.md | 2 +- .../references/eval.schema.json | 75 +-------- 30 files changed, 652 insertions(+), 218 deletions(-) create mode 100644 packages/core/test/evaluation/providers/targets-file.test.ts diff --git a/apps/cli/src/commands/eval/targets.ts b/apps/cli/src/commands/eval/targets.ts index 44921953e..7621e6ebe 100644 --- a/apps/cli/src/commands/eval/targets.ts +++ b/apps/cli/src/commands/eval/targets.ts @@ -307,7 +307,9 @@ export async function selectMultipleTargets( const definitions = [...fileDefinitions]; if (targetRefs) { for (const ref of targetRefs) { - if (ref.use_target && !fileDefinitions.some((d) => d.name === ref.name)) { + if (ref.definition && !fileDefinitions.some((d) => d.name === ref.name)) { + definitions.push(ref.definition); + } else if (ref.use_target && !fileDefinitions.some((d) => d.name === ref.name)) { definitions.push({ name: ref.name, use_target: ref.use_target } as TargetDefinition); } } diff --git a/apps/cli/src/commands/eval/task-bundle.ts b/apps/cli/src/commands/eval/task-bundle.ts index 89d458fc0..9aec62e11 100644 --- a/apps/cli/src/commands/eval/task-bundle.ts +++ b/apps/cli/src/commands/eval/task-bundle.ts @@ -43,6 +43,27 @@ const SKIPPED_DIR_NAMES = new Set([ '.beads', '.DS_Store', ]); +const AUTHORING_TOP_LEVEL_TARGET_FIELDS = new Set([ + 'label', + 'provider', + 'prompts', + 'transform', + 'delay', + 'env', + 'model', + 'use_target', + 'fallback_targets', + 'grader_target', + 'max_budget_usd', + 'workers', + 'provider_batching', + 'subagent_mode_allowed', + 'max_retries', + 'retry_initial_delay_ms', + 'retry_max_delay_ms', + 'retry_backoff_factor', + 'retry_status_codes', +]); export interface TaskBundleTargetSelection { readonly evalFileAbsolutePath?: string; @@ -591,6 +612,44 @@ function uniqueTargetDefinitions( return selected; } +function serializeTargetDefinition(definition: TargetDefinition): Record { + const target: Record = { label: definition.name }; + if (definition.id !== undefined) { + target.id = definition.id; + } + const config: Record = {}; + + for (const [key, value] of Object.entries(definition)) { + if (value === undefined || key === 'name' || key === 'id' || key === 'config') { + continue; + } + if (AUTHORING_TOP_LEVEL_TARGET_FIELDS.has(key)) { + target[key] = value; + } else { + config[key] = value; + } + } + + if (isRecord(definition.config)) { + for (const [key, value] of Object.entries(definition.config)) { + if (value !== undefined) { + config[key] = value; + } + } + } + if (Object.keys(config).length > 0) { + target.config = config; + } + + return target; +} + +function serializeTargetDefinitions( + definitions: readonly TargetDefinition[], +): readonly Record[] { + return definitions.map((definition) => serializeTargetDefinition(definition)); +} + function uniqueTargetNames(selections: readonly TaskBundleTargetSelection[]): readonly string[] { const names: string[] = []; const seen = new Set(); @@ -962,7 +1021,7 @@ export async function materializeTaskBundle( target: options.targetName, tests: [evalCase], }); - await writeYamlFile(targetsPath, { targets: targetDefinitions }); + await writeYamlFile(targetsPath, { targets: serializeTargetDefinitions(targetDefinitions) }); return { testDir, @@ -1033,7 +1092,7 @@ export async function materializeEvalBundle( tests: options.tests.map((test) => buildPortableEvalCase(test, rewrites)), }); await writeYamlFile(targetsPath, { - targets: uniqueTargetDefinitions(options.targetSelections), + targets: serializeTargetDefinitions(uniqueTargetDefinitions(options.targetSelections)), }); const manifest = bundleManifest({ diff --git a/apps/cli/src/commands/runs/rerun.ts b/apps/cli/src/commands/runs/rerun.ts index 864b9c6e3..bcf4b3b3f 100644 --- a/apps/cli/src/commands/runs/rerun.ts +++ b/apps/cli/src/commands/runs/rerun.ts @@ -122,6 +122,9 @@ async function readTargetDefinitions( } function targetName(definition: Record): string | undefined { + if (typeof definition.label === 'string' && definition.label.trim().length > 0) { + return definition.label.trim(); + } return typeof definition.name === 'string' && definition.name.trim().length > 0 ? definition.name.trim() : undefined; diff --git a/apps/cli/test/commands/eval/bundle.test.ts b/apps/cli/test/commands/eval/bundle.test.ts index ef1974a12..8952e9de7 100644 --- a/apps/cli/test/commands/eval/bundle.test.ts +++ b/apps/cli/test/commands/eval/bundle.test.ts @@ -54,11 +54,11 @@ describe('agentv eval bundle', () => { await writeFile( path.join(sourceDir, '.agentv', 'targets.yaml'), `targets: - - name: inherited + - label: inherited provider: mock response: '{"answer":"Mock provider response from inherited target"}' fallback_targets: [backup] - - name: backup + - label: backup provider: mock response: '{"answer":"Backup mock response"}' `, @@ -153,8 +153,8 @@ tests: ../data/cases.yaml expect(input[0]?.content[0]).toEqual({ type: 'file', value: 'files/data/input.txt' }); const bundledTargets = await readFile(path.join(bundleDir, 'targets.yaml'), 'utf8'); - expect(bundledTargets).toContain('name: inherited'); - expect(bundledTargets).toContain('name: backup'); + expect(bundledTargets).toContain('label: inherited'); + expect(bundledTargets).toContain('label: backup'); await rm(sourceDir, { recursive: true, force: true }); const run = await runCli(bundleDir, [ @@ -177,7 +177,7 @@ tests: ../data/cases.yaml await writeFile( path.join(sourceDir, '.agentv', 'targets.yaml'), `targets: - - name: default + - label: default provider: mock `, 'utf8', diff --git a/apps/cli/test/commands/runs/rerun.test.ts b/apps/cli/test/commands/runs/rerun.test.ts index dba3b32cd..facc3c3f1 100644 --- a/apps/cli/test/commands/runs/rerun.test.ts +++ b/apps/cli/test/commands/runs/rerun.test.ts @@ -30,7 +30,7 @@ interface CliResult { } const DEFAULT_TARGETS = `targets: - - name: captured + - label: captured provider: mock `; @@ -121,7 +121,7 @@ async function createBundleFixture( await writeFile( overrideTargetsPath, `targets: - - name: local + - label: local provider: mock `, 'utf8', @@ -272,7 +272,7 @@ describe('agentv runs rerun', () => { it('fails clearly for missing env and accepts an explicit env file', async () => { const created = await fixture(`targets: - - name: captured + - label: captured provider: cli command: \${{ LOCAL_AGENT_COMMAND }} `); diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index 00d49a159..54b5053aa 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -32,13 +32,13 @@ async function createFixture(): Promise { const targetsPath = path.join(agentvDir, 'targets.yaml'); const targetsContent = `$schema: agentv-targets-v2.2 targets: - - name: default + - label: default provider: mock - - name: file-target + - label: file-target provider: mock - - name: cli-target + - label: cli-target provider: mock - - name: codex-target + - label: codex-target provider: codex model: gpt-5-default `; @@ -90,7 +90,7 @@ async function createNestedEnvFixture(): Promise { const targetsPath = path.join(agentvDir, 'targets.yaml'); const targetsContent = `$schema: agentv-targets-v2.2 targets: - - name: default + - label: default provider: mock `; await writeFile(targetsPath, targetsContent, 'utf8'); diff --git a/apps/web/src/content/docs/docs/targets/cli-provider.mdx b/apps/web/src/content/docs/docs/targets/cli-provider.mdx index 049952fc3..bb70fb3c1 100644 --- a/apps/web/src/content/docs/docs/targets/cli-provider.mdx +++ b/apps/web/src/content/docs/docs/targets/cli-provider.mdx @@ -14,7 +14,7 @@ Because the contract is "we invoke a command and read a file," almost any useful ```yaml # .agentv/targets.yaml targets: - - name: my_agent + - label: my_agent provider: cli command: python agent.py --prompt {PROMPT} --out {OUTPUT_FILE} grader_target: azure-base # required if your evals use LLM graders @@ -70,7 +70,7 @@ echo "Hello, world!" > {OUTPUT_FILE} | Field | Type | Required | Default | Description | |---|---|---|---|---| -| `name` | string | yes | — | Target identifier used in eval configs. | +| `label` | string | yes | — | AgentV target name used by eval `target`, CLI `--target`, and comparisons. | | `provider` | literal `"cli"` | yes | — | Selects this provider. | | `command` | string | yes | — | Shell command template. | | `timeout_seconds` | number | no | — | Kill the process if it runs longer than this. | @@ -89,7 +89,7 @@ For targets where spin-up cost dominates per-case work (e.g. loading a model, au ```yaml targets: - - name: batched_agent + - label: batched_agent provider: cli provider_batching: true command: python agent.py --batch-in {PROMPT_FILE} --batch-out {OUTPUT_FILE} @@ -106,12 +106,12 @@ AgentV has no dedicated "oracle" feature because the `cli` provider already comp ```yaml # .agentv/targets.yaml targets: - - name: my_agent + - label: my_agent provider: cli command: python agent.py --prompt {PROMPT} --out {OUTPUT_FILE} grader_target: azure-base - - name: oracle + - label: oracle provider: cli command: cp fixtures/{EVAL_ID}.expected.txt {OUTPUT_FILE} grader_target: azure-base diff --git a/apps/web/src/content/docs/docs/targets/coding-agents.mdx b/apps/web/src/content/docs/docs/targets/coding-agents.mdx index 6f1ce13d4..d6a4a4000 100644 --- a/apps/web/src/content/docs/docs/targets/coding-agents.mdx +++ b/apps/web/src/content/docs/docs/targets/coding-agents.mdx @@ -62,7 +62,7 @@ The preread block instructs the agent to read input files before processing the ```yaml targets: - - name: claude_agent + - label: claude_agent provider: claude grader_target: azure-base ``` @@ -102,7 +102,7 @@ part of the canonical target schema. ```yaml targets: - - name: codex_target + - label: codex_target provider: codex executable: codex-eng model: ${{ CODEX_MODEL }} @@ -122,7 +122,7 @@ targets: ```yaml targets: - - name: copilot + - label: copilot provider: copilot-cli model: gpt-5-mini grader_target: azure-base @@ -144,7 +144,7 @@ Route Copilot through an OpenAI-compatible endpoint: ```yaml targets: - - name: copilot-openai + - label: copilot-openai provider: copilot-cli subprovider: openai base_url: ${{ OPENAI_ENDPOINT }} @@ -159,7 +159,7 @@ Values can come from environment variables through `${{ ... }}` interpolation. F ```yaml targets: - - name: pi_target + - label: pi_target provider: pi-coding-agent subprovider: openai-codex model: gpt-5.5 @@ -184,7 +184,7 @@ configuration. This works for OpenAI-compatible endpoints: ```yaml targets: - - name: pi-sdk-openai + - label: pi-sdk-openai provider: pi-coding-agent subprovider: openai base_url: ${{ OPENAI_ENDPOINT }} @@ -210,7 +210,7 @@ is compatible with Azure OpenAI Responses: ```yaml targets: - - name: pi-cli-gateway + - label: pi-cli-gateway provider: pi-cli subprovider: azure base_url: ${{ OPENAI_ENDPOINT }} @@ -223,7 +223,7 @@ targets: ```yaml targets: - - name: vscode_dev + - label: vscode_dev provider: vscode grader_target: azure-base ``` @@ -237,7 +237,7 @@ Using a custom executable path: ```yaml targets: - - name: vscode_dev + - label: vscode_dev provider: vscode executable: ${{ VSCODE_CMD }} grader_target: azure-base @@ -247,7 +247,7 @@ targets: ```yaml targets: - - name: vscode_insiders + - label: vscode_insiders provider: vscode-insiders grader_target: azure-base ``` @@ -260,7 +260,7 @@ Evaluate any command-line agent: ```yaml targets: - - name: local_agent + - label: local_agent provider: cli command: 'python agent.py --prompt-file {PROMPT_FILE} --output {OUTPUT_FILE}' grader_target: azure-base @@ -278,7 +278,7 @@ For testing the evaluation harness without calling real providers: ```yaml targets: - - name: mock_target + - label: mock_target provider: mock ``` @@ -299,7 +299,7 @@ The VS Code provider uses a **subagent file-messaging architecture**. AgentV pro ```yaml targets: - - name: copilot + - label: copilot provider: copilot-cli executable: ${{ COPILOT_EXE }} grader_target: azure-base diff --git a/apps/web/src/content/docs/docs/targets/configuration.mdx b/apps/web/src/content/docs/docs/targets/configuration.mdx index 50de9994d..0f5f9014b 100644 --- a/apps/web/src/content/docs/docs/targets/configuration.mdx +++ b/apps/web/src/content/docs/docs/targets/configuration.mdx @@ -11,22 +11,30 @@ Targets define which agent or LLM provider to evaluate. They are configured in ` ```yaml targets: - - name: azure-base + - label: azure-base provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} + config: + endpoint: ${{ AZURE_OPENAI_ENDPOINT }} + api_key: ${{ AZURE_OPENAI_API_KEY }} + model: ${{ AZURE_DEPLOYMENT_NAME }} - - name: vscode_dev + - label: vscode_dev provider: vscode grader_target: azure-base - - name: local_agent + - label: local_agent provider: cli - command: 'python agent.py --prompt {PROMPT}' + config: + command: 'python agent.py --prompt {PROMPT}' grader_target: azure-base ``` +Use `label` for AgentV target references and comparison names. Use `id` only when you need to carry a promptfoo provider/backend identifier. The +`provider` field selects the backend kind. Provider-specific settings belong in +`config`; AgentV target extensions such as `grader_target`, `use_target`, +`fallback_targets`, `workers`, and `provider_batching` remain top-level fields on +the target object. + ## Environment Variables Use `${{ VARIABLE_NAME }}` syntax to reference values from your environment. AgentV reads @@ -35,10 +43,11 @@ eval directory hierarchy when present: ```yaml targets: - - name: my_target + - label: my_target provider: anthropic - api_key: ${{ ANTHROPIC_API_KEY }} - model: ${{ ANTHROPIC_MODEL }} + config: + api_key: ${{ ANTHROPIC_API_KEY }} + model: ${{ ANTHROPIC_MODEL }} ``` This keeps secrets out of version-controlled files and avoids requiring a CI step that rewrites @@ -80,7 +89,7 @@ Agent targets that need LLM-based evaluation specify a `grader_target` — the L ```yaml targets: - - name: codex_target + - label: codex_target provider: codex grader_target: azure-base # LLM used for grading ``` diff --git a/apps/web/src/content/docs/docs/targets/custom-providers.mdx b/apps/web/src/content/docs/docs/targets/custom-providers.mdx index 737ba57b0..1310747d8 100644 --- a/apps/web/src/content/docs/docs/targets/custom-providers.mdx +++ b/apps/web/src/content/docs/docs/targets/custom-providers.mdx @@ -177,7 +177,7 @@ Then reference it in your targets file: ```yaml # .agentv/targets.yaml targets: - - name: my_http_agent + - label: my_http_agent provider: http-agent grader_target: azure-base ``` @@ -208,7 +208,7 @@ Use `provider: cli` when: ```yaml targets: - - name: python_agent + - label: python_agent provider: cli command: 'python agent.py --prompt-file {PROMPT_FILE} --output {OUTPUT_FILE}' ``` diff --git a/apps/web/src/content/docs/docs/targets/llm-providers.mdx b/apps/web/src/content/docs/docs/targets/llm-providers.mdx index 4e7a5ec71..9071c8e5d 100644 --- a/apps/web/src/content/docs/docs/targets/llm-providers.mdx +++ b/apps/web/src/content/docs/docs/targets/llm-providers.mdx @@ -11,7 +11,7 @@ LLM provider targets call language model APIs directly. These are used both as e ```yaml targets: - - name: openai-target + - label: openai-target provider: openai api_key: ${{ OPENAI_API_KEY }} model: gpt-4o @@ -38,7 +38,7 @@ Most users should leave this unset. The default `chat` format is universally sup ```yaml # OpenAI-compatible endpoint (default chat format works) targets: - - name: github-models + - label: github-models provider: openai api_format: chat base_url: https://models.github.ai/inference/v1 @@ -46,7 +46,7 @@ targets: model: ${{ GH_MODELS_MODEL }} # Opt in to Responses API for api.openai.com - - name: openai-responses + - label: openai-responses provider: openai api_format: responses api_key: ${{ OPENAI_API_KEY }} @@ -57,7 +57,7 @@ targets: ```yaml targets: - - name: azure-base + - label: azure-base provider: azure endpoint: ${{ AZURE_OPENAI_ENDPOINT }} api_key: ${{ AZURE_OPENAI_API_KEY }} @@ -79,7 +79,7 @@ If your Azure deployment only exposes `/chat/completions` (older deployments, ce ```yaml targets: - - name: azure-chat + - label: azure-chat provider: openai base_url: https://.openai.azure.com/openai/deployments/ api_key: ${{ AZURE_OPENAI_API_KEY }} @@ -93,7 +93,7 @@ The `api_format` field was previously available on `provider: azure` but has bee ```yaml targets: - - name: claude_target + - label: claude_target provider: anthropic api_key: ${{ ANTHROPIC_API_KEY }} model: claude-sonnet-4-20250514 @@ -108,7 +108,7 @@ targets: ```yaml targets: - - name: gemini_target + - label: gemini_target provider: gemini api_key: ${{ GEMINI_API_KEY }} model: gemini-2.0-flash diff --git a/apps/web/src/content/docs/docs/targets/retry.mdx b/apps/web/src/content/docs/docs/targets/retry.mdx index b01769520..05fb3638a 100644 --- a/apps/web/src/content/docs/docs/targets/retry.mdx +++ b/apps/web/src/content/docs/docs/targets/retry.mdx @@ -13,7 +13,7 @@ Add retry fields to any target: ```yaml targets: - - name: azure-base + - label: azure-base provider: azure endpoint: ${{ AZURE_OPENAI_ENDPOINT }} api_key: ${{ AZURE_OPENAI_API_KEY }} diff --git a/packages/core/src/evaluation/loaders/config-loader.ts b/packages/core/src/evaluation/loaders/config-loader.ts index c3629288f..ab45dbe00 100644 --- a/packages/core/src/evaluation/loaders/config-loader.ts +++ b/packages/core/src/evaluation/loaders/config-loader.ts @@ -11,6 +11,8 @@ import { } from '../../config-overlays.js'; import { getAgentvConfigDir } from '../../paths.js'; import { interpolateEnv } from '../interpolation.js'; +import { normalizeTargetDefinition } from '../providers/targets.js'; +import type { TargetDefinition } from '../providers/types.js'; import type { EvalTargetRef, FailOnError, @@ -335,10 +337,15 @@ export function extractTargetFromSuite(suite: JsonObject): string | undefined { return targetValue.trim(); } if (isJsonObject(targetValue)) { - const name = targetValue.name; + const label = targetValue.label; const extendsTarget = targetValue.extends; - if (typeof name === 'string' && name.trim().length > 0) { - return name.trim(); + if (typeof targetValue.name === 'string' && targetValue.name.trim().length > 0) { + throw new Error( + "Top-level target object field 'name' has been removed. Use 'label' instead.", + ); + } + if (typeof label === 'string' && label.trim().length > 0) { + return label.trim(); } if (typeof extendsTarget === 'string' && extendsTarget.trim().length > 0) { return extendsTarget.trim(); @@ -348,19 +355,22 @@ export function extractTargetFromSuite(suite: JsonObject): string | undefined { return undefined; } -/** - * Matrix target refs are not authored in eval YAML. The CLI keeps this helper - * as an internal no-op for call sites that still handle runtime-only matrices. - */ export function extractTargetRefsFromSuite( suite: JsonObject, ): readonly EvalTargetRef[] | undefined { rejectAuthoredRuntimeContainers(suite); - return undefined; + const rawTargets = suite.targets; + if (rawTargets === undefined) { + return undefined; + } + + const entries = Array.isArray(rawTargets) ? rawTargets : [rawTargets]; + const refs = entries.map((entry, index) => parseEvalTargetRef(entry, `targets[${index}]`)); + return refs.length > 0 ? refs : undefined; } /** - * Extract runtime-only matrix target names from parsed eval suite. + * Extract live matrix target names from parsed eval suite. */ export function extractTargetsFromSuite(suite: JsonObject): readonly string[] | undefined { const refs = extractTargetRefsFromSuite(suite); @@ -369,6 +379,48 @@ export function extractTargetsFromSuite(suite: JsonObject): readonly string[] | return names.length > 0 ? names : undefined; } +function parseEvalTargetRef(raw: unknown, location: string): EvalTargetRef { + if (typeof raw === 'string') { + const name = raw.trim(); + if (name.length === 0) { + throw new Error(`Invalid ${location}: target reference must be non-empty.`); + } + return { name }; + } + + if (!isJsonObject(raw)) { + throw new Error(`Invalid ${location}: use a target label string or target object.`); + } + if (typeof raw.name === 'string' && raw.name.trim().length > 0) { + throw new Error( + `Invalid ${location}: target field 'name' has been removed. Use 'label' instead.`, + ); + } + + const rawLabel = raw.label; + const label = + typeof rawLabel === 'string' && rawLabel.trim().length > 0 ? rawLabel.trim() : undefined; + if (!label) { + throw new Error(`Invalid ${location}: target object requires a 'label' field.`); + } + + const hooks = parseTargetHooks(raw.hooks); + const definition = normalizeTargetDefinition( + Object.fromEntries(Object.entries(raw).filter(([key]) => key !== 'hooks')), + ) as TargetDefinition; + const useTarget = + typeof raw.use_target === 'string' && raw.use_target.trim().length > 0 + ? raw.use_target.trim() + : undefined; + + return { + name: label, + ...(useTarget !== undefined ? { use_target: useTarget } : {}), + definition, + ...(hooks !== undefined ? { hooks } : {}), + }; +} + /** * Parse a single workspace hook config from a raw object. * Accepts both string shorthand (shell command) and object form. diff --git a/packages/core/src/evaluation/providers/targets-file.ts b/packages/core/src/evaluation/providers/targets-file.ts index f17729db7..c64a93266 100644 --- a/packages/core/src/evaluation/providers/targets-file.ts +++ b/packages/core/src/evaluation/providers/targets-file.ts @@ -3,6 +3,7 @@ import { access, readFile } from 'node:fs/promises'; import path from 'node:path'; import { parseYamlValue } from '../yaml-loader.js'; +import { normalizeTargetDefinition } from './targets.js'; import { TARGETS_SCHEMA_V2 } from './types.js'; import type { TargetDefinition } from './types.js'; @@ -23,25 +24,29 @@ function assertTargetDefinition(value: unknown, index: number, filePath: string) throw new Error(`targets.yaml entry at index ${index} in ${filePath} must be an object`); } - const name = value.name; + const label = value.label; const provider = value.provider; - if (typeof name !== 'string' || name.trim().length === 0) { + if (typeof label !== 'string' || label.trim().length === 0) { throw new Error( - `targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`, + `targets.yaml entry at index ${index} in ${filePath} is missing a valid 'label'`, + ); + } + + if (typeof value.name === 'string' && value.name.trim().length > 0) { + throw new Error( + `targets.yaml entry '${label}' in ${filePath} uses removed field 'name'. Use 'label' for the AgentV target name.`, ); } const hasUseTarget = typeof value.use_target === 'string' && value.use_target.trim().length > 0; if (!hasUseTarget && (typeof provider !== 'string' || provider.trim().length === 0)) { throw new Error( - `targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider' (or use use_target for delegation)`, + `targets.yaml entry '${label}' in ${filePath} is missing a valid 'provider' (or use use_target for delegation)`, ); } - // Pass through all properties from the YAML to support the flattened schema - // This includes all provider-specific settings at the top level - return value as unknown as TargetDefinition; + return normalizeTargetDefinition(value); } async function fileExists(filePath: string): Promise { diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index 6a6bfdd91..d997940fe 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -654,6 +654,51 @@ const DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS = new Map([ ['timeoutSeconds', 'timeout_seconds'], ]); +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +export interface NormalizeTargetDefinitionOptions { + readonly defaultName?: string; +} + +/** + * Converts the authored promptfoo-shaped target object into AgentV's internal + * target definition. Authored YAML uses `label` for AgentV's target/comparison + * name and `id` for the promptfoo provider/backend identifier. The runtime + * continues to use `name` as its stable resolver and artifact key. + */ +export function normalizeTargetDefinition( + definition: unknown, + options: NormalizeTargetDefinitionOptions = {}, +): TargetDefinition { + if (!isRecord(definition)) { + throw new Error('Target definition must be an object'); + } + + const rawId = definition.id; + const rawLabel = definition.label; + const rawName = definition.name; + const id = typeof rawId === 'string' && rawId.trim().length > 0 ? rawId.trim() : undefined; + const label = + typeof rawLabel === 'string' && rawLabel.trim().length > 0 ? rawLabel.trim() : undefined; + const legacyName = + typeof rawName === 'string' && rawName.trim().length > 0 ? rawName.trim() : undefined; + const name = label ?? legacyName ?? options.defaultName; + if (!name || name.trim().length === 0) { + throw new Error("Target definition is missing a valid 'label' field"); + } + + const config = isRecord(definition.config) ? definition.config : {}; + return { + ...config, + ...definition, + ...(id !== undefined ? { id } : {}), + label: label ?? name, + name, + } as unknown as TargetDefinition; +} + function collectDeprecatedCamelCaseWarnings( value: unknown, location: string, @@ -751,6 +796,7 @@ export type CliHealthcheck = Readonly; /** Base fields shared by all resolved targets. */ interface ResolvedTargetBase { readonly name: string; + readonly label?: string; readonly graderTarget?: string; readonly workers?: number; readonly providerBatching?: boolean; @@ -824,8 +870,11 @@ const SECRET_ENV_TEMPLATE_PATTERN = /^\s*\{\{\s*env\.([A-Za-z_][A-Za-z0-9_]*)\s* const BASE_TARGET_SCHEMA = z .object({ + id: z.string().optional(), name: z.string().min(1, 'target name is required'), + label: z.string().optional(), provider: z.string().optional(), + config: z.record(z.unknown()).optional(), use_target: z.string().optional(), grader_target: z.string().optional(), workers: z.number().int().min(1).optional(), @@ -962,10 +1011,11 @@ export function resolveTargetDefinition( options?: { readonly emitDeprecationWarnings?: boolean }, ): ResolvedTarget { void options; - assertNoRemovedTargetFields(definition); - assertNoDeprecatedCamelCaseTargetFields(definition); + const normalizedDefinition = normalizeTargetDefinition(definition); + assertNoRemovedTargetFields(normalizedDefinition); + assertNoDeprecatedCamelCaseTargetFields(normalizedDefinition); - const parsed = BASE_TARGET_SCHEMA.parse(definition); + const parsed = BASE_TARGET_SCHEMA.parse(normalizedDefinition); if (!parsed.provider) { throw new Error( `${parsed.name}: 'provider' is required (targets with use_target must be resolved before calling resolveTargetDefinition)`, @@ -984,6 +1034,7 @@ export function resolveTargetDefinition( const fallbackTargets = parsed.fallback_targets; const base = { name: parsed.name, + label: parsed.label, graderTarget: parsed.grader_target, workers: parsed.workers, providerBatching, diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index 7a7947be5..10b10c5a4 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -342,7 +342,17 @@ export interface Provider { export type EnvLookup = Readonly>; export interface TargetDefinition { + /** Internal canonical target identity. Authored YAML uses `label`; loaders map label -> name. */ readonly name: string; + /** Promptfoo provider/backend identifier, not the AgentV target reference key. */ + readonly id?: string | undefined; + /** Authored promptfoo-shaped label; used as AgentV's authored target/comparison name. */ + readonly label?: string | undefined; + /** Promptfoo-shaped provider options bag. Provider settings are flattened at the boundary. */ + readonly config?: unknown | undefined; + readonly prompts?: unknown | undefined; + readonly transform?: unknown | undefined; + readonly delay?: number | unknown | undefined; readonly provider?: ProviderKind | string; // Delegation: resolve this target as another named target. // Supports ${{ ENV_VAR }} syntax (e.g., use_target: ${{ AGENT_TARGET }}). diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index bdb4b8aac..953912a19 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -296,13 +296,15 @@ export type TargetHooksConfig = { * Extended target reference from eval file. * Allows eval files to define per-target hooks and delegation alongside target names. * - * String targets are shorthand for `{ name: "target-name" }` (no hooks). + * String targets are shorthand for `{ label: "target-name" }` (no hooks). */ export type EvalTargetRef = { - /** Target name (must match a target in targets.yaml or be defined inline with use_target) */ + /** Internal target identity (authored as `label` in object form). */ readonly name: string; /** Delegate to another named target (same as use_target in targets.yaml) */ readonly use_target?: string; + /** Inline target definition normalized from a promptfoo-shaped target object. */ + readonly definition?: import('./providers/types.js').TargetDefinition; /** Per-target hooks for workspace customization */ readonly hooks?: TargetHooksConfig; }; diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index a289a3f2a..9983cac8d 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -404,7 +404,8 @@ const TargetHooksSchema = z /** Eval target reference: string shorthand or object with hooks */ const EvalTargetRefSchema = z .object({ - name: z.string().min(1), + label: z.string().min(1), + id: z.string().min(1).optional(), use_target: z.string().optional(), hooks: TargetHooksSchema.optional(), }) @@ -415,7 +416,6 @@ const EvalLocalTargetSchema = z id: z.string().min(1).optional(), label: z.string().min(1).optional(), extends: z.string().min(1).optional(), - name: z.string().min(1).optional(), provider: z.string().min(1).optional(), model: z.string().min(1).optional(), config: JsonRecordSchema.optional(), @@ -670,6 +670,7 @@ export const EvalFileSchema: z.ZodType = z // Target target: z.union([z.string().min(1), EvalLocalTargetSchema]).optional(), targets: EvalTargetsSchema.optional(), + providers: z.never().optional(), model: z.never().optional(), // Run/result grouping label and flat run controls experiment: z.string().min(1).optional(), diff --git a/packages/core/src/evaluation/validation/eval-validator.ts b/packages/core/src/evaluation/validation/eval-validator.ts index 561231864..0edc477ef 100644 --- a/packages/core/src/evaluation/validation/eval-validator.ts +++ b/packages/core/src/evaluation/validation/eval-validator.ts @@ -166,6 +166,10 @@ const REMOVED_TOP_LEVEL_FIELDS = new Map([ 'execution', "Top-level 'execution' is not part of eval YAML. Put target and run controls at the top level, authored concurrency under evaluate_options.max_concurrency, and operational defaults in CLI flags or project config.", ], + [ + 'providers', + "Top-level 'providers' is not a runtime alias in AgentV eval YAML. Use 'targets' for systems under test; provider names backend kind inside each target.", + ], ['runs', "Top-level 'runs' has been removed. Use repeat.count instead."], ['early_exit', "Top-level 'early_exit' has been removed. Use repeat.early_exit instead."], ]); diff --git a/packages/core/src/evaluation/validation/targets-validator.ts b/packages/core/src/evaluation/validation/targets-validator.ts index bd847fde3..f8a39cf1d 100644 --- a/packages/core/src/evaluation/validation/targets-validator.ts +++ b/packages/core/src/evaluation/validation/targets-validator.ts @@ -6,6 +6,7 @@ import { CLI_PLACEHOLDERS, COMMON_TARGET_SETTINGS, findDeprecatedCamelCaseTargetWarnings, + normalizeTargetDefinition, } from '../providers/targets.js'; import { KNOWN_PROVIDERS } from '../providers/types.js'; import { parseYamlValue } from '../yaml-loader.js'; @@ -266,8 +267,15 @@ function validateUnknownSettings( // Known base target fields that aren't settings const baseFields = new Set([ + 'id', 'name', + 'label', 'provider', + 'config', + 'prompts', + 'transform', + 'delay', + 'env', 'grader_target', 'judge_target', 'workers', @@ -502,6 +510,16 @@ export async function validateTargetsFile(filePath: string): Promise 0) { errors.push({ severity: 'error', filePath: absolutePath, location: `${location}.name`, - message: "Missing or invalid 'name' field (must be a non-empty string)", + message: + "The target 'name' field has been removed. Use 'label' for the AgentV target name.", }); } // Required field: provider - const provider = target.provider; + const effectiveTarget = normalizedTarget ?? target; + const provider = effectiveTarget.provider; const rawTarget = rawTargets[i]; const rawUseTarget = isObject(rawTarget) ? rawTarget.use_target : undefined; - const hasUseTarget = isNonEmptyString(target.use_target) || isNonEmptyString(rawUseTarget); + const hasUseTarget = + isNonEmptyString(effectiveTarget.use_target) || isNonEmptyString(rawUseTarget); const providerValue = typeof provider === 'string' ? provider.trim().toLowerCase() : undefined; const isTemplated = typeof provider === 'string' && /^\$\{\{.+\}\}$/.test(provider.trim()); if (!hasUseTarget && (typeof provider !== 'string' || provider.trim().length === 0)) { @@ -589,11 +634,11 @@ export async function validateTargetsFile(filePath: string): Promise 0) { + throw new Error( + "Invalid top-level 'target': field 'name' has been removed. Use 'label' instead.", + ); + } const rawExtends = rawTarget.extends; const extendsTarget = typeof rawExtends === 'string' && rawExtends.trim().length > 0 ? rawExtends.trim() : undefined; - const rawName = rawTarget.name; + const rawLabel = rawTarget.label; const name = - typeof rawName === 'string' && rawName.trim().length > 0 - ? rawName.trim() + typeof rawLabel === 'string' && rawLabel.trim().length > 0 + ? rawLabel.trim() : (extendsTarget ?? 'eval-local-target'); const hooks = parseTargetHooks(rawTarget.hooks); const definitionEntries = Object.entries(rawTarget).filter( ([key]) => key !== 'extends' && key !== 'hooks', ); - const definition = { - ...Object.fromEntries(definitionEntries), - name, - } as unknown as TargetDefinition; + const definition = normalizeTargetDefinition(Object.fromEntries(definitionEntries), { + defaultName: name, + }); return { name, diff --git a/packages/core/test/evaluation/loaders/config-loader.test.ts b/packages/core/test/evaluation/loaders/config-loader.test.ts index 4966ea4df..c6a0ae034 100644 --- a/packages/core/test/evaluation/loaders/config-loader.test.ts +++ b/packages/core/test/evaluation/loaders/config-loader.test.ts @@ -566,11 +566,23 @@ describe('extractTargetFromSuite', () => { expect(extractTargetFromSuite(suite)).toBe('codex-gpt5'); }); - it('extracts target object name from name or extends', () => { + it('extracts target object identity from label or extends', () => { const suite: JsonObject = { - target: { extends: 'codex-gpt5', model: 'gpt-5.1' }, + target: { + label: 'codex-local', + id: 'codex:gpt-5.1', + extends: 'codex-gpt5', + config: { model: 'gpt-5.1' }, + }, }; - expect(extractTargetFromSuite(suite)).toBe('codex-gpt5'); + expect(extractTargetFromSuite(suite)).toBe('codex-local'); + }); + + it('rejects target object name in favor of label', () => { + const suite: JsonObject = { + target: { name: 'legacy-target', provider: 'mock' }, + }; + expect(() => extractTargetFromSuite(suite)).toThrow(/Use 'label'/); }); it('returns undefined when no target specified', () => { @@ -585,12 +597,43 @@ describe('extractTargetFromSuite', () => { }); describe('extractTargetsFromSuite and extractTargetRefsFromSuite', () => { - it('return undefined for authored eval YAML', () => { + it('return undefined when no targets are authored', () => { const suite: JsonObject = { tests: [] }; expect(extractTargetsFromSuite(suite)).toBeUndefined(); expect(extractTargetRefsFromSuite(suite)).toBeUndefined(); }); + it('extracts live targets strings and promptfoo-shaped target objects', () => { + const suite: JsonObject = { + targets: [ + 'registry-agent', + { + label: 'inline-agent', + id: 'mock', + provider: 'mock', + config: { response: 'ok' }, + fallback_targets: ['registry-agent'], + }, + ], + }; + + expect(extractTargetsFromSuite(suite)).toEqual(['registry-agent', 'inline-agent']); + expect(extractTargetRefsFromSuite(suite)).toEqual([ + { name: 'registry-agent' }, + { + name: 'inline-agent', + definition: expect.objectContaining({ + id: 'mock', + name: 'inline-agent', + label: 'inline-agent', + provider: 'mock', + response: 'ok', + fallback_targets: ['registry-agent'], + }), + }, + ]); + }); + it('reject top-level target arrays through execution', () => { const suite: JsonObject = { execution: { targets: ['copilot', 'claude'] } }; expect(() => extractTargetsFromSuite(suite)).toThrow(/Top-level 'execution'/); diff --git a/packages/core/test/evaluation/providers/targets-file.test.ts b/packages/core/test/evaluation/providers/targets-file.test.ts new file mode 100644 index 000000000..502d67236 --- /dev/null +++ b/packages/core/test/evaluation/providers/targets-file.test.ts @@ -0,0 +1,60 @@ +import { afterEach, describe, expect, it } from 'bun:test'; +import { mkdir, rm, writeFile } from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; + +import { readTargetDefinitions } from '../../../src/evaluation/providers/targets-file.js'; + +describe('readTargetDefinitions', () => { + let tempDir: string | undefined; + + afterEach(async () => { + if (tempDir) { + await rm(tempDir, { recursive: true, force: true }); + tempDir = undefined; + } + }); + + async function writeTargetsYaml(content: string): Promise { + tempDir = path.join(os.tmpdir(), `agentv-targets-file-test-${Date.now()}`); + await mkdir(tempDir, { recursive: true }); + const filePath = path.join(tempDir, 'targets.yaml'); + await writeFile(filePath, content); + return filePath; + } + + it('normalizes promptfoo-shaped label, backend id, and config fields', async () => { + const filePath = await writeTargetsYaml(`targets: + - label: candidate-agent + id: openai:gpt-5-codex + provider: codex + config: + model: gpt-5-codex + reasoning_effort: low + grader_target: grader +`); + + const definitions = await readTargetDefinitions(filePath); + + expect(definitions).toEqual([ + expect.objectContaining({ + id: 'openai:gpt-5-codex', + name: 'candidate-agent', + label: 'candidate-agent', + provider: 'codex', + model: 'gpt-5-codex', + reasoning_effort: 'low', + grader_target: 'grader', + }), + ]); + }); + + it('rejects authored name in favor of label', async () => { + const filePath = await writeTargetsYaml(`targets: + - name: legacy-agent + provider: mock +`); + + await expect(readTargetDefinitions(filePath)).rejects.toThrow(/missing a valid 'label'/); + }); +}); diff --git a/packages/core/test/evaluation/providers/targets.test.ts b/packages/core/test/evaluation/providers/targets.test.ts index cdd5c4c20..cec0262bb 100644 --- a/packages/core/test/evaluation/providers/targets.test.ts +++ b/packages/core/test/evaluation/providers/targets.test.ts @@ -87,6 +87,47 @@ describe('resolveTargetDefinition', () => { piGetModelMock.mockClear(); }); + it('uses promptfoo-shaped label as AgentV target identity and id as backend metadata', () => { + const target = resolveTargetDefinition( + { + label: 'primary-sut', + id: 'mock', + provider: 'mock', + config: { + response: 'ok', + }, + } as never, + {}, + ); + + expect(target.name).toBe('primary-sut'); + expect(target.label).toBe('primary-sut'); + expect(target.kind).toBe('mock'); + expect(target.config.response).toBe('ok'); + }); + + it('treats provider as backend kind while id remains a provider locator', () => { + const target = resolveTargetDefinition( + { + label: 'candidate-agent', + id: 'openai:gpt-5-mini', + provider: 'openai', + config: { + api_key: '${{ OPENAI_API_KEY }}', + model: '${{ OPENAI_MODEL }}', + }, + } as never, + { OPENAI_API_KEY: 'secret', OPENAI_MODEL: 'gpt-5-mini' }, + ); + + expect(target.name).toBe('candidate-agent'); + expect(target.kind).toBe('openai'); + if (target.kind !== 'openai') { + throw new Error('expected openai target'); + } + expect(target.config.model).toBe('gpt-5-mini'); + }); + it("throws when settings don't use ${{ }} syntax", () => { const env = { AZURE_OPENAI_ENDPOINT: 'https://example.openai.azure.com', diff --git a/packages/core/test/evaluation/validation/eval-validator.test.ts b/packages/core/test/evaluation/validation/eval-validator.test.ts index 9b294058b..97236de02 100644 --- a/packages/core/test/evaluation/validation/eval-validator.test.ts +++ b/packages/core/test/evaluation/validation/eval-validator.test.ts @@ -162,7 +162,7 @@ tags: prompts: - raw: "Review {{ vars.diff }}" targets: - - id: local-agent + - label: local-agent provider: codex default_test: vars: @@ -223,7 +223,7 @@ extensions: expect(result.errors).toHaveLength(0); }); - it('warns rather than accepting top-level providers as a live alias for targets', async () => { + it('rejects top-level providers as a live alias for targets', async () => { const filePath = path.join(tempDir, 'top-level-providers.yaml'); await writeFile( filePath, @@ -242,13 +242,13 @@ tests: const result = await validateEvalFile(filePath); - expect(result.valid).toBe(true); + expect(result.valid).toBe(false); expect( result.errors.some( (error) => - error.severity === 'warning' && + error.severity === 'error' && error.location === 'providers' && - error.message.includes("Unknown field 'providers'"), + error.message.includes("Top-level 'providers' is not a runtime alias"), ), ).toBe(true); }); diff --git a/packages/core/test/evaluation/validation/targets-validator.test.ts b/packages/core/test/evaluation/validation/targets-validator.test.ts index 1342292b6..e0f768370 100644 --- a/packages/core/test/evaluation/validation/targets-validator.test.ts +++ b/packages/core/test/evaluation/validation/targets-validator.test.ts @@ -22,7 +22,7 @@ describe('validateTargetsFile', () => { await writeFile( filePath, `targets: - - name: openrouter-target + - label: openrouter-target provider: openrouter api_key: \${{ OPENROUTER_API_KEY }} model: openai/gpt-5-mini @@ -40,32 +40,123 @@ describe('validateTargetsFile', () => { ).toBe(false); }); + it('accepts promptfoo-shaped id, label, and config fields', async () => { + const filePath = path.join(tempDir, 'promptfoo-shaped-target.yaml'); + await writeFile( + filePath, + `targets: + - label: candidate-agent + id: openai:gpt-5-codex + provider: codex + config: + model: \${{ CODEX_MODEL }} + reasoning_effort: low + base_url: \${{ OPENAI_BASE_URL }} + api_key: \${{ OPENAI_API_KEY }} + api_format: responses + grader_target: grader + fallback_targets: [backup-agent] + - label: grader + provider: openai + config: + api_key: \${{ OPENAI_API_KEY }} + model: gpt-5-mini + - label: backup-agent + provider: mock + config: + response: backup +`, + ); + + const result = await validateTargetsFile(filePath); + + expect(result.valid).toBe(true); + expect(result.errors.filter((error) => error.severity === 'warning')).toEqual([]); + }); + + it('rejects authored target name in favor of label', async () => { + const filePath = path.join(tempDir, 'legacy-name-target.yaml'); + await writeFile( + filePath, + `targets: + - name: legacy-agent + provider: mock +`, + ); + + const result = await validateTargetsFile(filePath); + + expect(result.valid).toBe(false); + expect( + result.errors.some( + (error) => + error.severity === 'error' && + error.location === 'targets[0].label' && + error.message.includes("Missing or invalid 'label' field"), + ), + ).toBe(true); + expect( + result.errors.some( + (error) => + error.severity === 'error' && + error.location === 'targets[0].name' && + error.message.includes("Use 'label'"), + ), + ).toBe(true); + }); + + it('rejects top-level providers as a targets.yaml runtime alias', async () => { + const filePath = path.join(tempDir, 'top-level-providers.yaml'); + await writeFile( + filePath, + `providers: + - label: candidate-agent + provider: mock +targets: + - label: candidate-agent + provider: mock +`, + ); + + const result = await validateTargetsFile(filePath); + + expect(result.valid).toBe(false); + expect( + result.errors.some( + (error) => + error.severity === 'error' && + error.location === 'providers' && + error.message.includes("Top-level 'providers' is not a runtime alias"), + ), + ).toBe(true); + }); + it('warns on removed built-in provider aliases', async () => { const filePath = path.join(tempDir, 'removed-provider-aliases.yaml'); await writeFile( filePath, `targets: - - name: azure-alias + - label: azure-alias provider: azure-openai - - name: google-alias + - label: google-alias provider: google - - name: google-gemini-alias + - label: google-gemini-alias provider: google-gemini - - name: codex-cli-alias + - label: codex-cli-alias provider: codex-cli - - name: copilot-alias + - label: copilot-alias provider: copilot - - name: copilot-sdk-alias + - label: copilot-sdk-alias provider: copilot_sdk - - name: pi-alias + - label: pi-alias provider: pi - - name: claude-code-alias + - label: claude-code-alias provider: claude-code - - name: cc-mirror-alias + - label: cc-mirror-alias provider: cc-mirror - - name: bedrock-future + - label: bedrock-future provider: bedrock - - name: vertex-future + - label: vertex-future provider: vertex `, ); @@ -101,13 +192,13 @@ describe('validateTargetsFile', () => { await writeFile( filePath, `targets: - - name: codex-target + - label: codex-target provider: codex timeoutSeconds: 30 logDir: ./logs systemPrompt: Be precise. modelReasoningEffort: low - - name: cli-target + - label: cli-target provider: cli command: echo {PROMPT} healthcheck: @@ -166,7 +257,7 @@ describe('validateTargetsFile', () => { await writeFile( filePath, `targets: - - name: codex-target + - label: codex-target provider: codex model: \${{ CODEX_MODEL }} reasoning_effort: \${{ CODEX_REASONING_EFFORT }} @@ -183,7 +274,7 @@ describe('validateTargetsFile', () => { await writeFile( filePath, `targets: - - name: copilot-sdk-custom-provider + - label: copilot-sdk-custom-provider provider: copilot-sdk model: gpt-5 subprovider: openai @@ -192,7 +283,7 @@ describe('validateTargetsFile', () => { api_format: responses model_id: gpt-5 wire_model: \${{ OPENAI_MODEL }} - - name: copilot-cli-custom-provider + - label: copilot-cli-custom-provider provider: copilot-cli subprovider: openai base_url: \${{ OPENAI_ENDPOINT }} @@ -212,7 +303,7 @@ describe('validateTargetsFile', () => { await writeFile( filePath, `targets: - - name: codex-local-openai + - label: codex-local-openai provider: codex model: \${{ CODEX_MODEL }} reasoning_effort: medium @@ -236,19 +327,19 @@ describe('validateTargetsFile', () => { await writeFile( filePath, `targets: - - name: copilot-sdk-custom + - label: copilot-sdk-custom provider: copilot-sdk custom_provider: type: openai base_url: \${{ OPENAI_ENDPOINT }} api_key: \${{ OPENAI_API_KEY }} - - name: copilot-sdk-byok + - label: copilot-sdk-byok provider: copilot-sdk byok: type: openai base_url: \${{ OPENAI_ENDPOINT }} api_key: \${{ OPENAI_API_KEY }} - - name: copilot-cli-custom + - label: copilot-cli-custom provider: copilot-cli custom_provider: type: openai @@ -278,11 +369,11 @@ describe('validateTargetsFile', () => { await writeFile( filePath, `targets: - - name: default + - label: default use_target: \${{ AGENT_TARGET }} - - name: grader + - label: grader use_target: \${{ GRADER_TARGET }} - - name: codex-agent + - label: codex-agent provider: codex grader_target: grader `, @@ -323,11 +414,11 @@ describe('validateTargetsFile', () => { await writeFile( filePath, `targets: - - name: codex-agent + - label: codex-agent provider: codex model: gpt-5 judge_target: grader - - name: grader + - label: grader provider: openai model: gpt-5-mini `, @@ -351,10 +442,10 @@ describe('validateTargetsFile', () => { await writeFile( filePath, `targets: - - name: copilot-agent + - label: copilot-agent provider: copilot-cli log_format: json - - name: claude-agent + - label: claude-agent provider: claude log_output_format: summary `, @@ -386,7 +477,7 @@ describe('validateTargetsFile', () => { await writeFile( filePath, `targets: - - name: azure-responses + - label: azure-responses provider: azure endpoint: \${{ AZURE_OPENAI_ENDPOINT }} api_key: \${{ AZURE_OPENAI_API_KEY }} @@ -412,7 +503,7 @@ describe('validateTargetsFile', () => { await writeFile( filePath, `targets: - - name: replay-execution-trace + - label: replay-execution-trace provider: replay execution_traces: ./fixtures/execution-traces.jsonl source_target: live-agent @@ -432,7 +523,7 @@ describe('validateTargetsFile', () => { await writeFile( filePath, `targets: - - name: replay-ambiguous + - label: replay-ambiguous provider: replay fixtures: ./fixtures/target-output.jsonl execution_traces: ./fixtures/execution-traces.jsonl diff --git a/packages/sdk/src/eval.ts b/packages/sdk/src/eval.ts index 970a5eab4..c31e7866c 100644 --- a/packages/sdk/src/eval.ts +++ b/packages/sdk/src/eval.ts @@ -125,16 +125,23 @@ export interface EvalWorkspace { } export interface EvalTargetRef { - readonly name: string; + readonly label: string; + readonly id?: string; readonly useTarget?: string; readonly hooks?: EvalWorkspaceHooks; } export interface EvalTargetConfig { readonly extends?: string; - readonly name?: string; + readonly id?: string; + readonly label?: string; readonly provider?: string; readonly model?: string; + readonly config?: Readonly>; + readonly prompts?: unknown; + readonly transform?: unknown; + readonly delay?: number; + readonly env?: Readonly>; readonly reasoningEffort?: string; readonly hooks?: EvalWorkspaceHooks; readonly [key: string]: unknown; diff --git a/packages/sdk/test/eval-authoring.test.ts b/packages/sdk/test/eval-authoring.test.ts index 568cc5ad2..e71c4fff9 100644 --- a/packages/sdk/test/eval-authoring.test.ts +++ b/packages/sdk/test/eval-authoring.test.ts @@ -9,9 +9,14 @@ describe('YAML-aligned eval authoring helpers', () => { inputFiles: ['fixtures/shared-system.md'], experiment: 'sdk-yaml-run', target: { + label: 'sdk-codex', + id: 'codex:gpt-5-codex', extends: 'mock-target', - model: 'gpt-5-codex', - reasoningEffort: 'high', + provider: 'codex', + config: { + model: 'gpt-5-codex', + reasoningEffort: 'high', + }, hooks: { beforeAll: { command: ['bun', 'run', 'scripts/setup.ts'], @@ -93,9 +98,14 @@ describe('YAML-aligned eval authoring helpers', () => { input_files: ['fixtures/shared-system.md'], experiment: 'sdk-yaml-run', target: { + label: 'sdk-codex', + id: 'codex:gpt-5-codex', extends: 'mock-target', - model: 'gpt-5-codex', - reasoning_effort: 'high', + provider: 'codex', + config: { + model: 'gpt-5-codex', + reasoning_effort: 'high', + }, hooks: { before_all: { command: ['bun', 'run', 'scripts/setup.ts'], diff --git a/skills-data/agentv-eval-writer/SKILL.md b/skills-data/agentv-eval-writer/SKILL.md index d27fdc5db..7449431d2 100644 --- a/skills-data/agentv-eval-writer/SKILL.md +++ b/skills-data/agentv-eval-writer/SKILL.md @@ -631,7 +631,7 @@ agentv compare \ agentv validate ``` -**Replay targets:** Add `provider: replay`, `fixtures: `, and `source_target: ` in `.agentv/targets.yaml`. Optional `suite`, `eval_path`, and `variant` tighten lookup. The eval YAML and graders stay unchanged; replay only substitutes recorded target output, and graders run fresh. +**Replay targets:** Add `provider: replay`, `fixtures: `, and `source_target: ` in `.agentv/targets.yaml`. Optional `suite`, `eval_path`, and `variant` tighten lookup. The eval YAML and graders stay unchanged; replay only substitutes recorded target output, and graders run fresh. ## TypeScript SDK Helpers diff --git a/skills-data/agentv-eval-writer/references/eval.schema.json b/skills-data/agentv-eval-writer/references/eval.schema.json index d044864f7..1420fda94 100644 --- a/skills-data/agentv-eval-writer/references/eval.schema.json +++ b/skills-data/agentv-eval-writer/references/eval.schema.json @@ -914,10 +914,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -1244,10 +1240,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -1574,10 +1566,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -5653,10 +5641,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -5983,10 +5967,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -6313,10 +6293,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -10368,10 +10344,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -10698,10 +10670,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -11028,10 +10996,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -11339,6 +11303,9 @@ } ] }, + "providers": { + "not": {} + }, "model": { "not": {} }, @@ -11513,10 +11480,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -11843,10 +11806,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -12173,10 +12132,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -12819,10 +12774,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -13149,10 +13100,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -13479,10 +13426,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -14060,10 +14003,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -14390,10 +14329,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 @@ -14720,10 +14655,6 @@ "type": "string", "minLength": 1 }, - "name": { - "type": "string", - "minLength": 1 - }, "provider": { "type": "string", "minLength": 1 From 0b4d23f19ff1179c410cc459d45c43ed95e5fd4a Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 12:25:01 +0200 Subject: [PATCH 2/3] test(cli): update prepared target fixtures for labels --- apps/cli/test/commands/grade/grade-prepared.test.ts | 2 +- apps/cli/test/commands/prepare/prepare.test.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/cli/test/commands/grade/grade-prepared.test.ts b/apps/cli/test/commands/grade/grade-prepared.test.ts index f0c0abc02..4b8a174fa 100644 --- a/apps/cli/test/commands/grade/grade-prepared.test.ts +++ b/apps/cli/test/commands/grade/grade-prepared.test.ts @@ -83,7 +83,7 @@ console.log(JSON.stringify({ path.join(root, '.agentv', 'targets.yaml'), ` targets: - - name: codex + - label: codex provider: cli command: bun ./scripts/target.ts `, diff --git a/apps/cli/test/commands/prepare/prepare.test.ts b/apps/cli/test/commands/prepare/prepare.test.ts index 4e8a9fe65..e27432f34 100644 --- a/apps/cli/test/commands/prepare/prepare.test.ts +++ b/apps/cli/test/commands/prepare/prepare.test.ts @@ -60,7 +60,7 @@ await Bun.write(\`\${payload.workspace_path}/\${step}.txt\`, \`\${payload.test_i path.join(root, '.agentv', 'targets.yaml'), ` targets: - - name: codex + - label: codex provider: cli command: bun ./scripts/target.ts `, From 60e8c7f31ccab7c8d4d304597c64a6869dbb364c Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 16:18:21 +0200 Subject: [PATCH 3/3] fix(eval): close target contract review gaps --- apps/cli/src/commands/eval/commands/bundle.ts | 4 +- apps/cli/test/commands/eval/bundle.test.ts | 37 +++++++++++++++++++ packages/core/src/evaluation/yaml-parser.ts | 6 +++ .../evaluation/eval-inline-experiment.test.ts | 21 +++++++++++ .../interpolation-integration.test.ts | 2 +- 5 files changed, 68 insertions(+), 2 deletions(-) diff --git a/apps/cli/src/commands/eval/commands/bundle.ts b/apps/cli/src/commands/eval/commands/bundle.ts index 799a7e8e3..8b0a2d3dc 100644 --- a/apps/cli/src/commands/eval/commands/bundle.ts +++ b/apps/cli/src/commands/eval/commands/bundle.ts @@ -91,7 +91,9 @@ function definitionsWithEvalTargetRefs( const result = [...definitions]; for (const ref of targetRefs) { - if (ref.use_target && !result.some((definition) => definition.name === ref.name)) { + if (ref.definition && !result.some((definition) => definition.name === ref.name)) { + result.push(ref.definition); + } else if (ref.use_target && !result.some((definition) => definition.name === ref.name)) { result.push({ name: ref.name, use_target: ref.use_target } as TargetDefinition); } } diff --git a/apps/cli/test/commands/eval/bundle.test.ts b/apps/cli/test/commands/eval/bundle.test.ts index 8952e9de7..624ed736c 100644 --- a/apps/cli/test/commands/eval/bundle.test.ts +++ b/apps/cli/test/commands/eval/bundle.test.ts @@ -169,6 +169,43 @@ tests: ../data/cases.yaml await expectFileExists(path.join(bundleDir, 'run', 'index.jsonl')); }, 60_000); + it('preserves inline eval target object definitions in the bundled target graph', async () => { + const sourceDir = path.join(tempDir, 'inline-source'); + const bundleDir = path.join(tempDir, 'inline-bundle'); + await mkdir(path.join(sourceDir, '.agentv'), { recursive: true }); + await mkdir(path.join(sourceDir, 'evals'), { recursive: true }); + await writeFile(path.join(sourceDir, '.agentv', 'targets.yaml'), 'targets: []\n', 'utf8'); + await writeFile( + path.join(sourceDir, 'evals', 'inline.eval.yaml'), + `targets: + - label: candidate + provider: mock + response: '{"answer":"inline bundled response"}' +tests: + - id: inline-case + input: hello + assertions: + - type: contains + value: inline +`, + 'utf8', + ); + + const bundle = await runCli(sourceDir, [ + 'eval', + 'bundle', + 'evals/inline.eval.yaml', + '--out', + bundleDir, + ]); + + expect(bundle.exitCode).toBe(0); + const bundledTargets = await readFile(path.join(bundleDir, 'targets.yaml'), 'utf8'); + expect(bundledTargets).toContain('label: candidate'); + expect(bundledTargets).toContain('provider: mock'); + expect(bundledTargets).toContain('inline bundled response'); + }, 30_000); + it('reports unbundleable workspace references with their eval location', async () => { const sourceDir = path.join(tempDir, 'missing-source'); const bundleDir = path.join(tempDir, 'missing-bundle'); diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 64d9ca049..52348e1f7 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -185,6 +185,7 @@ type RawTestSuite = JsonObject & { /** @deprecated Use `tests` instead */ readonly evalcases?: JsonValue; readonly target?: JsonValue; + readonly providers?: JsonValue; readonly model?: JsonValue; readonly experiment?: JsonValue; readonly execution?: JsonValue; @@ -1612,6 +1613,11 @@ function readSuiteRuntimeBlock(suite: RawTestSuite, evalFilePath: string): JsonO `Invalid eval runtime config in ${evalFilePath}: top-level 'execution' is not part of eval YAML. Put target and run controls at the top level, authored concurrency under evaluate_options.max_concurrency, and operational defaults in CLI flags or project config.`, ); } + if (suite.providers !== undefined) { + throw new Error( + `Invalid eval runtime config in ${evalFilePath}: top-level 'providers' is not a runtime alias in AgentV eval YAML. Use 'targets' for systems under test; provider names backend kind inside each target.`, + ); + } if (suite.model !== undefined) { throw new Error( `Invalid eval runtime config in ${evalFilePath}: top-level 'model' is not part of eval YAML. Put model inside the target object.`, diff --git a/packages/core/test/evaluation/eval-inline-experiment.test.ts b/packages/core/test/evaluation/eval-inline-experiment.test.ts index 8027d5f9c..52312c0a8 100644 --- a/packages/core/test/evaluation/eval-inline-experiment.test.ts +++ b/packages/core/test/evaluation/eval-inline-experiment.test.ts @@ -226,6 +226,27 @@ describe('eval.yaml flat runtime controls and tests imports', () => { await expect(loadTestSuite(evalPath, tempDir)).rejects.toThrow(/top-level 'policy'/); }); + it('rejects top-level providers during runtime suite loading', async () => { + const evalPath = path.join(tempDir, 'top-level-providers.eval.yaml'); + await writeFile( + evalPath, + [ + 'providers:', + ' - label: legacy', + ' provider: mock', + 'tests:', + ' - id: one', + ' input: hello', + ' criteria: ok', + '', + ].join('\n'), + ); + + await expect(loadTestSuite(evalPath, tempDir)).rejects.toThrow( + /top-level 'providers' is not a runtime alias/, + ); + }); + it('rejects removed top-level runs and early_exit controls', async () => { const evalPath = path.join(tempDir, 'removed-repeat-controls.eval.yaml'); await writeFile( diff --git a/packages/core/test/evaluation/interpolation-integration.test.ts b/packages/core/test/evaluation/interpolation-integration.test.ts index fef7fefbc..031882ed9 100644 --- a/packages/core/test/evaluation/interpolation-integration.test.ts +++ b/packages/core/test/evaluation/interpolation-integration.test.ts @@ -164,7 +164,7 @@ describe('env interpolation in YAML loading', () => { evalFile, [ 'target:', - ' name: local-shell', + ' label: local-shell', ' provider: cli', ' command: "echo $RUNTIME ${RUNTIME} {{ env.AGENTV_TEST_PATH }}"', 'tests:',