From dbe07e76991e68c46985e63c8414e9822c52cf8c Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 13:27:50 +0200 Subject: [PATCH 1/4] Add prompt instance expansion --- .../commands/eval/artifact-writer.test.ts | 32 +- .../docs/docs/evaluation/eval-files.mdx | 33 +- .../docs/docs/reference/result-artifacts.mdx | 11 +- .../src/evaluation/loaders/config-loader.ts | 64 +- packages/core/src/evaluation/orchestrator.ts | 42 +- packages/core/src/evaluation/run-artifacts.ts | 27 +- packages/core/src/evaluation/types.ts | 35 +- packages/core/src/evaluation/yaml-parser.ts | 272 ++++++- .../evaluation/eval-inline-experiment.test.ts | 107 +++ .../agentv-bench/references/eval-yaml-spec.md | 10 +- .../references/eval.schema.json | 702 ++++++++++++++++++ 11 files changed, 1308 insertions(+), 27 deletions(-) diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 8ab705f0e..15c342d1f 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -1103,17 +1103,24 @@ describe('writeArtifactsFromResults', () => { }); it('writes repeat runs in AgentV case and run folders', async () => { + const prompt = { id: 'direct', label: 'Direct prompt', kind: 'string' as const }; const results = [ makeResult({ testId: 'repeat-case', + prompt, score: 1, trials: [ { attempt: 0, + sampleIndex: 0, + retryIndex: 0, score: 0.25, verdict: 'fail', result: makeResult({ testId: 'repeat-case', + prompt, + sampleIndex: 0, + retryIndex: 0, score: 0.25, output: 'first attempt', durationMs: 2000, @@ -1122,10 +1129,15 @@ describe('writeArtifactsFromResults', () => { }, { attempt: 1, + sampleIndex: 1, + retryIndex: 0, score: 1, verdict: 'pass', result: makeResult({ testId: 'repeat-case', + prompt, + sampleIndex: 1, + retryIndex: 0, score: 1, output: 'second attempt', durationMs: 4000, @@ -1159,9 +1171,25 @@ describe('writeArtifactsFromResults', () => { const [indexEntry] = await readIndexLines(paths.indexPath); const repeatRowDir = expectRowDir(indexEntry, 'repeat-case'); + expect(indexEntry?.prompt_id).toBe('direct'); + expect(indexEntry?.prompt_label).toBe('Direct prompt'); expect(indexEntry?.trials).toEqual([ - { attempt: 0, run_path: 'run-1', score: 0.25, verdict: 'fail' }, - { attempt: 1, run_path: 'run-2', score: 1, verdict: 'pass' }, + { + attempt: 0, + sample_index: 0, + retry_index: 0, + run_path: 'run-1', + score: 0.25, + verdict: 'fail', + }, + { + attempt: 1, + sample_index: 1, + retry_index: 0, + run_path: 'run-2', + score: 1, + verdict: 'pass', + }, ]); expect(indexEntry?.aggregation).toEqual({ strategy: 'confidence_interval', diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx index 9dc940b97..2b6e5d819 100644 --- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx +++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx @@ -41,12 +41,15 @@ workspace: - path: ./support-app repo: acme/support-app commit: main -input: Answer using the refund policy in the workspace. +prompts: + - id: refund-policy + prompt: Answer using the refund policy in the workspace. Customer: {{ customer_question }} assertions: - Applies the refund policy correctly tests: - id: missing-receipt - input: Can this customer get a refund without a receipt? + vars: + customer_question: Can this customer get a refund without a receipt? ``` Raw cases are just case data: @@ -96,7 +99,13 @@ The primary format. A single file contains metadata, inline runtime config, and ```yaml description: Math problem solving evaluation -target: default +targets: + - id: default + label: default + +prompts: + - id: math + prompt: "{{ question }}" assertions: - Correctly calculates the answer @@ -104,7 +113,8 @@ assertions: tests: - id: addition - input: What is 15 + 27? + vars: + question: What is 15 + 27? expected_output: "42" ``` @@ -116,7 +126,9 @@ tests: | `suite` | Optional suite identifier | | `category` | Optional slash-delimited analytics taxonomy path. Overrides the category derived from the eval file path. | | `target` | Named system under test from `.agentv/targets.yaml` or `--targets` | +| `targets` | Promptfoo-style target matrix. `id` is the provider/backend locator identity and `label` is the display/comparison name. | | `experiment` | Optional run/result grouping label | +| `prompts` | Top-level prompt matrix. Supports string prompts, chat message arrays, and file prompt objects. Prompts combine with `targets`, `tests`, and `repeat.count` into deterministic execution instances. | | `repeat` | Optional repeat policy with `count`, `strategy`, and `early_exit` | | `timeout_seconds` | Optional per-case timeout | | `evaluate_options` | Optional evaluation runtime options such as `budget_usd` and `max_concurrency` | @@ -125,7 +137,7 @@ tests: | `imports` | Optional import groups. `imports.suites` imports full child eval suites with their task context. `imports.tests` imports raw test rows into this file's context. Import entries may use scoped `run:` overrides for `threshold`, `repeat`, `timeout_seconds`, and `budget_usd`. | | `tests` | Inline raw tests or a string path to an external raw-case file or directory. Legacy `tests[].include` entries still load with a migration warning; prefer `imports.suites` or `imports.tests`. | | `assertions` | Suite-level graders appended to each test unless `execution.skip_defaults: true` is set on the test | -| `input` | Suite-level input messages prepended to each test's input unless `execution.skip_defaults: true` is set on the test | +| `input` | Deprecated compatibility input. Prefer top-level `prompts` plus per-test `vars`. | `workspace` is what the agent can inspect or modify through tools, not prompt input. Put instructions in `input`; put repos, templates, and lifecycle setup in @@ -450,10 +462,12 @@ MY_REPO_COMMIT=main ## Per-Test Template Variables -Eval YAML also supports per-test `vars` for data-driven prompt templates. Use `{{name}}` placeholders in test-facing text fields, and AgentV resolves them when the suite loads. +Eval YAML supports per-test `vars` for data-driven prompt templates. Prefer top-level `prompts` as the authored input surface, then use `{{name}}` placeholders to vary each test row. ```yaml -input: "Answer clearly: {{question}}" +prompts: + - id: clear-answer + prompt: "Answer clearly: {{question}}" tests: - id: capital @@ -461,9 +475,6 @@ tests: question: What is the capital of France? expected_answer: Paris criteria: "Answers {{question}} correctly" - input: - - role: user - content: "Question: {{question}}" expected_output: "{{expected_answer}}" ``` @@ -471,7 +482,7 @@ tests: - `vars` is defined per test as an object - `{{name}}` and dotted paths like `{{ user.name }}` are supported -- Substitution applies to suite-level `input`, test `input`, `input_files`, `criteria`, `expected_output`, and conversation turn `input` / `expected_output` +- Substitution applies to `prompts`, `criteria`, `expected_output`, and conversation turn `input` / `expected_output` - When the whole string is a single placeholder, the original JSON value is preserved - Missing variables are left unchanged, so unrelated template syntax is not silently blanked out - `vars` interpolation is separate from environment interpolation: `{{question}}` uses test data, `${{ PROJECT_NAME }}` uses environment variables diff --git a/apps/web/src/content/docs/docs/reference/result-artifacts.mdx b/apps/web/src/content/docs/docs/reference/result-artifacts.mdx index f6f132412..9ce20484b 100644 --- a/apps/web/src/content/docs/docs/reference/result-artifacts.mdx +++ b/apps/web/src/content/docs/docs/reference/result-artifacts.mdx @@ -125,6 +125,10 @@ Example row: "tags": { "experiment": "with_skills", "team": "support" }, "eval_path": "evals/support/refunds.eval.yaml", "test_id": "refund-eligibility", + "prompt_id": "refund-policy", + "prompt_label": "Refund policy prompt", + "sample_index": 0, + "retry_index": 0, "target": "codex-gpt5", "variant": "skills-v2", "attempt": 1, @@ -147,9 +151,10 @@ Example row: Rows can represent repeated attempts, multi-target runs, imported suites, manual `prepare`/`grade` attempts, or imported provider sessions. That is why -`experiment`, `eval_path`, `test_id`, `target`, `variant`, `attempt`, and -source metadata belong in `index.jsonl`: tools can filter dynamically without -requiring every run to be pre-split into semantic folders. +`experiment`, `eval_path`, `test_id`, `prompt_id`, `target`, `variant`, +`sample_index`, `retry_index`, `attempt`, and source metadata belong in +`index.jsonl`: tools can filter dynamically without requiring every run to be +pre-split into semantic folders. When a run resolves a promptfoo-shaped tags map (from suite `tags`, project config `tags`, or `--tag key=value`), the resolved map is emitted as `tags` on diff --git a/packages/core/src/evaluation/loaders/config-loader.ts b/packages/core/src/evaluation/loaders/config-loader.ts index c3629288f..ec64e213f 100644 --- a/packages/core/src/evaluation/loaders/config-loader.ts +++ b/packages/core/src/evaluation/loaders/config-loader.ts @@ -15,6 +15,7 @@ import type { EvalTargetRef, FailOnError, JsonObject, + JsonValue, TargetHooksConfig, WorkspaceHookConfig, } from '../types.js'; @@ -356,7 +357,24 @@ export function extractTargetRefsFromSuite( suite: JsonObject, ): readonly EvalTargetRef[] | undefined { rejectAuthoredRuntimeContainers(suite); - return undefined; + if (suite.providers !== undefined) { + throw new Error("Top-level 'providers' has been removed. Use 'targets' instead."); + } + if (suite.target !== undefined && suite.targets !== undefined) { + throw new Error("Use either top-level 'target' or 'targets', not both."); + } + + const rawTargets = suite.targets; + if (rawTargets === undefined || rawTargets === null) { + return undefined; + } + + const rawEntries = Array.isArray(rawTargets) ? rawTargets : [rawTargets]; + const refs = rawEntries + .map((entry, index) => parseTargetRef(entry, index)) + .filter((entry): entry is EvalTargetRef => entry !== undefined); + + return refs.length > 0 ? refs : undefined; } /** @@ -369,6 +387,50 @@ export function extractTargetsFromSuite(suite: JsonObject): readonly string[] | return names.length > 0 ? names : undefined; } +function parseTargetRef(raw: JsonValue, index: number): EvalTargetRef | undefined { + if (typeof raw === 'string') { + const targetId = raw.trim(); + return targetId ? { name: targetId, id: targetId } : undefined; + } + + if (!isJsonObject(raw)) { + logWarning(`Invalid targets[${index}]: expected string or object. Ignoring.`); + return undefined; + } + + const rawId = raw.id; + const rawLabel = raw.label; + const legacyName = raw.name; + const useTarget = raw.use_target; + const id = typeof rawId === 'string' && rawId.trim().length > 0 ? rawId.trim() : undefined; + const label = + typeof rawLabel === 'string' && rawLabel.trim().length > 0 ? rawLabel.trim() : undefined; + const name = + label ?? + id ?? + (typeof legacyName === 'string' && legacyName.trim().length > 0 + ? legacyName.trim() + : undefined); + + if (!name) { + logWarning(`Invalid targets[${index}]: expected id or label. Ignoring.`); + return undefined; + } + if (legacyName !== undefined) { + logWarning('targets[].name is deprecated. Use targets[].id and targets[].label instead.'); + } + + return { + name, + ...(id ? { id } : {}), + ...(label ? { label } : {}), + ...(typeof useTarget === 'string' && useTarget.trim().length > 0 + ? { use_target: useTarget.trim() } + : {}), + ...(raw.hooks !== undefined ? { hooks: parseTargetHooks(raw.hooks) } : {}), + }; +} + /** * Parse a single workspace hook config from a raw object. * Accepts both string shorthand (shell command) and object form. diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 93e15cab5..26fbda0f6 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -439,6 +439,8 @@ export interface RunEvalCaseOptions { readonly evalFilePath?: string; /** Repo root used to serialize replay fixture eval_path as a stable relative path. */ readonly repoRoot?: string; + /** Zero-based sample index produced by repeat.count. */ + readonly sampleIndex?: number; } export interface ProgressEvent { @@ -1652,6 +1654,7 @@ async function runBatchEvaluation(options: { promptInputs, nowFn, attempt: 0, + sampleIndex: 0, graderProvider: await resolveGraderProvider(target), agentTimeoutMs, output, @@ -1693,6 +1696,7 @@ async function runBatchEvaluation(options: { 'evaluator', 'evaluator_error', verbose, + { sampleIndex: 0, retryIndex: 0 }, ); results.push(errorResult); if (onResult) { @@ -1802,6 +1806,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise Date; readonly attempt: number; + readonly sampleIndex: number; readonly graderProvider?: Provider; readonly agentTimeoutMs?: number; readonly output?: readonly Message[]; @@ -2464,6 +2480,7 @@ async function evaluateCandidate(options: { promptInputs, nowFn, attempt, + sampleIndex, graderProvider, agentTimeoutMs, output, @@ -2567,7 +2584,10 @@ async function evaluateCandidate(options: { : undefined; return { timestamp: completedAt.toISOString(), - testId: evalCase.id, + testId: evalCase.testId ?? evalCase.id, + prompt: evalCase.prompt, + sampleIndex, + retryIndex: attempt, source: evalCase.source, suite: evalCase.suite, category: evalCase.category, @@ -3035,6 +3055,7 @@ async function runConversationMode(options: { readonly targetResolver?: (name: string) => Provider | undefined; readonly availableTargets?: readonly string[]; readonly evalFilePath?: string; + readonly sampleIndex?: number; }): Promise { const { evalCase, @@ -3055,6 +3076,7 @@ async function runConversationMode(options: { targetResolver, availableTargets, evalFilePath, + sampleIndex = 0, } = options; // biome-ignore lint/style/noNonNullAssertion: turns is guaranteed by the caller (conversation mode gate) @@ -3183,6 +3205,7 @@ async function runConversationMode(options: { }, nowFn, attempt: 0, + sampleIndex, graderProvider, agentTimeoutMs, output: response.output, @@ -3245,6 +3268,7 @@ async function runConversationMode(options: { }, nowFn, attempt: 0, + sampleIndex, graderProvider, agentTimeoutMs, verbose, @@ -3288,7 +3312,7 @@ async function runConversationMode(options: { durationMs: totalDurationMs, provider: provider.kind, target: target.name, - testId: evalCase.id, + testId: evalCase.testId ?? evalCase.id, conversationId: evalCase.conversation_id, }); @@ -3296,7 +3320,10 @@ async function runConversationMode(options: { return { timestamp: nowFn().toISOString(), - testId: evalCase.id, + testId: evalCase.testId ?? evalCase.id, + prompt: evalCase.prompt, + sampleIndex, + retryIndex: 0, suite: evalCase.suite, category: evalCase.category, score: finalScore, @@ -3475,6 +3502,10 @@ function buildErrorResult( failureStage: FailureStage, failureReasonCode: string, verbose?: boolean, + identity?: { + readonly sampleIndex?: number; + readonly retryIndex?: number; + }, ): EvaluationResult { const message = extractErrorMessage(error); @@ -3521,7 +3552,10 @@ function buildErrorResult( return { timestamp: timestamp.toISOString(), - testId: evalCase.id, + testId: evalCase.testId ?? evalCase.id, + prompt: evalCase.prompt, + sampleIndex: identity?.sampleIndex, + retryIndex: identity?.retryIndex, suite: evalCase.suite, category: evalCase.category, conversationId: evalCase.conversation_id, diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index b59c80d80..40dd3eb45 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -120,13 +120,14 @@ export function buildEvaluationResultTargetKey(result: EvaluationResult): string null, suite: stringField(dimensions, 'suite') ?? getSuite(result) ?? null, test_id: stringField(dimensions, 'testId') ?? result.testId ?? 'unknown', + prompt_id: result.prompt?.id ?? null, target: stringField(dimensions, 'target') ?? result.target ?? 'unknown', variant: stringField(dimensions, 'variant') ?? result.variant ?? null, }); } export function buildEvalTestTargetKey( - test: Pick, + test: Pick, target?: string, variant?: string, ): string { @@ -134,6 +135,7 @@ export function buildEvalTestTargetKey( eval_path: evalSourcePath(test.source) ?? null, suite: test.suite ?? null, test_id: test.id ?? 'unknown', + prompt_id: test.prompt?.id ?? null, target: target ?? 'unknown', variant: variant ?? null, }); @@ -352,6 +354,8 @@ export interface GradingArtifact { export type TrialResultArtifact = { readonly attempt: number; + readonly sample_index?: number; + readonly retry_index?: number; readonly run_path?: string; readonly score: number; readonly verdict: string; @@ -471,6 +475,10 @@ export interface AggregateGradingArtifact { export interface IndexArtifactEntry { readonly timestamp: string; readonly test_id: string; + readonly prompt_id?: string; + readonly prompt_label?: string; + readonly sample_index?: number; + readonly retry_index?: number; readonly suite?: string; readonly category?: string; readonly conversation_id?: string; @@ -742,6 +750,8 @@ function toTrialArtifacts( } return trials.map((trial) => ({ attempt: trial.attempt, + sample_index: trial.sampleIndex, + retry_index: trial.retryIndex, run_path: trial.result ? trialRunDirName(trial.attempt) : undefined, score: trial.score, verdict: trial.verdict, @@ -888,6 +898,7 @@ function fallbackRepeatFingerprint(result: EvaluationResult): string { .update( JSON.stringify({ test_id: result.testId ?? 'unknown', + prompt_id: result.prompt?.id, target: result.target ?? 'unknown', trial_count: result.trials?.length ?? 0, aggregation: result.aggregation, @@ -986,6 +997,8 @@ function buildAgentVRunResultArtifact(params: { function singleRunTrial(result: EvaluationResult): TrialResult { return { attempt: 0, + sampleIndex: result.sampleIndex, + retryIndex: result.retryIndex, score: result.score, verdict: result.executionStatus !== 'execution_error' && result.score >= DEFAULT_THRESHOLD @@ -1545,6 +1558,8 @@ function buildRowArtifactHashInput( readonly eval_path: string | null; readonly suite: string | null; readonly test_id: string; + readonly prompt_id: string | null; + readonly sample_index: number | null; readonly target: string; readonly variant: string | null; } { @@ -1553,6 +1568,8 @@ function buildRowArtifactHashInput( eval_path: dimensions?.evalPath ?? sourceEvalPath(result, sourceTest) ?? null, suite: dimensions?.suite ?? getSuite(result) ?? null, test_id: dimensions?.testId ?? result.testId ?? 'unknown', + prompt_id: result.prompt?.id ?? sourceTest?.prompt?.id ?? null, + sample_index: result.sampleIndex ?? null, target: dimensions?.target ?? result.target ?? 'unknown', variant: dimensions?.variant ?? result.variant ?? null, }; @@ -1724,6 +1741,10 @@ export function buildIndexArtifactEntry( return { timestamp: result.timestamp, test_id: result.testId ?? 'unknown', + prompt_id: result.prompt?.id, + prompt_label: result.prompt?.label, + sample_index: result.sampleIndex, + retry_index: result.retryIndex, suite: getSuite(result), category: result.category, conversation_id: result.conversationId, @@ -1813,6 +1834,10 @@ export function buildResultIndexArtifact( return { timestamp: result.timestamp, test_id: result.testId ?? 'unknown', + prompt_id: result.prompt?.id, + prompt_label: result.prompt?.label, + sample_index: result.sampleIndex, + retry_index: result.retryIndex, suite: getSuite(result), category: result.category, conversation_id: result.conversationId, diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 1964d1336..a4b85bd16 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -299,8 +299,16 @@ export type TargetHooksConfig = { * String targets are shorthand for `{ name: "target-name" }` (no hooks). */ export type EvalTargetRef = { - /** Target name (must match a target in targets.yaml or be defined inline with use_target) */ + /** + * Internal target selection name. Authored YAML should prefer `id` and + * `label`; this field remains the runtime bridge to existing targets.yaml + * resolution until target-provider locator work lands. + */ readonly name: string; + /** Provider/backend locator identity from authored eval YAML. */ + readonly id?: string; + /** Display/comparison label from authored eval YAML. */ + readonly label?: string; /** Delegate to another named target (same as use_target in targets.yaml) */ readonly use_target?: string; /** Per-target hooks for workspace customization */ @@ -972,14 +980,30 @@ export type ConversationAggregation = 'mean' | 'min' | 'max'; */ export type TurnFailurePolicy = 'continue' | 'stop'; +export type EvalPromptKind = 'string' | 'chat' | 'file' | 'function'; + +/** + * Stable identity for an authored top-level prompt. The prompt content itself + * is rendered into EvalTest.input; this metadata keeps the matrix dimension + * visible to reports, artifacts, and future flat-instance workers. + */ +export interface EvalPromptIdentity { + readonly id: string; + readonly label?: string; + readonly kind: EvalPromptKind; +} + /** * Eval test definition sourced from AgentV specs. */ export interface EvalTest { readonly id: string; + /** Original authored test id before prompt expansion rewrites duplicate internal ids. */ + readonly testId?: string; readonly suite?: string; readonly category?: string; readonly conversation_id?: string; + readonly prompt?: EvalPromptIdentity; readonly question: string; readonly input: readonly TestMessage[]; readonly expected_output: readonly JsonObject[]; @@ -1056,6 +1080,10 @@ export interface TrialsConfig { */ export interface TrialResult { readonly attempt: number; + /** Zero-based sample index produced from repeat.count. */ + readonly sampleIndex?: number; + /** Provider retry index for the attempt that produced this trial result. */ + readonly retryIndex?: number; readonly score: number; readonly verdict: EvaluationVerdict; readonly scores?: readonly GraderResult[]; @@ -1164,6 +1192,11 @@ export type FailOnError = boolean; export interface EvaluationResult { readonly timestamp: string; readonly testId: string; + readonly prompt?: EvalPromptIdentity; + /** Zero-based sample index produced from repeat.count. */ + readonly sampleIndex?: number; + /** Provider retry index for the attempt that produced this result. */ + readonly retryIndex?: number; readonly source?: EvalTestSource; readonly suite?: string; readonly category?: string; diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index a5a99ef18..8c3c9e1e0 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -1,3 +1,4 @@ +import { createHash } from 'node:crypto'; import { readFile, realpath, stat } from 'node:fs/promises'; import path from 'node:path'; import fg from 'fast-glob'; @@ -31,7 +32,11 @@ import { loadConfig, parseTargetHooks, } from './loaders/config-loader.js'; -import { buildSearchRoots, resolveToAbsolutePath } from './loaders/file-resolver.js'; +import { + buildSearchRoots, + resolveFileReference, + resolveToAbsolutePath, +} from './loaders/file-resolver.js'; import { coerceEvaluator, collectAssertionTemplateSourceReferences, @@ -56,6 +61,7 @@ import type { ConversationTurn, DockerWorkspaceConfig, EvalGraderSource, + EvalPromptIdentity, EvalRunOverride, EvalSourceReference, EvalTest, @@ -185,6 +191,7 @@ type RawTestSuite = JsonObject & { readonly policy?: JsonValue; readonly repeat?: JsonValue; readonly runs?: JsonValue; + readonly prompts?: JsonValue; readonly early_exit?: JsonValue; readonly timeout_seconds?: JsonValue; readonly evaluate_options?: JsonValue; @@ -238,6 +245,17 @@ type RawEvalCase = JsonObject & { readonly window_size?: JsonValue; }; +type PromptDefinition = { + readonly identity: EvalPromptIdentity; + readonly input: JsonValue; +}; + +type PromptExpansionResult = { + readonly rawCases: readonly JsonValue[]; + readonly promptById: ReadonlyMap; + readonly sourceTestIdById: ReadonlyMap; +}; + function resolveTests(suite: RawTestSuite): JsonValue | undefined { if (suite.tests !== undefined) return suite.tests; if (suite.eval_cases !== undefined) { @@ -304,6 +322,250 @@ function interpolateRawEvalCase(raw: RawEvalCase, vars: JsonObject | undefined): }; } +function stablePromptId(value: unknown): string { + return createHash('sha256').update(JSON.stringify(value)).digest('hex').slice(0, 12); +} + +function safePromptId(value: string): string { + const safe = value + .trim() + .replace(/[^A-Za-z0-9_.-]+/g, '-') + .replace(/^-+|-+$/g, ''); + return safe.length > 0 ? safe.slice(0, 48) : stablePromptId(value); +} + +function stripFileProtocol(value: string): string { + return value.startsWith('file://') ? value.slice('file://'.length) : value; +} + +function isChatPromptArray(value: readonly JsonValue[]): boolean { + return value.length > 0 && value.every((entry) => isJsonObject(entry) && isTestMessage(entry)); +} + +async function readPromptFile( + rawPath: string, + searchRoots: readonly string[], +): Promise<{ + readonly displayPath: string; + readonly text: string; +}> { + const filePath = stripFileProtocol(rawPath); + const { displayPath, resolvedPath, attempted } = await resolveFileReference( + filePath, + searchRoots, + ); + if (!resolvedPath) { + const attempts = attempted.length + ? [' Tried:', ...attempted.map((candidate) => ` ${candidate}`)] + : undefined; + logError(`Prompt file not found: ${displayPath}`, attempts); + throw new Error(`Prompt file not found: ${displayPath}`); + } + return { + displayPath, + text: (await readFile(resolvedPath, 'utf8')).replace(/\r\n/g, '\n'), + }; +} + +async function parsePromptDefinition( + rawPrompt: JsonValue, + searchRoots: readonly string[], + index: number, +): Promise { + if (typeof rawPrompt === 'string') { + if (rawPrompt.startsWith('file://')) { + const { displayPath, text } = await readPromptFile(rawPrompt, searchRoots); + return { + identity: { id: displayPath, label: displayPath, kind: 'file' }, + input: text, + }; + } + return { + identity: { id: `prompt-${stablePromptId(rawPrompt)}`, kind: 'string' }, + input: rawPrompt, + }; + } + + if (Array.isArray(rawPrompt)) { + if (!isChatPromptArray(rawPrompt)) { + throw new Error( + 'Invalid prompts entry: arrays must be chat messages or a top-level list of prompt entries.', + ); + } + return { + identity: { id: `chat-${stablePromptId(rawPrompt)}`, kind: 'chat' }, + input: rawPrompt, + }; + } + + if (!isJsonObject(rawPrompt)) { + throw new Error(`Invalid prompts[${index}]: expected string, chat array, or object.`); + } + + if (rawPrompt.function !== undefined || rawPrompt.function_file !== undefined) { + throw new Error( + 'Function prompt sources are not supported by the YAML loader yet. Use a string, chat-array, or file prompt.', + ); + } + + const label = asString(rawPrompt.label)?.trim(); + const explicitId = asString(rawPrompt.id)?.trim(); + + if (rawPrompt.file !== undefined) { + const fileRef = asString(rawPrompt.file); + if (!fileRef) { + throw new Error(`Invalid prompts[${index}].file: expected non-empty string.`); + } + const { displayPath, text } = await readPromptFile(fileRef, searchRoots); + return { + identity: { + id: explicitId ?? displayPath, + ...(label ? { label } : { label: displayPath }), + kind: 'file', + }, + input: text, + }; + } + + if (rawPrompt.messages !== undefined) { + if (!Array.isArray(rawPrompt.messages) || !isChatPromptArray(rawPrompt.messages)) { + throw new Error(`Invalid prompts[${index}].messages: expected chat message array.`); + } + return { + identity: { + id: explicitId ?? `chat-${stablePromptId(rawPrompt.messages)}`, + ...(label ? { label } : {}), + kind: 'chat', + }, + input: rawPrompt.messages, + }; + } + + if (rawPrompt.prompt !== undefined) { + const promptValue = rawPrompt.prompt; + if ( + typeof promptValue !== 'string' && + !(Array.isArray(promptValue) && isChatPromptArray(promptValue)) + ) { + throw new Error(`Invalid prompts[${index}].prompt: expected string or chat message array.`); + } + const kind = Array.isArray(promptValue) ? 'chat' : 'string'; + return { + identity: { + id: explicitId ?? `${kind}-${stablePromptId(promptValue)}`, + ...(label ? { label } : {}), + kind, + }, + input: promptValue, + }; + } + + if (isTestMessage(rawPrompt)) { + return { + identity: { + id: explicitId ?? `chat-${stablePromptId(rawPrompt)}`, + ...(label ? { label } : {}), + kind: 'chat', + }, + input: [rawPrompt], + }; + } + + throw new Error(`Invalid prompts[${index}]: expected prompt, messages, or file.`); +} + +async function parseSuitePrompts( + rawPrompts: JsonValue | undefined, + searchRoots: readonly string[], +): Promise { + if (rawPrompts === undefined || rawPrompts === null) { + return undefined; + } + + const entries = + Array.isArray(rawPrompts) && !isChatPromptArray(rawPrompts) ? rawPrompts : [rawPrompts]; + const prompts: PromptDefinition[] = []; + for (let index = 0; index < entries.length; index++) { + prompts.push(await parsePromptDefinition(entries[index] as JsonValue, searchRoots, index)); + } + return prompts; +} + +function renderPromptInput(prompt: PromptDefinition, vars: JsonObject | undefined): JsonValue { + return interpolateCaseField(prompt.input, vars); +} + +function expandPromptMatrix( + rawCases: readonly JsonValue[], + prompts: readonly PromptDefinition[] | undefined, + suite: RawTestSuite, +): PromptExpansionResult { + const promptById = new Map(); + const sourceTestIdById = new Map(); + + if (!prompts) { + if (suite.input !== undefined || suite.input_files !== undefined) { + logWarning( + "Top-level 'input' and 'input_files' are deprecated. Use top-level 'prompts' plus tests[].vars instead.", + ); + } else if ( + rawCases.some( + (rawCase) => + isJsonObject(rawCase) && + (rawCase.input !== undefined || rawCase.input_files !== undefined), + ) + ) { + logWarning("tests[].input is deprecated. Use top-level 'prompts' plus tests[].vars instead."); + } + return { rawCases, promptById, sourceTestIdById }; + } + + if (suite.input !== undefined || suite.input_files !== undefined) { + throw new Error("Top-level 'input' and 'input_files' cannot be combined with 'prompts'."); + } + + const expandedCases: JsonValue[] = []; + for (const rawCase of rawCases) { + if (!isJsonObject(rawCase)) { + expandedCases.push(rawCase); + continue; + } + if (rawCase.input !== undefined || rawCase.input_files !== undefined) { + throw new Error( + "tests[].input and tests[].input_files have been removed from the preferred prompt contract. Use top-level 'prompts' plus tests[].vars.", + ); + } + + const sourceTestId = asString(rawCase.id); + const vars = isJsonObject(rawCase.vars) ? rawCase.vars : undefined; + for (const prompt of prompts) { + const promptId = safePromptId(prompt.identity.id); + const expandedId = + sourceTestId && prompts.length > 1 ? `${sourceTestId}__prompt_${promptId}` : sourceTestId; + const expandedDependsOn = Array.isArray(rawCase.depends_on) + ? rawCase.depends_on.map((dep) => + typeof dep === 'string' && prompts.length > 1 ? `${dep}__prompt_${promptId}` : dep, + ) + : rawCase.depends_on; + const expandedCase: JsonObject = { + ...rawCase, + ...(expandedId ? { id: expandedId } : {}), + ...(expandedDependsOn !== undefined ? { depends_on: expandedDependsOn } : {}), + input: renderPromptInput(prompt, vars), + }; + expandedCases.push(expandedCase); + if (expandedId) { + promptById.set(expandedId, prompt.identity); + if (sourceTestId) { + sourceTestIdById.set(expandedId, sourceTestId); + } + } + } + } + + return { rawCases: expandedCases, promptById, sourceTestIdById }; +} + /** * Read metadata from a test suite file (like target name). * This is a convenience function for CLI tools that need metadata without loading all tests. @@ -564,6 +826,10 @@ async function loadTestsFromParsedYamlValue( throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`); } + const promptDefinitions = await parseSuitePrompts(suite.prompts, searchRoots); + const promptExpansion = expandPromptMatrix(expandedTestCases, promptDefinitions, suite); + expandedTestCases = promptExpansion.rawCases; + const suiteWorkspace = await resolveWorkspaceConfig(suite.workspace, evalFileDir); const rawSuiteInput = suite.input; @@ -586,6 +852,8 @@ async function loadTestsFromParsedYamlValue( const testCaseConfig = rawTestCase as RawEvalCase; const id = asString(testCaseConfig.id); + const promptIdentity = id ? promptExpansion.promptById.get(id) : undefined; + const sourceTestId = id ? promptExpansion.sourceTestIdById.get(id) : undefined; // Skip tests that don't match the filter pattern (glob supported) if (filterPattern && (!id || !matchesFilter(id, filterPattern))) { @@ -836,9 +1104,11 @@ async function loadTestsFromParsedYamlValue( const testCase: EvalTest = { id, + ...(sourceTestId ? { testId: sourceTestId } : {}), suite: suiteName, category, conversation_id: conversationId, + ...(promptIdentity ? { prompt: promptIdentity } : {}), question: question, input: inputMessages, expected_output: outputSegments, diff --git a/packages/core/test/evaluation/eval-inline-experiment.test.ts b/packages/core/test/evaluation/eval-inline-experiment.test.ts index 8027d5f9c..a310ddeb9 100644 --- a/packages/core/test/evaluation/eval-inline-experiment.test.ts +++ b/packages/core/test/evaluation/eval-inline-experiment.test.ts @@ -90,6 +90,113 @@ describe('eval.yaml flat runtime controls and tests imports', () => { expect(suite.experimentConfig?.threshold).toBe(0.9); }); + it('expands top-level prompts across tests with per-test vars', async () => { + const evalPath = path.join(tempDir, 'prompt-matrix.eval.yaml'); + await writeFile( + evalPath, + [ + 'name: prompt-matrix-suite', + 'prompts:', + ' - id: direct', + ' label: Direct', + ' prompt: "Summarize {{ topic }}."', + ' - id: terse', + ' label: Terse', + ' prompt: "In one sentence, summarize {{ topic }}."', + 'targets:', + ' - id: openai:gpt-5.4-mini', + ' label: mini', + ' - id: local-codex', + 'tests:', + ' - id: docs', + ' vars:', + ' topic: release notes', + ' expected_output: concise release-note summary', + '', + ].join('\n'), + ); + + const suite = await loadTestSuite(evalPath, tempDir); + + expect(suite.tests.map((test) => test.id)).toEqual([ + 'docs__prompt_direct', + 'docs__prompt_terse', + ]); + expect(suite.tests.map((test) => test.testId)).toEqual(['docs', 'docs']); + expect(suite.tests.map((test) => test.prompt)).toEqual([ + { id: 'direct', label: 'Direct', kind: 'string' }, + { id: 'terse', label: 'Terse', kind: 'string' }, + ]); + expect(suite.tests.map((test) => test.question)).toEqual([ + 'Summarize release notes.', + 'In one sentence, summarize release notes.', + ]); + expect(suite.targets).toEqual(['mini', 'local-codex']); + expect(suite.targetRefs).toEqual([ + { name: 'mini', id: 'openai:gpt-5.4-mini', label: 'mini' }, + { name: 'local-codex', id: 'local-codex' }, + ]); + }); + + it('loads chat and file prompts from the top-level prompt matrix', async () => { + const promptPath = path.join(tempDir, 'prompt.md'); + const evalPath = path.join(tempDir, 'prompt-sources.eval.yaml'); + await writeFile(promptPath, 'Review {{ file_name }}.\n'); + await writeFile( + evalPath, + [ + 'name: prompt-sources-suite', + 'prompts:', + ' - id: chat', + ' messages:', + ' - role: system', + ' content: Be precise.', + ' - role: user', + ' content: "Inspect {{ file_name }}."', + ' - id: file', + ' file: prompt.md', + 'tests:', + ' - id: inspect', + ' vars:', + ' file_name: README.md', + ' criteria: useful', + '', + ].join('\n'), + ); + + const suite = await loadTestSuite(evalPath, tempDir); + + expect(suite.tests).toHaveLength(2); + expect(suite.tests[0]?.input).toEqual([ + { role: 'system', content: 'Be precise.' }, + { role: 'user', content: 'Inspect README.md.' }, + ]); + expect(suite.tests[1]?.question).toBe('Review README.md.'); + expect(suite.tests[1]?.prompt).toEqual({ + id: 'file', + label: 'prompt.md', + kind: 'file', + }); + }); + + it('rejects tests input when top-level prompts are authored', async () => { + const evalPath = path.join(tempDir, 'mixed-prompt-contract.eval.yaml'); + await writeFile( + evalPath, + [ + 'prompts:', + ' - hello', + 'tests:', + ' - id: one', + ' input: legacy', + ' criteria: ok', + '', + ].join('\n'), + ); + + await expect(loadTestSuite(evalPath, tempDir)).rejects.toThrow(/tests\[\]\.input/); + }); + it('parses evaluate_options.budget_usd and prefers it over legacy top-level budget_usd', async () => { const evalPath = path.join(tempDir, 'evaluate-options-budget.eval.yaml'); await writeFile( diff --git a/skills-data/agentv-bench/references/eval-yaml-spec.md b/skills-data/agentv-bench/references/eval-yaml-spec.md index b2285993a..05f92b9f6 100644 --- a/skills-data/agentv-bench/references/eval-yaml-spec.md +++ b/skills-data/agentv-bench/references/eval-yaml-spec.md @@ -9,15 +9,19 @@ The grader agent uses this to evaluate assertions without the CLI. - `name` (string, optional) — eval name - `description` (string, optional) — description -- `execution` (object, optional) — `target`, `model`, etc. +- `target` (string | object, optional) — single system under test +- `targets` (array, optional) — promptfoo-style target matrix. `id` is provider/backend locator identity; `label` is the display/comparison name. +- `repeat` (object, optional) — stochastic sample policy with `count`, `strategy`, and optional `early_exit` - `workspace` (object, optional) — workspace config (template, repos, hooks) -- `input` (string | object | Message | Message[], optional) — suite-level input prepended to each test. String/block shorthand expands to a user message. +- `prompts` (string | Message[] | array, optional) — preferred authored input surface. Prompts combine with `targets`, `tests`, and `repeat.count` into deterministic execution instances. +- `input` (string | object | Message | Message[], optional) — deprecated compatibility input. Prefer `prompts` plus per-test `vars`. - `tests` (array, required) — test cases ### Per-test fields - `id` (string, required) — unique test identifier -- `input` (string | object | Message | Message[], required) — task input. String shorthand expands to `[{role: user, content: "..."}]`; object shorthand preserves structured user content when the object has no top-level `role`. Top-level `role` is reserved for message objects. +- `vars` (object, optional) — per-test values interpolated into top-level `prompts`, `criteria`, `expected_output`, and conversation turns with `{{name}}` placeholders. +- `input` (string | object | Message | Message[], deprecated) — legacy task input. Do not use when top-level `prompts` is present. - `expected_output` (string | Message[], optional) — passive reference answer. String shorthand expands to `[{role: assistant, content: "..."}]`. It is available to declared graders, but does not add an implicit grader when `assertions` is present. - `criteria` (string, optional) — human-readable success criteria - `assertions` (array, optional) — grader assertions diff --git a/skills-data/agentv-eval-writer/references/eval.schema.json b/skills-data/agentv-eval-writer/references/eval.schema.json index d044864f7..48dff1ad8 100644 --- a/skills-data/agentv-eval-writer/references/eval.schema.json +++ b/skills-data/agentv-eval-writer/references/eval.schema.json @@ -193,6 +193,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -254,6 +267,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -970,6 +996,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -1031,6 +1070,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -1300,6 +1352,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -1361,6 +1426,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -1630,6 +1708,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -1691,6 +1782,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -1925,6 +2029,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -1986,6 +2103,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -2446,6 +2576,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -3815,6 +3958,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -5709,6 +5865,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -5770,6 +5939,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -6039,6 +6221,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -6100,6 +6295,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -6369,6 +6577,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -6430,6 +6651,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -6664,6 +6898,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -6725,6 +6972,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -7185,6 +7445,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -8554,6 +8827,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -10424,6 +10710,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -10485,6 +10784,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -10754,6 +11066,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -10815,6 +11140,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -11084,6 +11422,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -11145,6 +11496,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -11569,6 +11933,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -11630,6 +12007,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -11899,6 +12289,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -11960,6 +12363,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -12229,6 +12645,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -12290,6 +12719,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -12524,6 +12966,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -12585,6 +13040,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -12875,6 +13343,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -12936,6 +13417,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -13205,6 +13699,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -13266,6 +13773,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -13535,6 +14055,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -13596,6 +14129,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -13830,6 +14376,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -13891,6 +14450,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -14116,6 +14688,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -14177,6 +14762,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -14446,6 +15044,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -14507,6 +15118,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -14776,6 +15400,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -14837,6 +15474,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -15071,6 +15721,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -15132,6 +15795,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -15592,6 +16268,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, @@ -16961,6 +17650,19 @@ "label": { "type": "string" }, + "prompt": { + "type": "string" + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, "raw": { "type": "string" }, From 9cefd816a099955d3726522a995a68ff40beef67 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 13:33:44 +0200 Subject: [PATCH 2/4] Sync prompt object schema --- .../evaluation/validation/eval-file.schema.ts | 3 + .../references/eval.schema.json | 930 +++++++++++++++--- 2 files changed, 819 insertions(+), 114 deletions(-) diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index a289a3f2a..56673e3c6 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -67,6 +67,9 @@ const PromptSchema = z.union([ .object({ id: z.string().optional(), label: z.string().optional(), + prompt: z.union([z.string(), z.array(JsonObjectSchema)]).optional(), + file: z.string().optional(), + messages: z.array(JsonObjectSchema).optional(), raw: z.string().optional(), path: z.string().optional(), prefix: z.string().optional(), diff --git a/skills-data/agentv-eval-writer/references/eval.schema.json b/skills-data/agentv-eval-writer/references/eval.schema.json index 48dff1ad8..2f57a4271 100644 --- a/skills-data/agentv-eval-writer/references/eval.schema.json +++ b/skills-data/agentv-eval-writer/references/eval.schema.json @@ -194,7 +194,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -203,7 +215,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -268,7 +281,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -277,7 +302,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -997,7 +1023,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -1006,7 +1044,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -1071,7 +1110,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -1080,7 +1131,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -1353,7 +1405,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -1362,7 +1426,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -1427,7 +1492,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -1436,7 +1513,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -1709,7 +1787,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -1718,7 +1808,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -1783,7 +1874,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -1792,7 +1895,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -2030,7 +2134,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -2039,7 +2155,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -2104,7 +2221,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -2113,7 +2242,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -2577,7 +2707,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -2586,7 +2728,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -3959,7 +4102,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -3968,7 +4123,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -5866,7 +6022,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -5875,7 +6043,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -5940,7 +6109,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -5949,7 +6130,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -6222,7 +6404,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -6231,7 +6425,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -6296,7 +6491,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -6305,7 +6512,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -6578,7 +6786,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -6587,7 +6807,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -6652,7 +6873,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -6661,7 +6894,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -6899,7 +7133,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -6908,7 +7154,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -6973,7 +7220,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -6982,7 +7241,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -7446,7 +7706,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -7455,7 +7727,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -8828,7 +9101,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -8837,7 +9122,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -10711,7 +10997,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -10720,7 +11018,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -10785,7 +11084,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -10794,7 +11105,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -11067,7 +11379,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -11076,7 +11400,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -11141,7 +11466,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -11150,7 +11487,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -11423,7 +11761,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -11432,7 +11782,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -11497,7 +11848,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -11506,7 +11869,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -11934,7 +12298,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -11943,7 +12319,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -12008,7 +12385,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -12017,7 +12406,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -12290,7 +12680,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -12299,7 +12701,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -12364,7 +12767,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -12373,7 +12788,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -12646,7 +13062,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -12655,7 +13083,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -12720,16 +13149,29 @@ "type": "string" }, "prompt": { - "type": "string" - }, - "file": { - "type": "string" - }, - "messages": { - "type": "array", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] + }, + "file": { + "type": "string" + }, + "messages": { + "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -12967,7 +13409,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -12976,7 +13430,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -13041,7 +13496,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -13050,7 +13517,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -13344,7 +13812,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -13353,7 +13833,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -13418,7 +13899,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -13427,7 +13920,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -13700,7 +14194,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -13709,7 +14215,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -13774,7 +14281,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -13783,7 +14302,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -14056,7 +14576,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -14065,7 +14597,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -14130,7 +14663,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -14139,7 +14684,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -14377,7 +14923,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -14386,7 +14944,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -14451,7 +15010,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -14460,7 +15031,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -14689,7 +15261,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -14698,7 +15282,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -14763,7 +15348,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -14772,7 +15369,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -15045,7 +15643,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -15054,7 +15664,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -15119,7 +15730,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -15128,7 +15751,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -15401,7 +16025,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -15410,7 +16046,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -15475,7 +16112,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -15484,7 +16133,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -15722,7 +16372,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -15731,7 +16393,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -15796,7 +16459,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -15805,7 +16480,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -16269,7 +16945,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -16278,7 +16966,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { @@ -17651,7 +18340,19 @@ "type": "string" }, "prompt": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": {}, + "additionalProperties": {} + } + } + ] }, "file": { "type": "string" @@ -17660,7 +18361,8 @@ "type": "array", "items": { "type": "object", - "additionalProperties": true + "properties": {}, + "additionalProperties": {} } }, "raw": { From 0b707fdf04dec1c31e4d8e84a7412665f72bd745 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 14:21:45 +0200 Subject: [PATCH 3/4] feat(eval): add rerun-failed runner pooling --- apps/cli/src/commands/eval/commands/run.ts | 5 +- apps/cli/src/commands/eval/run-cache.ts | 8 +- apps/cli/src/commands/eval/run-eval.ts | 311 +++++++++++------- apps/cli/src/commands/results/eval-runner.ts | 5 +- apps/cli/test/commands/results/serve.test.ts | 4 +- apps/cli/test/eval.integration.test.ts | 95 ++++++ apps/cli/test/fixtures/mock-run-evaluation.ts | 27 +- .../docs/docs/evaluation/running-evals.mdx | 13 +- .../docs/docs/guides/workspace-pool.mdx | 2 + packages/core/src/evaluation/orchestrator.ts | 33 +- packages/core/src/evaluation/run-artifacts.ts | 22 +- .../core/test/evaluation/orchestrator.test.ts | 135 ++++++++ 12 files changed, 496 insertions(+), 164 deletions(-) diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index a078edc06..09b0f80ef 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -168,10 +168,11 @@ export const evalRunCommand = command({ description: 'Resume an interrupted run: skip already-completed tests and append new results to --output dir', }), - rerunFailed: flag({ + rerunFailed: option({ + type: optional(string), long: 'rerun-failed', description: - 'Rerun failed/errored tests while keeping passing results. Implies --resume semantics', + 'Run ID, run workspace, or index.jsonl to rerun failed/errored tests while keeping passing results', }), strict: flag({ long: 'strict', diff --git a/apps/cli/src/commands/eval/run-cache.ts b/apps/cli/src/commands/eval/run-cache.ts index 342fa8429..d2e8c7b85 100644 --- a/apps/cli/src/commands/eval/run-cache.ts +++ b/apps/cli/src/commands/eval/run-cache.ts @@ -54,10 +54,10 @@ export async function loadRunCache(cwd: string): Promise { /** * Resolve the cached last-run directory for a cwd, if it still exists on disk. * Returns undefined when there is no cache, the cache lacks a `lastRunDir`, - * or the directory has since been deleted. Used by `--resume` / `--rerun-failed` - * to default `--output` to the most recent run when no explicit dir is given, - * matching the convention used by promptfoo (`--resume [evalId]`) and - * OpenCompass (`-r [timestamp]`). + * or the directory has since been deleted. Used by `--resume` to default + * `--output` to the most recent run when no explicit dir is given, matching + * the convention used by promptfoo (`--resume [evalId]`) and OpenCompass + * (`-r [timestamp]`). */ export async function resolveCachedRunDir(cwd: string): Promise { const cache = await loadRunCache(cwd); diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index a16655015..6dd0afc36 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -3,6 +3,7 @@ import { access, readFile } from 'node:fs/promises'; import { createRequire as createNodeRequire } from 'node:module'; import path from 'node:path'; import { pathToFileURL } from 'node:url'; +import pLimit from 'p-limit'; import { DEFAULT_THRESHOLD, @@ -54,6 +55,7 @@ import { aggregateRunDir, buildEvalTestTargetKey, buildEvaluationResultTargetKey, + buildTestTargetKey, deduplicateByTestIdTarget, parseJsonlResults, writeArtifactsFromResults, @@ -135,6 +137,7 @@ interface NormalizedOptions { readonly retryErrors?: string; readonly resume: boolean; readonly rerunFailed: boolean; + readonly rerunFailedSource?: string; readonly workspaceMode?: 'pooled' | 'temp' | 'static'; readonly workspacePath?: string; readonly keepWorkspaces: boolean; @@ -609,8 +612,10 @@ function normalizeOptions( otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns) || yamlExecution?.otel_group_turns === true, retryErrors: normalizeString(rawOptions.retryErrors), - resume: normalizeBoolean(rawOptions.resume) || normalizeBoolean(rawOptions.rerunFailed), - rerunFailed: normalizeBoolean(rawOptions.rerunFailed), + resume: + normalizeBoolean(rawOptions.resume) || normalizeString(rawOptions.rerunFailed) !== undefined, + rerunFailed: normalizeString(rawOptions.rerunFailed) !== undefined, + rerunFailedSource: normalizeString(rawOptions.rerunFailed), workspaceMode, workspacePath, // Precedence: CLI > YAML config > TS config @@ -1164,6 +1169,27 @@ async function readExistingResultsFromRunDir(runDir: string): Promise { + const trimmed = source.trim(); + if (!trimmed) { + throw new Error('--rerun-failed requires a run ID, run workspace, or index.jsonl path.'); + } + + const candidate = path.isAbsolute(trimmed) ? trimmed : path.resolve(cwd, trimmed); + if (existsSync(candidate)) { + return path.basename(candidate) === RESULT_INDEX_FILENAME ? path.dirname(candidate) : candidate; + } + + const runIdCandidate = path.join(cwd, '.agentv', 'results', trimmed); + if (existsSync(runIdCandidate)) { + return runIdCandidate; + } + + throw new Error( + `Run not found for --rerun-failed: ${source}. Expected a run ID under .agentv/results, a run workspace, or an index.jsonl path.`, + ); +} + async function prepareFileMetadata(params: { readonly testFilePath: string; readonly repoRoot: string; @@ -1825,17 +1851,16 @@ export async function runEvalCommand( } } - // --resume / --rerun-failed without an explicit --output: default to the + // --resume without an explicit --output: default to the // last-known run dir for this cwd from .agentv/cache.json. Matches promptfoo's // `--resume [evalId]` and OpenCompass's `-r [timestamp]` "latest by default" // convention. The cache pointer is written by saveRunCache after every eval. - if (options.resume && !options.retryErrors && !options.outputDir) { + if (options.resume && !options.rerunFailedSource && !options.retryErrors && !options.outputDir) { const cachedDir = await resolveCachedRunDir(cwd); if (cachedDir) { options = { ...options, outputDir: cachedDir }; - const flagLabel = options.rerunFailed ? 'rerun-failed' : 'resume'; const displayDir = path.relative(cwd, cachedDir) || cachedDir; - console.log(`Auto-detected last run dir for --${flagLabel}: ${displayDir}`); + console.log(`Auto-detected last run dir for --resume: ${displayDir}`); } } @@ -1844,22 +1869,35 @@ export async function runEvalCommand( let resumeSkipKeys: Set | undefined; let isResumeAppend = false; if (options.resume && !options.retryErrors) { - const explicitResumeDir = options.outputDir; - if (explicitResumeDir) { - const resumeDir = path.resolve(explicitResumeDir); - const resumeIndexPaths = discoverRunManifestPaths(resumeDir); + const sourceRunDir = options.rerunFailedSource + ? await resolveRerunFailedRunDir(cwd, options.rerunFailedSource) + : options.outputDir + ? path.resolve(options.outputDir) + : undefined; + + if (sourceRunDir) { + if (options.rerunFailedSource && !options.outputDir) { + options = { ...options, outputDir: sourceRunDir }; + } + + const resumeIndexPaths = discoverRunManifestPaths(sourceRunDir); if (resumeIndexPaths.length > 0) { - const existingResults = await readExistingResultsFromRunDir(resumeDir); + const existingResults = await readExistingResultsFromRunDir(sourceRunDir); resumeSkipKeys = new Set(); + let completedResultCount = 0; for (const r of existingResults) { if (shouldSkipExistingResultForResume(r, options.rerunFailed)) { + completedResultCount += 1; resumeSkipKeys.add(buildEvaluationResultTargetKey(r)); + resumeSkipKeys.add(buildTestTargetKey(r.testId, r.target, r.variant)); } } - isResumeAppend = true; + isResumeAppend = + options.outputDir !== undefined && + path.resolve(options.outputDir) === path.resolve(sourceRunDir); const modeLabel = options.rerunFailed ? 'Rerun-failed' : 'Resume'; console.log( - `${modeLabel}: found ${existingResults.length} existing result(s), skipping ${resumeSkipKeys.size} completed.`, + `${modeLabel}: found ${existingResults.length} existing result(s), skipping ${completedResultCount} completed.`, ); } else { // No existing bundle manifest — behave like a normal run. @@ -2116,7 +2154,8 @@ export async function runEvalCommand( const target = selection.targetName; const variant = targetVariantForSelection(selection); const key = buildEvalTestTargetKey(test, target, variant); - if (resumeSkipKeys?.has(key)) { + const fallbackKey = buildTestTargetKey(test.id, target, variant); + if (resumeSkipKeys?.has(key) || resumeSkipKeys?.has(fallbackKey)) { resumeSkippedCount++; } else { totalEvalCount++; @@ -2339,126 +2378,142 @@ export async function runEvalCommand( continue; } - // Run all targets concurrently (each target has its own worker limit) + const fileWorkerLimit = Math.max(1, fileOptions.workers ?? DEFAULT_WORKERS); + const targetConcurrency = + targetPrep.selections.length > 1 + ? Math.min(fileWorkerLimit, targetPrep.selections.length) + : 1; + const perTargetWorkers = + targetPrep.selections.length > 1 + ? Math.max(1, Math.floor(fileWorkerLimit / targetConcurrency)) + : fileWorkerLimit; + const limitTarget = pLimit(targetConcurrency); + + // Run target matrix selections through a bounded pool. Each active target + // receives a slice of the worker budget so total in-process case execution + // never multiplies past max_concurrency. const targetResults = await Promise.all( - targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => { - // Target selection is suite/experiment/CLI runtime policy; every selected - // target runs every filtered test case for this eval file. - const targetName = selection.targetName; - const applicableTestCases = targetPrep.testCases; - - // --resume / --rerun-failed: skip tests that are already completed - const filteredTestCases = resumeSkipKeys - ? applicableTestCases.filter( - (test) => - !resumeSkipKeys.has( - buildEvalTestTargetKey(test, targetName, targetVariantForSelection(selection)), - ), - ) - : applicableTestCases; - - if (filteredTestCases.length === 0) { - return []; - } + targetPrep.selections.map(({ selection, inlineTargetLabel }) => + limitTarget(async () => { + // Target selection is suite/experiment/CLI runtime policy; every selected + // target runs every filtered test case for this eval file. + const targetName = selection.targetName; + const applicableTestCases = targetPrep.testCases; + + // --resume / --rerun-failed: skip tests that are already completed + const filteredTestCases = resumeSkipKeys + ? applicableTestCases.filter((test) => { + const variant = targetVariantForSelection(selection); + return ( + !resumeSkipKeys.has(buildEvalTestTargetKey(test, targetName, variant)) && + !resumeSkipKeys.has(buildTestTargetKey(test.id, targetName, variant)) + ); + }) + : applicableTestCases; + + if (filteredTestCases.length === 0) { + return []; + } - try { - const runGroups = groupTestsByRunPolicy({ - tests: filteredTestCases, - options: fileOptions, - defaultTrialsConfig: fileOptions.transcript ? undefined : targetPrep.trialsConfig, - defaultThreshold: targetPrep.threshold ?? fileOptions.threshold, - defaultTimeoutSeconds: fileOptions.agentTimeoutSeconds, - defaultBudgetUsd: targetPrep.budgetUsd, - }); - const groupResults: EvaluationResult[] = []; - for (const group of runGroups) { - hasScopedRunPolicies ||= group.policy.hasScopedOverride; - const result = await runSingleEvalFile({ - testFilePath, - cwd, - repoRoot, + try { + const runGroups = groupTestsByRunPolicy({ + tests: filteredTestCases, options: fileOptions, - outputWriter, - otelExporter, - cache, - evaluationRunner, - workersOverride: fileOptions.workers, - progressReporter, - seenTestCases, - displayIdTracker, - selection, - inlineTargetLabel, - testCases: group.tests, - trialsConfig: fileOptions.transcript ? undefined : group.policy.trialsConfig, - agentTimeoutSeconds: group.policy.timeoutSeconds, - matrixMode: targetPrep.selections.length > 1, - budgetUsd: group.policy.budgetUsd, - runBudgetTracker: fileBudgetTracker, - failOnError: targetPrep.failOnError, - threshold: group.policy.threshold, - providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory, - }); - groupResults.push(...result.results); - } - const evalFile = path.relative(cwd, testFilePath); - const existingSummary = remoteEvalSummaries.find( - (summary) => summary.evalFile === evalFile, - ); - if (existingSummary) { - existingSummary.results.push(...groupResults); - } else { - remoteEvalSummaries.push({ - evalFile, - results: [...groupResults], + defaultTrialsConfig: fileOptions.transcript ? undefined : targetPrep.trialsConfig, + defaultThreshold: targetPrep.threshold ?? fileOptions.threshold, + defaultTimeoutSeconds: fileOptions.agentTimeoutSeconds, + defaultBudgetUsd: targetPrep.budgetUsd, }); - } - - return groupResults; - } catch (fileError) { - // before_all or other setup failures should not abort the entire run. - // Mark all tests in this file as errors and continue with other files. - const message = fileError instanceof Error ? fileError.message : String(fileError); - console.error( - `\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`, - ); - const explicitVariant = targetVariantForSelection(selection); - const errorResults: EvaluationResult[] = filteredTestCases.map((testCase) => - withSourceMetadata( - { - timestamp: new Date().toISOString(), - testId: testCase.id, - score: 0, - assertions: [], - output: message, - trace: buildTraceFromMessages({ - input: testCase.input as EvaluationResult['input'], - output: [{ role: 'assistant' as const, content: message }], - finalOutput: message, - target: selection.targetName, + const groupResults: EvaluationResult[] = []; + for (const group of runGroups) { + hasScopedRunPolicies ||= group.policy.hasScopedOverride; + const result = await runSingleEvalFile({ + testFilePath, + cwd, + repoRoot, + options: fileOptions, + outputWriter, + otelExporter, + cache, + evaluationRunner, + workersOverride: perTargetWorkers, + progressReporter, + seenTestCases, + displayIdTracker, + selection, + inlineTargetLabel, + testCases: group.tests, + trialsConfig: fileOptions.transcript ? undefined : group.policy.trialsConfig, + agentTimeoutSeconds: group.policy.timeoutSeconds, + matrixMode: targetPrep.selections.length > 1, + budgetUsd: group.policy.budgetUsd, + runBudgetTracker: fileBudgetTracker, + failOnError: targetPrep.failOnError, + threshold: group.policy.threshold, + providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory, + }); + groupResults.push(...result.results); + } + const evalFile = path.relative(cwd, testFilePath); + const existingSummary = remoteEvalSummaries.find( + (summary) => summary.evalFile === evalFile, + ); + if (existingSummary) { + existingSummary.results.push(...groupResults); + } else { + remoteEvalSummaries.push({ + evalFile, + results: [...groupResults], + }); + } + + return groupResults; + } catch (fileError) { + // before_all or other setup failures should not abort the entire run. + // Mark all tests in this file as errors and continue with other files. + const message = fileError instanceof Error ? fileError.message : String(fileError); + console.error( + `\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`, + ); + const explicitVariant = targetVariantForSelection(selection); + const errorResults: EvaluationResult[] = filteredTestCases.map((testCase) => + withSourceMetadata( + { + timestamp: new Date().toISOString(), testId: testCase.id, - conversationId: testCase.conversation_id, + score: 0, + assertions: [], + output: message, + trace: buildTraceFromMessages({ + input: testCase.input as EvaluationResult['input'], + output: [{ role: 'assistant' as const, content: message }], + finalOutput: message, + target: selection.targetName, + testId: testCase.id, + conversationId: testCase.conversation_id, + error: message, + }), + scores: [], error: message, - }), - scores: [], - error: message, - executionStatus: 'execution_error' as const, - failureStage: 'setup' as const, - failureReasonCode: 'setup_error' as const, - durationMs: 0, - tokenUsage: { input: 0, output: 0 }, - target: selection.targetName, - variant: explicitVariant, - }, - testFilePath, - fileOptions, - ), - ); - for (const errResult of errorResults) { - await outputWriter.append(errResult); + executionStatus: 'execution_error' as const, + failureStage: 'setup' as const, + failureReasonCode: 'setup_error' as const, + durationMs: 0, + tokenUsage: { input: 0, output: 0 }, + target: selection.targetName, + variant: explicitVariant, + }, + testFilePath, + fileOptions, + ), + ); + for (const errResult of errorResults) { + await outputWriter.append(errResult); + } + return errorResults; } - return errorResults; - } - }), + }), + ), ); for (const results of targetResults) { allResults.push(...results); @@ -2646,7 +2701,7 @@ export async function runEvalCommand( const relativeRunDir = path.relative(cwd, runDir); console.log( `\nTip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:\n` + - ` agentv eval run ${evalFileArgs}${targetFlag} --output ${relativeRunDir} --rerun-failed`, + ` agentv eval run ${evalFileArgs}${targetFlag} --rerun-failed ${relativeRunDir}`, ); } diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts index 9a4617eb5..3fb397621 100644 --- a/apps/cli/src/commands/results/eval-runner.ts +++ b/apps/cli/src/commands/results/eval-runner.ts @@ -171,6 +171,9 @@ function validateResumeOptions(req: RunEvalRequest): string | undefined { if (modes.length > 1) { return `resume, rerun_failed, and retry_errors are mutually exclusive (got: ${modes.join(', ')})`; } + if (req.rerun_failed && !req.output?.trim()) { + return 'rerun_failed requires output to identify the prior run workspace'; + } return undefined; } @@ -230,7 +233,7 @@ function buildCliArgs(req: RunEvalRequest, experiment?: string): string[] { args.push('--resume'); } if (req.rerun_failed) { - args.push('--rerun-failed'); + args.push('--rerun-failed', req.output?.trim() ?? ''); } if (req.retry_errors?.trim()) { args.push('--retry-errors', req.retry_errors.trim()); diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 98a08f30b..9d301496a 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -3956,7 +3956,7 @@ describe('serve app', () => { }); expect(res.status).toBe(202); const data = (await res.json()) as { command: string }; - expect(data.command).toContain('--rerun-failed'); + expect(data.command).toContain('--rerun-failed .agentv/results/r1'); expect(data.command).toContain('--output .agentv/results/r1'); }); @@ -4140,7 +4140,7 @@ describe('serve app', () => { }); expect(res.status).toBe(200); const data = (await res.json()) as { command: string }; - expect(data.command).toContain('--rerun-failed'); + expect(data.command).toContain('--rerun-failed .agentv/results/r1'); expect(data.command).not.toContain('--resume'); }); diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index 00d49a159..5fc435e85 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -931,6 +931,101 @@ describe('agentv eval CLI', () => { } }, 30_000); + it('reruns failed rows from a canonical run id', async () => { + const fixture = await createFixture(); + try { + const priorRunDir = path.join(fixture.suiteDir, '.agentv', 'results', 'prior-run'); + const first = await runCli(fixture, [ + 'eval', + fixture.testFilePath, + '--output', + priorRunDir, + '--threshold', + '0.8', + ]); + expect(first.exitCode).toBe(1); + const priorIndexPath = path.join(priorRunDir, 'index.jsonl'); + const priorRows = (await readJsonLines(priorIndexPath)) as Array>; + await writeFile( + priorIndexPath, + `${priorRows + .map((row) => + JSON.stringify({ + ...row, + execution_status: row.test_id === 'case-alpha' ? 'quality_failure' : 'ok', + }), + ) + .join('\n')}\n`, + 'utf8', + ); + + const second = await runCli(fixture, [ + 'eval', + fixture.testFilePath, + '--rerun-failed', + 'prior-run', + '--threshold', + '0.8', + ]); + expect(second.exitCode).toBe(1); + expect(second.stdout).toContain('Rerun-failed: found 2 existing result(s), skipping 1'); + + const diagnostics = await readDiagnostics(fixture); + const calls = diagnostics.calls as Array>; + expect(calls.at(-1)).toMatchObject({ + evalCaseIds: ['case-alpha'], + }); + + const rows = await readJsonLines(priorIndexPath); + expect(rows).toHaveLength(3); + expect((rows.at(-1) as Record).test_id).toBe('case-alpha'); + } finally { + await rm(fixture.baseDir, { recursive: true, force: true }); + } + }, 30_000); + + it('does not multiply max_concurrency across target matrix selections', async () => { + const fixture = await createFixture(); + try { + const evalPath = path.join(fixture.suiteDir, 'target-matrix.eval.yaml'); + await writeFile( + evalPath, + [ + 'name: target-matrix', + 'target: file-target', + 'tests:', + ' - id: first-case', + ' input: first', + ' criteria: ok', + ' - id: second-case', + ' input: second', + ' criteria: ok', + '', + ].join('\n'), + 'utf8', + ); + + const { exitCode } = await runCli(fixture, [ + 'eval', + evalPath, + '--workers', + '2', + '--target', + 'file-target', + '--target', + 'cli-target', + ]); + + expect(exitCode).toBe(0); + const diagnostics = await readDiagnostics(fixture); + const calls = diagnostics.calls as Array>; + expect(calls).toHaveLength(2); + expect(calls.map((call) => call.maxConcurrency)).toEqual([1, 1]); + } finally { + await rm(fixture.baseDir, { recursive: true, force: true }); + } + }, 30_000); + it('records CLI-named experiment namespace separately from default runtime config', async () => { const fixture = await createFixture(); try { diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts index 5c30221ff..4103e4eaf 100644 --- a/apps/cli/test/fixtures/mock-run-evaluation.ts +++ b/apps/cli/test/fixtures/mock-run-evaluation.ts @@ -64,6 +64,8 @@ interface EvaluationResultLike { readonly timestamp: string; } +let diagnosticsWriteQueue: Promise = Promise.resolve(); + function evalCaseIds(evalCases: ReadonlyArray | undefined): readonly string[] { if (!Array.isArray(evalCases) || evalCases.length === 0) { return ['case-alpha', 'case-beta']; @@ -210,17 +212,20 @@ async function maybeWriteDiagnostics( resultCount: results.length, } satisfies Record; - const priorCalls = await readFile(diagnosticsPath, 'utf8') - .then((raw) => { - const parsed = JSON.parse(raw) as { readonly calls?: unknown }; - return Array.isArray(parsed.calls) ? parsed.calls : [parsed]; - }) - .catch(() => []); - await writeFile( - diagnosticsPath, - JSON.stringify({ ...payload, calls: [...priorCalls, payload] }, null, 2), - 'utf8', - ); + diagnosticsWriteQueue = diagnosticsWriteQueue.then(async () => { + const priorCalls = await readFile(diagnosticsPath, 'utf8') + .then((raw) => { + const parsed = JSON.parse(raw) as { readonly calls?: unknown }; + return Array.isArray(parsed.calls) ? parsed.calls : [parsed]; + }) + .catch(() => []); + await writeFile( + diagnosticsPath, + JSON.stringify({ ...payload, calls: [...priorCalls, payload] }, null, 2), + 'utf8', + ); + }); + await diagnosticsWriteQueue; } async function maybeWritePromptDump( diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index dc881d48b..5d8a14633 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -296,7 +296,7 @@ agentv eval evals/my-eval.yaml --export-otel ### Parallelism -The `--workers N` flag controls how many **test cases run in parallel within each eval file** (default: 3). Eval files always run sequentially — one file completes before the next starts. +The `--workers N` flag controls the in-process worker pool for a single eval file (default: 3). Eval files always run sequentially — one file completes before the next starts. In target-matrix runs, selected targets share that worker budget instead of each target creating its own full pool. ```bash agentv eval evals/my-eval.yaml --workers 4 @@ -304,6 +304,9 @@ agentv eval evals/my-eval.yaml --workers 4 agentv eval evals/file1.yaml evals/file2.yaml evals/file3.yaml --workers 3 # Files run one at a time; within each file, up to 3 test cases run in parallel + +agentv eval evals/my-eval.yaml --target gpt --target claude --workers 4 +# The target matrix shares the same 4-worker budget ``` This matches the standard model used by eval frameworks (promptfoo, deepeval, OpenAI Evals) and avoids cross-file workspace races without any special configuration. @@ -353,10 +356,10 @@ AgentV ships three flags for picking up a partial run. They differ only in **whi | Flag | What it skips | What it re-runs | Use when | |------|---------------|-----------------|----------| | `--resume` | Anything that finished without an `execution_error` (passes, fails, threshold misses) | Errors and missing cases | The run was interrupted (Ctrl-C, crash, OOM) and you just want it to finish | -| `--rerun-failed` | Only cases with `executionStatus === 'ok'` | Errors **and** test failures (assertion misses, threshold misses) | A grader change or model swap means you want to re-grade everything that wasn't already passing | +| `--rerun-failed ` | Only cases with `executionStatus === 'ok'` | Errors **and** test failures (assertion misses, threshold misses) | A grader change or model swap means you want to re-grade everything that wasn't already passing | | `--retry-errors ` | Anything that completed without an `execution_error` (same set as `--resume`) | Errors and missing cases | You want to point at an arbitrary prior run/manifest by path, instead of resuming the run dir you're currently writing to | -`--resume` and `--rerun-failed` both append to the existing `index.jsonl`. When `--output ` is given they target that directory; when omitted they default to the **last run dir for the current cwd**, recorded in `.agentv/cache.json` and updated after every eval. This matches promptfoo's `--resume [evalId]` and OpenCompass's `-r [timestamp]` "latest by default" convention. `--retry-errors` takes the prior run's path directly (a directory or an `index.jsonl`). +`--resume` appends to the existing `index.jsonl` in `--output `; when omitted it defaults to the **last run dir for the current cwd**, recorded in `.agentv/cache.json` and updated after every eval. `--rerun-failed ` reads a specific canonical run bundle from `.agentv/results/` and, when `--output` is omitted, appends replacement rows to that same bundle. You can also pass a run workspace path or `index.jsonl` path instead of a bare run ID. `--retry-errors` takes the prior run's path directly and re-runs only execution errors or missing cases. ```bash # Resume the last run — no args needed; AgentV finds it from .agentv/cache.json @@ -365,8 +368,8 @@ agentv eval evals/my-eval.yaml --resume # Or target a specific run dir explicitly agentv eval evals/my-eval.yaml --output .agentv/results/ --resume -# Re-run errors AND failed cases against the last run dir -agentv eval evals/my-eval.yaml --rerun-failed +# Re-run errors AND failed cases from a specific canonical run +agentv eval evals/my-eval.yaml --rerun-failed # Re-run only execution errors from any prior run by path agentv eval evals/my-eval.yaml --retry-errors .agentv/results//index.jsonl diff --git a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx index 685a1f801..5eba56da6 100644 --- a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx +++ b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx @@ -132,6 +132,8 @@ This creates up to 4 slots (`slot-0` through `slot-3`). PID-based lock files pre The maximum number of pool slots defaults to 10 (capped at 50). Slots are created on demand — a run with 2 workers only creates 2 slots, even if the pool allows 10. +Before a slot is reused for another case, AgentV resets it to the slot baseline. A pooled workspace is a performance cache, not shared mutable state between cases. + **Multiple eval files:** When you pass multiple eval files to `agentv eval`, they run sequentially — one file completes before the next starts (see [Parallelism](/docs/evaluation/running-evals/#parallelism)). Within each file, pool slots support concurrent workers as described above. ## Drift detection diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 26fbda0f6..29cdd0382 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -1209,13 +1209,17 @@ export async function runEvaluation( // shared owner prepare without inheriting a child suite's workspace. const usesSharedWorkspace = caseUsesSharedWorkspaceSetup(evalCase, sharedSetup); const testPoolSlot = - usesSharedWorkspace && availablePoolSlots.length > 0 ? availablePoolSlots.pop() : undefined; + usesSharedWorkspace && availablePoolSlots.length > 0 + ? availablePoolSlots.pop() + : usesSharedWorkspace + ? poolSlot + : undefined; const testWorkspacePath = usesSharedWorkspace ? (testPoolSlot?.path ?? sharedWorkspacePath) : undefined; const testBaselineCommit = usesSharedWorkspace ? testPoolSlot - ? poolSlotBaselines.get(testPoolSlot.path) + ? (poolSlotBaselines.get(testPoolSlot.path) ?? sharedBaselineCommit) : sharedBaselineCommit : undefined; @@ -1323,9 +1327,30 @@ export async function runEvaluation( } throw error; } finally { - // Return pool slot for reuse by next test + // Return pool slot for reuse by next test only after resetting it to + // the per-slot baseline. Pooling is a local performance optimization, + // not shared state between eval cases. if (testPoolSlot) { - availablePoolSlots.push(testPoolSlot); + const shouldReturnPoolSlot = testPoolSlot !== poolSlot; + const resetMode = workspaceClean === 'full' ? 'strict' : 'fast'; + let resetSucceeded = true; + try { + if (repoManager && suiteWorkspace?.repos?.length) { + await repoManager.reset(suiteWorkspace.repos, testPoolSlot.path, resetMode); + } + await resetWorkspaceRoot(testPoolSlot.path, resetMode, testBaselineCommit); + } catch (resetError) { + resetSucceeded = false; + if (verbose) { + const message = resetError instanceof Error ? resetError.message : String(resetError); + console.warn( + `Warning: failed to reset workspace pool slot ${testPoolSlot.index}; leaving it out of reuse: ${message}`, + ); + } + } + if (resetSucceeded && shouldReturnPoolSlot) { + availablePoolSlots.push(testPoolSlot); + } } } } diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index 40dd3eb45..f12b5817a 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -1997,6 +1997,13 @@ function indexRecordReplacementKey(record: unknown): string | undefined { return projectionIdentityRecordKey(record) ?? indexRecordKey(record); } +function indexRecordReplacementKeys(record: unknown): readonly string[] { + const keys = [projectionIdentityRecordKey(record), indexRecordKey(record)].filter( + (key): key is string => typeof key === 'string' && key.length > 0, + ); + return Array.from(new Set(keys)); +} + function projectionIdentityRecordKey(record: unknown): string | undefined { if (!isRecord(record) || !isRecord(record.projection_identity)) { return undefined; @@ -2086,10 +2093,9 @@ async function rewriteExistingIndexRecords( } const replacementsByKey = new Map( - replacements.flatMap((record) => { - const key = indexRecordReplacementKey(record); - return key ? [[key, record] as const] : []; - }), + replacements.flatMap((record) => + indexRecordReplacementKeys(record).map((key) => [key, record] as const), + ), ); const seen = new Set(); const records: unknown[] = []; @@ -2103,7 +2109,9 @@ async function rewriteExistingIndexRecords( const replacement = key ? replacementsByKey.get(key) : undefined; if (key && replacement) { records.push(replacement); - seen.add(key); + for (const replacementKey of indexRecordReplacementKeys(replacement)) { + seen.add(replacementKey); + } } else { records.push(parsed); } @@ -2111,8 +2119,8 @@ async function rewriteExistingIndexRecords( } for (const replacement of replacements) { - const key = indexRecordReplacementKey(replacement); - if (!key || !seen.has(key)) { + const keys = indexRecordReplacementKeys(replacement); + if (keys.length === 0 || keys.every((key) => !seen.has(key))) { records.push(replacement); } } diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 73c544135..a67f8553b 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -1,10 +1,12 @@ import { afterEach, describe, expect, it, mock } from 'bun:test'; +import { execSync } from 'node:child_process'; import { existsSync, mkdirSync, mkdtempSync, readFileSync, readdirSync, + rmSync, writeFileSync, } from 'node:fs'; import { tmpdir } from 'node:os'; @@ -151,6 +153,31 @@ const baseTarget: ResolvedTarget = { config: { response: '{}' }, }; +function cleanGitEnv(): Record { + const env: Record = {}; + for (const [key, value] of Object.entries(process.env)) { + if (value !== undefined && !(key.startsWith('GIT_') && key !== 'GIT_SSH_COMMAND')) { + env[key] = value; + } + } + return env; +} + +function createTestRepo(dir: string, files: Record): string { + mkdirSync(dir, { recursive: true }); + const opts = { cwd: dir, stdio: 'ignore' as const, env: cleanGitEnv() }; + execSync('git init', opts); + execSync('git config user.email "test@test.com"', opts); + execSync('git config user.name "Test"', opts); + for (const [name, content] of Object.entries(files)) { + const filePath = path.join(dir, name); + mkdirSync(path.dirname(filePath), { recursive: true }); + writeFileSync(filePath, content); + } + execSync('git add -A && git commit -m "initial"', opts); + return execSync('git rev-parse HEAD', { cwd: dir, env: cleanGitEnv() }).toString().trim(); +} + const evaluatorRegistry = { 'llm-grader': { kind: 'llm-grader', @@ -638,6 +665,114 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, expect(result.score).toBeGreaterThan(0); }); + it('does not retry completed quality failures', async () => { + const provider = new SequenceProvider('mock', { + responses: [ + { + output: [{ role: 'assistant', content: 'Incomplete response.' }], + }, + ], + }); + const failingEvaluators = { + 'llm-grader': { + kind: 'llm-grader', + async evaluate() { + return { + score: 0.1, + verdict: 'fail' as const, + assertions: [{ text: 'quality miss', passed: false }], + expectedAspectCount: 1, + }; + }, + }, + }; + + const result = await runEvalCase({ + evalCase: baseTestCase, + provider, + target: baseTarget, + evaluators: failingEvaluators, + maxRetries: 3, + }); + + expect(provider.callIndex).toBe(1); + expect(result.executionStatus).toBe('quality_failure'); + expect(result.retryIndex).toBe(0); + }); + + it('resets a pooled workspace slot before reusing it for the next case', async () => { + const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-pooled-runner-')); + const previousAgentvHome = process.env.AGENTV_HOME; + const previousAgentvDataDir = process.env.AGENTV_DATA_DIR; + process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home'); + process.env.AGENTV_DATA_DIR = path.join(tempDir, 'agentv-data'); + + try { + const sourceRepo = path.join(tempDir, 'source-repo'); + const cleanCommit = createTestRepo(sourceRepo, { 'tracked.txt': 'clean\n' }); + const workspace = { + repos: [ + { + path: './repo-a', + repo: `file://${sourceRepo}`, + commit: cleanCommit, + }, + ], + }; + const seenStaleBeforeSecond: boolean[] = []; + let callCount = 0; + const provider: Provider = { + id: 'mock:pooled-reset', + kind: 'mock' as const, + targetName: 'pooled-reset', + async invoke(request: ProviderRequest): Promise { + callCount += 1; + if (!request.cwd) { + throw new Error('missing cwd'); + } + const repoDir = path.join(request.cwd, 'repo-a'); + if (callCount === 1) { + writeFileSync(path.join(repoDir, 'tracked.txt'), 'dirty\n'); + writeFileSync(path.join(repoDir, 'stale.txt'), 'stale\n'); + } else { + seenStaleBeforeSecond.push(existsSync(path.join(repoDir, 'stale.txt'))); + expect(readFileSync(path.join(repoDir, 'tracked.txt'), 'utf8')).toBe('clean\n'); + } + return { output: [{ role: 'assistant', content: `response ${callCount}` }] }; + }, + }; + + const results = await runEvaluation({ + testFilePath: path.join(tempDir, 'eval.yaml'), + repoRoot: tempDir, + target: { ...baseTarget, name: 'pooled-reset' }, + providerFactory: () => provider, + evaluators: evaluatorRegistry, + workspaceMode: 'pooled', + maxConcurrency: 1, + evalCases: [ + { ...baseTestCase, id: 'case-1', workspace }, + { ...baseTestCase, id: 'case-2', workspace }, + ], + }); + + expect(results).toHaveLength(2); + expect(seenStaleBeforeSecond).toEqual([false]); + } finally { + if (previousAgentvHome === undefined) { + process.env.AGENTV_HOME = undefined; + } else { + process.env.AGENTV_HOME = previousAgentvHome; + } + if (previousAgentvDataDir === undefined) { + process.env.AGENTV_DATA_DIR = undefined; + } else { + process.env.AGENTV_DATA_DIR = previousAgentvDataDir; + } + rmSync(tempDir, { recursive: true, force: true }); + } + }, 30_000); + it('applies exponential backoff between retries', async () => { const provider = new SequenceProvider('mock', { errors: [new Error('Transient failure')], From 7242c84725f7f4d556f78ba29be2aa04e9a7e2e0 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 15:09:16 +0200 Subject: [PATCH 4/4] fix(eval): constrain rerun-failed identities --- apps/cli/src/commands/eval/run-eval.ts | 191 ++++++++++++++++-- apps/cli/test/eval.integration.test.ts | 164 +++++++++++++-- apps/cli/test/fixtures/mock-run-evaluation.ts | 1 + 3 files changed, 324 insertions(+), 32 deletions(-) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 6dd0afc36..e2f6cc765 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -105,6 +105,137 @@ function shouldSkipExistingResultForResume( return result.executionStatus !== 'execution_error'; } +interface ResumeIdentityEntry { + readonly kind: 'precise' | 'legacy'; + readonly key: string; + readonly result: EvaluationResult; +} + +interface ResumeIdentityMatcher { + readonly preciseKeys: Set; + readonly legacyKeys: Set; +} + +function hasNonEmptyString(value: unknown): value is string { + return typeof value === 'string' && value.trim().length > 0; +} + +function objectRecord(value: unknown): Record | undefined { + return typeof value === 'object' && value !== null && !Array.isArray(value) + ? (value as Record) + : undefined; +} + +function resultProjectionDimensions(result: EvaluationResult): Record | undefined { + const projectionIdentity = objectRecord( + (result as unknown as Record).projectionIdentity, + ); + return objectRecord(projectionIdentity?.dimensions); +} + +function hasCanonicalResultIdentity(result: EvaluationResult): boolean { + const source = result.source; + const dimensions = resultProjectionDimensions(result); + const resultRecord = result as unknown as Record; + return ( + hasNonEmptyString(dimensions?.evalPath) || + hasNonEmptyString(dimensions?.suite) || + hasNonEmptyString(dimensions?.promptId) || + hasNonEmptyString(resultRecord.evalPath) || + hasNonEmptyString(source?.evalFileRepoPath) || + hasNonEmptyString(source?.evalFilePath) || + hasNonEmptyString(source?.evalFileAbsolutePath) || + hasNonEmptyString(result.suite) || + hasNonEmptyString(result.prompt?.id) + ); +} + +function resultResumeIdentityEntry(result: EvaluationResult): ResumeIdentityEntry { + if (hasCanonicalResultIdentity(result)) { + return { + kind: 'precise', + key: buildEvaluationResultTargetKey(result), + result, + }; + } + return { + kind: 'legacy', + key: buildTestTargetKey(result.testId, result.target, result.variant), + result, + }; +} + +function latestResumeIdentityEntries( + results: readonly EvaluationResult[], +): readonly ResumeIdentityEntry[] { + const latestByIdentity = new Map(); + for (const result of results) { + const entry = resultResumeIdentityEntry(result); + latestByIdentity.set(`${entry.kind}:${entry.key}`, entry); + } + return Array.from(latestByIdentity.values()); +} + +function createResumeIdentityMatcher(): ResumeIdentityMatcher { + return { preciseKeys: new Set(), legacyKeys: new Set() }; +} + +function addResumeIdentityEntry(matcher: ResumeIdentityMatcher, entry: ResumeIdentityEntry): void { + if (entry.kind === 'legacy') { + matcher.legacyKeys.add(entry.key); + return; + } + matcher.preciseKeys.add(entry.key); +} + +function uniqueStrings(values: readonly (string | undefined)[]): string[] { + return Array.from(new Set(values.filter(hasNonEmptyString))); +} + +function buildPlannedResumeIdentityKeys( + test: EvalTest, + target: string, + variant: string | undefined, +): readonly string[] { + const keys = new Set([buildEvalTestTargetKey(test, target, variant)]); + const evalPaths = uniqueStrings([ + test.source?.evalFileRepoPath, + test.source?.evalFilePath, + test.source?.evalFileAbsolutePath, + ]); + const suites = Array.from(new Set([test.suite ?? null, null])); + + for (const evalPath of evalPaths) { + for (const suite of suites) { + keys.add( + JSON.stringify({ + eval_path: evalPath, + suite, + test_id: test.id ?? 'unknown', + prompt_id: test.prompt?.id ?? null, + target: target ?? 'unknown', + variant: variant ?? null, + }), + ); + } + } + + return Array.from(keys); +} + +function resumeIdentityMatches( + matcher: ResumeIdentityMatcher, + test: EvalTest, + target: string, + variant: string | undefined, +): boolean { + return ( + buildPlannedResumeIdentityKeys(test, target, variant).some((key) => + matcher.preciseKeys.has(key), + ) || matcher.legacyKeys.has(buildTestTargetKey(test.id, target, variant)) + ); +} + interface RunEvalCommandInput { readonly testFiles: readonly string[]; readonly rawOptions: Record; @@ -1864,9 +1995,10 @@ export async function runEvalCommand( } } - // --resume / --rerun-failed: skip already-completed tests and append to existing output. + // --resume skips completed rows; --rerun-failed includes only latest failed/error rows. // IMPORTANT: JSONL must be loaded before the output writer is created (same file). - let resumeSkipKeys: Set | undefined; + let resumeSkipKeys: ResumeIdentityMatcher | undefined; + let rerunIncludeKeys: ResumeIdentityMatcher | undefined; let isResumeAppend = false; if (options.resume && !options.retryErrors) { const sourceRunDir = options.rerunFailedSource @@ -1883,13 +2015,15 @@ export async function runEvalCommand( const resumeIndexPaths = discoverRunManifestPaths(sourceRunDir); if (resumeIndexPaths.length > 0) { const existingResults = await readExistingResultsFromRunDir(sourceRunDir); - resumeSkipKeys = new Set(); + resumeSkipKeys = createResumeIdentityMatcher(); + rerunIncludeKeys = options.rerunFailed ? createResumeIdentityMatcher() : undefined; let completedResultCount = 0; - for (const r of existingResults) { - if (shouldSkipExistingResultForResume(r, options.rerunFailed)) { + for (const entry of latestResumeIdentityEntries(existingResults)) { + if (shouldSkipExistingResultForResume(entry.result, options.rerunFailed)) { completedResultCount += 1; - resumeSkipKeys.add(buildEvaluationResultTargetKey(r)); - resumeSkipKeys.add(buildTestTargetKey(r.testId, r.target, r.variant)); + addResumeIdentityEntry(resumeSkipKeys, entry); + } else if (rerunIncludeKeys) { + addResumeIdentityEntry(rerunIncludeKeys, entry); } } isResumeAppend = @@ -1899,6 +2033,9 @@ export async function runEvalCommand( console.log( `${modeLabel}: found ${existingResults.length} existing result(s), skipping ${completedResultCount} completed.`, ); + } else if (options.rerunFailed) { + rerunIncludeKeys = createResumeIdentityMatcher(); + console.log('Rerun-failed: no existing bundle run manifest found. Nothing to rerun.'); } else { // No existing bundle manifest — behave like a normal run. console.log('Resume: no existing bundle run manifest found, starting fresh run.'); @@ -2153,9 +2290,13 @@ export async function runEvalCommand( for (const { selection } of meta.selections) { const target = selection.targetName; const variant = targetVariantForSelection(selection); - const key = buildEvalTestTargetKey(test, target, variant); - const fallbackKey = buildTestTargetKey(test.id, target, variant); - if (resumeSkipKeys?.has(key) || resumeSkipKeys?.has(fallbackKey)) { + if (rerunIncludeKeys) { + if (resumeIdentityMatches(rerunIncludeKeys, test, target, variant)) { + totalEvalCount++; + } else { + resumeSkippedCount++; + } + } else if (resumeSkipKeys && resumeIdentityMatches(resumeSkipKeys, test, target, variant)) { resumeSkippedCount++; } else { totalEvalCount++; @@ -2170,6 +2311,10 @@ export async function runEvalCommand( console.log('No execution errors or missing cases in the previous run. Nothing to retry.'); return; } + if (rerunIncludeKeys) { + console.log('Nothing to rerun — no failed or errored test(s) matched the current suite.'); + return; + } // When using --resume, all tests being completed means nothing to resume if (resumeSkipKeys && resumeSkippedCount > 0) { console.log(`Nothing to resume — all ${resumeSkippedCount} test(s) already completed.`); @@ -2400,16 +2545,22 @@ export async function runEvalCommand( const targetName = selection.targetName; const applicableTestCases = targetPrep.testCases; - // --resume / --rerun-failed: skip tests that are already completed - const filteredTestCases = resumeSkipKeys - ? applicableTestCases.filter((test) => { - const variant = targetVariantForSelection(selection); - return ( - !resumeSkipKeys.has(buildEvalTestTargetKey(test, targetName, variant)) && - !resumeSkipKeys.has(buildTestTargetKey(test.id, targetName, variant)) - ); - }) - : applicableTestCases; + // --resume skips completed tests; --rerun-failed only includes prior failed/error tests. + const filteredTestCases = rerunIncludeKeys + ? applicableTestCases.filter((test) => + resumeIdentityMatches( + rerunIncludeKeys, + test, + targetName, + targetVariantForSelection(selection), + ), + ) + : resumeSkipKeys + ? applicableTestCases.filter((test) => { + const variant = targetVariantForSelection(selection); + return !resumeIdentityMatches(resumeSkipKeys, test, targetName, variant); + }) + : applicableTestCases; if (filteredTestCases.length === 0) { return []; diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index 5fc435e85..fa6320b3b 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -931,7 +931,7 @@ describe('agentv eval CLI', () => { } }, 30_000); - it('reruns failed rows from a canonical run id', async () => { + it('reruns only latest failed rows from a canonical run id', async () => { const fixture = await createFixture(); try { const priorRunDir = path.join(fixture.suiteDir, '.agentv', 'results', 'prior-run'); @@ -946,18 +946,61 @@ describe('agentv eval CLI', () => { expect(first.exitCode).toBe(1); const priorIndexPath = path.join(priorRunDir, 'index.jsonl'); const priorRows = (await readJsonLines(priorIndexPath)) as Array>; + const alphaRow = priorRows.find((row) => row.test_id === 'case-alpha'); + const betaRow = priorRows.find((row) => row.test_id === 'case-beta'); + if (!alphaRow || !betaRow) { + throw new Error('Expected prior rows for case-alpha and case-beta'); + } await writeFile( priorIndexPath, - `${priorRows - .map((row) => - JSON.stringify({ - ...row, - execution_status: row.test_id === 'case-alpha' ? 'quality_failure' : 'ok', - }), - ) + `${[ + ...priorRows.map((row) => ({ + ...row, + execution_status: row.test_id === 'case-alpha' ? 'ok' : 'quality_failure', + })), + { ...alphaRow, execution_status: 'quality_failure' }, + { ...betaRow, execution_status: 'ok' }, + ] + .map((row) => JSON.stringify(row)) .join('\n')}\n`, 'utf8', ); + await writeFile( + fixture.testFilePath, + `description: CLI integration test +target: file-target + +tests: + - id: case-alpha + criteria: System responds with alpha + input: + - role: user + content: | + Please respond with alpha + expected_output: + - role: assistant + content: "Alpha" + - id: case-beta + criteria: System responds with beta + input: + - role: user + content: | + Please respond with beta + expected_output: + - role: assistant + content: "Beta" + - id: case-gamma + criteria: System responds with gamma + input: + - role: user + content: | + Please respond with gamma + expected_output: + - role: assistant + content: "Gamma" +`, + 'utf8', + ); const second = await runCli(fixture, [ 'eval', @@ -965,10 +1008,10 @@ describe('agentv eval CLI', () => { '--rerun-failed', 'prior-run', '--threshold', - '0.8', + '0.5', ]); - expect(second.exitCode).toBe(1); - expect(second.stdout).toContain('Rerun-failed: found 2 existing result(s), skipping 1'); + expect(second.exitCode).toBe(0); + expect(second.stdout).toContain('Rerun-failed: found 4 existing result(s), skipping 1'); const diagnostics = await readDiagnostics(fixture); const calls = diagnostics.calls as Array>; @@ -977,13 +1020,110 @@ describe('agentv eval CLI', () => { }); const rows = await readJsonLines(priorIndexPath); - expect(rows).toHaveLength(3); + expect(rows).toHaveLength(5); expect((rows.at(-1) as Record).test_id).toBe('case-alpha'); } finally { await rm(fixture.baseDir, { recursive: true, force: true }); } }, 30_000); + it('does not use coarse fallback keys for precise rerun-failed identities', async () => { + const fixture = await createFixture(); + try { + const firstEvalPath = path.join(fixture.suiteDir, 'collision-a.eval.yaml'); + const secondEvalPath = path.join(fixture.suiteDir, 'collision-b.eval.yaml'); + const evalContent = (name: string) => `description: ${name} +target: file-target + +tests: + - id: shared-case + criteria: System responds + input: + - role: user + content: | + Please respond for ${name} + expected_output: + - role: assistant + content: "Shared" +`; + await writeFile(firstEvalPath, evalContent('collision a'), 'utf8'); + await writeFile(secondEvalPath, evalContent('collision b'), 'utf8'); + + const priorRunDir = path.join(fixture.suiteDir, '.agentv', 'results', 'prior-collision'); + const first = await runCli(fixture, [ + 'eval', + firstEvalPath, + '--output', + priorRunDir, + '--threshold', + '0.8', + ]); + expect(first.exitCode).toBe(0); + + const priorIndexPath = path.join(priorRunDir, 'index.jsonl'); + const priorRows = (await readJsonLines(priorIndexPath)) as Array>; + expect(priorRows).toHaveLength(1); + const baseRow = priorRows[0]; + if (!baseRow) { + throw new Error('Expected one prior collision row'); + } + const baseProjection = baseRow.projection_identity as Record; + const baseDimensions = baseProjection.dimensions as Record; + const secondProjection = { + ...baseProjection, + dimensions: { + ...baseDimensions, + eval_path: secondEvalPath, + }, + }; + await writeFile( + priorIndexPath, + `${[ + { ...baseRow, execution_status: 'ok' }, + { + ...baseRow, + projection_identity: secondProjection, + execution_status: 'quality_failure', + }, + ] + .map((row) => JSON.stringify(row)) + .join('\n')}\n`, + 'utf8', + ); + + const second = await runCli(fixture, [ + 'eval', + firstEvalPath, + secondEvalPath, + '--rerun-failed', + 'prior-collision', + '--threshold', + '0.8', + ]); + expect(second.exitCode).toBe(0); + expect(second.stdout).toContain('Rerun-failed: found 2 existing result(s), skipping 1'); + + const diagnostics = await readDiagnostics(fixture); + const calls = diagnostics.calls as Array>; + const rerunCalls = calls.slice(1); + expect(rerunCalls).toHaveLength(1); + const rerunCall = rerunCalls[0]; + if (!rerunCall) { + throw new Error('Expected one rerun diagnostics call'); + } + expect(path.basename(rerunCall.testFilePath as string)).toBe('collision-b.eval.yaml'); + expect(rerunCall).toMatchObject({ + evalCaseIds: ['shared-case'], + }); + + const rows = await readJsonLines(priorIndexPath); + expect(rows).toHaveLength(3); + expect((rows.at(-1) as Record).test_id).toBe('shared-case'); + } finally { + await rm(fixture.baseDir, { recursive: true, force: true }); + } + }, 30_000); + it('does not multiply max_concurrency across target matrix selections', async () => { const fixture = await createFixture(); try { diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts index 4103e4eaf..c97edf54b 100644 --- a/apps/cli/test/fixtures/mock-run-evaluation.ts +++ b/apps/cli/test/fixtures/mock-run-evaluation.ts @@ -173,6 +173,7 @@ async function maybeWriteDiagnostics( } const payload = { + testFilePath: options.testFilePath, target: options.target?.name, targetKind: options.target?.kind, targetModel: