From 6589dc37163595af5e371565442da81a62fb1df4 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 14:21:45 +0200 Subject: [PATCH 1/3] feat(eval): add rerun-failed runner pooling --- apps/cli/src/commands/eval/commands/run.ts | 5 +- apps/cli/src/commands/eval/run-cache.ts | 8 +- apps/cli/src/commands/eval/run-eval.ts | 313 ++++++++++-------- apps/cli/src/commands/results/eval-runner.ts | 5 +- apps/cli/test/commands/results/serve.test.ts | 4 +- apps/cli/test/eval.integration.test.ts | 95 ++++++ apps/cli/test/fixtures/mock-run-evaluation.ts | 27 +- .../docs/docs/evaluation/running-evals.mdx | 13 +- .../docs/docs/guides/workspace-pool.mdx | 2 + packages/core/src/evaluation/orchestrator.ts | 33 +- packages/core/src/evaluation/run-artifacts.ts | 22 +- .../core/test/evaluation/orchestrator.test.ts | 135 ++++++++ 12 files changed, 497 insertions(+), 165 deletions(-) diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index a078edc06..09b0f80ef 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -168,10 +168,11 @@ export const evalRunCommand = command({ description: 'Resume an interrupted run: skip already-completed tests and append new results to --output dir', }), - rerunFailed: flag({ + rerunFailed: option({ + type: optional(string), long: 'rerun-failed', description: - 'Rerun failed/errored tests while keeping passing results. Implies --resume semantics', + 'Run ID, run workspace, or index.jsonl to rerun failed/errored tests while keeping passing results', }), strict: flag({ long: 'strict', diff --git a/apps/cli/src/commands/eval/run-cache.ts b/apps/cli/src/commands/eval/run-cache.ts index 342fa8429..d2e8c7b85 100644 --- a/apps/cli/src/commands/eval/run-cache.ts +++ b/apps/cli/src/commands/eval/run-cache.ts @@ -54,10 +54,10 @@ export async function loadRunCache(cwd: string): Promise { /** * Resolve the cached last-run directory for a cwd, if it still exists on disk. * Returns undefined when there is no cache, the cache lacks a `lastRunDir`, - * or the directory has since been deleted. Used by `--resume` / `--rerun-failed` - * to default `--output` to the most recent run when no explicit dir is given, - * matching the convention used by promptfoo (`--resume [evalId]`) and - * OpenCompass (`-r [timestamp]`). + * or the directory has since been deleted. Used by `--resume` to default + * `--output` to the most recent run when no explicit dir is given, matching + * the convention used by promptfoo (`--resume [evalId]`) and OpenCompass + * (`-r [timestamp]`). */ export async function resolveCachedRunDir(cwd: string): Promise { const cache = await loadRunCache(cwd); diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 7fb3bbc0d..31ac0cab4 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -3,6 +3,7 @@ import { access, readFile } from 'node:fs/promises'; import { createRequire as createNodeRequire } from 'node:module'; import path from 'node:path'; import { pathToFileURL } from 'node:url'; +import pLimit from 'p-limit'; import { DEFAULT_THRESHOLD, @@ -54,6 +55,7 @@ import { aggregateRunDir, buildEvalTestTargetKey, buildEvaluationResultTargetKey, + buildTestTargetKey, deduplicateByTestIdTarget, parseJsonlResults, writeArtifactsFromResults, @@ -135,6 +137,7 @@ interface NormalizedOptions { readonly retryErrors?: string; readonly resume: boolean; readonly rerunFailed: boolean; + readonly rerunFailedSource?: string; readonly workspaceMode?: 'pooled' | 'temp' | 'static'; readonly workspacePath?: string; readonly keepWorkspaces: boolean; @@ -609,8 +612,10 @@ function normalizeOptions( otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns) || yamlExecution?.otel_group_turns === true, retryErrors: normalizeString(rawOptions.retryErrors), - resume: normalizeBoolean(rawOptions.resume) || normalizeBoolean(rawOptions.rerunFailed), - rerunFailed: normalizeBoolean(rawOptions.rerunFailed), + resume: + normalizeBoolean(rawOptions.resume) || normalizeString(rawOptions.rerunFailed) !== undefined, + rerunFailed: normalizeString(rawOptions.rerunFailed) !== undefined, + rerunFailedSource: normalizeString(rawOptions.rerunFailed), workspaceMode, workspacePath, // Precedence: CLI > YAML config > TS config @@ -1164,6 +1169,27 @@ async function readExistingResultsFromRunDir(runDir: string): Promise { + const trimmed = source.trim(); + if (!trimmed) { + throw new Error('--rerun-failed requires a run ID, run workspace, or index.jsonl path.'); + } + + const candidate = path.isAbsolute(trimmed) ? trimmed : path.resolve(cwd, trimmed); + if (existsSync(candidate)) { + return path.basename(candidate) === RESULT_INDEX_FILENAME ? path.dirname(candidate) : candidate; + } + + const runIdCandidate = path.join(cwd, '.agentv', 'results', trimmed); + if (existsSync(runIdCandidate)) { + return runIdCandidate; + } + + throw new Error( + `Run not found for --rerun-failed: ${source}. Expected a run ID under .agentv/results, a run workspace, or an index.jsonl path.`, + ); +} + async function prepareFileMetadata(params: { readonly testFilePath: string; readonly repoRoot: string; @@ -1830,17 +1856,16 @@ export async function runEvalCommand( } } - // --resume / --rerun-failed without an explicit --output: default to the + // --resume without an explicit --output: default to the // last-known run dir for this cwd from .agentv/cache.json. Matches promptfoo's // `--resume [evalId]` and OpenCompass's `-r [timestamp]` "latest by default" // convention. The cache pointer is written by saveRunCache after every eval. - if (options.resume && !options.retryErrors && !options.outputDir) { + if (options.resume && !options.rerunFailedSource && !options.retryErrors && !options.outputDir) { const cachedDir = await resolveCachedRunDir(cwd); if (cachedDir) { options = { ...options, outputDir: cachedDir }; - const flagLabel = options.rerunFailed ? 'rerun-failed' : 'resume'; const displayDir = path.relative(cwd, cachedDir) || cachedDir; - console.log(`Auto-detected last run dir for --${flagLabel}: ${displayDir}`); + console.log(`Auto-detected last run dir for --resume: ${displayDir}`); } } @@ -1849,22 +1874,35 @@ export async function runEvalCommand( let resumeSkipKeys: Set | undefined; let isResumeAppend = false; if (options.resume && !options.retryErrors) { - const explicitResumeDir = options.outputDir; - if (explicitResumeDir) { - const resumeDir = path.resolve(explicitResumeDir); - const resumeIndexPaths = discoverRunManifestPaths(resumeDir); + const sourceRunDir = options.rerunFailedSource + ? await resolveRerunFailedRunDir(cwd, options.rerunFailedSource) + : options.outputDir + ? path.resolve(options.outputDir) + : undefined; + + if (sourceRunDir) { + if (options.rerunFailedSource && !options.outputDir) { + options = { ...options, outputDir: sourceRunDir }; + } + + const resumeIndexPaths = discoverRunManifestPaths(sourceRunDir); if (resumeIndexPaths.length > 0) { - const existingResults = await readExistingResultsFromRunDir(resumeDir); + const existingResults = await readExistingResultsFromRunDir(sourceRunDir); resumeSkipKeys = new Set(); + let completedResultCount = 0; for (const r of existingResults) { if (shouldSkipExistingResultForResume(r, options.rerunFailed)) { + completedResultCount += 1; resumeSkipKeys.add(buildEvaluationResultTargetKey(r)); + resumeSkipKeys.add(buildTestTargetKey(r.testId, r.target, r.variant)); } } - isResumeAppend = true; + isResumeAppend = + options.outputDir !== undefined && + path.resolve(options.outputDir) === path.resolve(sourceRunDir); const modeLabel = options.rerunFailed ? 'Rerun-failed' : 'Resume'; console.log( - `${modeLabel}: found ${existingResults.length} existing result(s), skipping ${resumeSkipKeys.size} completed.`, + `${modeLabel}: found ${existingResults.length} existing result(s), skipping ${completedResultCount} completed.`, ); } else { // No existing bundle manifest — behave like a normal run. @@ -2121,7 +2159,8 @@ export async function runEvalCommand( const target = selection.targetName; const variant = targetVariantForSelection(selection); const key = buildEvalTestTargetKey(test, target, variant); - if (resumeSkipKeys?.has(key)) { + const fallbackKey = buildTestTargetKey(test.id, target, variant); + if (resumeSkipKeys?.has(key) || resumeSkipKeys?.has(fallbackKey)) { resumeSkippedCount++; } else { totalEvalCount++; @@ -2345,127 +2384,143 @@ export async function runEvalCommand( continue; } - // Run all targets concurrently (each target has its own worker limit) + const fileWorkerLimit = Math.max(1, fileOptions.workers ?? DEFAULT_WORKERS); + const targetConcurrency = + targetPrep.selections.length > 1 + ? Math.min(fileWorkerLimit, targetPrep.selections.length) + : 1; + const perTargetWorkers = + targetPrep.selections.length > 1 + ? Math.max(1, Math.floor(fileWorkerLimit / targetConcurrency)) + : fileWorkerLimit; + const limitTarget = pLimit(targetConcurrency); + + // Run target matrix selections through a bounded pool. Each active target + // receives a slice of the worker budget so total in-process case execution + // never multiplies past max_concurrency. const targetResults = await Promise.all( - targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => { - // Target selection is suite/experiment/CLI runtime policy; every selected - // target runs every filtered test case for this eval file. - const targetName = selection.targetName; - const applicableTestCases = targetPrep.testCases; - - // --resume / --rerun-failed: skip tests that are already completed - const filteredTestCases = resumeSkipKeys - ? applicableTestCases.filter( - (test) => - !resumeSkipKeys.has( - buildEvalTestTargetKey(test, targetName, targetVariantForSelection(selection)), - ), - ) - : applicableTestCases; - - if (filteredTestCases.length === 0) { - return []; - } + targetPrep.selections.map(({ selection, inlineTargetLabel }) => + limitTarget(async () => { + // Target selection is suite/experiment/CLI runtime policy; every selected + // target runs every filtered test case for this eval file. + const targetName = selection.targetName; + const applicableTestCases = targetPrep.testCases; + + // --resume / --rerun-failed: skip tests that are already completed + const filteredTestCases = resumeSkipKeys + ? applicableTestCases.filter((test) => { + const variant = targetVariantForSelection(selection); + return ( + !resumeSkipKeys.has(buildEvalTestTargetKey(test, targetName, variant)) && + !resumeSkipKeys.has(buildTestTargetKey(test.id, targetName, variant)) + ); + }) + : applicableTestCases; + + if (filteredTestCases.length === 0) { + return []; + } - try { - const runGroups = groupTestsByRunPolicy({ - tests: filteredTestCases, - options: fileOptions, - defaultTrialsConfig: fileOptions.transcript ? undefined : targetPrep.trialsConfig, - defaultThreshold: targetPrep.threshold ?? fileOptions.threshold, - defaultTimeoutSeconds: fileOptions.agentTimeoutSeconds, - defaultBudgetUsd: targetPrep.budgetUsd, - }); - const groupResults: EvaluationResult[] = []; - for (const group of runGroups) { - hasScopedRunPolicies ||= group.policy.hasScopedOverride; - const result = await runSingleEvalFile({ - testFilePath, - cwd, - repoRoot, + try { + const runGroups = groupTestsByRunPolicy({ + tests: filteredTestCases, options: fileOptions, - outputWriter, - otelExporter, - cache, - evaluationRunner, - workersOverride: fileOptions.workers, - progressReporter, - seenTestCases, - displayIdTracker, - selection, - inlineTargetLabel, - testCases: group.tests, - trialsConfig: fileOptions.transcript ? undefined : group.policy.trialsConfig, - agentTimeoutSeconds: group.policy.timeoutSeconds, - matrixMode: targetPrep.selections.length > 1, - budgetUsd: group.policy.budgetUsd, - runBudgetTracker: fileBudgetTracker, - failOnError: targetPrep.failOnError, - threshold: group.policy.threshold, - providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory, - }); - groupResults.push(...result.results); - } - const evalFile = path.relative(cwd, testFilePath); - const existingSummary = remoteEvalSummaries.find( - (summary) => summary.evalFile === evalFile, - ); - if (existingSummary) { - existingSummary.results.push(...groupResults); - } else { - remoteEvalSummaries.push({ - evalFile, - results: [...groupResults], + defaultTrialsConfig: fileOptions.transcript ? undefined : targetPrep.trialsConfig, + defaultThreshold: targetPrep.threshold ?? fileOptions.threshold, + defaultTimeoutSeconds: fileOptions.agentTimeoutSeconds, + defaultBudgetUsd: targetPrep.budgetUsd, }); - } - - return groupResults; - } catch (fileError) { - // before_all or other setup failures should not abort the entire run. - // Mark all tests in this file as errors and continue with other files. - const message = fileError instanceof Error ? fileError.message : String(fileError); - console.error( - `\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`, - ); - const explicitVariant = targetVariantForSelection(selection); - const errorResults: EvaluationResult[] = filteredTestCases.map((testCase) => - withSourceMetadata( - { - timestamp: new Date().toISOString(), - testId: testCase.testId ?? testCase.id, - prompt: testCase.prompt, - score: 0, - assertions: [], - output: message, - trace: buildTraceFromMessages({ - input: testCase.input as EvaluationResult['input'], - output: [{ role: 'assistant' as const, content: message }], - finalOutput: message, - target: selection.targetName, + const groupResults: EvaluationResult[] = []; + for (const group of runGroups) { + hasScopedRunPolicies ||= group.policy.hasScopedOverride; + const result = await runSingleEvalFile({ + testFilePath, + cwd, + repoRoot, + options: fileOptions, + outputWriter, + otelExporter, + cache, + evaluationRunner, + workersOverride: perTargetWorkers, + progressReporter, + seenTestCases, + displayIdTracker, + selection, + inlineTargetLabel, + testCases: group.tests, + trialsConfig: fileOptions.transcript ? undefined : group.policy.trialsConfig, + agentTimeoutSeconds: group.policy.timeoutSeconds, + matrixMode: targetPrep.selections.length > 1, + budgetUsd: group.policy.budgetUsd, + runBudgetTracker: fileBudgetTracker, + failOnError: targetPrep.failOnError, + threshold: group.policy.threshold, + providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory, + }); + groupResults.push(...result.results); + } + const evalFile = path.relative(cwd, testFilePath); + const existingSummary = remoteEvalSummaries.find( + (summary) => summary.evalFile === evalFile, + ); + if (existingSummary) { + existingSummary.results.push(...groupResults); + } else { + remoteEvalSummaries.push({ + evalFile, + results: [...groupResults], + }); + } + + return groupResults; + } catch (fileError) { + // before_all or other setup failures should not abort the entire run. + // Mark all tests in this file as errors and continue with other files. + const message = fileError instanceof Error ? fileError.message : String(fileError); + console.error( + `\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`, + ); + const explicitVariant = targetVariantForSelection(selection); + const errorResults: EvaluationResult[] = filteredTestCases.map((testCase) => + withSourceMetadata( + { + timestamp: new Date().toISOString(), testId: testCase.testId ?? testCase.id, - conversationId: testCase.conversation_id, + prompt: testCase.prompt, + score: 0, + assertions: [], + output: message, + trace: buildTraceFromMessages({ + input: testCase.input as EvaluationResult['input'], + output: [{ role: 'assistant' as const, content: message }], + finalOutput: message, + target: selection.targetName, + testId: testCase.testId ?? testCase.id, + conversationId: testCase.conversation_id, + error: message, + }), + scores: [], error: message, - }), - scores: [], - error: message, - executionStatus: 'execution_error' as const, - failureStage: 'setup' as const, - failureReasonCode: 'setup_error' as const, - durationMs: 0, - tokenUsage: { input: 0, output: 0 }, - target: selection.targetName, - variant: explicitVariant, - }, - testFilePath, - fileOptions, - ), - ); - for (const errResult of errorResults) { - await outputWriter.append(errResult); + executionStatus: 'execution_error' as const, + failureStage: 'setup' as const, + failureReasonCode: 'setup_error' as const, + durationMs: 0, + tokenUsage: { input: 0, output: 0 }, + target: selection.targetName, + variant: explicitVariant, + }, + testFilePath, + fileOptions, + ), + ); + for (const errResult of errorResults) { + await outputWriter.append(errResult); + } + return errorResults; } - return errorResults; - } - }), + }), + ), ); for (const results of targetResults) { allResults.push(...results); @@ -2653,7 +2708,7 @@ export async function runEvalCommand( const relativeRunDir = path.relative(cwd, runDir); console.log( `\nTip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:\n` + - ` agentv eval run ${evalFileArgs}${targetFlag} --output ${relativeRunDir} --rerun-failed`, + ` agentv eval run ${evalFileArgs}${targetFlag} --rerun-failed ${relativeRunDir}`, ); } diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts index 9a4617eb5..3fb397621 100644 --- a/apps/cli/src/commands/results/eval-runner.ts +++ b/apps/cli/src/commands/results/eval-runner.ts @@ -171,6 +171,9 @@ function validateResumeOptions(req: RunEvalRequest): string | undefined { if (modes.length > 1) { return `resume, rerun_failed, and retry_errors are mutually exclusive (got: ${modes.join(', ')})`; } + if (req.rerun_failed && !req.output?.trim()) { + return 'rerun_failed requires output to identify the prior run workspace'; + } return undefined; } @@ -230,7 +233,7 @@ function buildCliArgs(req: RunEvalRequest, experiment?: string): string[] { args.push('--resume'); } if (req.rerun_failed) { - args.push('--rerun-failed'); + args.push('--rerun-failed', req.output?.trim() ?? ''); } if (req.retry_errors?.trim()) { args.push('--retry-errors', req.retry_errors.trim()); diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 77228476d..2d27c4c4b 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -4031,7 +4031,7 @@ describe('serve app', () => { }); expect(res.status).toBe(202); const data = (await res.json()) as { command: string }; - expect(data.command).toContain('--rerun-failed'); + expect(data.command).toContain('--rerun-failed .agentv/results/r1'); expect(data.command).toContain('--output .agentv/results/r1'); }); @@ -4215,7 +4215,7 @@ describe('serve app', () => { }); expect(res.status).toBe(200); const data = (await res.json()) as { command: string }; - expect(data.command).toContain('--rerun-failed'); + expect(data.command).toContain('--rerun-failed .agentv/results/r1'); expect(data.command).not.toContain('--resume'); }); diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index 419d06d23..a88638f06 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -933,6 +933,101 @@ describe('agentv eval CLI', () => { } }, 30_000); + it('reruns failed rows from a canonical run id', async () => { + const fixture = await createFixture(); + try { + const priorRunDir = path.join(fixture.suiteDir, '.agentv', 'results', 'prior-run'); + const first = await runCli(fixture, [ + 'eval', + fixture.testFilePath, + '--output', + priorRunDir, + '--threshold', + '0.8', + ]); + expect(first.exitCode).toBe(1); + const priorIndexPath = path.join(priorRunDir, 'index.jsonl'); + const priorRows = (await readJsonLines(priorIndexPath)) as Array>; + await writeFile( + priorIndexPath, + `${priorRows + .map((row) => + JSON.stringify({ + ...row, + execution_status: row.test_id === 'case-alpha' ? 'quality_failure' : 'ok', + }), + ) + .join('\n')}\n`, + 'utf8', + ); + + const second = await runCli(fixture, [ + 'eval', + fixture.testFilePath, + '--rerun-failed', + 'prior-run', + '--threshold', + '0.8', + ]); + expect(second.exitCode).toBe(1); + expect(second.stdout).toContain('Rerun-failed: found 2 existing result(s), skipping 1'); + + const diagnostics = await readDiagnostics(fixture); + const calls = diagnostics.calls as Array>; + expect(calls.at(-1)).toMatchObject({ + evalCaseIds: ['case-alpha'], + }); + + const rows = await readJsonLines(priorIndexPath); + expect(rows).toHaveLength(3); + expect((rows.at(-1) as Record).test_id).toBe('case-alpha'); + } finally { + await rm(fixture.baseDir, { recursive: true, force: true }); + } + }, 30_000); + + it('does not multiply max_concurrency across target matrix selections', async () => { + const fixture = await createFixture(); + try { + const evalPath = path.join(fixture.suiteDir, 'target-matrix.eval.yaml'); + await writeFile( + evalPath, + [ + 'name: target-matrix', + 'target: file-target', + 'tests:', + ' - id: first-case', + ' input: first', + ' criteria: ok', + ' - id: second-case', + ' input: second', + ' criteria: ok', + '', + ].join('\n'), + 'utf8', + ); + + const { exitCode } = await runCli(fixture, [ + 'eval', + evalPath, + '--workers', + '2', + '--target', + 'file-target', + '--target', + 'cli-target', + ]); + + expect(exitCode).toBe(0); + const diagnostics = await readDiagnostics(fixture); + const calls = diagnostics.calls as Array>; + expect(calls).toHaveLength(2); + expect(calls.map((call) => call.maxConcurrency)).toEqual([1, 1]); + } finally { + await rm(fixture.baseDir, { recursive: true, force: true }); + } + }, 30_000); + it('records CLI-named experiment namespace separately from default runtime config', async () => { const fixture = await createFixture(); try { diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts index 5c30221ff..4103e4eaf 100644 --- a/apps/cli/test/fixtures/mock-run-evaluation.ts +++ b/apps/cli/test/fixtures/mock-run-evaluation.ts @@ -64,6 +64,8 @@ interface EvaluationResultLike { readonly timestamp: string; } +let diagnosticsWriteQueue: Promise = Promise.resolve(); + function evalCaseIds(evalCases: ReadonlyArray | undefined): readonly string[] { if (!Array.isArray(evalCases) || evalCases.length === 0) { return ['case-alpha', 'case-beta']; @@ -210,17 +212,20 @@ async function maybeWriteDiagnostics( resultCount: results.length, } satisfies Record; - const priorCalls = await readFile(diagnosticsPath, 'utf8') - .then((raw) => { - const parsed = JSON.parse(raw) as { readonly calls?: unknown }; - return Array.isArray(parsed.calls) ? parsed.calls : [parsed]; - }) - .catch(() => []); - await writeFile( - diagnosticsPath, - JSON.stringify({ ...payload, calls: [...priorCalls, payload] }, null, 2), - 'utf8', - ); + diagnosticsWriteQueue = diagnosticsWriteQueue.then(async () => { + const priorCalls = await readFile(diagnosticsPath, 'utf8') + .then((raw) => { + const parsed = JSON.parse(raw) as { readonly calls?: unknown }; + return Array.isArray(parsed.calls) ? parsed.calls : [parsed]; + }) + .catch(() => []); + await writeFile( + diagnosticsPath, + JSON.stringify({ ...payload, calls: [...priorCalls, payload] }, null, 2), + 'utf8', + ); + }); + await diagnosticsWriteQueue; } async function maybeWritePromptDump( diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index 59c0d8366..2639427a5 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -296,7 +296,7 @@ agentv eval evals/my-eval.yaml --export-otel ### Parallelism -The `--workers N` flag controls how many **test cases run in parallel within each eval file** (default: 3). Eval files always run sequentially — one file completes before the next starts. +The `--workers N` flag controls the in-process worker pool for a single eval file (default: 3). Eval files always run sequentially — one file completes before the next starts. In target-matrix runs, selected targets share that worker budget instead of each target creating its own full pool. ```bash agentv eval evals/my-eval.yaml --workers 4 @@ -304,6 +304,9 @@ agentv eval evals/my-eval.yaml --workers 4 agentv eval evals/file1.yaml evals/file2.yaml evals/file3.yaml --workers 3 # Files run one at a time; within each file, up to 3 test cases run in parallel + +agentv eval evals/my-eval.yaml --target gpt --target claude --workers 4 +# The target matrix shares the same 4-worker budget ``` This matches the standard model used by eval frameworks (promptfoo, deepeval, OpenAI Evals) and avoids cross-file workspace races without any special configuration. @@ -355,10 +358,10 @@ AgentV ships three flags for picking up a partial run. They differ only in **whi | Flag | What it skips | What it re-runs | Use when | |------|---------------|-----------------|----------| | `--resume` | Anything that finished without an `execution_error` (passes, fails, threshold misses) | Errors and missing cases | The run was interrupted (Ctrl-C, crash, OOM) and you just want it to finish | -| `--rerun-failed` | Only cases with `executionStatus === 'ok'` | Errors **and** test failures (assertion misses, threshold misses) | A grader change or model swap means you want to re-grade everything that wasn't already passing | +| `--rerun-failed ` | Only cases with `executionStatus === 'ok'` | Errors **and** test failures (assertion misses, threshold misses) | A grader change or model swap means you want to re-grade everything that wasn't already passing | | `--retry-errors ` | Anything that completed without an `execution_error` (same set as `--resume`) | Errors and missing cases | You want to point at an arbitrary prior run/manifest by path, instead of resuming the run dir you're currently writing to | -`--resume` and `--rerun-failed` both append to the existing `index.jsonl`. When `--output ` is given they target that directory; when omitted they default to the **last run dir for the current cwd**, recorded in `.agentv/cache.json` and updated after every eval. This matches promptfoo's `--resume [evalId]` and OpenCompass's `-r [timestamp]` "latest by default" convention. `--retry-errors` takes the prior run's path directly (a directory or an `index.jsonl`). +`--resume` appends to the existing `index.jsonl` in `--output `; when omitted it defaults to the **last run dir for the current cwd**, recorded in `.agentv/cache.json` and updated after every eval. `--rerun-failed ` reads a specific canonical run bundle from `.agentv/results/` and, when `--output` is omitted, appends replacement rows to that same bundle. You can also pass a run workspace path or `index.jsonl` path instead of a bare run ID. `--retry-errors` takes the prior run's path directly and re-runs only execution errors or missing cases. ```bash # Resume the last run — no args needed; AgentV finds it from .agentv/cache.json @@ -367,8 +370,8 @@ agentv eval evals/my-eval.yaml --resume # Or target a specific run dir explicitly agentv eval evals/my-eval.yaml --output .agentv/results/ --resume -# Re-run errors AND failed cases against the last run dir -agentv eval evals/my-eval.yaml --rerun-failed +# Re-run errors AND failed cases from a specific canonical run +agentv eval evals/my-eval.yaml --rerun-failed # Re-run only execution errors from any prior run by path agentv eval evals/my-eval.yaml --retry-errors .agentv/results//index.jsonl diff --git a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx index f907aeb81..06e9694d4 100644 --- a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx +++ b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx @@ -132,6 +132,8 @@ This creates up to 4 slots (`slot-0` through `slot-3`). PID-based lock files pre The maximum number of pool slots defaults to 10 (capped at 50). Slots are created on demand — a run with 2 workers only creates 2 slots, even if the pool allows 10. +Before a slot is reused for another case, AgentV resets it to the slot baseline. A pooled workspace is a performance cache, not shared mutable state between cases. + **Multiple eval files:** When you pass multiple eval files to `agentv eval`, they run sequentially — one file completes before the next starts (see [Parallelism](/docs/evaluation/running-evals/#parallelism)). Within each file, pool slots support concurrent workers as described above. ## Drift detection diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 0e91ad79b..a393bb06e 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -1235,13 +1235,17 @@ export async function runEvaluation( // shared owner prepare without inheriting a child suite's workspace. const usesSharedWorkspace = caseUsesSharedWorkspaceSetup(evalCase, sharedSetup); const testPoolSlot = - usesSharedWorkspace && availablePoolSlots.length > 0 ? availablePoolSlots.pop() : undefined; + usesSharedWorkspace && availablePoolSlots.length > 0 + ? availablePoolSlots.pop() + : usesSharedWorkspace + ? poolSlot + : undefined; const testWorkspacePath = usesSharedWorkspace ? (testPoolSlot?.path ?? sharedWorkspacePath) : undefined; const testBaselineCommit = usesSharedWorkspace ? testPoolSlot - ? poolSlotBaselines.get(testPoolSlot.path) + ? (poolSlotBaselines.get(testPoolSlot.path) ?? sharedBaselineCommit) : sharedBaselineCommit : undefined; const testExtensionState = usesSharedWorkspace @@ -1355,9 +1359,30 @@ export async function runEvaluation( } throw error; } finally { - // Return pool slot for reuse by next test + // Return pool slot for reuse by next test only after resetting it to + // the per-slot baseline. Pooling is a local performance optimization, + // not shared state between eval cases. if (testPoolSlot) { - availablePoolSlots.push(testPoolSlot); + const shouldReturnPoolSlot = testPoolSlot !== poolSlot; + const resetMode = workspaceClean === 'full' ? 'strict' : 'fast'; + let resetSucceeded = true; + try { + if (repoManager && suiteWorkspace?.repos?.length) { + await repoManager.reset(suiteWorkspace.repos, testPoolSlot.path, resetMode); + } + await resetWorkspaceRoot(testPoolSlot.path, resetMode, testBaselineCommit); + } catch (resetError) { + resetSucceeded = false; + if (verbose) { + const message = resetError instanceof Error ? resetError.message : String(resetError); + console.warn( + `Warning: failed to reset workspace pool slot ${testPoolSlot.index}; leaving it out of reuse: ${message}`, + ); + } + } + if (resetSucceeded && shouldReturnPoolSlot) { + availablePoolSlots.push(testPoolSlot); + } } } } diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index 311a566c1..e687a0cb5 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -2052,6 +2052,13 @@ function indexRecordReplacementKey(record: unknown): string | undefined { return projectionIdentityRecordKey(record) ?? indexRecordKey(record); } +function indexRecordReplacementKeys(record: unknown): readonly string[] { + const keys = [projectionIdentityRecordKey(record), indexRecordKey(record)].filter( + (key): key is string => typeof key === 'string' && key.length > 0, + ); + return Array.from(new Set(keys)); +} + function projectionIdentityRecordKey(record: unknown): string | undefined { if (!isRecord(record) || !isRecord(record.projection_identity)) { return undefined; @@ -2141,10 +2148,9 @@ async function rewriteExistingIndexRecords( } const replacementsByKey = new Map( - replacements.flatMap((record) => { - const key = indexRecordReplacementKey(record); - return key ? [[key, record] as const] : []; - }), + replacements.flatMap((record) => + indexRecordReplacementKeys(record).map((key) => [key, record] as const), + ), ); const seen = new Set(); const records: unknown[] = []; @@ -2158,7 +2164,9 @@ async function rewriteExistingIndexRecords( const replacement = key ? replacementsByKey.get(key) : undefined; if (key && replacement) { records.push(replacement); - seen.add(key); + for (const replacementKey of indexRecordReplacementKeys(replacement)) { + seen.add(replacementKey); + } } else { records.push(parsed); } @@ -2166,8 +2174,8 @@ async function rewriteExistingIndexRecords( } for (const replacement of replacements) { - const key = indexRecordReplacementKey(replacement); - if (!key || !seen.has(key)) { + const keys = indexRecordReplacementKeys(replacement); + if (keys.length === 0 || keys.every((key) => !seen.has(key))) { records.push(replacement); } } diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index daccce706..1473a502a 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -1,10 +1,12 @@ import { afterEach, describe, expect, it, mock, spyOn } from 'bun:test'; +import { execSync } from 'node:child_process'; import { existsSync, mkdirSync, mkdtempSync, readFileSync, readdirSync, + rmSync, writeFileSync, } from 'node:fs'; import { tmpdir } from 'node:os'; @@ -151,6 +153,31 @@ const baseTarget: ResolvedTarget = { config: { response: '{}' }, }; +function cleanGitEnv(): Record { + const env: Record = {}; + for (const [key, value] of Object.entries(process.env)) { + if (value !== undefined && !(key.startsWith('GIT_') && key !== 'GIT_SSH_COMMAND')) { + env[key] = value; + } + } + return env; +} + +function createTestRepo(dir: string, files: Record): string { + mkdirSync(dir, { recursive: true }); + const opts = { cwd: dir, stdio: 'ignore' as const, env: cleanGitEnv() }; + execSync('git init', opts); + execSync('git config user.email "test@test.com"', opts); + execSync('git config user.name "Test"', opts); + for (const [name, content] of Object.entries(files)) { + const filePath = path.join(dir, name); + mkdirSync(path.dirname(filePath), { recursive: true }); + writeFileSync(filePath, content); + } + execSync('git add -A && git commit -m "initial"', opts); + return execSync('git rev-parse HEAD', { cwd: dir, env: cleanGitEnv() }).toString().trim(); +} + const evaluatorRegistry = { 'llm-grader': { kind: 'llm-grader', @@ -638,6 +665,114 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, expect(result.score).toBeGreaterThan(0); }); + it('does not retry completed quality failures', async () => { + const provider = new SequenceProvider('mock', { + responses: [ + { + output: [{ role: 'assistant', content: 'Incomplete response.' }], + }, + ], + }); + const failingEvaluators = { + 'llm-grader': { + kind: 'llm-grader', + async evaluate() { + return { + score: 0.1, + verdict: 'fail' as const, + assertions: [{ text: 'quality miss', passed: false }], + expectedAspectCount: 1, + }; + }, + }, + }; + + const result = await runEvalCase({ + evalCase: baseTestCase, + provider, + target: baseTarget, + evaluators: failingEvaluators, + maxRetries: 3, + }); + + expect(provider.callIndex).toBe(1); + expect(result.executionStatus).toBe('quality_failure'); + expect(result.retryIndex).toBe(0); + }); + + it('resets a pooled workspace slot before reusing it for the next case', async () => { + const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-pooled-runner-')); + const previousAgentvHome = process.env.AGENTV_HOME; + const previousAgentvDataDir = process.env.AGENTV_DATA_DIR; + process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home'); + process.env.AGENTV_DATA_DIR = path.join(tempDir, 'agentv-data'); + + try { + const sourceRepo = path.join(tempDir, 'source-repo'); + const cleanCommit = createTestRepo(sourceRepo, { 'tracked.txt': 'clean\n' }); + const workspace = { + repos: [ + { + path: './repo-a', + repo: `file://${sourceRepo}`, + commit: cleanCommit, + }, + ], + }; + const seenStaleBeforeSecond: boolean[] = []; + let callCount = 0; + const provider: Provider = { + id: 'mock:pooled-reset', + kind: 'mock' as const, + targetName: 'pooled-reset', + async invoke(request: ProviderRequest): Promise { + callCount += 1; + if (!request.cwd) { + throw new Error('missing cwd'); + } + const repoDir = path.join(request.cwd, 'repo-a'); + if (callCount === 1) { + writeFileSync(path.join(repoDir, 'tracked.txt'), 'dirty\n'); + writeFileSync(path.join(repoDir, 'stale.txt'), 'stale\n'); + } else { + seenStaleBeforeSecond.push(existsSync(path.join(repoDir, 'stale.txt'))); + expect(readFileSync(path.join(repoDir, 'tracked.txt'), 'utf8')).toBe('clean\n'); + } + return { output: [{ role: 'assistant', content: `response ${callCount}` }] }; + }, + }; + + const results = await runEvaluation({ + testFilePath: path.join(tempDir, 'eval.yaml'), + repoRoot: tempDir, + target: { ...baseTarget, name: 'pooled-reset' }, + providerFactory: () => provider, + evaluators: evaluatorRegistry, + workspaceMode: 'pooled', + maxConcurrency: 1, + evalCases: [ + { ...baseTestCase, id: 'case-1', workspace }, + { ...baseTestCase, id: 'case-2', workspace }, + ], + }); + + expect(results).toHaveLength(2); + expect(seenStaleBeforeSecond).toEqual([false]); + } finally { + if (previousAgentvHome === undefined) { + process.env.AGENTV_HOME = undefined; + } else { + process.env.AGENTV_HOME = previousAgentvHome; + } + if (previousAgentvDataDir === undefined) { + process.env.AGENTV_DATA_DIR = undefined; + } else { + process.env.AGENTV_DATA_DIR = previousAgentvDataDir; + } + rmSync(tempDir, { recursive: true, force: true }); + } + }, 30_000); + it('applies exponential backoff between retries', async () => { const provider = new SequenceProvider('mock', { errors: [new Error('Transient failure')], From 71c15301bcee745cee0a07b816d7e5802ec5a439 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 15:09:16 +0200 Subject: [PATCH 2/3] fix(eval): constrain rerun-failed identities --- apps/cli/src/commands/eval/run-eval.ts | 191 ++++++++++++++++-- apps/cli/test/eval.integration.test.ts | 164 +++++++++++++-- apps/cli/test/fixtures/mock-run-evaluation.ts | 1 + packages/core/src/evaluation/orchestrator.ts | 37 +++- packages/core/src/evaluation/types.ts | 4 + 5 files changed, 363 insertions(+), 34 deletions(-) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 31ac0cab4..e8c831c87 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -105,6 +105,137 @@ function shouldSkipExistingResultForResume( return result.executionStatus !== 'execution_error'; } +interface ResumeIdentityEntry { + readonly kind: 'precise' | 'legacy'; + readonly key: string; + readonly result: EvaluationResult; +} + +interface ResumeIdentityMatcher { + readonly preciseKeys: Set; + readonly legacyKeys: Set; +} + +function hasNonEmptyString(value: unknown): value is string { + return typeof value === 'string' && value.trim().length > 0; +} + +function objectRecord(value: unknown): Record | undefined { + return typeof value === 'object' && value !== null && !Array.isArray(value) + ? (value as Record) + : undefined; +} + +function resultProjectionDimensions(result: EvaluationResult): Record | undefined { + const projectionIdentity = objectRecord( + (result as unknown as Record).projectionIdentity, + ); + return objectRecord(projectionIdentity?.dimensions); +} + +function hasCanonicalResultIdentity(result: EvaluationResult): boolean { + const source = result.source; + const dimensions = resultProjectionDimensions(result); + const resultRecord = result as unknown as Record; + return ( + hasNonEmptyString(dimensions?.evalPath) || + hasNonEmptyString(dimensions?.suite) || + hasNonEmptyString(dimensions?.promptId) || + hasNonEmptyString(resultRecord.evalPath) || + hasNonEmptyString(source?.evalFileRepoPath) || + hasNonEmptyString(source?.evalFilePath) || + hasNonEmptyString(source?.evalFileAbsolutePath) || + hasNonEmptyString(result.suite) || + hasNonEmptyString(result.prompt?.id) + ); +} + +function resultResumeIdentityEntry(result: EvaluationResult): ResumeIdentityEntry { + if (hasCanonicalResultIdentity(result)) { + return { + kind: 'precise', + key: buildEvaluationResultTargetKey(result), + result, + }; + } + return { + kind: 'legacy', + key: buildTestTargetKey(result.testId, result.target, result.variant), + result, + }; +} + +function latestResumeIdentityEntries( + results: readonly EvaluationResult[], +): readonly ResumeIdentityEntry[] { + const latestByIdentity = new Map(); + for (const result of results) { + const entry = resultResumeIdentityEntry(result); + latestByIdentity.set(`${entry.kind}:${entry.key}`, entry); + } + return Array.from(latestByIdentity.values()); +} + +function createResumeIdentityMatcher(): ResumeIdentityMatcher { + return { preciseKeys: new Set(), legacyKeys: new Set() }; +} + +function addResumeIdentityEntry(matcher: ResumeIdentityMatcher, entry: ResumeIdentityEntry): void { + if (entry.kind === 'legacy') { + matcher.legacyKeys.add(entry.key); + return; + } + matcher.preciseKeys.add(entry.key); +} + +function uniqueStrings(values: readonly (string | undefined)[]): string[] { + return Array.from(new Set(values.filter(hasNonEmptyString))); +} + +function buildPlannedResumeIdentityKeys( + test: EvalTest, + target: string, + variant: string | undefined, +): readonly string[] { + const keys = new Set([buildEvalTestTargetKey(test, target, variant)]); + const evalPaths = uniqueStrings([ + test.source?.evalFileRepoPath, + test.source?.evalFilePath, + test.source?.evalFileAbsolutePath, + ]); + const suites = Array.from(new Set([test.suite ?? null, null])); + + for (const evalPath of evalPaths) { + for (const suite of suites) { + keys.add( + JSON.stringify({ + eval_path: evalPath, + suite, + test_id: test.id ?? 'unknown', + prompt_id: test.prompt?.id ?? null, + target: target ?? 'unknown', + variant: variant ?? null, + }), + ); + } + } + + return Array.from(keys); +} + +function resumeIdentityMatches( + matcher: ResumeIdentityMatcher, + test: EvalTest, + target: string, + variant: string | undefined, +): boolean { + return ( + buildPlannedResumeIdentityKeys(test, target, variant).some((key) => + matcher.preciseKeys.has(key), + ) || matcher.legacyKeys.has(buildTestTargetKey(test.id, target, variant)) + ); +} + interface RunEvalCommandInput { readonly testFiles: readonly string[]; readonly rawOptions: Record; @@ -1869,9 +2000,10 @@ export async function runEvalCommand( } } - // --resume / --rerun-failed: skip already-completed tests and append to existing output. + // --resume skips completed rows; --rerun-failed includes only latest failed/error rows. // IMPORTANT: JSONL must be loaded before the output writer is created (same file). - let resumeSkipKeys: Set | undefined; + let resumeSkipKeys: ResumeIdentityMatcher | undefined; + let rerunIncludeKeys: ResumeIdentityMatcher | undefined; let isResumeAppend = false; if (options.resume && !options.retryErrors) { const sourceRunDir = options.rerunFailedSource @@ -1888,13 +2020,15 @@ export async function runEvalCommand( const resumeIndexPaths = discoverRunManifestPaths(sourceRunDir); if (resumeIndexPaths.length > 0) { const existingResults = await readExistingResultsFromRunDir(sourceRunDir); - resumeSkipKeys = new Set(); + resumeSkipKeys = createResumeIdentityMatcher(); + rerunIncludeKeys = options.rerunFailed ? createResumeIdentityMatcher() : undefined; let completedResultCount = 0; - for (const r of existingResults) { - if (shouldSkipExistingResultForResume(r, options.rerunFailed)) { + for (const entry of latestResumeIdentityEntries(existingResults)) { + if (shouldSkipExistingResultForResume(entry.result, options.rerunFailed)) { completedResultCount += 1; - resumeSkipKeys.add(buildEvaluationResultTargetKey(r)); - resumeSkipKeys.add(buildTestTargetKey(r.testId, r.target, r.variant)); + addResumeIdentityEntry(resumeSkipKeys, entry); + } else if (rerunIncludeKeys) { + addResumeIdentityEntry(rerunIncludeKeys, entry); } } isResumeAppend = @@ -1904,6 +2038,9 @@ export async function runEvalCommand( console.log( `${modeLabel}: found ${existingResults.length} existing result(s), skipping ${completedResultCount} completed.`, ); + } else if (options.rerunFailed) { + rerunIncludeKeys = createResumeIdentityMatcher(); + console.log('Rerun-failed: no existing bundle run manifest found. Nothing to rerun.'); } else { // No existing bundle manifest — behave like a normal run. console.log('Resume: no existing bundle run manifest found, starting fresh run.'); @@ -2158,9 +2295,13 @@ export async function runEvalCommand( for (const { selection } of meta.selections) { const target = selection.targetName; const variant = targetVariantForSelection(selection); - const key = buildEvalTestTargetKey(test, target, variant); - const fallbackKey = buildTestTargetKey(test.id, target, variant); - if (resumeSkipKeys?.has(key) || resumeSkipKeys?.has(fallbackKey)) { + if (rerunIncludeKeys) { + if (resumeIdentityMatches(rerunIncludeKeys, test, target, variant)) { + totalEvalCount++; + } else { + resumeSkippedCount++; + } + } else if (resumeSkipKeys && resumeIdentityMatches(resumeSkipKeys, test, target, variant)) { resumeSkippedCount++; } else { totalEvalCount++; @@ -2175,6 +2316,10 @@ export async function runEvalCommand( console.log('No execution errors or missing cases in the previous run. Nothing to retry.'); return; } + if (rerunIncludeKeys) { + console.log('Nothing to rerun — no failed or errored test(s) matched the current suite.'); + return; + } // When using --resume, all tests being completed means nothing to resume if (resumeSkipKeys && resumeSkippedCount > 0) { console.log(`Nothing to resume — all ${resumeSkippedCount} test(s) already completed.`); @@ -2406,16 +2551,22 @@ export async function runEvalCommand( const targetName = selection.targetName; const applicableTestCases = targetPrep.testCases; - // --resume / --rerun-failed: skip tests that are already completed - const filteredTestCases = resumeSkipKeys - ? applicableTestCases.filter((test) => { - const variant = targetVariantForSelection(selection); - return ( - !resumeSkipKeys.has(buildEvalTestTargetKey(test, targetName, variant)) && - !resumeSkipKeys.has(buildTestTargetKey(test.id, targetName, variant)) - ); - }) - : applicableTestCases; + // --resume skips completed tests; --rerun-failed only includes prior failed/error tests. + const filteredTestCases = rerunIncludeKeys + ? applicableTestCases.filter((test) => + resumeIdentityMatches( + rerunIncludeKeys, + test, + targetName, + targetVariantForSelection(selection), + ), + ) + : resumeSkipKeys + ? applicableTestCases.filter((test) => { + const variant = targetVariantForSelection(selection); + return !resumeIdentityMatches(resumeSkipKeys, test, targetName, variant); + }) + : applicableTestCases; if (filteredTestCases.length === 0) { return []; diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index a88638f06..31dbce6d7 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -933,7 +933,7 @@ describe('agentv eval CLI', () => { } }, 30_000); - it('reruns failed rows from a canonical run id', async () => { + it('reruns only latest failed rows from a canonical run id', async () => { const fixture = await createFixture(); try { const priorRunDir = path.join(fixture.suiteDir, '.agentv', 'results', 'prior-run'); @@ -948,18 +948,61 @@ describe('agentv eval CLI', () => { expect(first.exitCode).toBe(1); const priorIndexPath = path.join(priorRunDir, 'index.jsonl'); const priorRows = (await readJsonLines(priorIndexPath)) as Array>; + const alphaRow = priorRows.find((row) => row.test_id === 'case-alpha'); + const betaRow = priorRows.find((row) => row.test_id === 'case-beta'); + if (!alphaRow || !betaRow) { + throw new Error('Expected prior rows for case-alpha and case-beta'); + } await writeFile( priorIndexPath, - `${priorRows - .map((row) => - JSON.stringify({ - ...row, - execution_status: row.test_id === 'case-alpha' ? 'quality_failure' : 'ok', - }), - ) + `${[ + ...priorRows.map((row) => ({ + ...row, + execution_status: row.test_id === 'case-alpha' ? 'ok' : 'quality_failure', + })), + { ...alphaRow, execution_status: 'quality_failure' }, + { ...betaRow, execution_status: 'ok' }, + ] + .map((row) => JSON.stringify(row)) .join('\n')}\n`, 'utf8', ); + await writeFile( + fixture.testFilePath, + `description: CLI integration test +target: file-target + +tests: + - id: case-alpha + criteria: System responds with alpha + input: + - role: user + content: | + Please respond with alpha + expected_output: + - role: assistant + content: "Alpha" + - id: case-beta + criteria: System responds with beta + input: + - role: user + content: | + Please respond with beta + expected_output: + - role: assistant + content: "Beta" + - id: case-gamma + criteria: System responds with gamma + input: + - role: user + content: | + Please respond with gamma + expected_output: + - role: assistant + content: "Gamma" +`, + 'utf8', + ); const second = await runCli(fixture, [ 'eval', @@ -967,10 +1010,10 @@ describe('agentv eval CLI', () => { '--rerun-failed', 'prior-run', '--threshold', - '0.8', + '0.5', ]); - expect(second.exitCode).toBe(1); - expect(second.stdout).toContain('Rerun-failed: found 2 existing result(s), skipping 1'); + expect(second.exitCode).toBe(0); + expect(second.stdout).toContain('Rerun-failed: found 4 existing result(s), skipping 1'); const diagnostics = await readDiagnostics(fixture); const calls = diagnostics.calls as Array>; @@ -979,13 +1022,110 @@ describe('agentv eval CLI', () => { }); const rows = await readJsonLines(priorIndexPath); - expect(rows).toHaveLength(3); + expect(rows).toHaveLength(5); expect((rows.at(-1) as Record).test_id).toBe('case-alpha'); } finally { await rm(fixture.baseDir, { recursive: true, force: true }); } }, 30_000); + it('does not use coarse fallback keys for precise rerun-failed identities', async () => { + const fixture = await createFixture(); + try { + const firstEvalPath = path.join(fixture.suiteDir, 'collision-a.eval.yaml'); + const secondEvalPath = path.join(fixture.suiteDir, 'collision-b.eval.yaml'); + const evalContent = (name: string) => `description: ${name} +target: file-target + +tests: + - id: shared-case + criteria: System responds + input: + - role: user + content: | + Please respond for ${name} + expected_output: + - role: assistant + content: "Shared" +`; + await writeFile(firstEvalPath, evalContent('collision a'), 'utf8'); + await writeFile(secondEvalPath, evalContent('collision b'), 'utf8'); + + const priorRunDir = path.join(fixture.suiteDir, '.agentv', 'results', 'prior-collision'); + const first = await runCli(fixture, [ + 'eval', + firstEvalPath, + '--output', + priorRunDir, + '--threshold', + '0.8', + ]); + expect(first.exitCode).toBe(0); + + const priorIndexPath = path.join(priorRunDir, 'index.jsonl'); + const priorRows = (await readJsonLines(priorIndexPath)) as Array>; + expect(priorRows).toHaveLength(1); + const baseRow = priorRows[0]; + if (!baseRow) { + throw new Error('Expected one prior collision row'); + } + const baseProjection = baseRow.projection_identity as Record; + const baseDimensions = baseProjection.dimensions as Record; + const secondProjection = { + ...baseProjection, + dimensions: { + ...baseDimensions, + eval_path: secondEvalPath, + }, + }; + await writeFile( + priorIndexPath, + `${[ + { ...baseRow, execution_status: 'ok' }, + { + ...baseRow, + projection_identity: secondProjection, + execution_status: 'quality_failure', + }, + ] + .map((row) => JSON.stringify(row)) + .join('\n')}\n`, + 'utf8', + ); + + const second = await runCli(fixture, [ + 'eval', + firstEvalPath, + secondEvalPath, + '--rerun-failed', + 'prior-collision', + '--threshold', + '0.8', + ]); + expect(second.exitCode).toBe(0); + expect(second.stdout).toContain('Rerun-failed: found 2 existing result(s), skipping 1'); + + const diagnostics = await readDiagnostics(fixture); + const calls = diagnostics.calls as Array>; + const rerunCalls = calls.slice(1); + expect(rerunCalls).toHaveLength(1); + const rerunCall = rerunCalls[0]; + if (!rerunCall) { + throw new Error('Expected one rerun diagnostics call'); + } + expect(path.basename(rerunCall.testFilePath as string)).toBe('collision-b.eval.yaml'); + expect(rerunCall).toMatchObject({ + evalCaseIds: ['shared-case'], + }); + + const rows = await readJsonLines(priorIndexPath); + expect(rows).toHaveLength(3); + expect((rows.at(-1) as Record).test_id).toBe('shared-case'); + } finally { + await rm(fixture.baseDir, { recursive: true, force: true }); + } + }, 30_000); + it('does not multiply max_concurrency across target matrix selections', async () => { const fixture = await createFixture(); try { diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts index 4103e4eaf..c97edf54b 100644 --- a/apps/cli/test/fixtures/mock-run-evaluation.ts +++ b/apps/cli/test/fixtures/mock-run-evaluation.ts @@ -173,6 +173,7 @@ async function maybeWriteDiagnostics( } const payload = { + testFilePath: options.testFilePath, target: options.target?.name, targetKind: options.target?.kind, targetModel: diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index a393bb06e..59635c657 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -457,6 +457,8 @@ export interface RunEvalCaseOptions { readonly evalFilePath?: string; /** Repo root used to serialize replay fixture eval_path as a stable relative path. */ readonly repoRoot?: string; + /** Zero-based sample index produced by repeat.count. */ + readonly sampleIndex?: number; } export interface ProgressEvent { @@ -1741,6 +1743,7 @@ async function runBatchEvaluation(options: { promptInputs, nowFn, attempt: 0, + sampleIndex: 0, graderProvider: await resolveGraderProvider(target), agentTimeoutMs, output, @@ -1782,6 +1785,7 @@ async function runBatchEvaluation(options: { 'evaluator', 'evaluator_error', verbose, + { sampleIndex: 0, retryIndex: 0 }, ); results.push(errorResult); if (onResult) { @@ -1892,6 +1896,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise Date; readonly attempt: number; + readonly sampleIndex: number; readonly graderProvider?: Provider; readonly agentTimeoutMs?: number; readonly output?: readonly Message[]; @@ -2596,6 +2612,7 @@ async function evaluateCandidate(options: { promptInputs, nowFn, attempt, + sampleIndex, graderProvider, agentTimeoutMs, output, @@ -2699,7 +2716,10 @@ async function evaluateCandidate(options: { : undefined; return { timestamp: completedAt.toISOString(), - testId: evalCase.id, + testId: authoredResultTestId(evalCase), + prompt: evalCase.prompt, + sampleIndex, + retryIndex: attempt, source: evalCase.source, suite: evalCase.suite, category: evalCase.category, @@ -3184,6 +3204,7 @@ async function runConversationMode(options: { readonly availableTargets?: readonly string[]; readonly evalFilePath?: string; readonly metadata?: JsonObject; + readonly sampleIndex?: number; }): Promise { const { evalCase, @@ -3205,6 +3226,7 @@ async function runConversationMode(options: { availableTargets, evalFilePath, metadata, + sampleIndex = 0, } = options; // biome-ignore lint/style/noNonNullAssertion: turns is guaranteed by the caller (conversation mode gate) @@ -3334,6 +3356,7 @@ async function runConversationMode(options: { }, nowFn, attempt: 0, + sampleIndex, graderProvider, agentTimeoutMs, output: response.output, @@ -3396,6 +3419,7 @@ async function runConversationMode(options: { }, nowFn, attempt: 0, + sampleIndex, graderProvider, agentTimeoutMs, verbose, @@ -3447,7 +3471,10 @@ async function runConversationMode(options: { return { timestamp: nowFn().toISOString(), - testId: evalCase.id, + testId: authoredResultTestId(evalCase), + prompt: evalCase.prompt, + sampleIndex, + retryIndex: 0, suite: evalCase.suite, category: evalCase.category, score: finalScore, @@ -3633,6 +3660,10 @@ function buildErrorResult( failureStage: FailureStage, failureReasonCode: string, verbose?: boolean, + identity?: { + readonly sampleIndex?: number; + readonly retryIndex?: number; + }, ): EvaluationResult { const message = extractErrorMessage(error); @@ -3681,6 +3712,8 @@ function buildErrorResult( timestamp: timestamp.toISOString(), testId: authoredResultTestId(evalCase), prompt: evalCase.prompt, + sampleIndex: identity?.sampleIndex, + retryIndex: identity?.retryIndex, suite: evalCase.suite, category: evalCase.category, conversationId: evalCase.conversation_id, diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 6be8c46f1..97de93fdf 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -1180,6 +1180,10 @@ export interface TrialsConfig { */ export interface TrialResult { readonly attempt: number; + /** Zero-based sample index produced from repeat.count. */ + readonly sampleIndex?: number; + /** Provider retry index for the attempt that produced this trial result. */ + readonly retryIndex?: number; readonly score: number; readonly verdict: EvaluationVerdict; readonly scores?: readonly GraderResult[]; From a25016e466e668cb5ac886abab815d7216dc862a Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 18:18:20 +0200 Subject: [PATCH 3/3] fix(eval): quarantine failed workspace pool slots --- packages/core/src/evaluation/orchestrator.ts | 123 +++++++++++++-- .../core/test/evaluation/orchestrator.test.ts | 145 ++++++++++++++++++ 2 files changed, 258 insertions(+), 10 deletions(-) diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 59635c657..d24066cfb 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -79,6 +79,7 @@ import type { TrialsConfig, } from './types.js'; import { cleanupEvalWorkspaces, cleanupWorkspace } from './workspace/manager.js'; +import type { PoolSlot } from './workspace/pool-manager.js'; import type { RepoManager } from './workspace/repo-manager.js'; import { type ScriptExecutionContext, @@ -996,6 +997,11 @@ export async function runEvaluation( let nextWorkerId = 1; const workerIdByEvalId = new Map(); let beforeAllOutputAttached = false; + const unavailablePoolSlotPaths = new Set(); + const poolSlotWaiters: Array<(slot: PoolSlot | undefined) => void> = []; + if (poolSlot && availablePoolSlots.length === 0) { + availablePoolSlots.push(poolSlot); + } // Suite-level budget tracking let cumulativeBudgetCost = 0; @@ -1059,6 +1065,97 @@ export async function runEvaluation( return { ok: allPassed, depResults }; } + function acquirePoolSlot(): Promise { + while (availablePoolSlots.length > 0) { + const candidate = availablePoolSlots.pop(); + if (candidate && !unavailablePoolSlotPaths.has(candidate.path)) { + return Promise.resolve(candidate); + } + } + + if (unavailablePoolSlotPaths.size >= poolSlots.length) { + return Promise.resolve(undefined); + } + + return new Promise((resolve) => { + poolSlotWaiters.push(resolve); + }); + } + + function returnPoolSlot(slot: PoolSlot): void { + if (unavailablePoolSlotPaths.has(slot.path)) { + return; + } + const waiter = poolSlotWaiters.shift(); + if (waiter) { + waiter(slot); + } else { + availablePoolSlots.push(slot); + } + } + + function markPoolSlotUnavailable(slot: PoolSlot): void { + unavailablePoolSlotPaths.add(slot.path); + if (unavailablePoolSlotPaths.size >= poolSlots.length) { + while (poolSlotWaiters.length > 0) { + poolSlotWaiters.shift()?.(undefined); + } + } + } + + async function emitSetupErrorResult( + evalCase: EvalTest, + workerId: number, + message: string, + failureReasonCode: string, + ): Promise { + if (failOnError === true) { + failOnErrorTriggered = true; + } + const output = `Error occurred: ${message}`; + const result: EvaluationResult = { + timestamp: (now ?? (() => new Date()))().toISOString(), + testId: authoredResultTestId(evalCase), + suite: evalCase.suite, + category: evalCase.category, + prompt: evalCase.prompt, + score: 0, + assertions: [{ text: `Error: ${message}`, passed: false }], + output, + trace: buildTraceFromMessages({ + input: evalCase.input as readonly Message[], + output: [{ role: 'assistant' as const, content: output }], + finalOutput: output, + target: target.name, + testId: authoredResultTestId(evalCase), + conversationId: evalCase.conversation_id, + error: message, + }), + target: target.name, + error: message, + executionStatus: 'execution_error', + failureStage: 'setup', + failureReasonCode, + executionError: { message, stage: 'setup' }, + }; + + if (onProgress) { + await onProgress({ + workerId, + testId: evalCase.id, + status: 'failed', + completedAt: Date.now(), + error: result.error, + score: result.score, + executionStatus: result.executionStatus, + }); + } + if (onResult) { + await onResult(result); + } + return result; + } + function extractEvaluationCostUsd(result: EvaluationResult): number | undefined { if (result.trials && result.trials.length > 0) { const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0); @@ -1236,14 +1333,20 @@ export async function runEvaluation( // Per-case isolated cases and raw/no-workspace cases outside the selected // shared owner prepare without inheriting a child suite's workspace. const usesSharedWorkspace = caseUsesSharedWorkspaceSetup(evalCase, sharedSetup); - const testPoolSlot = - usesSharedWorkspace && availablePoolSlots.length > 0 - ? availablePoolSlots.pop() - : usesSharedWorkspace - ? poolSlot - : undefined; + const pooledWorkspaceRequired = usesSharedWorkspace && poolSlots.length > 0; + const testPoolSlot = pooledWorkspaceRequired ? await acquirePoolSlot() : undefined; + if (pooledWorkspaceRequired && !testPoolSlot) { + return emitSetupErrorResult( + evalCase, + workerId, + 'No clean pooled workspace slot is available; prior slot reset failed.', + 'workspace_pool_unavailable', + ); + } const testWorkspacePath = usesSharedWorkspace - ? (testPoolSlot?.path ?? sharedWorkspacePath) + ? pooledWorkspaceRequired + ? testPoolSlot?.path + : sharedWorkspacePath : undefined; const testBaselineCommit = usesSharedWorkspace ? testPoolSlot @@ -1365,7 +1468,6 @@ export async function runEvaluation( // the per-slot baseline. Pooling is a local performance optimization, // not shared state between eval cases. if (testPoolSlot) { - const shouldReturnPoolSlot = testPoolSlot !== poolSlot; const resetMode = workspaceClean === 'full' ? 'strict' : 'fast'; let resetSucceeded = true; try { @@ -1375,6 +1477,7 @@ export async function runEvaluation( await resetWorkspaceRoot(testPoolSlot.path, resetMode, testBaselineCommit); } catch (resetError) { resetSucceeded = false; + markPoolSlotUnavailable(testPoolSlot); if (verbose) { const message = resetError instanceof Error ? resetError.message : String(resetError); console.warn( @@ -1382,8 +1485,8 @@ export async function runEvaluation( ); } } - if (resetSucceeded && shouldReturnPoolSlot) { - availablePoolSlots.push(testPoolSlot); + if (resetSucceeded) { + returnPoolSlot(testPoolSlot); } } } diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 1473a502a..27383c179 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -773,6 +773,151 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, } }, 30_000); + it('does not reuse a single pooled workspace slot after reset failure', async () => { + const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-pooled-runner-reset-fail-')); + const previousAgentvHome = process.env.AGENTV_HOME; + const previousAgentvDataDir = process.env.AGENTV_DATA_DIR; + process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home'); + process.env.AGENTV_DATA_DIR = path.join(tempDir, 'agentv-data'); + + try { + const sourceRepo = path.join(tempDir, 'source-repo'); + const cleanCommit = createTestRepo(sourceRepo, { 'tracked.txt': 'clean\n' }); + const workspace = { + repos: [ + { + path: './repo-a', + repo: `file://${sourceRepo}`, + commit: cleanCommit, + }, + ], + }; + let providerCalls = 0; + const provider: Provider = { + id: 'mock:single-slot-reset-failure', + kind: 'mock' as const, + targetName: 'single-slot-reset-failure', + async invoke(request: ProviderRequest): Promise { + providerCalls += 1; + if (!request.cwd) { + throw new Error('missing cwd'); + } + const repoDir = path.join(request.cwd, 'repo-a'); + writeFileSync(path.join(repoDir, 'tracked.txt'), 'dirty\n'); + rmSync(path.join(repoDir, '.git'), { recursive: true, force: true }); + return { output: [{ role: 'assistant', content: `response ${providerCalls}` }] }; + }, + }; + + const results = await runEvaluation({ + testFilePath: path.join(tempDir, 'eval.yaml'), + repoRoot: tempDir, + target: { ...baseTarget, name: 'single-slot-reset-failure' }, + providerFactory: () => provider, + evaluators: evaluatorRegistry, + workspaceMode: 'pooled', + maxConcurrency: 1, + evalCases: [ + { ...baseTestCase, id: 'case-1', workspace }, + { ...baseTestCase, id: 'case-2', workspace }, + ], + }); + + expect(providerCalls).toBe(1); + expect(results).toHaveLength(2); + expect(results[0].executionStatus).toBe('ok'); + expect(results[1].executionStatus).toBe('execution_error'); + expect(results[1].failureReasonCode).toBe('workspace_pool_unavailable'); + expect(results[1].error).toContain('No clean pooled workspace slot is available'); + } finally { + if (previousAgentvHome === undefined) { + process.env.AGENTV_HOME = undefined; + } else { + process.env.AGENTV_HOME = previousAgentvHome; + } + if (previousAgentvDataDir === undefined) { + process.env.AGENTV_DATA_DIR = undefined; + } else { + process.env.AGENTV_DATA_DIR = previousAgentvDataDir; + } + rmSync(tempDir, { recursive: true, force: true }); + } + }, 30_000); + + it('fails later pooled workspace cases after all multi-slot resets fail', async () => { + const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-pooled-runner-exhausted-')); + const previousAgentvHome = process.env.AGENTV_HOME; + const previousAgentvDataDir = process.env.AGENTV_DATA_DIR; + process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home'); + process.env.AGENTV_DATA_DIR = path.join(tempDir, 'agentv-data'); + + try { + const sourceRepo = path.join(tempDir, 'source-repo'); + const cleanCommit = createTestRepo(sourceRepo, { 'tracked.txt': 'clean\n' }); + const workspace = { + repos: [ + { + path: './repo-a', + repo: `file://${sourceRepo}`, + commit: cleanCommit, + }, + ], + }; + let providerCalls = 0; + const provider: Provider = { + id: 'mock:multi-slot-reset-failure', + kind: 'mock' as const, + targetName: 'multi-slot-reset-failure', + async invoke(request: ProviderRequest): Promise { + providerCalls += 1; + if (!request.cwd) { + throw new Error('missing cwd'); + } + const repoDir = path.join(request.cwd, 'repo-a'); + writeFileSync(path.join(repoDir, 'tracked.txt'), 'dirty\n'); + rmSync(path.join(repoDir, '.git'), { recursive: true, force: true }); + return { output: [{ role: 'assistant', content: `response ${providerCalls}` }] }; + }, + }; + + const results = await runEvaluation({ + testFilePath: path.join(tempDir, 'eval.yaml'), + repoRoot: tempDir, + target: { ...baseTarget, name: 'multi-slot-reset-failure' }, + providerFactory: () => provider, + evaluators: evaluatorRegistry, + workspaceMode: 'pooled', + maxConcurrency: 2, + configPoolMaxSlots: 2, + evalCases: [ + { ...baseTestCase, id: 'case-1', workspace }, + { ...baseTestCase, id: 'case-2', workspace }, + { ...baseTestCase, id: 'case-3', workspace }, + ], + }); + + expect(providerCalls).toBe(2); + expect(results).toHaveLength(3); + expect(results[0].executionStatus).toBe('ok'); + expect(results[1].executionStatus).toBe('ok'); + expect(results[2].executionStatus).toBe('execution_error'); + expect(results[2].failureReasonCode).toBe('workspace_pool_unavailable'); + expect(results[2].error).toContain('No clean pooled workspace slot is available'); + } finally { + if (previousAgentvHome === undefined) { + process.env.AGENTV_HOME = undefined; + } else { + process.env.AGENTV_HOME = previousAgentvHome; + } + if (previousAgentvDataDir === undefined) { + process.env.AGENTV_DATA_DIR = undefined; + } else { + process.env.AGENTV_DATA_DIR = previousAgentvDataDir; + } + rmSync(tempDir, { recursive: true, force: true }); + } + }, 30_000); + it('applies exponential backoff between retries', async () => { const provider = new SequenceProvider('mock', { errors: [new Error('Transient failure')],