From 6589dc37163595af5e371565442da81a62fb1df4 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Thu, 2 Jul 2026 14:21:45 +0200
Subject: [PATCH 1/3] feat(eval): add rerun-failed runner pooling

---
 apps/cli/src/commands/eval/commands/run.ts    |   5 +-
 apps/cli/src/commands/eval/run-cache.ts       |   8 +-
 apps/cli/src/commands/eval/run-eval.ts        | 313 ++++++++++--------
 apps/cli/src/commands/results/eval-runner.ts  |   5 +-
 apps/cli/test/commands/results/serve.test.ts  |   4 +-
 apps/cli/test/eval.integration.test.ts        |  95 ++++++
 apps/cli/test/fixtures/mock-run-evaluation.ts |  27 +-
 .../docs/docs/evaluation/running-evals.mdx    |  13 +-
 .../docs/docs/guides/workspace-pool.mdx       |   2 +
 packages/core/src/evaluation/orchestrator.ts  |  33 +-
 packages/core/src/evaluation/run-artifacts.ts |  22 +-
 .../core/test/evaluation/orchestrator.test.ts | 135 ++++++++
 12 files changed, 497 insertions(+), 165 deletions(-)
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
index a078edc06..09b0f80ef 100644
--- a/apps/cli/src/commands/eval/commands/run.ts
+++ b/apps/cli/src/commands/eval/commands/run.ts
@@ -168,10 +168,11 @@ export const evalRunCommand = command({
       description:
         'Resume an interrupted run: skip already-completed tests and append new results to --output dir',
     }),
-    rerunFailed: flag({
+    rerunFailed: option({
+      type: optional(string),
       long: 'rerun-failed',
       description:
-        'Rerun failed/errored tests while keeping passing results. Implies --resume semantics',
+        'Run ID, run workspace, or index.jsonl to rerun failed/errored tests while keeping passing results',
     }),
     strict: flag({
       long: 'strict',
diff --git a/apps/cli/src/commands/eval/run-cache.ts b/apps/cli/src/commands/eval/run-cache.ts
index 342fa8429..d2e8c7b85 100644
--- a/apps/cli/src/commands/eval/run-cache.ts
+++ b/apps/cli/src/commands/eval/run-cache.ts
@@ -54,10 +54,10 @@ export async function loadRunCache(cwd: string): Promise<RunCache | undefined> {
 /**
  * Resolve the cached last-run directory for a cwd, if it still exists on disk.
  * Returns undefined when there is no cache, the cache lacks a `lastRunDir`,
- * or the directory has since been deleted. Used by `--resume` / `--rerun-failed`
- * to default `--output` to the most recent run when no explicit dir is given,
- * matching the convention used by promptfoo (`--resume [evalId]`) and
- * OpenCompass (`-r [timestamp]`).
+ * or the directory has since been deleted. Used by `--resume` to default
+ * `--output` to the most recent run when no explicit dir is given, matching
+ * the convention used by promptfoo (`--resume [evalId]`) and OpenCompass
+ * (`-r [timestamp]`).
  */
 export async function resolveCachedRunDir(cwd: string): Promise<string | undefined> {
   const cache = await loadRunCache(cwd);
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 7fb3bbc0d..31ac0cab4 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -3,6 +3,7 @@ import { access, readFile } from 'node:fs/promises';
 import { createRequire as createNodeRequire } from 'node:module';
 import path from 'node:path';
 import { pathToFileURL } from 'node:url';
+import pLimit from 'p-limit';
 
 import {
   DEFAULT_THRESHOLD,
@@ -54,6 +55,7 @@ import {
   aggregateRunDir,
   buildEvalTestTargetKey,
   buildEvaluationResultTargetKey,
+  buildTestTargetKey,
   deduplicateByTestIdTarget,
   parseJsonlResults,
   writeArtifactsFromResults,
@@ -135,6 +137,7 @@ interface NormalizedOptions {
   readonly retryErrors?: string;
   readonly resume: boolean;
   readonly rerunFailed: boolean;
+  readonly rerunFailedSource?: string;
   readonly workspaceMode?: 'pooled' | 'temp' | 'static';
   readonly workspacePath?: string;
   readonly keepWorkspaces: boolean;
@@ -609,8 +612,10 @@ function normalizeOptions(
     otelGroupTurns:
       normalizeBoolean(rawOptions.otelGroupTurns) || yamlExecution?.otel_group_turns === true,
     retryErrors: normalizeString(rawOptions.retryErrors),
-    resume: normalizeBoolean(rawOptions.resume) || normalizeBoolean(rawOptions.rerunFailed),
-    rerunFailed: normalizeBoolean(rawOptions.rerunFailed),
+    resume:
+      normalizeBoolean(rawOptions.resume) || normalizeString(rawOptions.rerunFailed) !== undefined,
+    rerunFailed: normalizeString(rawOptions.rerunFailed) !== undefined,
+    rerunFailedSource: normalizeString(rawOptions.rerunFailed),
     workspaceMode,
     workspacePath,
     // Precedence: CLI > YAML config > TS config
@@ -1164,6 +1169,27 @@ async function readExistingResultsFromRunDir(runDir: string): Promise<Evaluation
   return results;
 }
 
+async function resolveRerunFailedRunDir(cwd: string, source: string): Promise<string> {
+  const trimmed = source.trim();
+  if (!trimmed) {
+    throw new Error('--rerun-failed requires a run ID, run workspace, or index.jsonl path.');
+  }
+
+  const candidate = path.isAbsolute(trimmed) ? trimmed : path.resolve(cwd, trimmed);
+  if (existsSync(candidate)) {
+    return path.basename(candidate) === RESULT_INDEX_FILENAME ? path.dirname(candidate) : candidate;
+  }
+
+  const runIdCandidate = path.join(cwd, '.agentv', 'results', trimmed);
+  if (existsSync(runIdCandidate)) {
+    return runIdCandidate;
+  }
+
+  throw new Error(
+    `Run not found for --rerun-failed: ${source}. Expected a run ID under .agentv/results, a run workspace, or an index.jsonl path.`,
+  );
+}
+
 async function prepareFileMetadata(params: {
   readonly testFilePath: string;
   readonly repoRoot: string;
@@ -1830,17 +1856,16 @@ export async function runEvalCommand(
     }
   }
 
-  // --resume / --rerun-failed without an explicit --output: default to the
+  // --resume without an explicit --output: default to the
   // last-known run dir for this cwd from .agentv/cache.json. Matches promptfoo's
   // `--resume [evalId]` and OpenCompass's `-r [timestamp]` "latest by default"
   // convention. The cache pointer is written by saveRunCache after every eval.
-  if (options.resume && !options.retryErrors && !options.outputDir) {
+  if (options.resume && !options.rerunFailedSource && !options.retryErrors && !options.outputDir) {
     const cachedDir = await resolveCachedRunDir(cwd);
     if (cachedDir) {
       options = { ...options, outputDir: cachedDir };
-      const flagLabel = options.rerunFailed ? 'rerun-failed' : 'resume';
       const displayDir = path.relative(cwd, cachedDir) || cachedDir;
-      console.log(`Auto-detected last run dir for --${flagLabel}: ${displayDir}`);
+      console.log(`Auto-detected last run dir for --resume: ${displayDir}`);
     }
   }
 
@@ -1849,22 +1874,35 @@ export async function runEvalCommand(
   let resumeSkipKeys: Set<string> | undefined;
   let isResumeAppend = false;
   if (options.resume && !options.retryErrors) {
-    const explicitResumeDir = options.outputDir;
-    if (explicitResumeDir) {
-      const resumeDir = path.resolve(explicitResumeDir);
-      const resumeIndexPaths = discoverRunManifestPaths(resumeDir);
+    const sourceRunDir = options.rerunFailedSource
+      ? await resolveRerunFailedRunDir(cwd, options.rerunFailedSource)
+      : options.outputDir
+        ? path.resolve(options.outputDir)
+        : undefined;
+
+    if (sourceRunDir) {
+      if (options.rerunFailedSource && !options.outputDir) {
+        options = { ...options, outputDir: sourceRunDir };
+      }
+
+      const resumeIndexPaths = discoverRunManifestPaths(sourceRunDir);
       if (resumeIndexPaths.length > 0) {
-        const existingResults = await readExistingResultsFromRunDir(resumeDir);
+        const existingResults = await readExistingResultsFromRunDir(sourceRunDir);
         resumeSkipKeys = new Set<string>();
+        let completedResultCount = 0;
         for (const r of existingResults) {
           if (shouldSkipExistingResultForResume(r, options.rerunFailed)) {
+            completedResultCount += 1;
             resumeSkipKeys.add(buildEvaluationResultTargetKey(r));
+            resumeSkipKeys.add(buildTestTargetKey(r.testId, r.target, r.variant));
           }
         }
-        isResumeAppend = true;
+        isResumeAppend =
+          options.outputDir !== undefined &&
+          path.resolve(options.outputDir) === path.resolve(sourceRunDir);
         const modeLabel = options.rerunFailed ? 'Rerun-failed' : 'Resume';
         console.log(
-          `${modeLabel}: found ${existingResults.length} existing result(s), skipping ${resumeSkipKeys.size} completed.`,
+          `${modeLabel}: found ${existingResults.length} existing result(s), skipping ${completedResultCount} completed.`,
         );
       } else {
         // No existing bundle manifest — behave like a normal run.
@@ -2121,7 +2159,8 @@ export async function runEvalCommand(
         const target = selection.targetName;
         const variant = targetVariantForSelection(selection);
         const key = buildEvalTestTargetKey(test, target, variant);
-        if (resumeSkipKeys?.has(key)) {
+        const fallbackKey = buildTestTargetKey(test.id, target, variant);
+        if (resumeSkipKeys?.has(key) || resumeSkipKeys?.has(fallbackKey)) {
           resumeSkippedCount++;
         } else {
           totalEvalCount++;
@@ -2345,127 +2384,143 @@ export async function runEvalCommand(
         continue;
       }
 
-      // Run all targets concurrently (each target has its own worker limit)
+      const fileWorkerLimit = Math.max(1, fileOptions.workers ?? DEFAULT_WORKERS);
+      const targetConcurrency =
+        targetPrep.selections.length > 1
+          ? Math.min(fileWorkerLimit, targetPrep.selections.length)
+          : 1;
+      const perTargetWorkers =
+        targetPrep.selections.length > 1
+          ? Math.max(1, Math.floor(fileWorkerLimit / targetConcurrency))
+          : fileWorkerLimit;
+      const limitTarget = pLimit(targetConcurrency);
+
+      // Run target matrix selections through a bounded pool. Each active target
+      // receives a slice of the worker budget so total in-process case execution
+      // never multiplies past max_concurrency.
       const targetResults = await Promise.all(
-        targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => {
-          // Target selection is suite/experiment/CLI runtime policy; every selected
-          // target runs every filtered test case for this eval file.
-          const targetName = selection.targetName;
-          const applicableTestCases = targetPrep.testCases;
-
-          // --resume / --rerun-failed: skip tests that are already completed
-          const filteredTestCases = resumeSkipKeys
-            ? applicableTestCases.filter(
-                (test) =>
-                  !resumeSkipKeys.has(
-                    buildEvalTestTargetKey(test, targetName, targetVariantForSelection(selection)),
-                  ),
-              )
-            : applicableTestCases;
-
-          if (filteredTestCases.length === 0) {
-            return [];
-          }
+        targetPrep.selections.map(({ selection, inlineTargetLabel }) =>
+          limitTarget(async () => {
+            // Target selection is suite/experiment/CLI runtime policy; every selected
+            // target runs every filtered test case for this eval file.
+            const targetName = selection.targetName;
+            const applicableTestCases = targetPrep.testCases;
+
+            // --resume / --rerun-failed: skip tests that are already completed
+            const filteredTestCases = resumeSkipKeys
+              ? applicableTestCases.filter((test) => {
+                  const variant = targetVariantForSelection(selection);
+                  return (
+                    !resumeSkipKeys.has(buildEvalTestTargetKey(test, targetName, variant)) &&
+                    !resumeSkipKeys.has(buildTestTargetKey(test.id, targetName, variant))
+                  );
+                })
+              : applicableTestCases;
+
+            if (filteredTestCases.length === 0) {
+              return [];
+            }
 
-          try {
-            const runGroups = groupTestsByRunPolicy({
-              tests: filteredTestCases,
-              options: fileOptions,
-              defaultTrialsConfig: fileOptions.transcript ? undefined : targetPrep.trialsConfig,
-              defaultThreshold: targetPrep.threshold ?? fileOptions.threshold,
-              defaultTimeoutSeconds: fileOptions.agentTimeoutSeconds,
-              defaultBudgetUsd: targetPrep.budgetUsd,
-            });
-            const groupResults: EvaluationResult[] = [];
-            for (const group of runGroups) {
-              hasScopedRunPolicies ||= group.policy.hasScopedOverride;
-              const result = await runSingleEvalFile({
-                testFilePath,
-                cwd,
-                repoRoot,
+            try {
+              const runGroups = groupTestsByRunPolicy({
+                tests: filteredTestCases,
                 options: fileOptions,
-                outputWriter,
-                otelExporter,
-                cache,
-                evaluationRunner,
-                workersOverride: fileOptions.workers,
-                progressReporter,
-                seenTestCases,
-                displayIdTracker,
-                selection,
-                inlineTargetLabel,
-                testCases: group.tests,
-                trialsConfig: fileOptions.transcript ? undefined : group.policy.trialsConfig,
-                agentTimeoutSeconds: group.policy.timeoutSeconds,
-                matrixMode: targetPrep.selections.length > 1,
-                budgetUsd: group.policy.budgetUsd,
-                runBudgetTracker: fileBudgetTracker,
-                failOnError: targetPrep.failOnError,
-                threshold: group.policy.threshold,
-                providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory,
-              });
-              groupResults.push(...result.results);
-            }
-            const evalFile = path.relative(cwd, testFilePath);
-            const existingSummary = remoteEvalSummaries.find(
-              (summary) => summary.evalFile === evalFile,
-            );
-            if (existingSummary) {
-              existingSummary.results.push(...groupResults);
-            } else {
-              remoteEvalSummaries.push({
-                evalFile,
-                results: [...groupResults],
+                defaultTrialsConfig: fileOptions.transcript ? undefined : targetPrep.trialsConfig,
+                defaultThreshold: targetPrep.threshold ?? fileOptions.threshold,
+                defaultTimeoutSeconds: fileOptions.agentTimeoutSeconds,
+                defaultBudgetUsd: targetPrep.budgetUsd,
               });
-            }
-
-            return groupResults;
-          } catch (fileError) {
-            // before_all or other setup failures should not abort the entire run.
-            // Mark all tests in this file as errors and continue with other files.
-            const message = fileError instanceof Error ? fileError.message : String(fileError);
-            console.error(
-              `\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`,
-            );
-            const explicitVariant = targetVariantForSelection(selection);
-            const errorResults: EvaluationResult[] = filteredTestCases.map((testCase) =>
-              withSourceMetadata(
-                {
-                  timestamp: new Date().toISOString(),
-                  testId: testCase.testId ?? testCase.id,
-                  prompt: testCase.prompt,
-                  score: 0,
-                  assertions: [],
-                  output: message,
-                  trace: buildTraceFromMessages({
-                    input: testCase.input as EvaluationResult['input'],
-                    output: [{ role: 'assistant' as const, content: message }],
-                    finalOutput: message,
-                    target: selection.targetName,
+              const groupResults: EvaluationResult[] = [];
+              for (const group of runGroups) {
+                hasScopedRunPolicies ||= group.policy.hasScopedOverride;
+                const result = await runSingleEvalFile({
+                  testFilePath,
+                  cwd,
+                  repoRoot,
+                  options: fileOptions,
+                  outputWriter,
+                  otelExporter,
+                  cache,
+                  evaluationRunner,
+                  workersOverride: perTargetWorkers,
+                  progressReporter,
+                  seenTestCases,
+                  displayIdTracker,
+                  selection,
+                  inlineTargetLabel,
+                  testCases: group.tests,
+                  trialsConfig: fileOptions.transcript ? undefined : group.policy.trialsConfig,
+                  agentTimeoutSeconds: group.policy.timeoutSeconds,
+                  matrixMode: targetPrep.selections.length > 1,
+                  budgetUsd: group.policy.budgetUsd,
+                  runBudgetTracker: fileBudgetTracker,
+                  failOnError: targetPrep.failOnError,
+                  threshold: group.policy.threshold,
+                  providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory,
+                });
+                groupResults.push(...result.results);
+              }
+              const evalFile = path.relative(cwd, testFilePath);
+              const existingSummary = remoteEvalSummaries.find(
+                (summary) => summary.evalFile === evalFile,
+              );
+              if (existingSummary) {
+                existingSummary.results.push(...groupResults);
+              } else {
+                remoteEvalSummaries.push({
+                  evalFile,
+                  results: [...groupResults],
+                });
+              }
+
+              return groupResults;
+            } catch (fileError) {
+              // before_all or other setup failures should not abort the entire run.
+              // Mark all tests in this file as errors and continue with other files.
+              const message = fileError instanceof Error ? fileError.message : String(fileError);
+              console.error(
+                `\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`,
+              );
+              const explicitVariant = targetVariantForSelection(selection);
+              const errorResults: EvaluationResult[] = filteredTestCases.map((testCase) =>
+                withSourceMetadata(
+                  {
+                    timestamp: new Date().toISOString(),
                     testId: testCase.testId ?? testCase.id,
-                    conversationId: testCase.conversation_id,
+                    prompt: testCase.prompt,
+                    score: 0,
+                    assertions: [],
+                    output: message,
+                    trace: buildTraceFromMessages({
+                      input: testCase.input as EvaluationResult['input'],
+                      output: [{ role: 'assistant' as const, content: message }],
+                      finalOutput: message,
+                      target: selection.targetName,
+                      testId: testCase.testId ?? testCase.id,
+                      conversationId: testCase.conversation_id,
+                      error: message,
+                    }),
+                    scores: [],
                     error: message,
-                  }),
-                  scores: [],
-                  error: message,
-                  executionStatus: 'execution_error' as const,
-                  failureStage: 'setup' as const,
-                  failureReasonCode: 'setup_error' as const,
-                  durationMs: 0,
-                  tokenUsage: { input: 0, output: 0 },
-                  target: selection.targetName,
-                  variant: explicitVariant,
-                },
-                testFilePath,
-                fileOptions,
-              ),
-            );
-            for (const errResult of errorResults) {
-              await outputWriter.append(errResult);
+                    executionStatus: 'execution_error' as const,
+                    failureStage: 'setup' as const,
+                    failureReasonCode: 'setup_error' as const,
+                    durationMs: 0,
+                    tokenUsage: { input: 0, output: 0 },
+                    target: selection.targetName,
+                    variant: explicitVariant,
+                  },
+                  testFilePath,
+                  fileOptions,
+                ),
+              );
+              for (const errResult of errorResults) {
+                await outputWriter.append(errResult);
+              }
+              return errorResults;
             }
-            return errorResults;
-          }
-        }),
+          }),
+        ),
       );
       for (const results of targetResults) {
         allResults.push(...results);
@@ -2653,7 +2708,7 @@ export async function runEvalCommand(
       const relativeRunDir = path.relative(cwd, runDir);
       console.log(
         `\nTip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:\n` +
-          `  agentv eval run ${evalFileArgs}${targetFlag} --output ${relativeRunDir} --rerun-failed`,
+          `  agentv eval run ${evalFileArgs}${targetFlag} --rerun-failed ${relativeRunDir}`,
       );
     }
 
diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts
index 9a4617eb5..3fb397621 100644
--- a/apps/cli/src/commands/results/eval-runner.ts
+++ b/apps/cli/src/commands/results/eval-runner.ts
@@ -171,6 +171,9 @@ function validateResumeOptions(req: RunEvalRequest): string | undefined {
   if (modes.length > 1) {
     return `resume, rerun_failed, and retry_errors are mutually exclusive (got: ${modes.join(', ')})`;
   }
+  if (req.rerun_failed && !req.output?.trim()) {
+    return 'rerun_failed requires output to identify the prior run workspace';
+  }
   return undefined;
 }
 
@@ -230,7 +233,7 @@ function buildCliArgs(req: RunEvalRequest, experiment?: string): string[] {
     args.push('--resume');
   }
   if (req.rerun_failed) {
-    args.push('--rerun-failed');
+    args.push('--rerun-failed', req.output?.trim() ?? '');
   }
   if (req.retry_errors?.trim()) {
     args.push('--retry-errors', req.retry_errors.trim());
diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts
index 77228476d..2d27c4c4b 100644
--- a/apps/cli/test/commands/results/serve.test.ts
+++ b/apps/cli/test/commands/results/serve.test.ts
@@ -4031,7 +4031,7 @@ describe('serve app', () => {
       });
       expect(res.status).toBe(202);
       const data = (await res.json()) as { command: string };
-      expect(data.command).toContain('--rerun-failed');
+      expect(data.command).toContain('--rerun-failed .agentv/results/r1');
       expect(data.command).toContain('--output .agentv/results/r1');
     });
 
@@ -4215,7 +4215,7 @@ describe('serve app', () => {
       });
       expect(res.status).toBe(200);
       const data = (await res.json()) as { command: string };
-      expect(data.command).toContain('--rerun-failed');
+      expect(data.command).toContain('--rerun-failed .agentv/results/r1');
       expect(data.command).not.toContain('--resume');
     });
 
diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts
index 419d06d23..a88638f06 100644
--- a/apps/cli/test/eval.integration.test.ts
+++ b/apps/cli/test/eval.integration.test.ts
@@ -933,6 +933,101 @@ describe('agentv eval CLI', () => {
     }
   }, 30_000);
 
+  it('reruns failed rows from a canonical run id', async () => {
+    const fixture = await createFixture();
+    try {
+      const priorRunDir = path.join(fixture.suiteDir, '.agentv', 'results', 'prior-run');
+      const first = await runCli(fixture, [
+        'eval',
+        fixture.testFilePath,
+        '--output',
+        priorRunDir,
+        '--threshold',
+        '0.8',
+      ]);
+      expect(first.exitCode).toBe(1);
+      const priorIndexPath = path.join(priorRunDir, 'index.jsonl');
+      const priorRows = (await readJsonLines(priorIndexPath)) as Array<Record<string, unknown>>;
+      await writeFile(
+        priorIndexPath,
+        `${priorRows
+          .map((row) =>
+            JSON.stringify({
+              ...row,
+              execution_status: row.test_id === 'case-alpha' ? 'quality_failure' : 'ok',
+            }),
+          )
+          .join('\n')}\n`,
+        'utf8',
+      );
+
+      const second = await runCli(fixture, [
+        'eval',
+        fixture.testFilePath,
+        '--rerun-failed',
+        'prior-run',
+        '--threshold',
+        '0.8',
+      ]);
+      expect(second.exitCode).toBe(1);
+      expect(second.stdout).toContain('Rerun-failed: found 2 existing result(s), skipping 1');
+
+      const diagnostics = await readDiagnostics(fixture);
+      const calls = diagnostics.calls as Array<Record<string, unknown>>;
+      expect(calls.at(-1)).toMatchObject({
+        evalCaseIds: ['case-alpha'],
+      });
+
+      const rows = await readJsonLines(priorIndexPath);
+      expect(rows).toHaveLength(3);
+      expect((rows.at(-1) as Record<string, unknown>).test_id).toBe('case-alpha');
+    } finally {
+      await rm(fixture.baseDir, { recursive: true, force: true });
+    }
+  }, 30_000);
+
+  it('does not multiply max_concurrency across target matrix selections', async () => {
+    const fixture = await createFixture();
+    try {
+      const evalPath = path.join(fixture.suiteDir, 'target-matrix.eval.yaml');
+      await writeFile(
+        evalPath,
+        [
+          'name: target-matrix',
+          'target: file-target',
+          'tests:',
+          '  - id: first-case',
+          '    input: first',
+          '    criteria: ok',
+          '  - id: second-case',
+          '    input: second',
+          '    criteria: ok',
+          '',
+        ].join('\n'),
+        'utf8',
+      );
+
+      const { exitCode } = await runCli(fixture, [
+        'eval',
+        evalPath,
+        '--workers',
+        '2',
+        '--target',
+        'file-target',
+        '--target',
+        'cli-target',
+      ]);
+
+      expect(exitCode).toBe(0);
+      const diagnostics = await readDiagnostics(fixture);
+      const calls = diagnostics.calls as Array<Record<string, unknown>>;
+      expect(calls).toHaveLength(2);
+      expect(calls.map((call) => call.maxConcurrency)).toEqual([1, 1]);
+    } finally {
+      await rm(fixture.baseDir, { recursive: true, force: true });
+    }
+  }, 30_000);
+
   it('records CLI-named experiment namespace separately from default runtime config', async () => {
     const fixture = await createFixture();
     try {
diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts
index 5c30221ff..4103e4eaf 100644
--- a/apps/cli/test/fixtures/mock-run-evaluation.ts
+++ b/apps/cli/test/fixtures/mock-run-evaluation.ts
@@ -64,6 +64,8 @@ interface EvaluationResultLike {
   readonly timestamp: string;
 }
 
+let diagnosticsWriteQueue: Promise<void> = Promise.resolve();
+
 function evalCaseIds(evalCases: ReadonlyArray<unknown> | undefined): readonly string[] {
   if (!Array.isArray(evalCases) || evalCases.length === 0) {
     return ['case-alpha', 'case-beta'];
@@ -210,17 +212,20 @@ async function maybeWriteDiagnostics(
     resultCount: results.length,
   } satisfies Record<string, unknown>;
 
-  const priorCalls = await readFile(diagnosticsPath, 'utf8')
-    .then((raw) => {
-      const parsed = JSON.parse(raw) as { readonly calls?: unknown };
-      return Array.isArray(parsed.calls) ? parsed.calls : [parsed];
-    })
-    .catch(() => []);
-  await writeFile(
-    diagnosticsPath,
-    JSON.stringify({ ...payload, calls: [...priorCalls, payload] }, null, 2),
-    'utf8',
-  );
+  diagnosticsWriteQueue = diagnosticsWriteQueue.then(async () => {
+    const priorCalls = await readFile(diagnosticsPath, 'utf8')
+      .then((raw) => {
+        const parsed = JSON.parse(raw) as { readonly calls?: unknown };
+        return Array.isArray(parsed.calls) ? parsed.calls : [parsed];
+      })
+      .catch(() => []);
+    await writeFile(
+      diagnosticsPath,
+      JSON.stringify({ ...payload, calls: [...priorCalls, payload] }, null, 2),
+      'utf8',
+    );
+  });
+  await diagnosticsWriteQueue;
 }
 
 async function maybeWritePromptDump(
diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
index 59c0d8366..2639427a5 100644
--- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
@@ -296,7 +296,7 @@ agentv eval evals/my-eval.yaml --export-otel
 
 ### Parallelism
 
-The `--workers N` flag controls how many **test cases run in parallel within each eval file** (default: 3). Eval files always run sequentially — one file completes before the next starts.
+The `--workers N` flag controls the in-process worker pool for a single eval file (default: 3). Eval files always run sequentially — one file completes before the next starts. In target-matrix runs, selected targets share that worker budget instead of each target creating its own full pool.
 
 ```bash
 agentv eval evals/my-eval.yaml --workers 4
@@ -304,6 +304,9 @@ agentv eval evals/my-eval.yaml --workers 4
 
 agentv eval evals/file1.yaml evals/file2.yaml evals/file3.yaml --workers 3
 # Files run one at a time; within each file, up to 3 test cases run in parallel
+
+agentv eval evals/my-eval.yaml --target gpt --target claude --workers 4
+# The target matrix shares the same 4-worker budget
 ```
 
 This matches the standard model used by eval frameworks (promptfoo, deepeval, OpenAI Evals) and avoids cross-file workspace races without any special configuration.
@@ -355,10 +358,10 @@ AgentV ships three flags for picking up a partial run. They differ only in **whi
 | Flag | What it skips | What it re-runs | Use when |
 |------|---------------|-----------------|----------|
 | `--resume` | Anything that finished without an `execution_error` (passes, fails, threshold misses) | Errors and missing cases | The run was interrupted (Ctrl-C, crash, OOM) and you just want it to finish |
-| `--rerun-failed` | Only cases with `executionStatus === 'ok'` | Errors **and** test failures (assertion misses, threshold misses) | A grader change or model swap means you want to re-grade everything that wasn't already passing |
+| `--rerun-failed <run_id>` | Only cases with `executionStatus === 'ok'` | Errors **and** test failures (assertion misses, threshold misses) | A grader change or model swap means you want to re-grade everything that wasn't already passing |
 | `--retry-errors <path>` | Anything that completed without an `execution_error` (same set as `--resume`) | Errors and missing cases | You want to point at an arbitrary prior run/manifest by path, instead of resuming the run dir you're currently writing to |
 
-`--resume` and `--rerun-failed` both append to the existing `index.jsonl`. When `--output <dir>` is given they target that directory; when omitted they default to the **last run dir for the current cwd**, recorded in `.agentv/cache.json` and updated after every eval. This matches promptfoo's `--resume [evalId]` and OpenCompass's `-r [timestamp]` "latest by default" convention. `--retry-errors` takes the prior run's path directly (a directory or an `index.jsonl`).
+`--resume` appends to the existing `index.jsonl` in `--output <dir>`; when omitted it defaults to the **last run dir for the current cwd**, recorded in `.agentv/cache.json` and updated after every eval. `--rerun-failed <run_id>` reads a specific canonical run bundle from `.agentv/results/<run_id>` and, when `--output` is omitted, appends replacement rows to that same bundle. You can also pass a run workspace path or `index.jsonl` path instead of a bare run ID. `--retry-errors` takes the prior run's path directly and re-runs only execution errors or missing cases.
 
 ```bash
 # Resume the last run — no args needed; AgentV finds it from .agentv/cache.json
@@ -367,8 +370,8 @@ agentv eval evals/my-eval.yaml --resume
 # Or target a specific run dir explicitly
 agentv eval evals/my-eval.yaml --output .agentv/results/<run_id> --resume
 
-# Re-run errors AND failed cases against the last run dir
-agentv eval evals/my-eval.yaml --rerun-failed
+# Re-run errors AND failed cases from a specific canonical run
+agentv eval evals/my-eval.yaml --rerun-failed <run_id>
 
 # Re-run only execution errors from any prior run by path
 agentv eval evals/my-eval.yaml --retry-errors .agentv/results/<run_id>/index.jsonl
diff --git a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx
index f907aeb81..06e9694d4 100644
--- a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx
+++ b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx
@@ -132,6 +132,8 @@ This creates up to 4 slots (`slot-0` through `slot-3`). PID-based lock files pre
 
 The maximum number of pool slots defaults to 10 (capped at 50). Slots are created on demand — a run with 2 workers only creates 2 slots, even if the pool allows 10.
 
+Before a slot is reused for another case, AgentV resets it to the slot baseline. A pooled workspace is a performance cache, not shared mutable state between cases.
+
 **Multiple eval files:** When you pass multiple eval files to `agentv eval`, they run sequentially — one file completes before the next starts (see [Parallelism](/docs/evaluation/running-evals/#parallelism)). Within each file, pool slots support concurrent workers as described above.
 
 ## Drift detection
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index 0e91ad79b..a393bb06e 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -1235,13 +1235,17 @@ export async function runEvaluation(
       // shared owner prepare without inheriting a child suite's workspace.
       const usesSharedWorkspace = caseUsesSharedWorkspaceSetup(evalCase, sharedSetup);
       const testPoolSlot =
-        usesSharedWorkspace && availablePoolSlots.length > 0 ? availablePoolSlots.pop() : undefined;
+        usesSharedWorkspace && availablePoolSlots.length > 0
+          ? availablePoolSlots.pop()
+          : usesSharedWorkspace
+            ? poolSlot
+            : undefined;
       const testWorkspacePath = usesSharedWorkspace
         ? (testPoolSlot?.path ?? sharedWorkspacePath)
         : undefined;
       const testBaselineCommit = usesSharedWorkspace
         ? testPoolSlot
-          ? poolSlotBaselines.get(testPoolSlot.path)
+          ? (poolSlotBaselines.get(testPoolSlot.path) ?? sharedBaselineCommit)
           : sharedBaselineCommit
         : undefined;
       const testExtensionState = usesSharedWorkspace
@@ -1355,9 +1359,30 @@ export async function runEvaluation(
         }
         throw error;
       } finally {
-        // Return pool slot for reuse by next test
+        // Return pool slot for reuse by next test only after resetting it to
+        // the per-slot baseline. Pooling is a local performance optimization,
+        // not shared state between eval cases.
         if (testPoolSlot) {
-          availablePoolSlots.push(testPoolSlot);
+          const shouldReturnPoolSlot = testPoolSlot !== poolSlot;
+          const resetMode = workspaceClean === 'full' ? 'strict' : 'fast';
+          let resetSucceeded = true;
+          try {
+            if (repoManager && suiteWorkspace?.repos?.length) {
+              await repoManager.reset(suiteWorkspace.repos, testPoolSlot.path, resetMode);
+            }
+            await resetWorkspaceRoot(testPoolSlot.path, resetMode, testBaselineCommit);
+          } catch (resetError) {
+            resetSucceeded = false;
+            if (verbose) {
+              const message = resetError instanceof Error ? resetError.message : String(resetError);
+              console.warn(
+                `Warning: failed to reset workspace pool slot ${testPoolSlot.index}; leaving it out of reuse: ${message}`,
+              );
+            }
+          }
+          if (resetSucceeded && shouldReturnPoolSlot) {
+            availablePoolSlots.push(testPoolSlot);
+          }
         }
       }
     }
diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts
index 311a566c1..e687a0cb5 100644
--- a/packages/core/src/evaluation/run-artifacts.ts
+++ b/packages/core/src/evaluation/run-artifacts.ts
@@ -2052,6 +2052,13 @@ function indexRecordReplacementKey(record: unknown): string | undefined {
   return projectionIdentityRecordKey(record) ?? indexRecordKey(record);
 }
 
+function indexRecordReplacementKeys(record: unknown): readonly string[] {
+  const keys = [projectionIdentityRecordKey(record), indexRecordKey(record)].filter(
+    (key): key is string => typeof key === 'string' && key.length > 0,
+  );
+  return Array.from(new Set(keys));
+}
+
 function projectionIdentityRecordKey(record: unknown): string | undefined {
   if (!isRecord(record) || !isRecord(record.projection_identity)) {
     return undefined;
@@ -2141,10 +2148,9 @@ async function rewriteExistingIndexRecords(
   }
 
   const replacementsByKey = new Map(
-    replacements.flatMap((record) => {
-      const key = indexRecordReplacementKey(record);
-      return key ? [[key, record] as const] : [];
-    }),
+    replacements.flatMap((record) =>
+      indexRecordReplacementKeys(record).map((key) => [key, record] as const),
+    ),
   );
   const seen = new Set<string>();
   const records: unknown[] = [];
@@ -2158,7 +2164,9 @@ async function rewriteExistingIndexRecords(
       const replacement = key ? replacementsByKey.get(key) : undefined;
       if (key && replacement) {
         records.push(replacement);
-        seen.add(key);
+        for (const replacementKey of indexRecordReplacementKeys(replacement)) {
+          seen.add(replacementKey);
+        }
       } else {
         records.push(parsed);
       }
@@ -2166,8 +2174,8 @@ async function rewriteExistingIndexRecords(
   }
 
   for (const replacement of replacements) {
-    const key = indexRecordReplacementKey(replacement);
-    if (!key || !seen.has(key)) {
+    const keys = indexRecordReplacementKeys(replacement);
+    if (keys.length === 0 || keys.every((key) => !seen.has(key))) {
       records.push(replacement);
     }
   }
diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts
index daccce706..1473a502a 100644
--- a/packages/core/test/evaluation/orchestrator.test.ts
+++ b/packages/core/test/evaluation/orchestrator.test.ts
@@ -1,10 +1,12 @@
 import { afterEach, describe, expect, it, mock, spyOn } from 'bun:test';
+import { execSync } from 'node:child_process';
 import {
   existsSync,
   mkdirSync,
   mkdtempSync,
   readFileSync,
   readdirSync,
+  rmSync,
   writeFileSync,
 } from 'node:fs';
 import { tmpdir } from 'node:os';
@@ -151,6 +153,31 @@ const baseTarget: ResolvedTarget = {
   config: { response: '{}' },
 };
 
+function cleanGitEnv(): Record<string, string> {
+  const env: Record<string, string> = {};
+  for (const [key, value] of Object.entries(process.env)) {
+    if (value !== undefined && !(key.startsWith('GIT_') && key !== 'GIT_SSH_COMMAND')) {
+      env[key] = value;
+    }
+  }
+  return env;
+}
+
+function createTestRepo(dir: string, files: Record<string, string>): string {
+  mkdirSync(dir, { recursive: true });
+  const opts = { cwd: dir, stdio: 'ignore' as const, env: cleanGitEnv() };
+  execSync('git init', opts);
+  execSync('git config user.email "test@test.com"', opts);
+  execSync('git config user.name "Test"', opts);
+  for (const [name, content] of Object.entries(files)) {
+    const filePath = path.join(dir, name);
+    mkdirSync(path.dirname(filePath), { recursive: true });
+    writeFileSync(filePath, content);
+  }
+  execSync('git add -A && git commit -m "initial"', opts);
+  return execSync('git rev-parse HEAD', { cwd: dir, env: cleanGitEnv() }).toString().trim();
+}
+
 const evaluatorRegistry = {
   'llm-grader': {
     kind: 'llm-grader',
@@ -638,6 +665,114 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`,
     expect(result.score).toBeGreaterThan(0);
   });
 
+  it('does not retry completed quality failures', async () => {
+    const provider = new SequenceProvider('mock', {
+      responses: [
+        {
+          output: [{ role: 'assistant', content: 'Incomplete response.' }],
+        },
+      ],
+    });
+    const failingEvaluators = {
+      'llm-grader': {
+        kind: 'llm-grader',
+        async evaluate() {
+          return {
+            score: 0.1,
+            verdict: 'fail' as const,
+            assertions: [{ text: 'quality miss', passed: false }],
+            expectedAspectCount: 1,
+          };
+        },
+      },
+    };
+
+    const result = await runEvalCase({
+      evalCase: baseTestCase,
+      provider,
+      target: baseTarget,
+      evaluators: failingEvaluators,
+      maxRetries: 3,
+    });
+
+    expect(provider.callIndex).toBe(1);
+    expect(result.executionStatus).toBe('quality_failure');
+    expect(result.retryIndex).toBe(0);
+  });
+
+  it('resets a pooled workspace slot before reusing it for the next case', async () => {
+    const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-pooled-runner-'));
+    const previousAgentvHome = process.env.AGENTV_HOME;
+    const previousAgentvDataDir = process.env.AGENTV_DATA_DIR;
+    process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home');
+    process.env.AGENTV_DATA_DIR = path.join(tempDir, 'agentv-data');
+
+    try {
+      const sourceRepo = path.join(tempDir, 'source-repo');
+      const cleanCommit = createTestRepo(sourceRepo, { 'tracked.txt': 'clean\n' });
+      const workspace = {
+        repos: [
+          {
+            path: './repo-a',
+            repo: `file://${sourceRepo}`,
+            commit: cleanCommit,
+          },
+        ],
+      };
+      const seenStaleBeforeSecond: boolean[] = [];
+      let callCount = 0;
+      const provider: Provider = {
+        id: 'mock:pooled-reset',
+        kind: 'mock' as const,
+        targetName: 'pooled-reset',
+        async invoke(request: ProviderRequest): Promise<ProviderResponse> {
+          callCount += 1;
+          if (!request.cwd) {
+            throw new Error('missing cwd');
+          }
+          const repoDir = path.join(request.cwd, 'repo-a');
+          if (callCount === 1) {
+            writeFileSync(path.join(repoDir, 'tracked.txt'), 'dirty\n');
+            writeFileSync(path.join(repoDir, 'stale.txt'), 'stale\n');
+          } else {
+            seenStaleBeforeSecond.push(existsSync(path.join(repoDir, 'stale.txt')));
+            expect(readFileSync(path.join(repoDir, 'tracked.txt'), 'utf8')).toBe('clean\n');
+          }
+          return { output: [{ role: 'assistant', content: `response ${callCount}` }] };
+        },
+      };
+
+      const results = await runEvaluation({
+        testFilePath: path.join(tempDir, 'eval.yaml'),
+        repoRoot: tempDir,
+        target: { ...baseTarget, name: 'pooled-reset' },
+        providerFactory: () => provider,
+        evaluators: evaluatorRegistry,
+        workspaceMode: 'pooled',
+        maxConcurrency: 1,
+        evalCases: [
+          { ...baseTestCase, id: 'case-1', workspace },
+          { ...baseTestCase, id: 'case-2', workspace },
+        ],
+      });
+
+      expect(results).toHaveLength(2);
+      expect(seenStaleBeforeSecond).toEqual([false]);
+    } finally {
+      if (previousAgentvHome === undefined) {
+        process.env.AGENTV_HOME = undefined;
+      } else {
+        process.env.AGENTV_HOME = previousAgentvHome;
+      }
+      if (previousAgentvDataDir === undefined) {
+        process.env.AGENTV_DATA_DIR = undefined;
+      } else {
+        process.env.AGENTV_DATA_DIR = previousAgentvDataDir;
+      }
+      rmSync(tempDir, { recursive: true, force: true });
+    }
+  }, 30_000);
+
   it('applies exponential backoff between retries', async () => {
     const provider = new SequenceProvider('mock', {
       errors: [new Error('Transient failure')],

From 71c15301bcee745cee0a07b816d7e5802ec5a439 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Thu, 2 Jul 2026 15:09:16 +0200
Subject: [PATCH 2/3] fix(eval): constrain rerun-failed identities

---
 apps/cli/src/commands/eval/run-eval.ts        | 191 ++++++++++++++++--
 apps/cli/test/eval.integration.test.ts        | 164 +++++++++++++--
 apps/cli/test/fixtures/mock-run-evaluation.ts |   1 +
 packages/core/src/evaluation/orchestrator.ts  |  37 +++-
 packages/core/src/evaluation/types.ts         |   4 +
 5 files changed, 363 insertions(+), 34 deletions(-)

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 31ac0cab4..e8c831c87 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -105,6 +105,137 @@ function shouldSkipExistingResultForResume(
   return result.executionStatus !== 'execution_error';
 }
 
+interface ResumeIdentityEntry {
+  readonly kind: 'precise' | 'legacy';
+  readonly key: string;
+  readonly result: EvaluationResult;
+}
+
+interface ResumeIdentityMatcher {
+  readonly preciseKeys: Set<string>;
+  readonly legacyKeys: Set<string>;
+}
+
+function hasNonEmptyString(value: unknown): value is string {
+  return typeof value === 'string' && value.trim().length > 0;
+}
+
+function objectRecord(value: unknown): Record<string, unknown> | undefined {
+  return typeof value === 'object' && value !== null && !Array.isArray(value)
+    ? (value as Record<string, unknown>)
+    : undefined;
+}
+
+function resultProjectionDimensions(result: EvaluationResult): Record<string, unknown> | undefined {
+  const projectionIdentity = objectRecord(
+    (result as unknown as Record<string, unknown>).projectionIdentity,
+  );
+  return objectRecord(projectionIdentity?.dimensions);
+}
+
+function hasCanonicalResultIdentity(result: EvaluationResult): boolean {
+  const source = result.source;
+  const dimensions = resultProjectionDimensions(result);
+  const resultRecord = result as unknown as Record<string, unknown>;
+  return (
+    hasNonEmptyString(dimensions?.evalPath) ||
+    hasNonEmptyString(dimensions?.suite) ||
+    hasNonEmptyString(dimensions?.promptId) ||
+    hasNonEmptyString(resultRecord.evalPath) ||
+    hasNonEmptyString(source?.evalFileRepoPath) ||
+    hasNonEmptyString(source?.evalFilePath) ||
+    hasNonEmptyString(source?.evalFileAbsolutePath) ||
+    hasNonEmptyString(result.suite) ||
+    hasNonEmptyString(result.prompt?.id)
+  );
+}
+
+function resultResumeIdentityEntry(result: EvaluationResult): ResumeIdentityEntry {
+  if (hasCanonicalResultIdentity(result)) {
+    return {
+      kind: 'precise',
+      key: buildEvaluationResultTargetKey(result),
+      result,
+    };
+  }
+  return {
+    kind: 'legacy',
+    key: buildTestTargetKey(result.testId, result.target, result.variant),
+    result,
+  };
+}
+
+function latestResumeIdentityEntries(
+  results: readonly EvaluationResult[],
+): readonly ResumeIdentityEntry[] {
+  const latestByIdentity = new Map<string, ResumeIdentityEntry>();
+  for (const result of results) {
+    const entry = resultResumeIdentityEntry(result);
+    latestByIdentity.set(`${entry.kind}:${entry.key}`, entry);
+  }
+  return Array.from(latestByIdentity.values());
+}
+
+function createResumeIdentityMatcher(): ResumeIdentityMatcher {
+  return { preciseKeys: new Set<string>(), legacyKeys: new Set<string>() };
+}
+
+function addResumeIdentityEntry(matcher: ResumeIdentityMatcher, entry: ResumeIdentityEntry): void {
+  if (entry.kind === 'legacy') {
+    matcher.legacyKeys.add(entry.key);
+    return;
+  }
+  matcher.preciseKeys.add(entry.key);
+}
+
+function uniqueStrings(values: readonly (string | undefined)[]): string[] {
+  return Array.from(new Set(values.filter(hasNonEmptyString)));
+}
+
+function buildPlannedResumeIdentityKeys(
+  test: EvalTest,
+  target: string,
+  variant: string | undefined,
+): readonly string[] {
+  const keys = new Set<string>([buildEvalTestTargetKey(test, target, variant)]);
+  const evalPaths = uniqueStrings([
+    test.source?.evalFileRepoPath,
+    test.source?.evalFilePath,
+    test.source?.evalFileAbsolutePath,
+  ]);
+  const suites = Array.from(new Set<string | null>([test.suite ?? null, null]));
+
+  for (const evalPath of evalPaths) {
+    for (const suite of suites) {
+      keys.add(
+        JSON.stringify({
+          eval_path: evalPath,
+          suite,
+          test_id: test.id ?? 'unknown',
+          prompt_id: test.prompt?.id ?? null,
+          target: target ?? 'unknown',
+          variant: variant ?? null,
+        }),
+      );
+    }
+  }
+
+  return Array.from(keys);
+}
+
+function resumeIdentityMatches(
+  matcher: ResumeIdentityMatcher,
+  test: EvalTest,
+  target: string,
+  variant: string | undefined,
+): boolean {
+  return (
+    buildPlannedResumeIdentityKeys(test, target, variant).some((key) =>
+      matcher.preciseKeys.has(key),
+    ) || matcher.legacyKeys.has(buildTestTargetKey(test.id, target, variant))
+  );
+}
+
 interface RunEvalCommandInput {
   readonly testFiles: readonly string[];
   readonly rawOptions: Record<string, unknown>;
@@ -1869,9 +2000,10 @@ export async function runEvalCommand(
     }
   }
 
-  // --resume / --rerun-failed: skip already-completed tests and append to existing output.
+  // --resume skips completed rows; --rerun-failed includes only latest failed/error rows.
   // IMPORTANT: JSONL must be loaded before the output writer is created (same file).
-  let resumeSkipKeys: Set<string> | undefined;
+  let resumeSkipKeys: ResumeIdentityMatcher | undefined;
+  let rerunIncludeKeys: ResumeIdentityMatcher | undefined;
   let isResumeAppend = false;
   if (options.resume && !options.retryErrors) {
     const sourceRunDir = options.rerunFailedSource
@@ -1888,13 +2020,15 @@ export async function runEvalCommand(
       const resumeIndexPaths = discoverRunManifestPaths(sourceRunDir);
       if (resumeIndexPaths.length > 0) {
         const existingResults = await readExistingResultsFromRunDir(sourceRunDir);
-        resumeSkipKeys = new Set<string>();
+        resumeSkipKeys = createResumeIdentityMatcher();
+        rerunIncludeKeys = options.rerunFailed ? createResumeIdentityMatcher() : undefined;
         let completedResultCount = 0;
-        for (const r of existingResults) {
-          if (shouldSkipExistingResultForResume(r, options.rerunFailed)) {
+        for (const entry of latestResumeIdentityEntries(existingResults)) {
+          if (shouldSkipExistingResultForResume(entry.result, options.rerunFailed)) {
             completedResultCount += 1;
-            resumeSkipKeys.add(buildEvaluationResultTargetKey(r));
-            resumeSkipKeys.add(buildTestTargetKey(r.testId, r.target, r.variant));
+            addResumeIdentityEntry(resumeSkipKeys, entry);
+          } else if (rerunIncludeKeys) {
+            addResumeIdentityEntry(rerunIncludeKeys, entry);
           }
         }
         isResumeAppend =
@@ -1904,6 +2038,9 @@ export async function runEvalCommand(
         console.log(
           `${modeLabel}: found ${existingResults.length} existing result(s), skipping ${completedResultCount} completed.`,
         );
+      } else if (options.rerunFailed) {
+        rerunIncludeKeys = createResumeIdentityMatcher();
+        console.log('Rerun-failed: no existing bundle run manifest found. Nothing to rerun.');
       } else {
         // No existing bundle manifest — behave like a normal run.
         console.log('Resume: no existing bundle run manifest found, starting fresh run.');
@@ -2158,9 +2295,13 @@ export async function runEvalCommand(
       for (const { selection } of meta.selections) {
         const target = selection.targetName;
         const variant = targetVariantForSelection(selection);
-        const key = buildEvalTestTargetKey(test, target, variant);
-        const fallbackKey = buildTestTargetKey(test.id, target, variant);
-        if (resumeSkipKeys?.has(key) || resumeSkipKeys?.has(fallbackKey)) {
+        if (rerunIncludeKeys) {
+          if (resumeIdentityMatches(rerunIncludeKeys, test, target, variant)) {
+            totalEvalCount++;
+          } else {
+            resumeSkippedCount++;
+          }
+        } else if (resumeSkipKeys && resumeIdentityMatches(resumeSkipKeys, test, target, variant)) {
           resumeSkippedCount++;
         } else {
           totalEvalCount++;
@@ -2175,6 +2316,10 @@ export async function runEvalCommand(
       console.log('No execution errors or missing cases in the previous run. Nothing to retry.');
       return;
     }
+    if (rerunIncludeKeys) {
+      console.log('Nothing to rerun — no failed or errored test(s) matched the current suite.');
+      return;
+    }
     // When using --resume, all tests being completed means nothing to resume
     if (resumeSkipKeys && resumeSkippedCount > 0) {
       console.log(`Nothing to resume — all ${resumeSkippedCount} test(s) already completed.`);
@@ -2406,16 +2551,22 @@ export async function runEvalCommand(
             const targetName = selection.targetName;
             const applicableTestCases = targetPrep.testCases;
 
-            // --resume / --rerun-failed: skip tests that are already completed
-            const filteredTestCases = resumeSkipKeys
-              ? applicableTestCases.filter((test) => {
-                  const variant = targetVariantForSelection(selection);
-                  return (
-                    !resumeSkipKeys.has(buildEvalTestTargetKey(test, targetName, variant)) &&
-                    !resumeSkipKeys.has(buildTestTargetKey(test.id, targetName, variant))
-                  );
-                })
-              : applicableTestCases;
+            // --resume skips completed tests; --rerun-failed only includes prior failed/error tests.
+            const filteredTestCases = rerunIncludeKeys
+              ? applicableTestCases.filter((test) =>
+                  resumeIdentityMatches(
+                    rerunIncludeKeys,
+                    test,
+                    targetName,
+                    targetVariantForSelection(selection),
+                  ),
+                )
+              : resumeSkipKeys
+                ? applicableTestCases.filter((test) => {
+                    const variant = targetVariantForSelection(selection);
+                    return !resumeIdentityMatches(resumeSkipKeys, test, targetName, variant);
+                  })
+                : applicableTestCases;
 
             if (filteredTestCases.length === 0) {
               return [];
diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts
index a88638f06..31dbce6d7 100644
--- a/apps/cli/test/eval.integration.test.ts
+++ b/apps/cli/test/eval.integration.test.ts
@@ -933,7 +933,7 @@ describe('agentv eval CLI', () => {
     }
   }, 30_000);
 
-  it('reruns failed rows from a canonical run id', async () => {
+  it('reruns only latest failed rows from a canonical run id', async () => {
     const fixture = await createFixture();
     try {
       const priorRunDir = path.join(fixture.suiteDir, '.agentv', 'results', 'prior-run');
@@ -948,18 +948,61 @@ describe('agentv eval CLI', () => {
       expect(first.exitCode).toBe(1);
       const priorIndexPath = path.join(priorRunDir, 'index.jsonl');
       const priorRows = (await readJsonLines(priorIndexPath)) as Array<Record<string, unknown>>;
+      const alphaRow = priorRows.find((row) => row.test_id === 'case-alpha');
+      const betaRow = priorRows.find((row) => row.test_id === 'case-beta');
+      if (!alphaRow || !betaRow) {
+        throw new Error('Expected prior rows for case-alpha and case-beta');
+      }
       await writeFile(
         priorIndexPath,
-        `${priorRows
-          .map((row) =>
-            JSON.stringify({
-              ...row,
-              execution_status: row.test_id === 'case-alpha' ? 'quality_failure' : 'ok',
-            }),
-          )
+        `${[
+          ...priorRows.map((row) => ({
+            ...row,
+            execution_status: row.test_id === 'case-alpha' ? 'ok' : 'quality_failure',
+          })),
+          { ...alphaRow, execution_status: 'quality_failure' },
+          { ...betaRow, execution_status: 'ok' },
+        ]
+          .map((row) => JSON.stringify(row))
           .join('\n')}\n`,
         'utf8',
       );
+      await writeFile(
+        fixture.testFilePath,
+        `description: CLI integration test
+target: file-target
+
+tests:
+  - id: case-alpha
+    criteria: System responds with alpha
+    input:
+      - role: user
+        content: |
+          Please respond with alpha
+    expected_output:
+      - role: assistant
+        content: "Alpha"
+  - id: case-beta
+    criteria: System responds with beta
+    input:
+      - role: user
+        content: |
+          Please respond with beta
+    expected_output:
+      - role: assistant
+        content: "Beta"
+  - id: case-gamma
+    criteria: System responds with gamma
+    input:
+      - role: user
+        content: |
+          Please respond with gamma
+    expected_output:
+      - role: assistant
+        content: "Gamma"
+`,
+        'utf8',
+      );
 
       const second = await runCli(fixture, [
         'eval',
@@ -967,10 +1010,10 @@ describe('agentv eval CLI', () => {
         '--rerun-failed',
         'prior-run',
         '--threshold',
-        '0.8',
+        '0.5',
       ]);
-      expect(second.exitCode).toBe(1);
-      expect(second.stdout).toContain('Rerun-failed: found 2 existing result(s), skipping 1');
+      expect(second.exitCode).toBe(0);
+      expect(second.stdout).toContain('Rerun-failed: found 4 existing result(s), skipping 1');
 
       const diagnostics = await readDiagnostics(fixture);
       const calls = diagnostics.calls as Array<Record<string, unknown>>;
@@ -979,13 +1022,110 @@ describe('agentv eval CLI', () => {
       });
 
       const rows = await readJsonLines(priorIndexPath);
-      expect(rows).toHaveLength(3);
+      expect(rows).toHaveLength(5);
       expect((rows.at(-1) as Record<string, unknown>).test_id).toBe('case-alpha');
     } finally {
       await rm(fixture.baseDir, { recursive: true, force: true });
     }
   }, 30_000);
 
+  it('does not use coarse fallback keys for precise rerun-failed identities', async () => {
+    const fixture = await createFixture();
+    try {
+      const firstEvalPath = path.join(fixture.suiteDir, 'collision-a.eval.yaml');
+      const secondEvalPath = path.join(fixture.suiteDir, 'collision-b.eval.yaml');
+      const evalContent = (name: string) => `description: ${name}
+target: file-target
+
+tests:
+  - id: shared-case
+    criteria: System responds
+    input:
+      - role: user
+        content: |
+          Please respond for ${name}
+    expected_output:
+      - role: assistant
+        content: "Shared"
+`;
+      await writeFile(firstEvalPath, evalContent('collision a'), 'utf8');
+      await writeFile(secondEvalPath, evalContent('collision b'), 'utf8');
+
+      const priorRunDir = path.join(fixture.suiteDir, '.agentv', 'results', 'prior-collision');
+      const first = await runCli(fixture, [
+        'eval',
+        firstEvalPath,
+        '--output',
+        priorRunDir,
+        '--threshold',
+        '0.8',
+      ]);
+      expect(first.exitCode).toBe(0);
+
+      const priorIndexPath = path.join(priorRunDir, 'index.jsonl');
+      const priorRows = (await readJsonLines(priorIndexPath)) as Array<Record<string, unknown>>;
+      expect(priorRows).toHaveLength(1);
+      const baseRow = priorRows[0];
+      if (!baseRow) {
+        throw new Error('Expected one prior collision row');
+      }
+      const baseProjection = baseRow.projection_identity as Record<string, unknown>;
+      const baseDimensions = baseProjection.dimensions as Record<string, unknown>;
+      const secondProjection = {
+        ...baseProjection,
+        dimensions: {
+          ...baseDimensions,
+          eval_path: secondEvalPath,
+        },
+      };
+      await writeFile(
+        priorIndexPath,
+        `${[
+          { ...baseRow, execution_status: 'ok' },
+          {
+            ...baseRow,
+            projection_identity: secondProjection,
+            execution_status: 'quality_failure',
+          },
+        ]
+          .map((row) => JSON.stringify(row))
+          .join('\n')}\n`,
+        'utf8',
+      );
+
+      const second = await runCli(fixture, [
+        'eval',
+        firstEvalPath,
+        secondEvalPath,
+        '--rerun-failed',
+        'prior-collision',
+        '--threshold',
+        '0.8',
+      ]);
+      expect(second.exitCode).toBe(0);
+      expect(second.stdout).toContain('Rerun-failed: found 2 existing result(s), skipping 1');
+
+      const diagnostics = await readDiagnostics(fixture);
+      const calls = diagnostics.calls as Array<Record<string, unknown>>;
+      const rerunCalls = calls.slice(1);
+      expect(rerunCalls).toHaveLength(1);
+      const rerunCall = rerunCalls[0];
+      if (!rerunCall) {
+        throw new Error('Expected one rerun diagnostics call');
+      }
+      expect(path.basename(rerunCall.testFilePath as string)).toBe('collision-b.eval.yaml');
+      expect(rerunCall).toMatchObject({
+        evalCaseIds: ['shared-case'],
+      });
+
+      const rows = await readJsonLines(priorIndexPath);
+      expect(rows).toHaveLength(3);
+      expect((rows.at(-1) as Record<string, unknown>).test_id).toBe('shared-case');
+    } finally {
+      await rm(fixture.baseDir, { recursive: true, force: true });
+    }
+  }, 30_000);
+
   it('does not multiply max_concurrency across target matrix selections', async () => {
     const fixture = await createFixture();
     try {
diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts
index 4103e4eaf..c97edf54b 100644
--- a/apps/cli/test/fixtures/mock-run-evaluation.ts
+++ b/apps/cli/test/fixtures/mock-run-evaluation.ts
@@ -173,6 +173,7 @@ async function maybeWriteDiagnostics(
   }
 
   const payload = {
+    testFilePath: options.testFilePath,
     target: options.target?.name,
     targetKind: options.target?.kind,
     targetModel:
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index a393bb06e..59635c657 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -457,6 +457,8 @@ export interface RunEvalCaseOptions {
   readonly evalFilePath?: string;
   /** Repo root used to serialize replay fixture eval_path as a stable relative path. */
   readonly repoRoot?: string;
+  /** Zero-based sample index produced by repeat.count. */
+  readonly sampleIndex?: number;
 }
 
 export interface ProgressEvent {
@@ -1741,6 +1743,7 @@ async function runBatchEvaluation(options: {
         promptInputs,
         nowFn,
         attempt: 0,
+        sampleIndex: 0,
         graderProvider: await resolveGraderProvider(target),
         agentTimeoutMs,
         output,
@@ -1782,6 +1785,7 @@ async function runBatchEvaluation(options: {
         'evaluator',
         'evaluator_error',
         verbose,
+        { sampleIndex: 0, retryIndex: 0 },
       );
       results.push(errorResult);
       if (onResult) {
@@ -1892,6 +1896,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
     replayRecording,
     evalFilePath,
     repoRoot,
+    sampleIndex = 0,
   } = options;
   const setupDebug = process.env.AGENTV_SETUP_DEBUG === '1';
 
@@ -1937,6 +1942,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
       setupError?.failureStage ?? 'setup',
       setupError?.failureReasonCode ?? 'script_error',
       verbose,
+      { sampleIndex },
     );
   }
 
@@ -2074,6 +2080,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
       availableTargets,
       evalFilePath,
       metadata: providerMetadata,
+      sampleIndex,
     });
     await runAfterEachHooks();
     conversationResult = {
@@ -2176,6 +2183,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
       'agent',
       'provider_error',
       verbose,
+      { sampleIndex, retryIndex: attempt },
     );
     // On error, keep workspace for debugging (unless forceCleanup is set)
     if (workspacePath) {
@@ -2294,6 +2302,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
       verbose,
       threshold: evalCase.threshold ?? caseThreshold,
       dependencyResults,
+      sampleIndex,
     });
     await runAfterEachHooks();
 
@@ -2414,6 +2423,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
       'evaluator',
       'evaluator_error',
       verbose,
+      { sampleIndex, retryIndex: attempt },
     );
     // On error, keep workspace for debugging (only for per-case workspaces)
     if (workspacePath && !isSharedWorkspace) {
@@ -2453,6 +2463,7 @@ async function runEvalCaseWithTrials(
       keepWorkspaces: isLastDeclaredTrial ? options.keepWorkspaces : false,
       retainOnSuccess: isLastDeclaredTrial ? options.retainOnSuccess : 'cleanup',
       retainOnFailure: isLastDeclaredTrial ? options.retainOnFailure : 'cleanup',
+      sampleIndex: attempt,
     };
 
     const result = await runEvalCase(trialOptions);
@@ -2464,6 +2475,8 @@ async function runEvalCaseWithTrials(
     const trialVerdict = result.executionStatus === 'ok' ? 'pass' : 'fail';
     const trial: TrialResult = {
       attempt,
+      sampleIndex: result.sampleIndex ?? attempt,
+      retryIndex: result.retryIndex,
       score: result.score,
       verdict: trialVerdict,
       scores: result.scores,
@@ -2546,6 +2559,8 @@ async function runEvalCaseWithTrials(
   return {
     ...baseResult,
     score,
+    sampleIndex: undefined,
+    retryIndex: undefined,
     trials: trialResults,
     aggregation,
     costLimited: costLimited || undefined,
@@ -2566,6 +2581,7 @@ async function evaluateCandidate(options: {
   readonly promptInputs: PromptInputs;
   readonly nowFn: () => Date;
   readonly attempt: number;
+  readonly sampleIndex: number;
   readonly graderProvider?: Provider;
   readonly agentTimeoutMs?: number;
   readonly output?: readonly Message[];
@@ -2596,6 +2612,7 @@ async function evaluateCandidate(options: {
     promptInputs,
     nowFn,
     attempt,
+    sampleIndex,
     graderProvider,
     agentTimeoutMs,
     output,
@@ -2699,7 +2716,10 @@ async function evaluateCandidate(options: {
       : undefined;
   return {
     timestamp: completedAt.toISOString(),
-    testId: evalCase.id,
+    testId: authoredResultTestId(evalCase),
+    prompt: evalCase.prompt,
+    sampleIndex,
+    retryIndex: attempt,
     source: evalCase.source,
     suite: evalCase.suite,
     category: evalCase.category,
@@ -3184,6 +3204,7 @@ async function runConversationMode(options: {
   readonly availableTargets?: readonly string[];
   readonly evalFilePath?: string;
   readonly metadata?: JsonObject;
+  readonly sampleIndex?: number;
 }): Promise<EvaluationResult> {
   const {
     evalCase,
@@ -3205,6 +3226,7 @@ async function runConversationMode(options: {
     availableTargets,
     evalFilePath,
     metadata,
+    sampleIndex = 0,
   } = options;
 
   // biome-ignore lint/style/noNonNullAssertion: turns is guaranteed by the caller (conversation mode gate)
@@ -3334,6 +3356,7 @@ async function runConversationMode(options: {
       },
       nowFn,
       attempt: 0,
+      sampleIndex,
       graderProvider,
       agentTimeoutMs,
       output: response.output,
@@ -3396,6 +3419,7 @@ async function runConversationMode(options: {
       },
       nowFn,
       attempt: 0,
+      sampleIndex,
       graderProvider,
       agentTimeoutMs,
       verbose,
@@ -3447,7 +3471,10 @@ async function runConversationMode(options: {
 
   return {
     timestamp: nowFn().toISOString(),
-    testId: evalCase.id,
+    testId: authoredResultTestId(evalCase),
+    prompt: evalCase.prompt,
+    sampleIndex,
+    retryIndex: 0,
     suite: evalCase.suite,
     category: evalCase.category,
     score: finalScore,
@@ -3633,6 +3660,10 @@ function buildErrorResult(
   failureStage: FailureStage,
   failureReasonCode: string,
   verbose?: boolean,
+  identity?: {
+    readonly sampleIndex?: number;
+    readonly retryIndex?: number;
+  },
 ): EvaluationResult {
   const message = extractErrorMessage(error);
 
@@ -3681,6 +3712,8 @@ function buildErrorResult(
     timestamp: timestamp.toISOString(),
     testId: authoredResultTestId(evalCase),
     prompt: evalCase.prompt,
+    sampleIndex: identity?.sampleIndex,
+    retryIndex: identity?.retryIndex,
     suite: evalCase.suite,
     category: evalCase.category,
     conversationId: evalCase.conversation_id,
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index 6be8c46f1..97de93fdf 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -1180,6 +1180,10 @@ export interface TrialsConfig {
  */
 export interface TrialResult {
   readonly attempt: number;
+  /** Zero-based sample index produced from repeat.count. */
+  readonly sampleIndex?: number;
+  /** Provider retry index for the attempt that produced this trial result. */
+  readonly retryIndex?: number;
   readonly score: number;
   readonly verdict: EvaluationVerdict;
   readonly scores?: readonly GraderResult[];

From a25016e466e668cb5ac886abab815d7216dc862a Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Thu, 2 Jul 2026 18:18:20 +0200
Subject: [PATCH 3/3] fix(eval): quarantine failed workspace pool slots

---
 packages/core/src/evaluation/orchestrator.ts  | 123 +++++++++++++--
 .../core/test/evaluation/orchestrator.test.ts | 145 ++++++++++++++++++
 2 files changed, 258 insertions(+), 10 deletions(-)

diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index 59635c657..d24066cfb 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -79,6 +79,7 @@ import type {
   TrialsConfig,
 } from './types.js';
 import { cleanupEvalWorkspaces, cleanupWorkspace } from './workspace/manager.js';
+import type { PoolSlot } from './workspace/pool-manager.js';
 import type { RepoManager } from './workspace/repo-manager.js';
 import {
   type ScriptExecutionContext,
@@ -996,6 +997,11 @@ export async function runEvaluation(
     let nextWorkerId = 1;
     const workerIdByEvalId = new Map<string, number>();
     let beforeAllOutputAttached = false;
+    const unavailablePoolSlotPaths = new Set<string>();
+    const poolSlotWaiters: Array<(slot: PoolSlot | undefined) => void> = [];
+    if (poolSlot && availablePoolSlots.length === 0) {
+      availablePoolSlots.push(poolSlot);
+    }
 
     // Suite-level budget tracking
     let cumulativeBudgetCost = 0;
@@ -1059,6 +1065,97 @@ export async function runEvaluation(
       return { ok: allPassed, depResults };
     }
 
+    function acquirePoolSlot(): Promise<PoolSlot | undefined> {
+      while (availablePoolSlots.length > 0) {
+        const candidate = availablePoolSlots.pop();
+        if (candidate && !unavailablePoolSlotPaths.has(candidate.path)) {
+          return Promise.resolve(candidate);
+        }
+      }
+
+      if (unavailablePoolSlotPaths.size >= poolSlots.length) {
+        return Promise.resolve(undefined);
+      }
+
+      return new Promise((resolve) => {
+        poolSlotWaiters.push(resolve);
+      });
+    }
+
+    function returnPoolSlot(slot: PoolSlot): void {
+      if (unavailablePoolSlotPaths.has(slot.path)) {
+        return;
+      }
+      const waiter = poolSlotWaiters.shift();
+      if (waiter) {
+        waiter(slot);
+      } else {
+        availablePoolSlots.push(slot);
+      }
+    }
+
+    function markPoolSlotUnavailable(slot: PoolSlot): void {
+      unavailablePoolSlotPaths.add(slot.path);
+      if (unavailablePoolSlotPaths.size >= poolSlots.length) {
+        while (poolSlotWaiters.length > 0) {
+          poolSlotWaiters.shift()?.(undefined);
+        }
+      }
+    }
+
+    async function emitSetupErrorResult(
+      evalCase: EvalTest,
+      workerId: number,
+      message: string,
+      failureReasonCode: string,
+    ): Promise<EvaluationResult> {
+      if (failOnError === true) {
+        failOnErrorTriggered = true;
+      }
+      const output = `Error occurred: ${message}`;
+      const result: EvaluationResult = {
+        timestamp: (now ?? (() => new Date()))().toISOString(),
+        testId: authoredResultTestId(evalCase),
+        suite: evalCase.suite,
+        category: evalCase.category,
+        prompt: evalCase.prompt,
+        score: 0,
+        assertions: [{ text: `Error: ${message}`, passed: false }],
+        output,
+        trace: buildTraceFromMessages({
+          input: evalCase.input as readonly Message[],
+          output: [{ role: 'assistant' as const, content: output }],
+          finalOutput: output,
+          target: target.name,
+          testId: authoredResultTestId(evalCase),
+          conversationId: evalCase.conversation_id,
+          error: message,
+        }),
+        target: target.name,
+        error: message,
+        executionStatus: 'execution_error',
+        failureStage: 'setup',
+        failureReasonCode,
+        executionError: { message, stage: 'setup' },
+      };
+
+      if (onProgress) {
+        await onProgress({
+          workerId,
+          testId: evalCase.id,
+          status: 'failed',
+          completedAt: Date.now(),
+          error: result.error,
+          score: result.score,
+          executionStatus: result.executionStatus,
+        });
+      }
+      if (onResult) {
+        await onResult(result);
+      }
+      return result;
+    }
+
     function extractEvaluationCostUsd(result: EvaluationResult): number | undefined {
       if (result.trials && result.trials.length > 0) {
         const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
@@ -1236,14 +1333,20 @@ export async function runEvaluation(
       // Per-case isolated cases and raw/no-workspace cases outside the selected
       // shared owner prepare without inheriting a child suite's workspace.
       const usesSharedWorkspace = caseUsesSharedWorkspaceSetup(evalCase, sharedSetup);
-      const testPoolSlot =
-        usesSharedWorkspace && availablePoolSlots.length > 0
-          ? availablePoolSlots.pop()
-          : usesSharedWorkspace
-            ? poolSlot
-            : undefined;
+      const pooledWorkspaceRequired = usesSharedWorkspace && poolSlots.length > 0;
+      const testPoolSlot = pooledWorkspaceRequired ? await acquirePoolSlot() : undefined;
+      if (pooledWorkspaceRequired && !testPoolSlot) {
+        return emitSetupErrorResult(
+          evalCase,
+          workerId,
+          'No clean pooled workspace slot is available; prior slot reset failed.',
+          'workspace_pool_unavailable',
+        );
+      }
       const testWorkspacePath = usesSharedWorkspace
-        ? (testPoolSlot?.path ?? sharedWorkspacePath)
+        ? pooledWorkspaceRequired
+          ? testPoolSlot?.path
+          : sharedWorkspacePath
         : undefined;
       const testBaselineCommit = usesSharedWorkspace
         ? testPoolSlot
@@ -1365,7 +1468,6 @@ export async function runEvaluation(
         // the per-slot baseline. Pooling is a local performance optimization,
         // not shared state between eval cases.
         if (testPoolSlot) {
-          const shouldReturnPoolSlot = testPoolSlot !== poolSlot;
           const resetMode = workspaceClean === 'full' ? 'strict' : 'fast';
           let resetSucceeded = true;
           try {
@@ -1375,6 +1477,7 @@ export async function runEvaluation(
             await resetWorkspaceRoot(testPoolSlot.path, resetMode, testBaselineCommit);
           } catch (resetError) {
             resetSucceeded = false;
+            markPoolSlotUnavailable(testPoolSlot);
             if (verbose) {
               const message = resetError instanceof Error ? resetError.message : String(resetError);
               console.warn(
@@ -1382,8 +1485,8 @@ export async function runEvaluation(
               );
             }
           }
-          if (resetSucceeded && shouldReturnPoolSlot) {
-            availablePoolSlots.push(testPoolSlot);
+          if (resetSucceeded) {
+            returnPoolSlot(testPoolSlot);
           }
         }
       }
diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts
index 1473a502a..27383c179 100644
--- a/packages/core/test/evaluation/orchestrator.test.ts
+++ b/packages/core/test/evaluation/orchestrator.test.ts
@@ -773,6 +773,151 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`,
     }
   }, 30_000);
 
+  it('does not reuse a single pooled workspace slot after reset failure', async () => {
+    const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-pooled-runner-reset-fail-'));
+    const previousAgentvHome = process.env.AGENTV_HOME;
+    const previousAgentvDataDir = process.env.AGENTV_DATA_DIR;
+    process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home');
+    process.env.AGENTV_DATA_DIR = path.join(tempDir, 'agentv-data');
+
+    try {
+      const sourceRepo = path.join(tempDir, 'source-repo');
+      const cleanCommit = createTestRepo(sourceRepo, { 'tracked.txt': 'clean\n' });
+      const workspace = {
+        repos: [
+          {
+            path: './repo-a',
+            repo: `file://${sourceRepo}`,
+            commit: cleanCommit,
+          },
+        ],
+      };
+      let providerCalls = 0;
+      const provider: Provider = {
+        id: 'mock:single-slot-reset-failure',
+        kind: 'mock' as const,
+        targetName: 'single-slot-reset-failure',
+        async invoke(request: ProviderRequest): Promise<ProviderResponse> {
+          providerCalls += 1;
+          if (!request.cwd) {
+            throw new Error('missing cwd');
+          }
+          const repoDir = path.join(request.cwd, 'repo-a');
+          writeFileSync(path.join(repoDir, 'tracked.txt'), 'dirty\n');
+          rmSync(path.join(repoDir, '.git'), { recursive: true, force: true });
+          return { output: [{ role: 'assistant', content: `response ${providerCalls}` }] };
+        },
+      };
+
+      const results = await runEvaluation({
+        testFilePath: path.join(tempDir, 'eval.yaml'),
+        repoRoot: tempDir,
+        target: { ...baseTarget, name: 'single-slot-reset-failure' },
+        providerFactory: () => provider,
+        evaluators: evaluatorRegistry,
+        workspaceMode: 'pooled',
+        maxConcurrency: 1,
+        evalCases: [
+          { ...baseTestCase, id: 'case-1', workspace },
+          { ...baseTestCase, id: 'case-2', workspace },
+        ],
+      });
+
+      expect(providerCalls).toBe(1);
+      expect(results).toHaveLength(2);
+      expect(results[0].executionStatus).toBe('ok');
+      expect(results[1].executionStatus).toBe('execution_error');
+      expect(results[1].failureReasonCode).toBe('workspace_pool_unavailable');
+      expect(results[1].error).toContain('No clean pooled workspace slot is available');
+    } finally {
+      if (previousAgentvHome === undefined) {
+        process.env.AGENTV_HOME = undefined;
+      } else {
+        process.env.AGENTV_HOME = previousAgentvHome;
+      }
+      if (previousAgentvDataDir === undefined) {
+        process.env.AGENTV_DATA_DIR = undefined;
+      } else {
+        process.env.AGENTV_DATA_DIR = previousAgentvDataDir;
+      }
+      rmSync(tempDir, { recursive: true, force: true });
+    }
+  }, 30_000);
+
+  it('fails later pooled workspace cases after all multi-slot resets fail', async () => {
+    const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-pooled-runner-exhausted-'));
+    const previousAgentvHome = process.env.AGENTV_HOME;
+    const previousAgentvDataDir = process.env.AGENTV_DATA_DIR;
+    process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home');
+    process.env.AGENTV_DATA_DIR = path.join(tempDir, 'agentv-data');
+
+    try {
+      const sourceRepo = path.join(tempDir, 'source-repo');
+      const cleanCommit = createTestRepo(sourceRepo, { 'tracked.txt': 'clean\n' });
+      const workspace = {
+        repos: [
+          {
+            path: './repo-a',
+            repo: `file://${sourceRepo}`,
+            commit: cleanCommit,
+          },
+        ],
+      };
+      let providerCalls = 0;
+      const provider: Provider = {
+        id: 'mock:multi-slot-reset-failure',
+        kind: 'mock' as const,
+        targetName: 'multi-slot-reset-failure',
+        async invoke(request: ProviderRequest): Promise<ProviderResponse> {
+          providerCalls += 1;
+          if (!request.cwd) {
+            throw new Error('missing cwd');
+          }
+          const repoDir = path.join(request.cwd, 'repo-a');
+          writeFileSync(path.join(repoDir, 'tracked.txt'), 'dirty\n');
+          rmSync(path.join(repoDir, '.git'), { recursive: true, force: true });
+          return { output: [{ role: 'assistant', content: `response ${providerCalls}` }] };
+        },
+      };
+
+      const results = await runEvaluation({
+        testFilePath: path.join(tempDir, 'eval.yaml'),
+        repoRoot: tempDir,
+        target: { ...baseTarget, name: 'multi-slot-reset-failure' },
+        providerFactory: () => provider,
+        evaluators: evaluatorRegistry,
+        workspaceMode: 'pooled',
+        maxConcurrency: 2,
+        configPoolMaxSlots: 2,
+        evalCases: [
+          { ...baseTestCase, id: 'case-1', workspace },
+          { ...baseTestCase, id: 'case-2', workspace },
+          { ...baseTestCase, id: 'case-3', workspace },
+        ],
+      });
+
+      expect(providerCalls).toBe(2);
+      expect(results).toHaveLength(3);
+      expect(results[0].executionStatus).toBe('ok');
+      expect(results[1].executionStatus).toBe('ok');
+      expect(results[2].executionStatus).toBe('execution_error');
+      expect(results[2].failureReasonCode).toBe('workspace_pool_unavailable');
+      expect(results[2].error).toContain('No clean pooled workspace slot is available');
+    } finally {
+      if (previousAgentvHome === undefined) {
+        process.env.AGENTV_HOME = undefined;
+      } else {
+        process.env.AGENTV_HOME = previousAgentvHome;
+      }
+      if (previousAgentvDataDir === undefined) {
+        process.env.AGENTV_DATA_DIR = undefined;
+      } else {
+        process.env.AGENTV_DATA_DIR = previousAgentvDataDir;
+      }
+      rmSync(tempDir, { recursive: true, force: true });
+    }
+  }, 30_000);
+
   it('applies exponential backoff between retries', async () => {
     const provider = new SequenceProvider('mock', {
       errors: [new Error('Transient failure')],