EntityProcess · christso · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026
diff --git a/.agents/verification.md b/.agents/verification.md
@@ -141,12 +141,12 @@ Use live dogfood before marking PRs ready when they affect eval execution, exper
 - Prefer the smallest realistic eval: one or two cases, bounded timeouts, and `workers: 1` for heavyweight agent providers.
 - For artifact/result contract changes, prefer letting AgentV choose the canonical run directory and capture the printed `Artifact workspace written to:` and `Results written to:` paths for evidence. Do not precompute `--output` unless the test specifically needs a fixed path.
 - For native experiment changes, run through `agentv eval run ... --experiment <experiment.yaml|ts>` so resolution, setup, scripts, target selection, run knobs, and artifact metadata are exercised together.
-- For repeat-run changes, use top-level `runs >= 2` when validating repeated attempts. Inspect root `index.jsonl`, root `benchmark.json`, and the repeated case folder. The repeated case folder should carry aggregate `summary.json` with flattened snake_case timing fields plus AgentV aggregate `grading.json`; attempt-specific outputs, transcripts, and metrics live under `run-N/`. Each `run-N/` folder should contain `result.json`, `grading.json`, `metrics.json`, `transcript.jsonl`, `transcript-raw.jsonl`, and `outputs/answer.md` when answer output is available. `result.json` should point at `./grading.json`, `./metrics.json`, `./transcript.jsonl`, and `./transcript-raw.jsonl` through the corresponding path fields.
+- For repeat-run changes, use `evaluate_options.repeat.count >= 2` when validating repeated attempts. Inspect root `index.jsonl`, root `summary.json`, and the repeated case folder. Use `repeat` for authored configuration and `attempts[]` for produced executions. The repeated case folder should carry aggregate `summary.json` with flattened snake_case timing fields; attempt-specific outputs, transcripts, and metrics live under `attempt-N/`. Each `attempt-N/` folder should contain `result.json`, `grading.json`, `metrics.json`, `transcript.json`, `transcript-raw.jsonl`, and `outputs/answer.md` when answer output is available. `result.json` should point at `./grading.json`, `./metrics.json`, `./transcript.json`, and `./transcript-raw.jsonl` through the corresponding path fields.
 - For local OpenAI-compatible grading through the OAuth proxy, use `endpoint: http://127.0.0.1:10531/v1`, but still route `api_key` and `model` through environment references such as `${{ LOCAL_OPENAI_PROXY_API_KEY }}` and `${{ LOCAL_OPENAI_PROXY_MODEL }}`. Literal secrets and literal model values are intentionally rejected by target validation unless a resolver explicitly allows them.
 - For `codex`/Codex SDK live dogfood through the same local proxy, configure the agent target with `provider: codex`, `base_url: ${{ LOCAL_OPENAI_PROXY_BASE_URL }}`, `api_key: ${{ LOCAL_OPENAI_PROXY_API_KEY }}`, `model: ${{ LOCAL_OPENAI_PROXY_MODEL }}`, `api_format: responses`, `grader_target: <local-openai-grader>`, `workers: 1`, and a bounded `timeout_seconds`. Configure the grader target as `provider: openai`, `api_format: chat`, and the same local proxy env references. A minimal run should use `bun apps/cli/src/cli.ts eval run <eval.yaml> --targets <targets.yaml> --target <codex-target> --workers 1`.
 - If the local proxy returns `401 token_expired`, the blocker is stale Codex OAuth, not AgentV target configuration. Refresh from a trusted local terminal with `codex logout`, `codex login --device-auth`, then restart `openai-oauth` and rerun the same eval command.
 - Preserve review evidence in `agentv-private` on an orphan `evidence/<bead-or-feature-slug>` branch. Include the run bundle, source eval/experiment/targets files, a short README, an artifact tree, contract checks, and screenshots when folder structure or UI behavior is under review.
-- If comparing against an external convention such as Vercel `agent-eval`, verify both semantic provenance and the physical `run-N` artifact layout for repeat runs.
+- If comparing against an external convention such as Vercel `agent-eval`, verify both semantic provenance and the physical `attempt-N` artifact layout for repeat runs.
 - For transcript/result artifact contract changes, try the same provider spread before merging: `pi-cli`, `codex-sdk`, and `copilot-sdk` through the local OpenAI-compatible endpoint when available. If a provider cannot run live, record the exact blocker, the run bundle or command output, and whether coverage moved to fixture/regression tests.
 - If dogfood or review changes the durable verification playbook, update this file or `AGENTS.md` in the same PR. Use `docs/solutions/` for longer reusable lessons rather than relying on PR comments or private evidence as the only source.
 

diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts
@@ -8,9 +8,9 @@
  *     index.jsonl              — per-test manifest with artifact pointers
  *     <test-id>/
  *       summary.json           — per-case aggregate
- *       run-1/result.json      — per-run result
- *       run-1/grading.json     — per-run grading artifact (assertions, graders)
- *       run-1/metrics.json     — per-run metrics artifact
+ *       attempt-1/result.json  — per-attempt result
+ *       attempt-1/grading.json — per-attempt grading artifact (assertions, graders)
+ *       attempt-1/metrics.json — per-attempt metrics artifact
  *
  * This module delegates artifact building to the shared artifact-writer so
  * that summary/grading/timing schemas stay aligned with `agentv eval`.

diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts
@@ -34,8 +34,17 @@ export interface ResultManifestRecord {
   readonly variant?: string;
   readonly score: number;
   readonly scores?: readonly Record<string, unknown>[];
+  readonly attempts?: readonly {
+    readonly attempt?: number;
+    readonly attempt_path?: string;
+    readonly run_path?: string;
+    readonly score?: number;
+    readonly verdict?: string;
+    readonly [key: string]: unknown;
+  }[];
   readonly trials?: readonly {
     readonly attempt?: number;
+    readonly attempt_path?: string;
     readonly run_path?: string;
     readonly score?: number;
     readonly verdict?: string;

diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
@@ -805,8 +805,9 @@ function addTrialRunCatalogEntries(
     ? normalizeArtifactRelativePath(record.result_dir)
     : undefined;
   if (!resultDir) return;
-  for (const trial of record.trials ?? []) {
-    const runPath = trial.run_path ? normalizeArtifactRelativePath(trial.run_path) : undefined;
+  for (const trial of record.attempts ?? record.trials ?? []) {
+    const rawPath = typeof trial.attempt_path === 'string' ? trial.attempt_path : trial.run_path;
+    const runPath = rawPath ? normalizeArtifactRelativePath(rawPath) : undefined;
     if (!runPath) continue;
     const runDir = path.posix.join(resultDir, runPath);
     addDirectArtifactCatalogEntry(
@@ -1145,13 +1146,15 @@ function buildRepeatTrialReadModels(
   baseDir: string,
   record: ResultManifestRecord,
 ): Array<Record<string, unknown>> | undefined {
-  if (!record.trials || record.trials.length === 0) return undefined;
+  const attempts = record.attempts ?? record.trials;
+  if (!attempts || attempts.length === 0) return undefined;
   const resultDir = record.result_dir
     ? normalizeArtifactRelativePath(record.result_dir)
     : undefined;
 
-  return record.trials.map((trial) => {
-    const runPath = trial.run_path ? normalizeArtifactRelativePath(trial.run_path) : undefined;
+  return attempts.map((trial) => {
+    const rawPath = typeof trial.attempt_path === 'string' ? trial.attempt_path : trial.run_path;
+    const runPath = rawPath ? normalizeArtifactRelativePath(rawPath) : undefined;
     const metricsPath = caseTrialArtifactPath(resultDir, runPath, 'metrics.json');
     const timingPath = caseTrialArtifactPath(resultDir, runPath, 'timing.json');
     const gradingPath = caseTrialArtifactPath(resultDir, runPath, 'grading.json');
@@ -1202,7 +1205,7 @@ function attachRunDetailReadModelFields<T extends Record<string, unknown>>(
   return results.map((result, index) => {
     const record = records[index];
     if (!record) return result;
-    const trials = buildRepeatTrialReadModels(baseDir, record);
+    const attempts = buildRepeatTrialReadModels(baseDir, record);
     return {
       ...result,
       ...(record.aggregation && { aggregation: record.aggregation }),
@@ -1217,7 +1220,7 @@ function attachRunDetailReadModelFields<T extends Record<string, unknown>>(
       ...(record.transcript_raw_path && { transcript_raw_path: record.transcript_raw_path }),
       ...(record.output_path && { output_path: record.output_path }),
       ...(record.answer_path && { answer_path: record.answer_path }),
-      ...(trials && { trials }),
+      ...(attempts && { attempts }),
     };
   });
 }

diff --git a/apps/cli/src/commands/results/validate.ts b/apps/cli/src/commands/results/validate.ts
@@ -40,7 +40,8 @@ interface IndexEntry {
   readonly grading_path?: string;
   readonly timing_path?: string;
   readonly result_dir?: string;
-  readonly trials?: readonly { readonly run_path?: string }[];
+  readonly attempts?: readonly { readonly attempt_path?: string; readonly run_path?: string }[];
+  readonly trials?: readonly { readonly attempt_path?: string; readonly run_path?: string }[];
   readonly [key: string]: unknown;
 }
 
@@ -242,23 +243,24 @@ function checkArtifactFiles(runDir: string, entries: IndexEntry[]): Diagnostic[]
       }
     }
 
-    for (const trial of entry.trials ?? []) {
-      if (!entry.result_dir || !trial.run_path) {
+    for (const attempt of entry.attempts ?? entry.trials ?? []) {
+      const attemptPath = attempt.attempt_path ?? attempt.run_path;
+      if (!entry.result_dir || !attemptPath) {
         continue;
       }
-      const runDirPath = path.join(runDir, entry.result_dir, trial.run_path);
+      const runDirPath = path.join(runDir, entry.result_dir, attemptPath);
       const resultPath = path.join(runDirPath, 'result.json');
       const gradingPath = path.join(runDirPath, 'grading.json');
       if (!existsSync(resultPath)) {
         diagnostics.push({
           severity: 'error',
-          message: `${testId}: result.json not found at '${path.posix.join(entry.result_dir, trial.run_path, 'result.json')}'`,
+          message: `${testId}: result.json not found at '${path.posix.join(entry.result_dir, attemptPath, 'result.json')}'`,
         });
       }
       if (!existsSync(gradingPath)) {
         diagnostics.push({
           severity: 'error',
-          message: `${testId}: grading.json not found at '${path.posix.join(entry.result_dir, trial.run_path, 'grading.json')}'`,
+          message: `${testId}: grading.json not found at '${path.posix.join(entry.result_dir, attemptPath, 'grading.json')}'`,
         });
       }
     }

diff --git a/apps/cli/test/commands/eval/aggregate.test.ts b/apps/cli/test/commands/eval/aggregate.test.ts
@@ -289,17 +289,17 @@ describe('writePerTestArtifacts', () => {
     await writePerTestArtifacts(results, tmpDir);
 
     const grading1 = JSON.parse(
-      readFileSync(rowRunPath(tmpDir, 'test-1', 'run-1', 'grading.json'), 'utf8'),
+      readFileSync(rowRunPath(tmpDir, 'test-1', 'attempt-1', 'grading.json'), 'utf8'),
     );
     expect(grading1.assertions).toHaveLength(1);
 
     const timing1 = JSON.parse(
-      readFileSync(rowRunPath(tmpDir, 'test-1', 'run-1', 'timing.json'), 'utf8'),
+      readFileSync(rowRunPath(tmpDir, 'test-1', 'attempt-1', 'timing.json'), 'utf8'),
     );
     expect(timing1.total_tokens).toBeGreaterThanOrEqual(0);
 
     const grading2 = JSON.parse(
-      readFileSync(rowRunPath(tmpDir, 'test-2', 'run-1', 'grading.json'), 'utf8'),
+      readFileSync(rowRunPath(tmpDir, 'test-2', 'attempt-1', 'grading.json'), 'utf8'),
     );
     expect(grading2.assertions).toHaveLength(1);
   });
@@ -310,7 +310,7 @@ describe('writePerTestArtifacts', () => {
     await writePerTestArtifacts(results, tmpDir);
 
     const answer = readFileSync(
-      rowRunPath(tmpDir, 'test-1', 'run-1', 'outputs', 'answer.md'),
+      rowRunPath(tmpDir, 'test-1', 'attempt-1', 'outputs', 'answer.md'),
       'utf8',
     );
     expect(answer).toContain('hello');