EntityProcess · christso · Jul 2, 2026 · Jul 2, 2026
diff --git a/apps/cli/src/commands/prepare/index.ts b/apps/cli/src/commands/prepare/index.ts
@@ -8,6 +8,7 @@ import path from 'node:path';
 
 import {
   type EvalTargetRef,
+  type JsonObject,
   type PreparedEvalWorkspace,
   type PreparedWorkspaceRepoPin,
   type ResolvedTarget,
@@ -51,6 +52,8 @@ interface PrepareResult {
   readonly manifestPath: string;
   readonly setupStatus: 'ok';
   readonly setupSteps: readonly SetupStep[];
+  readonly providerContext?: JsonObject;
+  readonly metadata?: Record<string, unknown>;
   readonly repoPins: readonly RepoPin[];
   readonly baseline: PreparedEvalWorkspace['baseline'];
   readonly createdAt: string;
@@ -65,6 +68,8 @@ interface PrepareManifestWire {
   readonly prompt_path: string;
   readonly setup_status: 'ok';
   readonly setup_steps: readonly SetupStepWire[];
+  readonly provider_context?: JsonObject;
+  readonly metadata?: Record<string, unknown>;
   readonly repo_pins: readonly RepoPinWire[];
   readonly baseline: BaselineWire;
   readonly created_at: string;
@@ -130,6 +135,37 @@ function toRepoPins(pins: readonly PreparedWorkspaceRepoPin[]): readonly RepoPin
   }));
 }
 
+function remapWorkspacePaths<T>(
+  value: T,
+  sourceWorkspacePath: string,
+  targetWorkspacePath: string,
+): T {
+  if (typeof value === 'string') {
+    const relativePath = path.relative(sourceWorkspacePath, value);
+    if (
+      relativePath === '' ||
+      (!!relativePath && !relativePath.startsWith('..') && !path.isAbsolute(relativePath))
+    ) {
+      return path.join(targetWorkspacePath, relativePath) as T;
+    }
+    return value;
+  }
+  if (Array.isArray(value)) {
+    return value.map((item) =>
+      remapWorkspacePaths(item, sourceWorkspacePath, targetWorkspacePath),
+    ) as T;
+  }
+  if (value && typeof value === 'object') {
+    return Object.fromEntries(
+      Object.entries(value).map(([key, item]) => [
+        key,
+        remapWorkspacePaths(item, sourceWorkspacePath, targetWorkspacePath),
+      ]),
+    ) as T;
+  }
+  return value;
+}
+
 async function moveDirectory(sourcePath: string, destinationPath: string): Promise<void> {
   try {
     await rename(sourcePath, destinationPath);
@@ -200,6 +236,8 @@ function toManifestWire(result: PrepareResult): PrepareManifestWire {
       status: step.status,
       ...(step.message !== undefined && { message: step.message }),
     })),
+    ...(result.providerContext !== undefined && { provider_context: result.providerContext }),
+    ...(result.metadata !== undefined && { metadata: result.metadata }),
     repo_pins: result.repoPins.map((pin) => ({
       ...(pin.path !== undefined && { path: pin.path }),
       ...(pin.repo !== undefined && { repo: pin.repo }),
@@ -322,6 +360,16 @@ async function prepareAttempt(options: {
     manifestPath,
     setupStatus: 'ok',
     setupSteps: setupStepsFromPrepared(prepared),
+    ...(prepared.providerContext !== undefined && {
+      providerContext: remapWorkspacePaths(
+        prepared.providerContext,
+        prepared.workspacePath,
+        workspacePath,
+      ),
+    }),
+    ...(prepared.metadata !== undefined && {
+      metadata: remapWorkspacePaths(prepared.metadata, prepared.workspacePath, workspacePath),
+    }),
     repoPins: toRepoPins(prepared.repoPins),
     baseline: prepared.baseline,
     createdAt: prepared.createdAt,

diff --git a/apps/cli/test/commands/prepare/prepare.test.ts b/apps/cli/test/commands/prepare/prepare.test.ts
@@ -238,4 +238,75 @@ describe('agentv prepare', () => {
     expect(typeof output.baseline.commit).toBe('string');
     expect(Object.keys(output)).not.toContain('workspacePath');
   });
+
+  it('remaps prepared extension context paths into the output workspace', async () => {
+    const evalPath = path.join(tempDir, 'evals', 'suite.eval.yaml');
+    const outDir = path.join(tempDir, 'prepared-extension-context');
+
+    await mkdir(path.join(tempDir, 'evals'), { recursive: true });
+    await mkdir(path.join(tempDir, 'template'), { recursive: true });
+    await mkdir(path.join(tempDir, 'rules'), { recursive: true });
+    await mkdir(path.join(tempDir, 'scripts'), { recursive: true });
+    await mkdir(path.join(tempDir, '.agentv'), { recursive: true });
+    await writeFile(path.join(tempDir, 'template', 'app.txt'), 'initial\n', 'utf8');
+    await writeFile(path.join(tempDir, 'rules', 'AGENTS.md'), '# Rules\n', 'utf8');
+    await writeFile(path.join(tempDir, 'scripts', 'target.ts'), '', 'utf8');
+    await writeFile(
+      path.join(tempDir, '.agentv', 'targets.yaml'),
+      `
+targets:
+  - name: codex
+    provider: cli
+    command: bun ./scripts/target.ts
+`,
+      'utf8',
+    );
+    await writeFile(
+      evalPath,
+      `
+extensions:
+  - id: agentv:agent-rules
+    hook: beforeAll
+    rules: ../rules/AGENTS.md
+workspace:
+  template: ../template
+tests:
+  - id: case-1
+    input: "Fix the workspace file."
+    criteria: "Works"
+`,
+      'utf8',
+    );
+
+    await execa(
+      'bun',
+      [
+        '--no-env-file',
+        CLI_ENTRY,
+        'prepare',
+        evalPath,
+        '--test-id',
+        'case-1',
+        '--target',
+        'codex',
+        '--out',
+        outDir,
+      ],
+      {
+        cwd: tempDir,
+        env: {
+          AGENTV_HOME: path.join(tempDir, '.agentv-home'),
+          AGENTV_NO_UPDATE_CHECK: '1',
+        },
+      },
+    );
+
+    const workspacePath = path.join(outDir, 'workspace');
+    const manifest = JSON.parse(await readFile(path.join(outDir, 'agentv_prepare.json'), 'utf8'));
+    const rulesPath = manifest.provider_context.agent_rules_paths.rules[0];
+
+    expect(rulesPath).toStartWith(workspacePath);
+    expect(await exists(rulesPath)).toBe(true);
+    expect(manifest.metadata.agent_rules_paths.rules[0]).toBe(rulesPath);
+  });
 });
diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
@@ -5,7 +5,7 @@ sidebar:
   order: 1
 ---
 
-Evaluation files define the test cases, graders, workspace lifecycle, and run controls for an evaluation run. The reserved `tags.experiment` key is the run/result grouping label, top-level `target` identifies the system under test, and fields such as `repeat`, `threshold`, `timeout_seconds`, `evaluate_options.budget_usd`, and `evaluate_options.max_concurrency` control repeated attempts and gates. Workspace reuse belongs under `workspace.isolation`; Docker/container binding belongs under `workspace.docker`. Install, build, and reset commands belong under `workspace.hooks`; runner-specific setup belongs in the `target` object or `targets.yaml`. AgentV supports two eval data formats: YAML and JSONL.
+Evaluation files define the test cases, graders, workspace lifecycle, and run controls for an evaluation run. The reserved `tags.experiment` key is the run/result grouping label, top-level `target` identifies the system under test, and fields such as `repeat`, `threshold`, `timeout_seconds`, `evaluate_options.budget_usd`, and `evaluate_options.max_concurrency` control repeated attempts and gates. Workspace reuse belongs under `workspace.isolation`; repository provenance belongs under `workspace.repos`; Docker/container binding belongs under `workspace.docker`. Non-provisioning setup commands belong in top-level `extensions`; reset policy stays under `workspace.hooks.after_each.reset`; runner-specific setup belongs in the `target` object or `targets.yaml`. AgentV supports two eval data formats: YAML and JSONL.
 
 YAML is the canonical portable model. TypeScript helpers, generated fixtures, and Python scripts should lower to the same YAML/JSONL shapes rather than inventing a separate eval contract.
 Eval files describe the task, target binding, and run controls. Use `evaluate_options.max_concurrency` for authored suite concurrency. Operators can still override concurrency with `--workers` or set defaults with `execution.workers` in `agentv.config.*` / `.agentv/config.yaml`; do not author legacy `workers` fields in eval YAML.
@@ -122,20 +122,58 @@ tests:
 | `evaluate_options` | Optional evaluation runtime options such as `budget_usd` and `max_concurrency` |
 | `threshold` | Optional suite quality threshold |
 | `workspace` | Suite-level task environment — inline object or string path to an [external workspace file](/docs/guides/workspace-pool/#external-workspace-config). Repo entries declare identity and checkout pins; acquisition is covered in [Workspace Architecture](/docs/guides/workspace-architecture/#repo-provenance-vs-acquisition). |
+| `extensions` | Promptfoo-style lifecycle hooks: `file://path/to/hooks.mjs:beforeAll`, `beforeEach`, `afterEach`, `afterAll`, plus the built-in `agentv:agent-rules`. Hooks run after `workspace.repos` materializes. |
 | `imports` | Optional import groups. `imports.suites` imports full child eval suites with their task context. `imports.tests` imports raw test rows into this file's context. Import entries may use scoped `run:` overrides for `threshold`, `repeat`, `timeout_seconds`, and `budget_usd`. |
 | `tests` | Inline raw tests or a string path to an external raw-case file or directory. Legacy `tests[].include` entries still load with a migration warning; prefer `imports.suites` or `imports.tests`. |
 | `assertions` | Suite-level graders appended to each test unless `execution.skip_defaults: true` is set on the test |
 | `input` | Suite-level input messages prepended to each test's input unless `execution.skip_defaults: true` is set on the test |
 
 `workspace` is what the agent can inspect or modify through tools, not prompt
-input. Put instructions in `input`; put repos, templates, and lifecycle setup in
-`workspace`.
+input. Put instructions in `input`; put repos, templates, Docker config, env
+checks, isolation, and repo provenance in `workspace`. Put lifecycle setup that
+does not acquire repos in `extensions`.
 
 For historical or repo-state evals, put the checkout under
 `workspace.repos[].commit` or `workspace.repos[].base_commit`. A commit SHA in
 the prompt or metadata is useful context, but it does not materialize a repo for
 the agent to inspect.
 
+### Lifecycle Extensions
+
+`extensions` uses Promptfoo-compatible lifecycle names. File hooks are local
+JavaScript or TypeScript modules resolved relative to the eval file:
+
+```yaml
+extensions:
+  - file://scripts/setup.mjs:beforeAll
+  - file://scripts/setup.mjs:beforeEach
+  - file://scripts/setup.mjs:afterEach
+  - file://scripts/setup.mjs:afterAll
+```
+
+Each exported function receives a context object with snake_case keys such as
+`workspace_path`, `test_id`, `eval_run_id`, `case_input`, and `case_metadata`.
+Setup hook failures (`beforeAll`, `beforeEach`) fail the affected run; teardown
+hook failures (`afterEach`, `afterAll`) are non-fatal.
+
+`agentv:agent-rules` is the only built-in extension in this slice. It runs after
+workspace materialization and exposes staged rule paths to providers and result
+metadata as `agent_rules_paths`:
+
+```yaml
+extensions:
+  - id: agentv:agent-rules
+    hook: beforeAll
+    skills: agent-rules/skills
+    hooks: agent-rules/hooks
+    agents: agent-rules/agents
+    rules: agent-rules/AGENTS.md
+```
+
+If `agentv:agent-rules` is authored as a string, it defaults to `beforeAll` and
+discovers conventional rule locations already present in the materialized
+workspace. It does not clone repositories or replace `workspace.repos`.
+
 ### Metadata Fields
 
 You can add structured metadata to your eval file using these optional top-level fields. Metadata is parsed when the `name` field is present:

diff --git a/apps/web/src/content/docs/docs/evaluation/experiments.mdx b/apps/web/src/content/docs/docs/evaluation/experiments.mdx
@@ -189,7 +189,7 @@ Scoped `run:` supports `threshold`, `repeat`, `timeout_seconds`, and legacy
 per-case `budget_usd` overrides. Parent suite budgets should use
 `evaluate_options.budget_usd` for public eval authoring. Use
 `evaluate_options.max_concurrency` for authored concurrency. Candidate-changing fields stay
-parent-level. Workspace mutation belongs in `workspace.hooks`, and
+parent-level. Executable workspace setup belongs in top-level lifecycle extensions, and
 provider-specific setup belongs in target configuration.
 
 ## Lifecycle Ownership
@@ -199,19 +199,18 @@ target-specific runner state.
 
 | Need | Put it in |
 | --- | --- |
-| Install dependencies, build the repo, seed files | `workspace.hooks.before_all` |
-| Reset or apply per-case state | `workspace.hooks.before_each` / `workspace.hooks.after_each` |
+| Install dependencies, build the repo, seed files | `extensions: ["file://scripts/setup.mjs:beforeAll"]` |
+| Apply per-case state | `extensions: ["file://scripts/setup.mjs:beforeEach"]` |
+| Reset file state after each case | `workspace.hooks.after_each.reset` |
 | Configure an agent runner or provider variant | `target` object or `targets.yaml` |
 | Choose the target | top-level `target` |
 | Override the target's default model | `target.model` |
 | Configure repeat policy, budget, concurrency, timeout, threshold | top-level `repeat`, `evaluate_options.budget_usd`, `evaluate_options.max_concurrency`, `timeout_seconds`, `threshold` |
 | Bind an existing local workspace directory | `--workspace-path` or `.agentv/config.local.yaml` |
 
 ```yaml
-workspace:
-  hooks:
-    before_all:
-      command: ["bash", "-lc", "bun install && bun run build"]
+extensions:
+  - file://scripts/build.mjs:beforeAll
 
 target:
   extends: codex-gpt5

diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
@@ -326,14 +326,16 @@ agentv eval evals/my-eval.yaml --workspace-clean full
 agentv eval evals/my-eval.yaml --retain-on-success cleanup --retain-on-failure keep
 ```
 
-Portable eval YAML keeps workspace intent under templates, repos, hooks, env,
-Docker, and folder isolation:
+Portable eval YAML keeps workspace intent under templates, repos, env, Docker,
+and folder isolation. Use top-level extensions for executable setup:
 
 ```yaml
+extensions:
+  - file://scripts/setup.mjs:beforeAll
+
 workspace:
   isolation: shared      # shared | per_case
   hooks:
-    enabled: true        # set false to skip all hooks
     after_each:
       reset: fast        # none | fast | strict
 ```
@@ -343,7 +345,7 @@ Notes:
 - Pooled mode is an explicit machine-local optimization.
 - `--workspace-path` uses an existing machine-local directory as-is and implies static runtime mode.
 - Runtime static mode is incompatible with `isolation: per_case`.
-- `hooks.enabled: false` skips all lifecycle hooks (setup, teardown, reset).
+- `workspace.hooks.after_each.reset` resets file state after each case.
 - Pool slots are managed separately (`agentv workspace list|clean`).
 
 ### Resume an Interrupted Run

diff --git a/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx b/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx
@@ -27,24 +27,25 @@ Use this split when deciding where a benchmark key belongs:
 |------------|--------------|------------------|
 | `workspace.repos[]` | Yes | Declares repo identity and checkout refs; AgentV resolves acquisition and materializes the checkout. |
 | `workspace.template` | Yes | Copies a workspace template into the run workspace. |
-| `workspace.hooks` | Yes | Runs lifecycle commands with workspace and case context on stdin. |
+| `extensions` | Yes | Runs Promptfoo-style lifecycle setup after `workspace.template` and `workspace.repos` materialize. |
+| `workspace.hooks.after_each.reset` | Yes | Controls workspace reset policy after each case. |
 | `workspace.isolation` | Yes | Controls shared vs per-case folder isolation. Runtime workspace paths are machine-local config/CLI bindings, not benchmark provenance. |
 | `experiment` | Yes | Selects targets, thresholds, repeat policy, budgets, and default grader behavior. Concurrency is an operator/run setting from `--workers` or project config. |
 | `input`, `input_files`, `expected_output` | Yes | Builds the target prompt and passive reference answer. |
 | `assertions` | Yes | Runs deterministic, LLM, composite, or code graders. |
 | Top-level `name`, `version`, `tags`, `license`, `requires` | Informational | Identifies and categorizes the suite. |
-| `tests[].metadata` | Informational to AgentV | Passes arbitrary case data through to results and hook stdin; in-process custom assertions can also read it. |
+| `tests[].metadata` | Informational to AgentV | Passes arbitrary case data through to results and extension context; in-process custom assertions can also read it. |
 
-`metadata` can still become operational inside your own hook scripts. For
-example, a `before_each` hook can read `case_metadata.test_patch` and apply that
+`metadata` can still become operational inside your own lifecycle extensions. For
+example, a `beforeEach` extension can read `case_metadata.test_patch` and apply that
 patch before the agent starts. The distinction is that AgentV itself only passes
-the metadata along; the script owns the behavior.
+the metadata along; the extension owns the behavior.
 
-## Hook Payloads
+## Extension Context
 
-Lifecycle hooks receive JSON on stdin. Case-scoped hooks such as per-test
-`before_all`, `before_each`, and `after_each` receive the current test's
-metadata as `case_metadata`:
+File lifecycle extensions export functions named `beforeAll`, `beforeEach`,
+`afterEach`, or `afterAll`. AgentV calls each function with context including
+the current test's metadata as `case_metadata`:
 
 ```json
 {
@@ -59,9 +60,9 @@ metadata as `case_metadata`:
 }
 ```
 
-Suite-level `before_all` hooks run once for the workspace, before any one test is
-selected, so they should do suite setup only. Use `before_each` when setup depends
-on per-case metadata such as a patch path, source row, or selected test list.
+`beforeAll` runs once for the shared workspace after repo materialization, so it
+should do suite setup only. Use `beforeEach` when setup depends on per-case
+metadata such as a patch path, source row, or selected test list.
 
 ## Task Artifact Anatomy
 
@@ -71,7 +72,7 @@ Benchmark task packs map cleanly onto AgentV fields at authoring time:
 |---------------|----------------|
 | Prompt or instruction | `input`, usually with `type: file` blocks for long prompts |
 | Source checkout | `workspace.repos[].repo` and `workspace.repos[].commit` |
-| Per-case setup | `workspace.hooks.before_each` reading `case_metadata` |
+| Per-case setup | `extensions: ["file://scripts/setup.mjs:beforeEach"]` reading `case_metadata` |
 | Gold answer | `expected_output` when the answer is passive reference data |
 | Active verification | `assertions`, especially `code-grader` for commands or artifact checks |
 | Provenance | `tests[].metadata` with source pins, generator rows, and curation labels |
@@ -104,12 +105,12 @@ workspace:
       repo: https://github.com/example/widget.git
       commit: 4f3e2d19b6e4e8f1c2b7d9a0e5a6b7c8d9e0f123
   hooks:
-    before_each:
-      command: ["python", "./scripts/apply-test-patch.py"]
-      timeout_ms: 120000
     after_each:
       reset: strict
 
+extensions:
+  - file://scripts/apply-test-patch.mjs:beforeEach
+
 assertions:
   - name: focused-tests
     type: code-grader
@@ -133,7 +134,7 @@ tests:
 
 In this example, `workspace.repos[].commit` is the actual checkout. The
 matching `metadata.source_commit` is audit data that gets recorded with the case
-and is available to scripts. `apply-test-patch.py` can read
+and is available to extensions. `apply-test-patch.mjs` can read
 `case_metadata.test_patch` and `case_metadata.fail_to_pass_tests`, then apply
 the patch and write the selected test list into the workspace. The code grader
 can read that workspace file through its `workspace_path` payload. Repo
@@ -158,9 +159,9 @@ workspace:
     - path: ./repo
       repo: https://github.com/example/widget.git
       commit: 4f3e2d19b6e4e8f1c2b7d9a0e5a6b7c8d9e0f123
-  hooks:
-    before_each:
-      command: ["python", "./scripts/apply-case-fixtures.py"]
+
+extensions:
+  - file://scripts/apply-case-fixtures.mjs:beforeEach
 
 target: codex