Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions apps/cli/src/commands/prepare/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import path from 'node:path';

import {
type EvalTargetRef,
type JsonObject,
type PreparedEvalWorkspace,
type PreparedWorkspaceRepoPin,
type ResolvedTarget,
Expand Down Expand Up @@ -51,6 +52,8 @@ interface PrepareResult {
readonly manifestPath: string;
readonly setupStatus: 'ok';
readonly setupSteps: readonly SetupStep[];
readonly providerContext?: JsonObject;
readonly metadata?: Record<string, unknown>;
readonly repoPins: readonly RepoPin[];
readonly baseline: PreparedEvalWorkspace['baseline'];
readonly createdAt: string;
Expand All @@ -65,6 +68,8 @@ interface PrepareManifestWire {
readonly prompt_path: string;
readonly setup_status: 'ok';
readonly setup_steps: readonly SetupStepWire[];
readonly provider_context?: JsonObject;
readonly metadata?: Record<string, unknown>;
readonly repo_pins: readonly RepoPinWire[];
readonly baseline: BaselineWire;
readonly created_at: string;
Expand Down Expand Up @@ -130,6 +135,37 @@ function toRepoPins(pins: readonly PreparedWorkspaceRepoPin[]): readonly RepoPin
}));
}

function remapWorkspacePaths<T>(
value: T,
sourceWorkspacePath: string,
targetWorkspacePath: string,
): T {
if (typeof value === 'string') {
const relativePath = path.relative(sourceWorkspacePath, value);
if (
relativePath === '' ||
(!!relativePath && !relativePath.startsWith('..') && !path.isAbsolute(relativePath))
) {
return path.join(targetWorkspacePath, relativePath) as T;
}
return value;
}
if (Array.isArray(value)) {
return value.map((item) =>
remapWorkspacePaths(item, sourceWorkspacePath, targetWorkspacePath),
) as T;
}
if (value && typeof value === 'object') {
return Object.fromEntries(
Object.entries(value).map(([key, item]) => [
key,
remapWorkspacePaths(item, sourceWorkspacePath, targetWorkspacePath),
]),
) as T;
}
return value;
}

async function moveDirectory(sourcePath: string, destinationPath: string): Promise<void> {
try {
await rename(sourcePath, destinationPath);
Expand Down Expand Up @@ -200,6 +236,8 @@ function toManifestWire(result: PrepareResult): PrepareManifestWire {
status: step.status,
...(step.message !== undefined && { message: step.message }),
})),
...(result.providerContext !== undefined && { provider_context: result.providerContext }),
...(result.metadata !== undefined && { metadata: result.metadata }),
repo_pins: result.repoPins.map((pin) => ({
...(pin.path !== undefined && { path: pin.path }),
...(pin.repo !== undefined && { repo: pin.repo }),
Expand Down Expand Up @@ -322,6 +360,16 @@ async function prepareAttempt(options: {
manifestPath,
setupStatus: 'ok',
setupSteps: setupStepsFromPrepared(prepared),
...(prepared.providerContext !== undefined && {
providerContext: remapWorkspacePaths(
prepared.providerContext,
prepared.workspacePath,
workspacePath,
),
}),
...(prepared.metadata !== undefined && {
metadata: remapWorkspacePaths(prepared.metadata, prepared.workspacePath, workspacePath),
}),
repoPins: toRepoPins(prepared.repoPins),
baseline: prepared.baseline,
createdAt: prepared.createdAt,
Expand Down
71 changes: 71 additions & 0 deletions apps/cli/test/commands/prepare/prepare.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -238,4 +238,75 @@ describe('agentv prepare', () => {
expect(typeof output.baseline.commit).toBe('string');
expect(Object.keys(output)).not.toContain('workspacePath');
});

it('remaps prepared extension context paths into the output workspace', async () => {
const evalPath = path.join(tempDir, 'evals', 'suite.eval.yaml');
const outDir = path.join(tempDir, 'prepared-extension-context');

await mkdir(path.join(tempDir, 'evals'), { recursive: true });
await mkdir(path.join(tempDir, 'template'), { recursive: true });
await mkdir(path.join(tempDir, 'rules'), { recursive: true });
await mkdir(path.join(tempDir, 'scripts'), { recursive: true });
await mkdir(path.join(tempDir, '.agentv'), { recursive: true });
await writeFile(path.join(tempDir, 'template', 'app.txt'), 'initial\n', 'utf8');
await writeFile(path.join(tempDir, 'rules', 'AGENTS.md'), '# Rules\n', 'utf8');
await writeFile(path.join(tempDir, 'scripts', 'target.ts'), '', 'utf8');
await writeFile(
path.join(tempDir, '.agentv', 'targets.yaml'),
`
targets:
- name: codex
provider: cli
command: bun ./scripts/target.ts
`,
'utf8',
);
await writeFile(
evalPath,
`
extensions:
- id: agentv:agent-rules
hook: beforeAll
rules: ../rules/AGENTS.md
workspace:
template: ../template
tests:
- id: case-1
input: "Fix the workspace file."
criteria: "Works"
`,
'utf8',
);

await execa(
'bun',
[
'--no-env-file',
CLI_ENTRY,
'prepare',
evalPath,
'--test-id',
'case-1',
'--target',
'codex',
'--out',
outDir,
],
{
cwd: tempDir,
env: {
AGENTV_HOME: path.join(tempDir, '.agentv-home'),
AGENTV_NO_UPDATE_CHECK: '1',
},
},
);

const workspacePath = path.join(outDir, 'workspace');
const manifest = JSON.parse(await readFile(path.join(outDir, 'agentv_prepare.json'), 'utf8'));
const rulesPath = manifest.provider_context.agent_rules_paths.rules[0];

expect(rulesPath).toStartWith(workspacePath);
expect(await exists(rulesPath)).toBe(true);
expect(manifest.metadata.agent_rules_paths.rules[0]).toBe(rulesPath);
});
});
44 changes: 41 additions & 3 deletions apps/web/src/content/docs/docs/evaluation/eval-files.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ sidebar:
order: 1
---

Evaluation files define the test cases, graders, workspace lifecycle, and run controls for an evaluation run. The reserved `tags.experiment` key is the run/result grouping label, top-level `target` identifies the system under test, and fields such as `repeat`, `threshold`, `timeout_seconds`, `evaluate_options.budget_usd`, and `evaluate_options.max_concurrency` control repeated attempts and gates. Workspace reuse belongs under `workspace.isolation`; Docker/container binding belongs under `workspace.docker`. Install, build, and reset commands belong under `workspace.hooks`; runner-specific setup belongs in the `target` object or `targets.yaml`. AgentV supports two eval data formats: YAML and JSONL.
Evaluation files define the test cases, graders, workspace lifecycle, and run controls for an evaluation run. The reserved `tags.experiment` key is the run/result grouping label, top-level `target` identifies the system under test, and fields such as `repeat`, `threshold`, `timeout_seconds`, `evaluate_options.budget_usd`, and `evaluate_options.max_concurrency` control repeated attempts and gates. Workspace reuse belongs under `workspace.isolation`; repository provenance belongs under `workspace.repos`; Docker/container binding belongs under `workspace.docker`. Non-provisioning setup commands belong in top-level `extensions`; reset policy stays under `workspace.hooks.after_each.reset`; runner-specific setup belongs in the `target` object or `targets.yaml`. AgentV supports two eval data formats: YAML and JSONL.

YAML is the canonical portable model. TypeScript helpers, generated fixtures, and Python scripts should lower to the same YAML/JSONL shapes rather than inventing a separate eval contract.
Eval files describe the task, target binding, and run controls. Use `evaluate_options.max_concurrency` for authored suite concurrency. Operators can still override concurrency with `--workers` or set defaults with `execution.workers` in `agentv.config.*` / `.agentv/config.yaml`; do not author legacy `workers` fields in eval YAML.
Expand Down Expand Up @@ -122,20 +122,58 @@ tests:
| `evaluate_options` | Optional evaluation runtime options such as `budget_usd` and `max_concurrency` |
| `threshold` | Optional suite quality threshold |
| `workspace` | Suite-level task environment — inline object or string path to an [external workspace file](/docs/guides/workspace-pool/#external-workspace-config). Repo entries declare identity and checkout pins; acquisition is covered in [Workspace Architecture](/docs/guides/workspace-architecture/#repo-provenance-vs-acquisition). |
| `extensions` | Promptfoo-style lifecycle hooks: `file://path/to/hooks.mjs:beforeAll`, `beforeEach`, `afterEach`, `afterAll`, plus the built-in `agentv:agent-rules`. Hooks run after `workspace.repos` materializes. |
| `imports` | Optional import groups. `imports.suites` imports full child eval suites with their task context. `imports.tests` imports raw test rows into this file's context. Import entries may use scoped `run:` overrides for `threshold`, `repeat`, `timeout_seconds`, and `budget_usd`. |
| `tests` | Inline raw tests or a string path to an external raw-case file or directory. Legacy `tests[].include` entries still load with a migration warning; prefer `imports.suites` or `imports.tests`. |
| `assertions` | Suite-level graders appended to each test unless `execution.skip_defaults: true` is set on the test |
| `input` | Suite-level input messages prepended to each test's input unless `execution.skip_defaults: true` is set on the test |

`workspace` is what the agent can inspect or modify through tools, not prompt
input. Put instructions in `input`; put repos, templates, and lifecycle setup in
`workspace`.
input. Put instructions in `input`; put repos, templates, Docker config, env
checks, isolation, and repo provenance in `workspace`. Put lifecycle setup that
does not acquire repos in `extensions`.

For historical or repo-state evals, put the checkout under
`workspace.repos[].commit` or `workspace.repos[].base_commit`. A commit SHA in
the prompt or metadata is useful context, but it does not materialize a repo for
the agent to inspect.

### Lifecycle Extensions

`extensions` uses Promptfoo-compatible lifecycle names. File hooks are local
JavaScript or TypeScript modules resolved relative to the eval file:

```yaml
extensions:
- file://scripts/setup.mjs:beforeAll
- file://scripts/setup.mjs:beforeEach
- file://scripts/setup.mjs:afterEach
- file://scripts/setup.mjs:afterAll
```

Each exported function receives a context object with snake_case keys such as
`workspace_path`, `test_id`, `eval_run_id`, `case_input`, and `case_metadata`.
Setup hook failures (`beforeAll`, `beforeEach`) fail the affected run; teardown
hook failures (`afterEach`, `afterAll`) are non-fatal.

`agentv:agent-rules` is the only built-in extension in this slice. It runs after
workspace materialization and exposes staged rule paths to providers and result
metadata as `agent_rules_paths`:

```yaml
extensions:
- id: agentv:agent-rules
hook: beforeAll
skills: agent-rules/skills
hooks: agent-rules/hooks
agents: agent-rules/agents
rules: agent-rules/AGENTS.md
```

If `agentv:agent-rules` is authored as a string, it defaults to `beforeAll` and
discovers conventional rule locations already present in the materialized
workspace. It does not clone repositories or replace `workspace.repos`.

### Metadata Fields

You can add structured metadata to your eval file using these optional top-level fields. Metadata is parsed when the `name` field is present:
Expand Down
13 changes: 6 additions & 7 deletions apps/web/src/content/docs/docs/evaluation/experiments.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ Scoped `run:` supports `threshold`, `repeat`, `timeout_seconds`, and legacy
per-case `budget_usd` overrides. Parent suite budgets should use
`evaluate_options.budget_usd` for public eval authoring. Use
`evaluate_options.max_concurrency` for authored concurrency. Candidate-changing fields stay
parent-level. Workspace mutation belongs in `workspace.hooks`, and
parent-level. Executable workspace setup belongs in top-level lifecycle extensions, and
provider-specific setup belongs in target configuration.

## Lifecycle Ownership
Expand All @@ -199,19 +199,18 @@ target-specific runner state.

| Need | Put it in |
| --- | --- |
| Install dependencies, build the repo, seed files | `workspace.hooks.before_all` |
| Reset or apply per-case state | `workspace.hooks.before_each` / `workspace.hooks.after_each` |
| Install dependencies, build the repo, seed files | `extensions: ["file://scripts/setup.mjs:beforeAll"]` |
| Apply per-case state | `extensions: ["file://scripts/setup.mjs:beforeEach"]` |
| Reset file state after each case | `workspace.hooks.after_each.reset` |
| Configure an agent runner or provider variant | `target` object or `targets.yaml` |
| Choose the target | top-level `target` |
| Override the target's default model | `target.model` |
| Configure repeat policy, budget, concurrency, timeout, threshold | top-level `repeat`, `evaluate_options.budget_usd`, `evaluate_options.max_concurrency`, `timeout_seconds`, `threshold` |
| Bind an existing local workspace directory | `--workspace-path` or `.agentv/config.local.yaml` |

```yaml
workspace:
hooks:
before_all:
command: ["bash", "-lc", "bun install && bun run build"]
extensions:
- file://scripts/build.mjs:beforeAll

target:
extends: codex-gpt5
Expand Down
10 changes: 6 additions & 4 deletions apps/web/src/content/docs/docs/evaluation/running-evals.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -326,14 +326,16 @@ agentv eval evals/my-eval.yaml --workspace-clean full
agentv eval evals/my-eval.yaml --retain-on-success cleanup --retain-on-failure keep
```

Portable eval YAML keeps workspace intent under templates, repos, hooks, env,
Docker, and folder isolation:
Portable eval YAML keeps workspace intent under templates, repos, env, Docker,
and folder isolation. Use top-level extensions for executable setup:

```yaml
extensions:
- file://scripts/setup.mjs:beforeAll

workspace:
isolation: shared # shared | per_case
hooks:
enabled: true # set false to skip all hooks
after_each:
reset: fast # none | fast | strict
```
Expand All @@ -343,7 +345,7 @@ Notes:
- Pooled mode is an explicit machine-local optimization.
- `--workspace-path` uses an existing machine-local directory as-is and implies static runtime mode.
- Runtime static mode is incompatible with `isolation: per_case`.
- `hooks.enabled: false` skips all lifecycle hooks (setup, teardown, reset).
- `workspace.hooks.after_each.reset` resets file state after each case.
- Pool slots are managed separately (`agentv workspace list|clean`).

### Resume an Interrupted Run
Expand Down
41 changes: 21 additions & 20 deletions apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -27,24 +27,25 @@ Use this split when deciding where a benchmark key belongs:
|------------|--------------|------------------|
| `workspace.repos[]` | Yes | Declares repo identity and checkout refs; AgentV resolves acquisition and materializes the checkout. |
| `workspace.template` | Yes | Copies a workspace template into the run workspace. |
| `workspace.hooks` | Yes | Runs lifecycle commands with workspace and case context on stdin. |
| `extensions` | Yes | Runs Promptfoo-style lifecycle setup after `workspace.template` and `workspace.repos` materialize. |
| `workspace.hooks.after_each.reset` | Yes | Controls workspace reset policy after each case. |
| `workspace.isolation` | Yes | Controls shared vs per-case folder isolation. Runtime workspace paths are machine-local config/CLI bindings, not benchmark provenance. |
| `experiment` | Yes | Selects targets, thresholds, repeat policy, budgets, and default grader behavior. Concurrency is an operator/run setting from `--workers` or project config. |
| `input`, `input_files`, `expected_output` | Yes | Builds the target prompt and passive reference answer. |
| `assertions` | Yes | Runs deterministic, LLM, composite, or code graders. |
| Top-level `name`, `version`, `tags`, `license`, `requires` | Informational | Identifies and categorizes the suite. |
| `tests[].metadata` | Informational to AgentV | Passes arbitrary case data through to results and hook stdin; in-process custom assertions can also read it. |
| `tests[].metadata` | Informational to AgentV | Passes arbitrary case data through to results and extension context; in-process custom assertions can also read it. |

`metadata` can still become operational inside your own hook scripts. For
example, a `before_each` hook can read `case_metadata.test_patch` and apply that
`metadata` can still become operational inside your own lifecycle extensions. For
example, a `beforeEach` extension can read `case_metadata.test_patch` and apply that
patch before the agent starts. The distinction is that AgentV itself only passes
the metadata along; the script owns the behavior.
the metadata along; the extension owns the behavior.

## Hook Payloads
## Extension Context

Lifecycle hooks receive JSON on stdin. Case-scoped hooks such as per-test
`before_all`, `before_each`, and `after_each` receive the current test's
metadata as `case_metadata`:
File lifecycle extensions export functions named `beforeAll`, `beforeEach`,
`afterEach`, or `afterAll`. AgentV calls each function with context including
the current test's metadata as `case_metadata`:

```json
{
Expand All @@ -59,9 +60,9 @@ metadata as `case_metadata`:
}
```

Suite-level `before_all` hooks run once for the workspace, before any one test is
selected, so they should do suite setup only. Use `before_each` when setup depends
on per-case metadata such as a patch path, source row, or selected test list.
`beforeAll` runs once for the shared workspace after repo materialization, so it
should do suite setup only. Use `beforeEach` when setup depends on per-case
metadata such as a patch path, source row, or selected test list.

## Task Artifact Anatomy

Expand All @@ -71,7 +72,7 @@ Benchmark task packs map cleanly onto AgentV fields at authoring time:
|---------------|----------------|
| Prompt or instruction | `input`, usually with `type: file` blocks for long prompts |
| Source checkout | `workspace.repos[].repo` and `workspace.repos[].commit` |
| Per-case setup | `workspace.hooks.before_each` reading `case_metadata` |
| Per-case setup | `extensions: ["file://scripts/setup.mjs:beforeEach"]` reading `case_metadata` |
| Gold answer | `expected_output` when the answer is passive reference data |
| Active verification | `assertions`, especially `code-grader` for commands or artifact checks |
| Provenance | `tests[].metadata` with source pins, generator rows, and curation labels |
Expand Down Expand Up @@ -104,12 +105,12 @@ workspace:
repo: https://github.com/example/widget.git
commit: 4f3e2d19b6e4e8f1c2b7d9a0e5a6b7c8d9e0f123
hooks:
before_each:
command: ["python", "./scripts/apply-test-patch.py"]
timeout_ms: 120000
after_each:
reset: strict

extensions:
- file://scripts/apply-test-patch.mjs:beforeEach

assertions:
- name: focused-tests
type: code-grader
Expand All @@ -133,7 +134,7 @@ tests:

In this example, `workspace.repos[].commit` is the actual checkout. The
matching `metadata.source_commit` is audit data that gets recorded with the case
and is available to scripts. `apply-test-patch.py` can read
and is available to extensions. `apply-test-patch.mjs` can read
`case_metadata.test_patch` and `case_metadata.fail_to_pass_tests`, then apply
the patch and write the selected test list into the workspace. The code grader
can read that workspace file through its `workspace_path` payload. Repo
Expand All @@ -158,9 +159,9 @@ workspace:
- path: ./repo
repo: https://github.com/example/widget.git
commit: 4f3e2d19b6e4e8f1c2b7d9a0e5a6b7c8d9e0f123
hooks:
before_each:
command: ["python", "./scripts/apply-case-fixtures.py"]

extensions:
- file://scripts/apply-case-fixtures.mjs:beforeEach

target: codex

Expand Down
Loading
Loading