diff --git a/apps/cli/src/commands/prepare/index.ts b/apps/cli/src/commands/prepare/index.ts index a615e823a..5a6ad9aeb 100644 --- a/apps/cli/src/commands/prepare/index.ts +++ b/apps/cli/src/commands/prepare/index.ts @@ -8,6 +8,7 @@ import path from 'node:path'; import { type EvalTargetRef, + type JsonObject, type PreparedEvalWorkspace, type PreparedWorkspaceRepoPin, type ResolvedTarget, @@ -51,6 +52,8 @@ interface PrepareResult { readonly manifestPath: string; readonly setupStatus: 'ok'; readonly setupSteps: readonly SetupStep[]; + readonly providerContext?: JsonObject; + readonly metadata?: Record; readonly repoPins: readonly RepoPin[]; readonly baseline: PreparedEvalWorkspace['baseline']; readonly createdAt: string; @@ -65,6 +68,8 @@ interface PrepareManifestWire { readonly prompt_path: string; readonly setup_status: 'ok'; readonly setup_steps: readonly SetupStepWire[]; + readonly provider_context?: JsonObject; + readonly metadata?: Record; readonly repo_pins: readonly RepoPinWire[]; readonly baseline: BaselineWire; readonly created_at: string; @@ -130,6 +135,37 @@ function toRepoPins(pins: readonly PreparedWorkspaceRepoPin[]): readonly RepoPin })); } +function remapWorkspacePaths( + value: T, + sourceWorkspacePath: string, + targetWorkspacePath: string, +): T { + if (typeof value === 'string') { + const relativePath = path.relative(sourceWorkspacePath, value); + if ( + relativePath === '' || + (!!relativePath && !relativePath.startsWith('..') && !path.isAbsolute(relativePath)) + ) { + return path.join(targetWorkspacePath, relativePath) as T; + } + return value; + } + if (Array.isArray(value)) { + return value.map((item) => + remapWorkspacePaths(item, sourceWorkspacePath, targetWorkspacePath), + ) as T; + } + if (value && typeof value === 'object') { + return Object.fromEntries( + Object.entries(value).map(([key, item]) => [ + key, + remapWorkspacePaths(item, sourceWorkspacePath, targetWorkspacePath), + ]), + ) as T; + } + return value; +} + async function moveDirectory(sourcePath: string, destinationPath: string): Promise { try { await rename(sourcePath, destinationPath); @@ -200,6 +236,8 @@ function toManifestWire(result: PrepareResult): PrepareManifestWire { status: step.status, ...(step.message !== undefined && { message: step.message }), })), + ...(result.providerContext !== undefined && { provider_context: result.providerContext }), + ...(result.metadata !== undefined && { metadata: result.metadata }), repo_pins: result.repoPins.map((pin) => ({ ...(pin.path !== undefined && { path: pin.path }), ...(pin.repo !== undefined && { repo: pin.repo }), @@ -322,6 +360,16 @@ async function prepareAttempt(options: { manifestPath, setupStatus: 'ok', setupSteps: setupStepsFromPrepared(prepared), + ...(prepared.providerContext !== undefined && { + providerContext: remapWorkspacePaths( + prepared.providerContext, + prepared.workspacePath, + workspacePath, + ), + }), + ...(prepared.metadata !== undefined && { + metadata: remapWorkspacePaths(prepared.metadata, prepared.workspacePath, workspacePath), + }), repoPins: toRepoPins(prepared.repoPins), baseline: prepared.baseline, createdAt: prepared.createdAt, diff --git a/apps/cli/test/commands/prepare/prepare.test.ts b/apps/cli/test/commands/prepare/prepare.test.ts index 4e8a9fe65..1bb265d33 100644 --- a/apps/cli/test/commands/prepare/prepare.test.ts +++ b/apps/cli/test/commands/prepare/prepare.test.ts @@ -238,4 +238,75 @@ describe('agentv prepare', () => { expect(typeof output.baseline.commit).toBe('string'); expect(Object.keys(output)).not.toContain('workspacePath'); }); + + it('remaps prepared extension context paths into the output workspace', async () => { + const evalPath = path.join(tempDir, 'evals', 'suite.eval.yaml'); + const outDir = path.join(tempDir, 'prepared-extension-context'); + + await mkdir(path.join(tempDir, 'evals'), { recursive: true }); + await mkdir(path.join(tempDir, 'template'), { recursive: true }); + await mkdir(path.join(tempDir, 'rules'), { recursive: true }); + await mkdir(path.join(tempDir, 'scripts'), { recursive: true }); + await mkdir(path.join(tempDir, '.agentv'), { recursive: true }); + await writeFile(path.join(tempDir, 'template', 'app.txt'), 'initial\n', 'utf8'); + await writeFile(path.join(tempDir, 'rules', 'AGENTS.md'), '# Rules\n', 'utf8'); + await writeFile(path.join(tempDir, 'scripts', 'target.ts'), '', 'utf8'); + await writeFile( + path.join(tempDir, '.agentv', 'targets.yaml'), + ` +targets: + - name: codex + provider: cli + command: bun ./scripts/target.ts +`, + 'utf8', + ); + await writeFile( + evalPath, + ` +extensions: + - id: agentv:agent-rules + hook: beforeAll + rules: ../rules/AGENTS.md +workspace: + template: ../template +tests: + - id: case-1 + input: "Fix the workspace file." + criteria: "Works" +`, + 'utf8', + ); + + await execa( + 'bun', + [ + '--no-env-file', + CLI_ENTRY, + 'prepare', + evalPath, + '--test-id', + 'case-1', + '--target', + 'codex', + '--out', + outDir, + ], + { + cwd: tempDir, + env: { + AGENTV_HOME: path.join(tempDir, '.agentv-home'), + AGENTV_NO_UPDATE_CHECK: '1', + }, + }, + ); + + const workspacePath = path.join(outDir, 'workspace'); + const manifest = JSON.parse(await readFile(path.join(outDir, 'agentv_prepare.json'), 'utf8')); + const rulesPath = manifest.provider_context.agent_rules_paths.rules[0]; + + expect(rulesPath).toStartWith(workspacePath); + expect(await exists(rulesPath)).toBe(true); + expect(manifest.metadata.agent_rules_paths.rules[0]).toBe(rulesPath); + }); }); diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx index 63d6655ab..a49a13229 100644 --- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx +++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx @@ -5,7 +5,7 @@ sidebar: order: 1 --- -Evaluation files define the test cases, graders, workspace lifecycle, and run controls for an evaluation run. The reserved `tags.experiment` key is the run/result grouping label, top-level `target` identifies the system under test, and fields such as `repeat`, `threshold`, `timeout_seconds`, `evaluate_options.budget_usd`, and `evaluate_options.max_concurrency` control repeated attempts and gates. Workspace reuse belongs under `workspace.isolation`; Docker/container binding belongs under `workspace.docker`. Install, build, and reset commands belong under `workspace.hooks`; runner-specific setup belongs in the `target` object or `targets.yaml`. AgentV supports two eval data formats: YAML and JSONL. +Evaluation files define the test cases, graders, workspace lifecycle, and run controls for an evaluation run. The reserved `tags.experiment` key is the run/result grouping label, top-level `target` identifies the system under test, and fields such as `repeat`, `threshold`, `timeout_seconds`, `evaluate_options.budget_usd`, and `evaluate_options.max_concurrency` control repeated attempts and gates. Workspace reuse belongs under `workspace.isolation`; repository provenance belongs under `workspace.repos`; Docker/container binding belongs under `workspace.docker`. Non-provisioning setup commands belong in top-level `extensions`; reset policy stays under `workspace.hooks.after_each.reset`; runner-specific setup belongs in the `target` object or `targets.yaml`. AgentV supports two eval data formats: YAML and JSONL. YAML is the canonical portable model. TypeScript helpers, generated fixtures, and Python scripts should lower to the same YAML/JSONL shapes rather than inventing a separate eval contract. Eval files describe the task, target binding, and run controls. Use `evaluate_options.max_concurrency` for authored suite concurrency. Operators can still override concurrency with `--workers` or set defaults with `execution.workers` in `agentv.config.*` / `.agentv/config.yaml`; do not author legacy `workers` fields in eval YAML. @@ -122,20 +122,58 @@ tests: | `evaluate_options` | Optional evaluation runtime options such as `budget_usd` and `max_concurrency` | | `threshold` | Optional suite quality threshold | | `workspace` | Suite-level task environment — inline object or string path to an [external workspace file](/docs/guides/workspace-pool/#external-workspace-config). Repo entries declare identity and checkout pins; acquisition is covered in [Workspace Architecture](/docs/guides/workspace-architecture/#repo-provenance-vs-acquisition). | +| `extensions` | Promptfoo-style lifecycle hooks: `file://path/to/hooks.mjs:beforeAll`, `beforeEach`, `afterEach`, `afterAll`, plus the built-in `agentv:agent-rules`. Hooks run after `workspace.repos` materializes. | | `imports` | Optional import groups. `imports.suites` imports full child eval suites with their task context. `imports.tests` imports raw test rows into this file's context. Import entries may use scoped `run:` overrides for `threshold`, `repeat`, `timeout_seconds`, and `budget_usd`. | | `tests` | Inline raw tests or a string path to an external raw-case file or directory. Legacy `tests[].include` entries still load with a migration warning; prefer `imports.suites` or `imports.tests`. | | `assertions` | Suite-level graders appended to each test unless `execution.skip_defaults: true` is set on the test | | `input` | Suite-level input messages prepended to each test's input unless `execution.skip_defaults: true` is set on the test | `workspace` is what the agent can inspect or modify through tools, not prompt -input. Put instructions in `input`; put repos, templates, and lifecycle setup in -`workspace`. +input. Put instructions in `input`; put repos, templates, Docker config, env +checks, isolation, and repo provenance in `workspace`. Put lifecycle setup that +does not acquire repos in `extensions`. For historical or repo-state evals, put the checkout under `workspace.repos[].commit` or `workspace.repos[].base_commit`. A commit SHA in the prompt or metadata is useful context, but it does not materialize a repo for the agent to inspect. +### Lifecycle Extensions + +`extensions` uses Promptfoo-compatible lifecycle names. File hooks are local +JavaScript or TypeScript modules resolved relative to the eval file: + +```yaml +extensions: + - file://scripts/setup.mjs:beforeAll + - file://scripts/setup.mjs:beforeEach + - file://scripts/setup.mjs:afterEach + - file://scripts/setup.mjs:afterAll +``` + +Each exported function receives a context object with snake_case keys such as +`workspace_path`, `test_id`, `eval_run_id`, `case_input`, and `case_metadata`. +Setup hook failures (`beforeAll`, `beforeEach`) fail the affected run; teardown +hook failures (`afterEach`, `afterAll`) are non-fatal. + +`agentv:agent-rules` is the only built-in extension in this slice. It runs after +workspace materialization and exposes staged rule paths to providers and result +metadata as `agent_rules_paths`: + +```yaml +extensions: + - id: agentv:agent-rules + hook: beforeAll + skills: agent-rules/skills + hooks: agent-rules/hooks + agents: agent-rules/agents + rules: agent-rules/AGENTS.md +``` + +If `agentv:agent-rules` is authored as a string, it defaults to `beforeAll` and +discovers conventional rule locations already present in the materialized +workspace. It does not clone repositories or replace `workspace.repos`. + ### Metadata Fields You can add structured metadata to your eval file using these optional top-level fields. Metadata is parsed when the `name` field is present: diff --git a/apps/web/src/content/docs/docs/evaluation/experiments.mdx b/apps/web/src/content/docs/docs/evaluation/experiments.mdx index aab8f78f6..a02e1a331 100644 --- a/apps/web/src/content/docs/docs/evaluation/experiments.mdx +++ b/apps/web/src/content/docs/docs/evaluation/experiments.mdx @@ -189,7 +189,7 @@ Scoped `run:` supports `threshold`, `repeat`, `timeout_seconds`, and legacy per-case `budget_usd` overrides. Parent suite budgets should use `evaluate_options.budget_usd` for public eval authoring. Use `evaluate_options.max_concurrency` for authored concurrency. Candidate-changing fields stay -parent-level. Workspace mutation belongs in `workspace.hooks`, and +parent-level. Executable workspace setup belongs in top-level lifecycle extensions, and provider-specific setup belongs in target configuration. ## Lifecycle Ownership @@ -199,8 +199,9 @@ target-specific runner state. | Need | Put it in | | --- | --- | -| Install dependencies, build the repo, seed files | `workspace.hooks.before_all` | -| Reset or apply per-case state | `workspace.hooks.before_each` / `workspace.hooks.after_each` | +| Install dependencies, build the repo, seed files | `extensions: ["file://scripts/setup.mjs:beforeAll"]` | +| Apply per-case state | `extensions: ["file://scripts/setup.mjs:beforeEach"]` | +| Reset file state after each case | `workspace.hooks.after_each.reset` | | Configure an agent runner or provider variant | `target` object or `targets.yaml` | | Choose the target | top-level `target` | | Override the target's default model | `target.model` | @@ -208,10 +209,8 @@ target-specific runner state. | Bind an existing local workspace directory | `--workspace-path` or `.agentv/config.local.yaml` | ```yaml -workspace: - hooks: - before_all: - command: ["bash", "-lc", "bun install && bun run build"] +extensions: + - file://scripts/build.mjs:beforeAll target: extends: codex-gpt5 diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index dc881d48b..ae8711bc4 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -326,14 +326,16 @@ agentv eval evals/my-eval.yaml --workspace-clean full agentv eval evals/my-eval.yaml --retain-on-success cleanup --retain-on-failure keep ``` -Portable eval YAML keeps workspace intent under templates, repos, hooks, env, -Docker, and folder isolation: +Portable eval YAML keeps workspace intent under templates, repos, env, Docker, +and folder isolation. Use top-level extensions for executable setup: ```yaml +extensions: + - file://scripts/setup.mjs:beforeAll + workspace: isolation: shared # shared | per_case hooks: - enabled: true # set false to skip all hooks after_each: reset: fast # none | fast | strict ``` @@ -343,7 +345,7 @@ Notes: - Pooled mode is an explicit machine-local optimization. - `--workspace-path` uses an existing machine-local directory as-is and implies static runtime mode. - Runtime static mode is incompatible with `isolation: per_case`. -- `hooks.enabled: false` skips all lifecycle hooks (setup, teardown, reset). +- `workspace.hooks.after_each.reset` resets file state after each case. - Pool slots are managed separately (`agentv workspace list|clean`). ### Resume an Interrupted Run diff --git a/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx b/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx index ae760593a..87241b4dd 100644 --- a/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx +++ b/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx @@ -27,24 +27,25 @@ Use this split when deciding where a benchmark key belongs: |------------|--------------|------------------| | `workspace.repos[]` | Yes | Declares repo identity and checkout refs; AgentV resolves acquisition and materializes the checkout. | | `workspace.template` | Yes | Copies a workspace template into the run workspace. | -| `workspace.hooks` | Yes | Runs lifecycle commands with workspace and case context on stdin. | +| `extensions` | Yes | Runs Promptfoo-style lifecycle setup after `workspace.template` and `workspace.repos` materialize. | +| `workspace.hooks.after_each.reset` | Yes | Controls workspace reset policy after each case. | | `workspace.isolation` | Yes | Controls shared vs per-case folder isolation. Runtime workspace paths are machine-local config/CLI bindings, not benchmark provenance. | | `experiment` | Yes | Selects targets, thresholds, repeat policy, budgets, and default grader behavior. Concurrency is an operator/run setting from `--workers` or project config. | | `input`, `input_files`, `expected_output` | Yes | Builds the target prompt and passive reference answer. | | `assertions` | Yes | Runs deterministic, LLM, composite, or code graders. | | Top-level `name`, `version`, `tags`, `license`, `requires` | Informational | Identifies and categorizes the suite. | -| `tests[].metadata` | Informational to AgentV | Passes arbitrary case data through to results and hook stdin; in-process custom assertions can also read it. | +| `tests[].metadata` | Informational to AgentV | Passes arbitrary case data through to results and extension context; in-process custom assertions can also read it. | -`metadata` can still become operational inside your own hook scripts. For -example, a `before_each` hook can read `case_metadata.test_patch` and apply that +`metadata` can still become operational inside your own lifecycle extensions. For +example, a `beforeEach` extension can read `case_metadata.test_patch` and apply that patch before the agent starts. The distinction is that AgentV itself only passes -the metadata along; the script owns the behavior. +the metadata along; the extension owns the behavior. -## Hook Payloads +## Extension Context -Lifecycle hooks receive JSON on stdin. Case-scoped hooks such as per-test -`before_all`, `before_each`, and `after_each` receive the current test's -metadata as `case_metadata`: +File lifecycle extensions export functions named `beforeAll`, `beforeEach`, +`afterEach`, or `afterAll`. AgentV calls each function with context including +the current test's metadata as `case_metadata`: ```json { @@ -59,9 +60,9 @@ metadata as `case_metadata`: } ``` -Suite-level `before_all` hooks run once for the workspace, before any one test is -selected, so they should do suite setup only. Use `before_each` when setup depends -on per-case metadata such as a patch path, source row, or selected test list. +`beforeAll` runs once for the shared workspace after repo materialization, so it +should do suite setup only. Use `beforeEach` when setup depends on per-case +metadata such as a patch path, source row, or selected test list. ## Task Artifact Anatomy @@ -71,7 +72,7 @@ Benchmark task packs map cleanly onto AgentV fields at authoring time: |---------------|----------------| | Prompt or instruction | `input`, usually with `type: file` blocks for long prompts | | Source checkout | `workspace.repos[].repo` and `workspace.repos[].commit` | -| Per-case setup | `workspace.hooks.before_each` reading `case_metadata` | +| Per-case setup | `extensions: ["file://scripts/setup.mjs:beforeEach"]` reading `case_metadata` | | Gold answer | `expected_output` when the answer is passive reference data | | Active verification | `assertions`, especially `code-grader` for commands or artifact checks | | Provenance | `tests[].metadata` with source pins, generator rows, and curation labels | @@ -104,12 +105,12 @@ workspace: repo: https://github.com/example/widget.git commit: 4f3e2d19b6e4e8f1c2b7d9a0e5a6b7c8d9e0f123 hooks: - before_each: - command: ["python", "./scripts/apply-test-patch.py"] - timeout_ms: 120000 after_each: reset: strict +extensions: + - file://scripts/apply-test-patch.mjs:beforeEach + assertions: - name: focused-tests type: code-grader @@ -133,7 +134,7 @@ tests: In this example, `workspace.repos[].commit` is the actual checkout. The matching `metadata.source_commit` is audit data that gets recorded with the case -and is available to scripts. `apply-test-patch.py` can read +and is available to extensions. `apply-test-patch.mjs` can read `case_metadata.test_patch` and `case_metadata.fail_to_pass_tests`, then apply the patch and write the selected test list into the workspace. The code grader can read that workspace file through its `workspace_path` payload. Repo @@ -158,9 +159,9 @@ workspace: - path: ./repo repo: https://github.com/example/widget.git commit: 4f3e2d19b6e4e8f1c2b7d9a0e5a6b7c8d9e0f123 - hooks: - before_each: - command: ["python", "./scripts/apply-case-fixtures.py"] + +extensions: + - file://scripts/apply-case-fixtures.mjs:beforeEach target: codex diff --git a/apps/web/src/content/docs/docs/guides/eval-authoring.mdx b/apps/web/src/content/docs/docs/guides/eval-authoring.mdx index 6dda5efbf..6d5c4e39b 100644 --- a/apps/web/src/content/docs/docs/guides/eval-authoring.mdx +++ b/apps/web/src/content/docs/docs/guides/eval-authoring.mdx @@ -5,60 +5,58 @@ sidebar: order: 3 --- -## Workspace Setup: Skill Discovery Paths - -The `before_all` setup hook must copy skills to **all** provider discovery paths. Each provider searches a different directory: - -| Provider | Discovery path | -|----------|---------------| -| claude-cli | `.claude/skills/` | -| allagents | `.agents/skills/` | -| pi-cli | `.pi/skills/` | - -If your setup hook only copies to one path, `skill-trigger` assertions will fail for other providers. - -### Example setup.mjs - -```javascript -import { cp, mkdir } from 'node:fs/promises'; -import path from 'node:path'; - -// Read AgentV payload from stdin -const payload = JSON.parse(await new Promise((resolve) => { - let data = ''; - process.stdin.on('data', (chunk) => (data += chunk)); - process.stdin.on('end', () => resolve(data)); -})); - -const workspacePath = payload.workspace_path; -const skillSource = path.resolve('skills'); - -// Copy skills to all provider discovery paths -const discoveryPaths = [ - '.claude/skills', - '.agents/skills', - '.pi/skills', -]; - -for (const rel of discoveryPaths) { - const dest = path.join(workspacePath, rel); - await mkdir(path.dirname(dest), { recursive: true }); - await cp(skillSource, dest, { recursive: true }); -} -``` +## Agent Rules and Skill Paths -### In your eval YAML +Use the built-in `agentv:agent-rules` extension when an eval needs to stage or +expose agent-facing rules, skills, hooks, or subagents. It runs after +`workspace.template` and `workspace.repos` materialize, then writes +`agent_rules_paths` into provider context and result metadata. ```yaml +extensions: + - id: agentv:agent-rules + hook: beforeAll + skills: agent-rules/skills + hooks: agent-rules/hooks + agents: agent-rules/agents + rules: agent-rules/AGENTS.md + workspace: template: ./workspace-template - hooks: - before_all: - command: - - node - - ../scripts/setup.mjs + repos: + - path: ./app + repo: acme/app + commit: main ``` +Configured paths are resolved relative to the eval file and staged under the +materialized workspace. If you write the shorthand form, AgentV discovers +conventional rule locations already present in the workspace: + +```yaml +extensions: + - agentv:agent-rules +``` + +Do not move repo acquisition into `agentv:agent-rules`. Repositories remain +first-class workspace provenance through `workspace.repos`. + +## Custom Lifecycle Setup + +Use file extensions for setup that is not repo provisioning: + +```yaml +extensions: + - file://scripts/setup.mjs:beforeAll + - file://scripts/setup.mjs:beforeEach + - file://scripts/setup.mjs:afterEach + - file://scripts/setup.mjs:afterAll +``` + +Each file hook exports a function with the matching name. The function receives +context such as `workspace_path`, `test_id`, `eval_run_id`, `case_input`, and +`case_metadata`. + ## Workspace Limitations: No GitHub Remote Workspace-based evals are sandboxed — there is no GitHub remote, no PRs, and no issue tracker. Tests that ask agents to interact with GitHub will fail. diff --git a/apps/web/src/content/docs/docs/guides/workspace-architecture.mdx b/apps/web/src/content/docs/docs/guides/workspace-architecture.mdx index 93a6e281a..1cc0141b6 100644 --- a/apps/web/src/content/docs/docs/guides/workspace-architecture.mdx +++ b/apps/web/src/content/docs/docs/guides/workspace-architecture.mdx @@ -41,14 +41,14 @@ eval start | v +---------------------------+ -| 4. before_all hooks | workspace hook, then target hook +| 4. beforeAll lifecycle | extensions, then target hook +---------------------------+ | v +---------------------------+ | 5. Test loop | For each test case: -| before_each -> run -> | workspace hook, target hook, agent, -| after_each | target hook, workspace hook +| beforeEach -> run -> | extension, target hook, agent, +| afterEach | target hook, extension, reset +---------------------------+ | v diff --git a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx index 685a1f801..f907aeb81 100644 --- a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx +++ b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx @@ -31,9 +31,9 @@ On subsequent runs: 1. AgentV computes the fingerprint from your repo configs 2. If a matching pool entry exists, it acquires a slot and resets it (`git reset --hard` + `git clean -fd`) 3. Template files are re-copied (repo directories are preserved) -4. Lifecycle hooks (`before_all`, etc.) run as normal +4. Lifecycle extensions (`beforeAll`, etc.) run as normal -**Keep templates small.** Template files are re-copied into every slot on every run. Use them for lightweight setup — agent skills, configuration files, prompt templates — not large assets. Heavy dependencies belong in repos (pooled and reused) or should be installed by `before_all` hooks (cached across reuse cycles with `fast` reset). +**Keep templates small.** Template files are re-copied into every slot on every run. Use them for lightweight setup — agent skills, configuration files, prompt templates — not large assets. Heavy dependencies belong in repos (pooled and reused) or should be installed by `beforeAll` extensions (cached across reuse cycles with `fast` reset). The first pooled run materializes from scratch. Subsequent pooled runs reuse the pool — skipping clone and checkout entirely. @@ -57,7 +57,7 @@ execution: ## Pool reset mode -By default, pool reset uses `git clean -fd` which **preserves `.gitignore`d files** like `node_modules/`, `build/`, and compiled binaries. This means `before_all` build steps survive across reuse cycles. +By default, pool reset uses `git clean -fd` which **preserves `.gitignore`d files** like `node_modules/`, `build/`, and compiled binaries. This means `beforeAll` build steps survive across reuse cycles. For strict reset that also removes `.gitignore`d files, use the `--workspace-clean full` CLI flag: diff --git a/apps/web/src/content/docs/docs/targets/configuration.mdx b/apps/web/src/content/docs/docs/targets/configuration.mdx index 50de9994d..b3e7830e0 100644 --- a/apps/web/src/content/docs/docs/targets/configuration.mdx +++ b/apps/web/src/content/docs/docs/targets/configuration.mdx @@ -85,56 +85,49 @@ targets: grader_target: azure-base # LLM used for grading ``` -### Workspace Lifecycle Hooks +### Lifecycle Extensions -Run commands and reset/cleanup policies at different lifecycle points using `workspace.hooks`. This can be defined at the suite level (applies to all tests) or per test (overrides suite-level). -Use workspace hooks for repo preparation such as dependency installs, builds, -fixture generation, and per-case resets. Use target hooks for runner-specific -setup. +Run non-provisioning setup at Promptfoo-compatible lifecycle points using +top-level `extensions`. The harness materializes `workspace.template` and +`workspace.repos` first, then runs `beforeAll` extensions. Use extensions for +dependency installs, builds, fixture generation, and agent-rule staging. Use +target hooks for runner-specific setup. Keep repo identity and checkout pins in +`workspace.repos`; extensions must not become the default repo acquisition path. ```yaml +extensions: + - file://scripts/workspace.mjs:beforeAll + - file://scripts/workspace.mjs:beforeEach + - file://scripts/workspace.mjs:afterEach + - file://scripts/workspace.mjs:afterAll + - id: agentv:agent-rules + hook: beforeAll + skills: agent-rules/skills + rules: agent-rules/AGENTS.md + workspace: template: ./workspace-templates/my-project hooks: - before_all: - command: ["bun", "run", "setup.ts"] - timeout_ms: 120000 - cwd: ./scripts after_each: - command: ["bun", "run", "reset.ts"] - timeout_ms: 5000 reset: fast - after_all: - command: ["bun", "run", "cleanup.ts"] - timeout_ms: 30000 ``` | Field | Description | |-------|-------------| | `template` | Directory to copy as workspace | -| `hooks.before_all` | Runs once after workspace creation, before the first test | -| `hooks.after_all` | Runs once after the last test, before cleanup | -| `hooks.before_each` | Runs before each test | -| `hooks.after_each` | Runs after each test (supports both `command` and `reset`) | - -Each hook config accepts: - -| Field | Description | -|-------|-------------| -| `command` | Command array (e.g., `["bun", "run", "setup.ts"]`) | -| `reset` | Reset mode: `none`, `fast`, `strict` | -| `timeout_ms` | Timeout in milliseconds (default: 60000 for setup hooks, 30000 for teardown hooks) | -| `cwd` | Working directory (relative paths resolved against eval file directory) | +| `extensions[]` | `file://...:beforeAll`, `beforeEach`, `afterEach`, `afterAll`, or `agentv:agent-rules` | +| `hooks.after_each.reset` | Reset mode: `none`, `fast`, `strict` | -**Lifecycle order:** template copy → repo materialization → workspace `hooks.before_all` → target `hooks.before_all` → git baseline → (`hooks.before_each` → target `hooks.before_each` → agent runs → file changes captured → target `hooks.after_each` → `hooks.after_each`) × N tests → target `hooks.after_all` → `hooks.after_all` → cleanup +**Lifecycle order:** template copy → repo materialization → `extensions.beforeAll` → target `hooks.before_all` → git baseline → (`extensions.beforeEach` → target `hooks.before_each` → agent runs → file changes captured → target `hooks.after_each` → `extensions.afterEach` → `workspace.hooks.after_each.reset`) × N tests → target `hooks.after_all` → `extensions.afterAll` → cleanup **Shared workspace:** The workspace is created once and shared across all tests in a suite. Use `hooks.after_each.reset` to reset state between tests (e.g., `fast`/`strict`). **Error handling:** -- `hooks.before_all` / `hooks.before_each` command failure aborts the test with an error result -- `hooks.after_all` / `hooks.after_each` command failure is non-fatal (warning only) +- `beforeAll` / `beforeEach` extension failure aborts the affected run with an error result +- `afterAll` / `afterEach` extension failure is non-fatal -**Script context:** All scripts receive a JSON object on stdin with case context: +**File hook context:** Exported functions receive a JSON-compatible object with +case context: ```json { @@ -146,7 +139,9 @@ Each hook config accepts: } ``` -**Suite vs per-test:** When both are defined, test-level fields replace suite-level fields. See [Per-Test Workspace Config](/docs/evaluation/eval-cases/#per-case-workspace-config) for examples. +`workspace.hooks` remains the reset-policy home for `after_each.reset`. Legacy +command hooks still parse for existing local suites, but new portable evals +should use `extensions` for executable setup. ### Repository Lifecycle @@ -237,7 +232,7 @@ Use `cwd` on a target to run in an existing directory (shared across tests). If Eval files can define per-target hooks that run setup/teardown scripts to customize the workspace for each target variant. This enables comparing different harness configurations (e.g., baseline vs with-plugins) in a single eval file. -Targets do not declare `repos`. Repositories belong to the shared eval workspace so every target runs in the same world; target hooks customize the harness under evaluation. Use hooks for per-target setup such as copying skills, enabling wrappers, or changing provider-local config. Keep installs, builds, fixture generation, and case resets in `workspace.hooks`. +Targets do not declare `repos`. Repositories belong to the shared eval workspace so every target runs in the same world; target hooks customize the harness under evaluation. Use hooks for per-target setup such as enabling wrappers or changing provider-local config. Keep installs, builds, fixture generation, and case setup in top-level lifecycle `extensions`. Target hooks can be scoped to an eval-local target object: @@ -253,7 +248,7 @@ target: Target hooks run after workspace hooks on setup, before workspace hooks on teardown: -1. Workspace `before_all` +1. Extension `beforeAll` 2. **Target `before_all`** 3. For each test: - Workspace `before_each` diff --git a/apps/web/src/content/docs/docs/tools/prepare.mdx b/apps/web/src/content/docs/docs/tools/prepare.mdx index c93470fd1..ac07cddcc 100644 --- a/apps/web/src/content/docs/docs/tools/prepare.mdx +++ b/apps/web/src/content/docs/docs/tools/prepare.mdx @@ -17,12 +17,12 @@ The prepared directory contains: ```text /tmp/agentv-case-1/ - workspace/ # materialized template/repos/hooks state + workspace/ # materialized template/repos/extensions state prompt.md # safe task prompt for the human or external agent agentv_prepare.json # snake_case manifest for audit and later grading ``` -`prepare` runs setup only: workspace `before_all`, target `before_all`, workspace `before_each`, and target `before_each`. It does not launch the agent, run graders, mark an eval complete, or expose hidden expected outputs and grader internals in `prompt.md`. +`prepare` runs setup only: workspace materialization, extension `beforeAll`, target `before_all`, extension `beforeEach`, and target `before_each`. It does not launch the agent, run graders, mark an eval complete, or expose hidden expected outputs and grader internals in `prompt.md`. ## Grade the Attempt diff --git a/examples/README.md b/examples/README.md index 93f662ee9..f64080578 100644 --- a/examples/README.md +++ b/examples/README.md @@ -48,7 +48,7 @@ Focused demonstrations of specific AgentV capabilities. Each example includes it - [compare](features/compare/) - Baseline comparison - [deterministic-graders](features/deterministic-graders/) - Deterministic assertions (contains, regex, JSON validation) - [vitest-workspace-grader](features/vitest-workspace-grader/) - Vitest-style deterministic workspace verifiers -- [workspace-setup-script](features/workspace-setup-script/) - Multi-step workspace setup with `before_all` lifecycle hook +- [workspace-setup-script](features/workspace-setup-script/) - Multi-step workspace setup with a `beforeAll` lifecycle extension ### SDK diff --git a/examples/features/README.md b/examples/features/README.md index 40153f696..632f10b4d 100644 --- a/examples/features/README.md +++ b/examples/features/README.md @@ -98,7 +98,7 @@ Focused examples for specific AgentV capabilities. Find your use case below, the ### Workspace and agent setup | Example | Description | |---------|-------------| -| [workspace-setup-script](workspace-setup-script/) | Multi-step setup with the `before_all` lifecycle hook | +| [workspace-setup-script](workspace-setup-script/) | Multi-step setup with a `beforeAll` lifecycle extension | | [workspace-multi-repo](workspace-multi-repo/) | Multi-repo workspace using a VS Code `.code-workspace` file | | [workspace-shared-config](workspace-shared-config/) | Define a `workspace.yaml` once and reference it across eval files | | [repo-lifecycle](repo-lifecycle/) | Clone a git repo into the workspace and target the agent at it | diff --git a/examples/features/copilot-log-eval/README.md b/examples/features/copilot-log-eval/README.md index 59aadfdb7..d0ac4b635 100644 --- a/examples/features/copilot-log-eval/README.md +++ b/examples/features/copilot-log-eval/README.md @@ -38,7 +38,7 @@ the latest session from `~/.copilot/session-state/` and runs all graders. ## How it works ``` -allagents workspace init (before_all hook) +allagents workspace init (setup hook) ↓ syncs agentv-dev plugin skills from marketplace ~/.copilot/session-state/{uuid}/events.jsonl ↓ copilot-log provider (reads from disk) diff --git a/examples/features/file-changes-with-repos/evals/eval.yaml b/examples/features/file-changes-with-repos/evals/eval.yaml index ebbdc1bde..b029608a8 100644 --- a/examples/features/file-changes-with-repos/evals/eval.yaml +++ b/examples/features/file-changes-with-repos/evals/eval.yaml @@ -6,8 +6,8 @@ # # Setup: # - workspace.template copies workspace-template/ into the temp workspace -# - before_all hook initialises my-lib/ as a git repo inside the workspace -# - initializeBaseline (runs after before_all) sees my-lib/.git as a gitlink +# - setup hook initialises my-lib/ as a git repo inside the workspace +# - initializeBaseline sees my-lib/.git as a gitlink after setup # # Agent behaviour: # - Writes report.txt to workspace root (not inside any repo) diff --git a/examples/features/tool-calls-template/evals/eval.yaml b/examples/features/tool-calls-template/evals/eval.yaml index 18976db70..a8f9a96d3 100644 --- a/examples/features/tool-calls-template/evals/eval.yaml +++ b/examples/features/tool-calls-template/evals/eval.yaml @@ -4,7 +4,7 @@ # whether an agent invoked the right skills — without needing the # skill-trigger evaluator. # -# Skills live in workspace/.agents/skills/. The before_all hook copies +# Skills live in workspace/.agents/skills/. The setup hook copies # them to .claude/skills/ so copilot and other providers can discover them. # # Run: diff --git a/examples/features/workspace-setup-script/README.md b/examples/features/workspace-setup-script/README.md index d2fdccfd4..d557ebafd 100644 --- a/examples/features/workspace-setup-script/README.md +++ b/examples/features/workspace-setup-script/README.md @@ -1,78 +1,54 @@ -# Workspace Setup Script +# Workspace Setup Extension -Demonstrates using a `before_all` lifecycle hook to clean and re-initialize an allagents workspace before evaluation runs, then register a project-scoped marketplace and sync plugin content (including prompt files). +Demonstrates using a `beforeAll` lifecycle extension to clean and re-initialize an allagents workspace before evaluation runs, then register a project-scoped marketplace and sync plugin content. ## Problem -`allagents workspace init` fails if `.allagents/workspace.yaml` already exists. In CI and repeated eval runs, stale artifacts need to be cleaned first. Without a wrapper, you'd need shell operators like `&&` (not cross-platform) or framework-level multi-command support. +`allagents workspace init` fails if `.allagents/workspace.yaml` already exists. In CI and repeated eval runs, stale artifacts need to be cleaned before project-scoped plugin content is synced. ## Solution -A generic Node.js script that any eval can reuse. It reads `workspace_path` from AgentV's stdin JSON, removes stale `.allagents/` state, runs `allagents workspace init --from`, registers a project-scoped marketplace, then runs `allagents workspace sync`. +A Node.js lifecycle extension exports `beforeAll(context)`. AgentV runs it after `workspace.template` and `workspace.repos` materialize, so the extension can safely prepare local configuration without owning repo provisioning. ``` workspace-setup-script/ ├── evals/ -│ └── dataset.eval.yaml # Eval with before_all hook +│ └── dataset.eval.yaml # Eval with beforeAll extension ├── plugins/ │ └── my-plugin/ # Plugin content (AGENTS + prompt) -│ ├── AGENTS.md # Agent guidelines +│ ├── AGENTS.md │ └── .github/ │ └── prompts/ │ └── summarize-repo.prompt.md ├── marketplace/ │ └── .claude-plugin/ -│ └── marketplace.json # Local marketplace manifest +│ └── marketplace.json ├── scripts/ -│ └── workspace-setup.mjs # Generic setup script (reusable across evals) +│ └── workspace-setup.mjs # Lifecycle extension module └── workspace-template/ └── .allagents/ - └── workspace.yaml # Template for allagents init + └── workspace.yaml ``` -## Plugin Installation via Project Marketplace - -The `.allagents/workspace.yaml` installs a plugin from a named marketplace: - -```yaml -# .allagents/workspace.yaml -plugins: - - my-plugin@workspace-setup-script-marketplace -``` - -The setup script registers that marketplace using project scope: - -```bash -npx --yes allagents plugin marketplace add ../marketplace --scope project -``` - -This matches the project-scoped marketplace flow introduced in `allagents` (PR #224). - ## Eval YAML -The template path and local marketplace path are passed as arguments. Use `--require` to validate expected artifacts after sync: +Use top-level `extensions` for executable setup and keep repos under `workspace.repos`: ```yaml +extensions: + - file://../scripts/workspace-setup.mjs:beforeAll + workspace: - template: ./workspace-template - hooks: - before_all: - command: - - node - - ../scripts/workspace-setup.mjs - - --from - - ../workspace-template/.allagents/workspace.yaml - - --marketplace-source - - ../marketplace - - --require - - AGENTS.md - - --require - - .github/prompts/summarize-repo.prompt.md + template: ../workspace-template + repos: + - path: ./my-repo + repo: https://github.com/EntityProcess/agentv.git + commit: main ``` -The `--require` flag accepts one or more file paths (relative to the workspace root). If any required file is missing after `allagents workspace init`, the script exits with an error listing the missing files. +The extension reads `context.workspace_path` and `context.eval_dir`, refreshes `.allagents/`, runs `allagents workspace init`, registers the local marketplace with `--scope project`, syncs plugins, and validates that expected artifacts exist. -## Referencing plugin files in test inputs +## Referencing Plugin Files In Test Inputs Reference plugin files via `type: file` in test inputs to inject them into the agent's prompt: @@ -90,22 +66,18 @@ tests: The `type: file` path is resolved from the eval file's directory up to the repo root. This injects the file contents into the agent's prompt alongside any text instructions. -## How it works - -1. AgentV copies `workspace-template/` to a pooled workspace -2. The setup script removes stale `.allagents/` config and runs `npx allagents workspace init` -3. The setup script registers the local marketplace with `--scope project` -4. `allagents workspace sync` installs `my-plugin@workspace-setup-script-marketplace` -5. `--require` checks verify `AGENTS.md` and `.github/prompts/summarize-repo.prompt.md` exist -6. AgentV clones repos and runs tests against the initialized workspace +## How It Works -## Cross-platform +1. AgentV copies `workspace-template/` to a pooled workspace. +2. AgentV clones `workspace.repos`. +3. The `beforeAll` extension removes stale `.allagents/` config and runs `npx allagents workspace init`. +4. The extension registers the local marketplace with `--scope project`. +5. `allagents workspace sync` installs `my-plugin@workspace-setup-script-marketplace`. +6. Required-file checks verify `AGENTS.md` and `.github/prompts/summarize-repo.prompt.md` exist. -The script handles Windows by using `npx.cmd` instead of `npx`. +## Cross-Platform Notes -Because the script first reads AgentV payload from stdin, it then launches `npx` with: +The extension handles Windows by using `npx.cmd` instead of `npx` and launches subprocesses with: - `stdio: ['ignore', 'inherit', 'inherit']` - `shell: process.platform === 'win32'` - -This avoids a Windows-specific `spawnSync npx.cmd EINVAL` failure seen when stdin is inherited after being consumed in `before_all` hooks. diff --git a/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml b/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml index 8e0a2ef6e..ee2e1a43b 100644 --- a/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml +++ b/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml @@ -1,22 +1,13 @@ description: >- - Demonstrates using a before_all workspace setup script with the VSCode target. + Demonstrates using a beforeAll lifecycle extension with the VSCode target. Same as dataset.eval.yaml but uses vscode instead of copilot. +extensions: + - file://../scripts/workspace-setup.mjs:beforeAll + workspace: template: ../workspace-template hooks: - before_all: - command: - - node - - ../scripts/workspace-setup.mjs - - --from - - ../workspace-template/.allagents/workspace.yaml - - --marketplace-source - - ../marketplace - - --require - - AGENTS.md - - --require - - .github/prompts/summarize-repo.prompt.md after_each: reset: fast repos: diff --git a/examples/features/workspace-setup-script/evals/dataset.eval.yaml b/examples/features/workspace-setup-script/evals/dataset.eval.yaml index 27fdb1bb5..ac6f62ada 100644 --- a/examples/features/workspace-setup-script/evals/dataset.eval.yaml +++ b/examples/features/workspace-setup-script/evals/dataset.eval.yaml @@ -1,22 +1,12 @@ description: >- - Demonstrates using a before_all workspace setup script to clean and + Demonstrates using a beforeAll lifecycle extension to clean and re-initialize an allagents workspace before evaluation runs. +extensions: + - file://../scripts/workspace-setup.mjs:beforeAll + workspace: template: ../workspace-template - hooks: - before_all: - command: - - node - - ../scripts/workspace-setup.mjs - - --from - - ../workspace-template/.allagents/workspace.yaml - - --marketplace-source - - ../marketplace - - --require - - AGENTS.md - - --require - - .github/prompts/summarize-repo.prompt.md repos: - path: ./my-repo repo: https://github.com/EntityProcess/agentv.git diff --git a/examples/features/workspace-setup-script/scripts/workspace-setup.mjs b/examples/features/workspace-setup-script/scripts/workspace-setup.mjs index 7c45ee286..0e351e908 100644 --- a/examples/features/workspace-setup-script/scripts/workspace-setup.mjs +++ b/examples/features/workspace-setup-script/scripts/workspace-setup.mjs @@ -1,149 +1,158 @@ -#!/usr/bin/env node // @ts-check // -// Generic workspace setup script for AgentV before_all lifecycle hook. +// AgentV beforeAll lifecycle extension for this example. // -// Reads workspace_path from AgentV stdin JSON, removes stale .allagents/ -// config, copies source directories, and runs `npx allagents workspace init`. -// -// Usage in eval YAML: -// workspace: -// hooks: -// before_all: -// command: -// - node -// - ../scripts/workspace-setup.mjs -// - --from -// - ../workspace-template/.allagents/workspace.yaml -// - --source -// - ../guidelines -// - --require -// - AGENTS.md +// It runs after workspace.template and workspace.repos materialize, then +// refreshes allagents project state inside the prepared workspace. import { spawnSync } from 'node:child_process'; -import { cpSync, existsSync, readFileSync, rmSync } from 'node:fs'; -import { basename, isAbsolute, join, resolve } from 'node:path'; - -// --- parse arguments --- -const fromIndex = process.argv.indexOf('--from'); -if (fromIndex === -1 || !process.argv[fromIndex + 1]) { - console.error( - 'Usage: workspace-setup.mjs --from [--source ...] [--marketplace-source ] [--marketplace-name ] [--require ...]', - ); - process.exit(1); -} -const templatePath = process.argv[fromIndex + 1]; - -// Collect --source arguments: directories to copy into the workspace before init -const sourceDirs = []; -for (let i = 0; i < process.argv.length; i++) { - if (process.argv[i] === '--source' && process.argv[i + 1]) { - sourceDirs.push(process.argv[i + 1]); - i++; - } -} +import { existsSync, readFileSync, rmSync } from 'node:fs'; +import { join, resolve } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const REQUIRED_FILES = ['AGENTS.md', '.github/prompts/summarize-repo.prompt.md']; -// Collect --require arguments: files that must exist in the workspace after init -const requiredFiles = []; -for (let i = 0; i < process.argv.length; i++) { - if (process.argv[i] === '--require' && process.argv[i + 1]) { - requiredFiles.push(process.argv[i + 1]); - i++; +/** + * @param {{ + * workspace_path?: string; + * eval_dir: string; + * }} context + */ +export function beforeAll(context) { + const workspacePath = context.workspace_path; + if (!workspacePath) { + throw new Error('workspace_path not provided to workspace setup extension'); } -} -// Optional project-scoped marketplace source to register after init. -const marketplaceSourceIndex = process.argv.indexOf('--marketplace-source'); -const marketplaceSource = - marketplaceSourceIndex !== -1 ? process.argv[marketplaceSourceIndex + 1] : undefined; -const marketplaceNameIndex = process.argv.indexOf('--marketplace-name'); -const marketplaceName = - marketplaceNameIndex !== -1 ? process.argv[marketplaceNameIndex + 1] : undefined; - -// --- stdin context from AgentV --- -const { workspace_path } = JSON.parse(readFileSync(0, 'utf8')); -if (!workspace_path) { - console.error('workspace_path not provided on stdin'); - process.exit(1); -} + const templatePath = resolve(context.eval_dir, '../workspace-template/.allagents/workspace.yaml'); + const marketplaceSource = resolve(context.eval_dir, '../marketplace'); -// --- copy source directories into workspace --- -for (const src of sourceDirs) { - if (!existsSync(src)) { - console.error(`Source directory not found: ${src}`); - process.exit(1); - } - const dest = join(workspace_path, basename(src)); - cpSync(src, dest, { recursive: true }); -} + runAllagentsSetup({ + workspacePath, + templatePath, + marketplaceSource, + requiredFiles: REQUIRED_FILES, + }); -// --- clean previous workspace config --- -rmSync(join(workspace_path, '.allagents'), { recursive: true, force: true }); - -// --- run allagents workspace init --- -const npx = process.platform === 'win32' ? 'npx.cmd' : 'npx'; -const result = spawnSync( - npx, - ['--yes', 'allagents', 'workspace', 'init', workspace_path, '--from', templatePath], - { - // This script reads AgentV stdin first, so don't pass fd 0 through. - // On Windows, inheriting stdin into `npx.cmd` can raise EINVAL. - // shell=true ensures `.cmd` is launched reliably. - stdio: ['ignore', 'inherit', 'inherit'], - shell: process.platform === 'win32', - }, -); -if (result.status !== 0) { - process.exit(result.status ?? 1); + return { + metadata: { + workspace_setup: { + marketplace_source: marketplaceSource, + required_files: REQUIRED_FILES, + }, + }, + }; } -// --- optionally register project-scoped marketplace and resync --- -if (marketplaceSource) { - const resolvedMarketplaceSource = isAbsolute(marketplaceSource) - ? marketplaceSource - : resolve(process.cwd(), marketplaceSource); +/** + * @param {{ + * workspacePath: string; + * templatePath: string; + * marketplaceSource?: string; + * marketplaceName?: string; + * requiredFiles: readonly string[]; + * }} options + */ +function runAllagentsSetup(options) { + rmSync(join(options.workspacePath, '.allagents'), { recursive: true, force: true }); - const addMarketplaceArgs = [ + const npx = process.platform === 'win32' ? 'npx.cmd' : 'npx'; + run(npx, [ '--yes', 'allagents', - 'plugin', - 'marketplace', - 'add', - resolvedMarketplaceSource, - '--scope', - 'project', - ]; - if (marketplaceName) { - addMarketplaceArgs.push('--name', marketplaceName); + 'workspace', + 'init', + options.workspacePath, + '--from', + options.templatePath, + ]); + + if (options.marketplaceSource) { + const addMarketplaceArgs = [ + '--yes', + 'allagents', + 'plugin', + 'marketplace', + 'add', + options.marketplaceSource, + '--scope', + 'project', + ]; + if (options.marketplaceName) { + addMarketplaceArgs.push('--name', options.marketplaceName); + } + run(npx, addMarketplaceArgs, options.workspacePath); + run(npx, ['--yes', 'allagents', 'workspace', 'sync'], options.workspacePath); } - const addMarketplaceResult = spawnSync(npx, addMarketplaceArgs, { - stdio: ['ignore', 'inherit', 'inherit'], - shell: process.platform === 'win32', - cwd: workspace_path, - }); - if (addMarketplaceResult.status !== 0) { - process.exit(addMarketplaceResult.status ?? 1); + const missing = options.requiredFiles.filter( + (file) => !existsSync(join(options.workspacePath, file)), + ); + if (missing.length > 0) { + throw new Error(`Required artifacts not found in workspace: ${missing.join(', ')}`); } +} - const syncResult = spawnSync(npx, ['--yes', 'allagents', 'workspace', 'sync'], { +/** + * @param {string} command + * @param {readonly string[]} args + * @param {string | undefined} cwd + */ +function run(command, args, cwd = undefined) { + const result = spawnSync(command, args, { stdio: ['ignore', 'inherit', 'inherit'], shell: process.platform === 'win32', - cwd: workspace_path, + ...(cwd ? { cwd } : {}), }); - if (syncResult.status !== 0) { - process.exit(syncResult.status ?? 1); + if (result.status !== 0) { + throw new Error(`${command} ${args.join(' ')} failed with exit ${result.status ?? 1}`); } } -// --- validate required artifacts exist in workspace --- -const missing = requiredFiles.filter((file) => !existsSync(join(workspace_path, file))); -if (missing.length > 0) { - console.error('Required artifacts not found in workspace:'); - for (const file of missing) { - console.error(` - ${file}`); +function runCli() { + const fromIndex = process.argv.indexOf('--from'); + if (fromIndex === -1 || !process.argv[fromIndex + 1]) { + throw new Error( + 'Usage: workspace-setup.mjs --from [--marketplace-source ] [--marketplace-name ] [--require ...]', + ); + } + + const { workspace_path } = JSON.parse(readFileSync(0, 'utf8')); + if (!workspace_path) { + throw new Error('workspace_path not provided on stdin'); } - process.exit(1); + + const requiredFiles = []; + for (let i = 0; i < process.argv.length; i++) { + if (process.argv[i] === '--require' && process.argv[i + 1]) { + requiredFiles.push(process.argv[i + 1]); + i++; + } + } + + const marketplaceSourceIndex = process.argv.indexOf('--marketplace-source'); + const marketplaceSource = + marketplaceSourceIndex !== -1 + ? resolve(process.cwd(), process.argv[marketplaceSourceIndex + 1]) + : undefined; + const marketplaceNameIndex = process.argv.indexOf('--marketplace-name'); + const marketplaceName = + marketplaceNameIndex !== -1 ? process.argv[marketplaceNameIndex + 1] : undefined; + + runAllagentsSetup({ + workspacePath: workspace_path, + templatePath: resolve(process.cwd(), process.argv[fromIndex + 1]), + ...(marketplaceSource ? { marketplaceSource } : {}), + ...(marketplaceName ? { marketplaceName } : {}), + requiredFiles, + }); } -process.exit(0); +if (process.argv[1] && fileURLToPath(import.meta.url) === resolve(process.argv[1])) { + try { + runCli(); + } catch (error) { + console.error(error instanceof Error ? error.message : String(error)); + process.exit(1); + } +} diff --git a/packages/core/src/evaluation/extensions/runner.ts b/packages/core/src/evaluation/extensions/runner.ts new file mode 100644 index 000000000..e7ba6aed0 --- /dev/null +++ b/packages/core/src/evaluation/extensions/runner.ts @@ -0,0 +1,290 @@ +import { type Stats, existsSync } from 'node:fs'; +import { cp, mkdir, stat } from 'node:fs/promises'; +import path from 'node:path'; +import { pathToFileURL } from 'node:url'; + +import type { + AgentRulesExtensionConfig, + AgentRulesPaths, + AgentVExtensionConfig, + EvalTest, + ExtensionLifecycleHook, + JsonObject, +} from '../types.js'; + +export interface ExtensionHookContext { + readonly hook_name: ExtensionLifecycleHook; + readonly workspace_path?: string; + readonly test_id: string; + readonly eval_run_id?: string; + readonly eval_dir: string; + readonly case_input?: string; + readonly case_metadata?: Record; + readonly workspace_file_dir?: string; + readonly provider_context?: JsonObject; + readonly agent_rules_paths?: AgentRulesPaths; +} + +export interface ExtensionRuntimeState { + readonly providerContext?: JsonObject; + readonly metadata?: Record; + readonly output?: string; + readonly agentRulesPaths?: AgentRulesPaths; +} + +type ExtensionReturn = { + readonly provider_context?: JsonObject; + readonly metadata?: Record; + readonly output?: string; + readonly agent_rules_paths?: AgentRulesPaths; +}; + +export function mergeExtensionState( + left: ExtensionRuntimeState | undefined, + right: ExtensionRuntimeState | undefined, +): ExtensionRuntimeState | undefined { + if (!left) return right; + if (!right) return left; + + const agentRulesPaths = mergeAgentRulesPaths(left.agentRulesPaths, right.agentRulesPaths); + const providerContext = { + ...(left.providerContext ?? {}), + ...(right.providerContext ?? {}), + ...(agentRulesPaths ? { agent_rules_paths: agentRulesPaths } : {}), + }; + const metadata = { + ...(left.metadata ?? {}), + ...(right.metadata ?? {}), + ...(agentRulesPaths ? { agent_rules_paths: agentRulesPaths } : {}), + }; + const output = [left.output, right.output].filter(Boolean).join('\n') || undefined; + + return { + ...(Object.keys(providerContext).length > 0 ? { providerContext } : {}), + ...(Object.keys(metadata).length > 0 ? { metadata } : {}), + ...(output !== undefined ? { output } : {}), + ...(agentRulesPaths !== undefined ? { agentRulesPaths } : {}), + }; +} + +export async function runExtensionsForHook(options: { + readonly extensions: readonly AgentVExtensionConfig[] | undefined; + readonly hook: ExtensionLifecycleHook; + readonly context: ExtensionHookContext; + readonly state?: ExtensionRuntimeState; +}): Promise { + const matching = (options.extensions ?? []).filter( + (extension) => extension.hook === options.hook, + ); + if (matching.length === 0) { + return options.state; + } + + let state = options.state; + for (const extension of matching) { + const context = buildContextWithState(options.context, state); + const next = isAgentRulesExtension(extension) + ? await runAgentRulesExtension(extension, context) + : await runFileExtension(extension, context); + state = mergeExtensionState(state, next); + } + return state; +} + +function buildContextWithState( + context: ExtensionHookContext, + state: ExtensionRuntimeState | undefined, +): ExtensionHookContext { + return { + ...context, + ...(state?.providerContext !== undefined ? { provider_context: state.providerContext } : {}), + ...(state?.agentRulesPaths !== undefined ? { agent_rules_paths: state.agentRulesPaths } : {}), + }; +} + +function isAgentRulesExtension( + extension: AgentVExtensionConfig, +): extension is AgentRulesExtensionConfig { + return extension.id === 'agentv:agent-rules'; +} + +async function runFileExtension( + extension: Exclude, + context: ExtensionHookContext, +): Promise { + const moduleUrl = pathToFileURL(extension.path); + moduleUrl.search = `t=${Date.now()}-${Math.random().toString(36).slice(2)}`; + const imported = (await import(moduleUrl.href)) as Record; + const defaultExport = imported.default; + const maybeCommonJs = + defaultExport && typeof defaultExport === 'object' + ? (defaultExport as Record)[extension.functionName] + : undefined; + const hookFn = imported[extension.functionName] ?? maybeCommonJs; + if (typeof hookFn !== 'function') { + throw new Error(`Extension ${extension.id} does not export function ${extension.functionName}`); + } + + const result = (await hookFn(context, { hookName: extension.hook })) as unknown; + return normalizeExtensionReturn(result); +} + +async function runAgentRulesExtension( + extension: AgentRulesExtensionConfig, + context: ExtensionHookContext, +): Promise { + if (!context.workspace_path) { + throw new Error('agentv:agent-rules requires a materialized workspace'); + } + + const paths: AgentRulesPaths = { + skills: await stageConfiguredOrDiscover({ + kind: 'skills', + configured: extension.skills, + evalDir: context.eval_dir, + workspacePath: context.workspace_path, + discover: ['.claude/skills', '.agents/skills', '.codex/skills', '.pi/skills', 'skills'], + }), + hooks: await stageConfiguredOrDiscover({ + kind: 'hooks', + configured: extension.hooks, + evalDir: context.eval_dir, + workspacePath: context.workspace_path, + discover: ['.claude/hooks', '.agents/hooks', '.codex/hooks', '.pi/hooks', 'hooks'], + }), + agents: await stageConfiguredOrDiscover({ + kind: 'agents', + configured: extension.agents, + evalDir: context.eval_dir, + workspacePath: context.workspace_path, + discover: ['.agents/agents', '.codex/agents', 'agents'], + }), + rules: await stageConfiguredOrDiscover({ + kind: 'rules', + configured: extension.rules, + evalDir: context.eval_dir, + workspacePath: context.workspace_path, + discover: ['AGENTS.md', 'CLAUDE.md', 'rules'], + }), + }; + const compactPaths = compactAgentRulesPaths(paths); + if (!compactPaths) { + return undefined; + } + + return normalizeExtensionReturn({ + provider_context: { agent_rules_paths: compactPaths }, + metadata: { agent_rules_paths: compactPaths }, + agent_rules_paths: compactPaths, + }); +} + +async function stageConfiguredOrDiscover(options: { + readonly kind: keyof AgentRulesPaths; + readonly configured: readonly string[] | undefined; + readonly evalDir: string; + readonly workspacePath: string; + readonly discover: readonly string[]; +}): Promise { + if (!options.configured || options.configured.length === 0) { + const discovered = options.discover + .map((candidate) => path.resolve(options.workspacePath, candidate)) + .filter((candidate) => existsSync(candidate)); + return discovered.length > 0 ? discovered : undefined; + } + + const staged: string[] = []; + const stageRoot = path.join(options.workspacePath, '.agentv', 'agent-rules', options.kind); + await mkdir(stageRoot, { recursive: true }); + + for (const entry of options.configured) { + const sourcePath = path.isAbsolute(entry) ? entry : path.resolve(options.evalDir, entry); + let sourceStat: Stats; + try { + sourceStat = await stat(sourcePath); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + throw new Error(`agentv:agent-rules ${options.kind} path not found: ${entry}: ${message}`); + } + + if (isInside(options.workspacePath, sourcePath)) { + staged.push(sourcePath); + continue; + } + + const destPath = path.join(stageRoot, path.basename(sourcePath)); + await cp(sourcePath, destPath, { + recursive: sourceStat.isDirectory(), + force: true, + }); + staged.push(destPath); + } + + return staged.length > 0 ? staged : undefined; +} + +function normalizeExtensionReturn(value: unknown): ExtensionRuntimeState | undefined { + if (!value || typeof value !== 'object' || Array.isArray(value)) { + return undefined; + } + const result = value as ExtensionReturn; + const agentRulesPaths = compactAgentRulesPaths(result.agent_rules_paths); + const providerContext = { + ...(result.provider_context ?? {}), + ...(agentRulesPaths ? { agent_rules_paths: agentRulesPaths } : {}), + }; + const metadata = { + ...(result.metadata ?? {}), + ...(agentRulesPaths ? { agent_rules_paths: agentRulesPaths } : {}), + }; + + return { + ...(Object.keys(providerContext).length > 0 ? { providerContext } : {}), + ...(Object.keys(metadata).length > 0 ? { metadata } : {}), + ...(typeof result.output === 'string' ? { output: result.output } : {}), + ...(agentRulesPaths ? { agentRulesPaths } : {}), + }; +} + +function compactAgentRulesPaths(paths: AgentRulesPaths | undefined): AgentRulesPaths | undefined { + if (!paths) { + return undefined; + } + const compacted: AgentRulesPaths = { + ...(paths.skills && paths.skills.length > 0 ? { skills: [...paths.skills] } : {}), + ...(paths.hooks && paths.hooks.length > 0 ? { hooks: [...paths.hooks] } : {}), + ...(paths.agents && paths.agents.length > 0 ? { agents: [...paths.agents] } : {}), + ...(paths.rules && paths.rules.length > 0 ? { rules: [...paths.rules] } : {}), + }; + return Object.keys(compacted).length > 0 ? compacted : undefined; +} + +function mergeAgentRulesPaths( + left: AgentRulesPaths | undefined, + right: AgentRulesPaths | undefined, +): AgentRulesPaths | undefined { + if (!left) return compactAgentRulesPaths(right); + if (!right) return compactAgentRulesPaths(left); + + return compactAgentRulesPaths({ + skills: mergePathLists(left.skills, right.skills), + hooks: mergePathLists(left.hooks, right.hooks), + agents: mergePathLists(left.agents, right.agents), + rules: mergePathLists(left.rules, right.rules), + }); +} + +function mergePathLists( + left: readonly string[] | undefined, + right: readonly string[] | undefined, +): readonly string[] | undefined { + const merged = [...(left ?? []), ...(right ?? [])]; + return merged.length > 0 ? [...new Set(merged)] : undefined; +} + +function isInside(root: string, candidate: string): boolean { + const relative = path.relative(root, candidate); + return ( + relative === '' || (!!relative && !relative.startsWith('..') && !path.isAbsolute(relative)) + ); +} diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 93e15cab5..29518ee3a 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -4,6 +4,7 @@ import { fileURLToPath } from 'node:url'; import micromatch from 'micromatch'; import pLimit from 'p-limit'; +import { runExtensionsForHook } from './extensions/runner.js'; import { readJsonFile } from './file-utils.js'; import { type ChildGraderResult, @@ -145,6 +146,21 @@ function extractProviderRawLogPath(response: ProviderResponse): string | undefin return trimmed.length > 0 ? trimmed : undefined; } +function mergeMetadata( + base: Record | undefined, + overlay: JsonObject | Record | undefined, +): JsonObject | undefined { + const merged = { + ...(base ?? {}), + ...(overlay ?? {}), + } as JsonObject; + return Object.keys(merged).length > 0 ? merged : undefined; +} + +function mergeTextOutput(left: string | undefined, right: string | undefined): string | undefined { + return [left, right].filter(Boolean).join('\n') || undefined; +} + interface EvaluationRuntimeOptions { readonly target: ResolvedTarget; readonly targets?: readonly TargetDefinition[]; @@ -415,6 +431,8 @@ export interface RunEvalCaseOptions { readonly sharedWorkspacePath?: string; /** Pre-initialized baseline commit for shared workspace */ readonly sharedBaselineCommit?: string; + /** Provider/runtime context produced by shared beforeAll extensions. */ + readonly sharedExtensionState?: import('./extensions/runner.js').ExtensionRuntimeState; /** Suite-level .code-workspace file (resolved from workspace.template) */ readonly suiteWorkspaceFile?: string; /** Real-time observability callbacks passed to the provider */ @@ -959,10 +977,14 @@ export async function runEvaluation( poolSlots, availablePoolSlots, poolSlotBaselines, + poolSlotExtensionStates, useStaticWorkspace, + extensionState: sharedExtensionState, } = sharedSetup; const targetHooks = options.targetHooks; const suiteHooksEnabled = hooksEnabled(suiteWorkspace); + const suiteExtensions = + filteredEvalCases.find((evalCase) => evalCase.extensions?.length)?.extensions ?? []; try { // Track worker assignments for progress reporting @@ -1216,6 +1238,11 @@ export async function runEvaluation( ? poolSlotBaselines.get(testPoolSlot.path) : sharedBaselineCommit : undefined; + const testExtensionState = usesSharedWorkspace + ? testPoolSlot + ? poolSlotExtensionStates.get(testPoolSlot.path) + : sharedExtensionState + : undefined; try { const graderProvider = await resolveGraderProvider(target); @@ -1247,6 +1274,7 @@ export async function runEvaluation( verbose, threshold: scoreThreshold, targetHooks: options.targetHooks, + sharedExtensionState: testExtensionState, replayRecording, evalFilePath, repoRoot: repoRootPath, @@ -1457,6 +1485,35 @@ export async function runEvaluation( } const suiteAfterAllHook = suiteWorkspace?.hooks?.after_all; + if (afterAllWorkspaces.length > 0 && suiteExtensions.length > 0) { + for (const wsPath of afterAllWorkspaces) { + try { + const afterAllState = await runExtensionsForHook({ + extensions: suiteExtensions, + hook: 'afterAll', + context: { + hook_name: 'afterAll', + workspace_path: wsPath, + test_id: '__after_all__', + eval_run_id: evalRunId, + eval_dir: evalDir, + }, + state: poolSlotExtensionStates.get(wsPath) ?? sharedExtensionState, + }); + if (afterAllState?.output && results.length > 0 && wsPath === afterAllWorkspaces[0]) { + results[results.length - 1] = { + ...results[results.length - 1], + afterAllOutput: mergeTextOutput( + results[results.length - 1].afterAllOutput, + afterAllState.output, + ), + }; + } + } catch { + // afterAll extension failures are non-fatal, matching teardown hooks. + } + } + } if (afterAllWorkspaces.length > 0 && suiteHooksEnabled && hasHookCommand(suiteAfterAllHook)) { const afterAllHook = suiteAfterAllHook; for (const wsPath of afterAllWorkspaces) { @@ -1792,6 +1849,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise { + // Execute target after_each hook before workspace after_each/reset. + const targetAfterEachHook = options.targetHooks?.after_each; + if (workspacePath && hasHookCommand(targetAfterEachHook)) { + const scriptContext: ScriptExecutionContext = { + workspacePath, + testId: evalCase.id, + evalRunId: evalRunId ?? '', + caseInput: evalCase.question, + caseMetadata: evalCase.metadata, + evalDir, + workspaceFileDir: evalCase.workspace?.workspaceFileDir, + }; + try { + await executeWorkspaceScript( + toScriptConfig(targetAfterEachHook, 'after_each', `target hook for '${evalCase.id}'`), + scriptContext, + 'warn', + ); + } catch { + // target after_each failures are non-fatal + } + } + + if (workspacePath && evalCase.extensions && evalCase.extensions.length > 0) { + try { + const afterEachState = await runExtensionsForHook({ + extensions: evalCase.extensions, + hook: 'afterEach', + context: { + hook_name: 'afterEach', + workspace_path: workspacePath, + test_id: evalCase.id, + eval_run_id: evalRunId ?? '', + case_input: evalCase.question, + case_metadata: evalCase.metadata, + eval_dir: evalDir ?? process.cwd(), + workspace_file_dir: evalCase.workspace?.workspaceFileDir, + }, + state: extensionState, + }); + afterEachOutput = mergeTextOutput(afterEachOutput, afterEachState?.output); + } catch { + // afterEach extension failures are non-fatal, matching teardown hooks. + } + } + + // Reset workspace state before after_each hook (if configured), but only + // after graders have inspected the agent-modified workspace. + if ( + caseHooksEnabled && + workspacePath && + evalCase.workspace?.hooks?.after_each?.reset && + evalCase.workspace.hooks.after_each.reset !== 'none' + ) { + try { + if (repoManager && evalCase.workspace.repos?.length) { + await repoManager.reset( + evalCase.workspace.repos, + workspacePath, + evalCase.workspace.hooks.after_each.reset, + ); + } else { + await resetWorkspaceRoot( + workspacePath, + evalCase.workspace.hooks.after_each.reset, + baselineCommit, + ); + } + } catch { + // Reset failures are non-fatal (like after_each) + } + } + + // Execute after_each hook (runs after grading, before cleanup) + const caseAfterEachHook = evalCase.workspace?.hooks?.after_each; + if (workspacePath && caseHooksEnabled && hasHookCommand(caseAfterEachHook)) { + const afterEachHook = caseAfterEachHook; + const scriptContext: ScriptExecutionContext = { + workspacePath, + testId: evalCase.id, + evalRunId: evalRunId ?? '', + caseInput: evalCase.question, + caseMetadata: evalCase.metadata, + evalDir, + workspaceFileDir: evalCase.workspace?.workspaceFileDir, + }; + try { + afterEachOutput = await executeWorkspaceScript( + toScriptConfig(afterEachHook, 'after_each', `test '${evalCase.id}'`), + scriptContext, + 'warn', + ); + } catch { + // after_each failures are non-fatal + } + } + }; // Conversation mode: turn-by-turn evaluation if (evalCase.mode === 'conversation' && evalCase.turns?.length) { - const conversationResult = await runConversationMode({ + let conversationResult = await runConversationMode({ evalCase, provider, target, @@ -1879,7 +2041,16 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise { - // Execute target after_each hook before workspace after_each/reset. - const targetAfterEachHook = options.targetHooks?.after_each; - if (workspacePath && hasHookCommand(targetAfterEachHook)) { - const scriptContext: ScriptExecutionContext = { - workspacePath, - testId: evalCase.id, - evalRunId: evalRunId ?? '', - caseInput: evalCase.question, - caseMetadata: evalCase.metadata, - evalDir, - workspaceFileDir: evalCase.workspace?.workspaceFileDir, - }; - try { - await executeWorkspaceScript( - toScriptConfig(targetAfterEachHook, 'after_each', `target hook for '${evalCase.id}'`), - scriptContext, - 'warn', - ); - } catch { - // target after_each failures are non-fatal - } - } - - // Reset workspace state before after_each hook (if configured), but only - // after graders have inspected the agent-modified workspace. - if ( - caseHooksEnabled && - workspacePath && - evalCase.workspace?.hooks?.after_each?.reset && - evalCase.workspace.hooks.after_each.reset !== 'none' - ) { - try { - if (repoManager && evalCase.workspace.repos?.length) { - await repoManager.reset( - evalCase.workspace.repos, - workspacePath, - evalCase.workspace.hooks.after_each.reset, - ); - } else { - await resetWorkspaceRoot( - workspacePath, - evalCase.workspace.hooks.after_each.reset, - baselineCommit, - ); - } - } catch { - // Reset failures are non-fatal (like after_each) - } - } - - // Execute after_each hook (runs after grading, before cleanup) - const caseAfterEachHook = evalCase.workspace?.hooks?.after_each; - if (workspacePath && caseHooksEnabled && hasHookCommand(caseAfterEachHook)) { - const afterEachHook = caseAfterEachHook; - const scriptContext: ScriptExecutionContext = { - workspacePath, - testId: evalCase.id, - evalRunId: evalRunId ?? '', - caseInput: evalCase.question, - caseMetadata: evalCase.metadata, - evalDir, - workspaceFileDir: evalCase.workspace?.workspaceFileDir, - }; - try { - afterEachOutput = await executeWorkspaceScript( - toScriptConfig(afterEachHook, 'after_each', `test '${evalCase.id}'`), - scriptContext, - 'warn', - ); - } catch { - // after_each failures are non-fatal - } - } - }; - try { const result = await evaluateCandidate({ evalCase, @@ -2215,6 +2312,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise Provider | undefined; readonly availableTargets?: readonly string[]; readonly evalFilePath?: string; + readonly metadata?: JsonObject; }): Promise { const { evalCase, @@ -3055,6 +3156,7 @@ async function runConversationMode(options: { targetResolver, availableTargets, evalFilePath, + metadata, } = options; // biome-ignore lint/style/noNonNullAssertion: turns is guaranteed by the caller (conversation mode gate) @@ -3114,6 +3216,7 @@ async function runConversationMode(options: { cwd: workspacePath, workspaceFile: caseWorkspaceFile, streamCallbacks, + metadata, }); } catch (error) { const message = error instanceof Error ? error.message : String(error); @@ -3414,6 +3517,7 @@ async function invokeProvider( readonly workspaceFile?: string; /** When true, AgentV captures file changes — provider should skip forced diff prompt */ readonly captureFileChanges?: boolean; + readonly metadata?: JsonObject; /** Real-time observability callbacks */ readonly streamCallbacks?: ProviderStreamCallbacks; }, @@ -3428,6 +3532,7 @@ async function invokeProvider( cwd, workspaceFile, captureFileChanges, + metadata, streamCallbacks, } = options; @@ -3455,6 +3560,7 @@ async function invokeProvider( cwd, workspaceFile, captureFileChanges, + metadata, streamCallbacks, braintrustSpanIds: braintrustSpanIds ?? undefined, }); diff --git a/packages/core/src/evaluation/prepared-workspace.ts b/packages/core/src/evaluation/prepared-workspace.ts index 15994a146..df4e67569 100644 --- a/packages/core/src/evaluation/prepared-workspace.ts +++ b/packages/core/src/evaluation/prepared-workspace.ts @@ -18,7 +18,7 @@ import micromatch from 'micromatch'; import type { ResolvedTarget } from './providers/targets.js'; import type { ChatPrompt } from './providers/types.js'; import { AGENT_PROVIDER_KINDS } from './providers/types.js'; -import type { EvalTest, RepoConfig, TargetHooksConfig } from './types.js'; +import type { EvalTest, JsonObject, RepoConfig, TargetHooksConfig } from './types.js'; import { type SharedWorkspaceSetup, type WorkspaceSetupCleanPolicy, @@ -100,6 +100,8 @@ export interface PreparedEvalWorkspace { readonly workspaceFile?: string; readonly createdAt: string; readonly hookExecutions: readonly WorkspaceSetupHookExecution[]; + readonly providerContext?: JsonObject; + readonly metadata?: Record; readonly repoPins: readonly PreparedWorkspaceRepoPin[]; readonly baseline: PreparedWorkspaceBaseline; readonly promptSource: PreparedWorkspacePromptSource; @@ -244,6 +246,7 @@ export async function prepareEvalWorkspace( evalDir, cleanupWorkspaces: options.cleanupWorkspaces, targetHooks: options.targetHooks, + sharedExtensionState: sharedSetup.extensionState, setupDebug: options.verbose, }); @@ -267,6 +270,12 @@ export async function prepareEvalWorkspace( }), createdAt: (options.now ?? (() => new Date()))().toISOString(), hookExecutions: [...sharedSetup.hookExecutions, ...caseSetup.hookExecutions], + ...(caseSetup.extensionState?.providerContext !== undefined && { + providerContext: caseSetup.extensionState.providerContext, + }), + ...(caseSetup.extensionState?.metadata !== undefined && { + metadata: caseSetup.extensionState.metadata, + }), repoPins: toRepoPins(evalCase.workspace?.repos), baseline: caseSetup.baselineCommit ? { status: 'initialized', commit: caseSetup.baselineCommit } diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index bdb4b8aac..8d4ae944f 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -292,6 +292,29 @@ export type TargetHooksConfig = { readonly after_all?: WorkspaceHookConfig; }; +export type ExtensionLifecycleHook = 'beforeAll' | 'beforeEach' | 'afterEach' | 'afterAll'; + +export type AgentRulesPaths = { + readonly skills?: readonly string[]; + readonly hooks?: readonly string[]; + readonly agents?: readonly string[]; + readonly rules?: readonly string[]; +}; + +export type AgentRulesExtensionConfig = AgentRulesPaths & { + readonly id: 'agentv:agent-rules'; + readonly hook: ExtensionLifecycleHook; +}; + +export type FileExtensionConfig = { + readonly id: string; + readonly hook: ExtensionLifecycleHook; + readonly path: string; + readonly functionName: ExtensionLifecycleHook; +}; + +export type AgentVExtensionConfig = AgentRulesExtensionConfig | FileExtensionConfig; + /** * Extended target reference from eval file. * Allows eval files to define per-target hooks and delegation alongside target names. @@ -325,7 +348,7 @@ export type DockerWorkspaceConfig = { /** * Preflight environment requirements for the workspace. - * Checked once before before_all hooks run. Fails fast if anything is missing. + * Checked once before workspace setup hooks run. Fails fast if anything is missing. * * @example * ```yaml @@ -358,7 +381,7 @@ export type WorkspaceConfig = { * Used as default cwd for hook commands so that file-referenced templates resolve * relative paths from their own directory, not the eval file's directory. */ readonly workspaceFileDir?: string; - /** Preflight environment requirements. Checked before before_all hooks run. */ + /** Preflight environment requirements. Checked before workspace setup hooks run. */ readonly env?: WorkspaceEnvConfig; }; @@ -994,6 +1017,8 @@ export interface EvalTest { readonly assertions?: readonly GraderConfig[]; /** Suite-level preprocessors used by the implicit default llm-grader. */ readonly preprocessors?: readonly ContentPreprocessorConfig[]; + /** Promptfoo-style lifecycle extensions inherited from the suite. */ + readonly extensions?: readonly AgentVExtensionConfig[]; /** Workspace configuration (merged from suite-level and case-level) */ readonly workspace?: WorkspaceConfig; /** Arbitrary metadata passed to workspace scripts via stdin */ diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index a289a3f2a..135fc8912 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -325,6 +325,68 @@ const WorkspaceScriptSchema = z }) .strict(); +const ExtensionHookSchema = z.enum(['beforeAll', 'beforeEach', 'afterEach', 'afterAll']); + +const FileExtensionSchema = z + .string() + .min(1) + .refine((value) => value.startsWith('file://'), { + message: 'file extensions must start with file://', + }) + .refine( + (value) => { + const lastColon = value.lastIndexOf(':'); + return ( + lastColon > 'file://'.length && + ExtensionHookSchema.safeParse(value.slice(lastColon + 1)).success + ); + }, + { + message: 'file extensions must be of the form file://path/to/hook.ts:beforeAll', + }, + ); + +const AgentRulesStringExtensionSchema = z.union([ + z.literal('agentv:agent-rules'), + z + .string() + .startsWith('agentv:agent-rules:') + .refine( + (value) => ExtensionHookSchema.safeParse(value.slice('agentv:agent-rules:'.length)).success, + { + message: 'agentv:agent-rules hook must be beforeAll, beforeEach, afterEach, or afterAll', + }, + ), +]); + +const AgentRulesPathListSchema = z.union([z.string().min(1), z.array(z.string().min(1))]); + +const AgentRulesObjectExtensionSchema = z + .object({ + id: z.literal('agentv:agent-rules'), + hook: ExtensionHookSchema.optional(), + skills: AgentRulesPathListSchema.optional(), + hooks: AgentRulesPathListSchema.optional(), + agents: AgentRulesPathListSchema.optional(), + rules: AgentRulesPathListSchema.optional(), + config: z + .object({ + skills: AgentRulesPathListSchema.optional(), + hooks: AgentRulesPathListSchema.optional(), + agents: AgentRulesPathListSchema.optional(), + rules: AgentRulesPathListSchema.optional(), + }) + .strict() + .optional(), + }) + .strict(); + +const ExtensionSchema = z.union([ + FileExtensionSchema, + AgentRulesStringExtensionSchema, + AgentRulesObjectExtensionSchema, +]); + // --------------------------------------------------------------------------- // Repo lifecycle // --------------------------------------------------------------------------- @@ -686,8 +748,8 @@ export const EvalFileSchema: z.ZodType = z output_path: z.union([z.string().min(1), z.array(z.string().min(1))]).optional(), env: z.record(z.string()).optional(), nunjucks_filters: z.union([JsonObjectSchema, z.array(z.string().min(1))]).optional(), - extensions: z.array(z.union([z.string().min(1), JsonObjectSchema])).optional(), - on_run_complete: z.union([z.string().min(1), z.array(z.string().min(1))]).optional(), + extensions: z.array(ExtensionSchema).optional(), + on_run_complete: z.never().optional(), policy: z.never().optional(), execution: z.never().optional(), // Suite-level assertions diff --git a/packages/core/src/evaluation/workspace/setup.ts b/packages/core/src/evaluation/workspace/setup.ts index df0004df2..b099f987e 100644 --- a/packages/core/src/evaluation/workspace/setup.ts +++ b/packages/core/src/evaluation/workspace/setup.ts @@ -18,8 +18,11 @@ import path from 'node:path'; import { promisify } from 'node:util'; import { getWorkspacePoolRoot } from '../../paths.js'; +import { type ExtensionRuntimeState, runExtensionsForHook } from '../extensions/runner.js'; import type { + AgentVExtensionConfig, EvalTest, + ExtensionLifecycleHook, FailureStage, TargetHooksConfig, WorkspaceConfig, @@ -117,9 +120,11 @@ export interface SharedWorkspaceSetup { readonly poolSlots: readonly PoolSlot[]; readonly availablePoolSlots: PoolSlot[]; readonly poolSlotBaselines: ReadonlyMap; + readonly poolSlotExtensionStates: ReadonlyMap; readonly useStaticWorkspace: boolean; readonly configuredMode: WorkspaceSetupMode; readonly hookExecutions: readonly WorkspaceSetupHookExecution[]; + readonly extensionState?: ExtensionRuntimeState; } export interface EvalCaseWorkspaceSetupOptions { @@ -134,6 +139,7 @@ export interface EvalCaseWorkspaceSetupOptions { readonly cleanupWorkspaces?: boolean; readonly targetHooks?: TargetHooksConfig; readonly setupDebug?: boolean; + readonly sharedExtensionState?: ExtensionRuntimeState; } export interface EvalCaseWorkspaceSetup { @@ -144,6 +150,7 @@ export interface EvalCaseWorkspaceSetup { readonly baselineCommit?: string; readonly isSharedWorkspace: boolean; readonly hookExecutions: readonly WorkspaceSetupHookExecution[]; + readonly extensionState?: ExtensionRuntimeState; } export function toScriptConfig( @@ -299,6 +306,28 @@ function selectSuiteWorkspace(evalCases: readonly EvalTest[]): SelectedSharedWor ); } +function selectSuiteExtensions(evalCases: readonly EvalTest[]): readonly AgentVExtensionConfig[] { + const candidates = new Map(); + for (const evalCase of evalCases) { + const extensions = evalCase.extensions ?? []; + if (extensions.length === 0 || isPerCaseIsolation(evalCase.workspace)) { + continue; + } + candidates.set(stableWorkspaceValue(extensions), extensions); + } + + if (candidates.size > 1) { + throw new WorkspaceSetupError( + 'Wrapper eval contains multiple shared extension sets. Split the suites or use isolation: per_case when lifecycle extensions differ.', + { + failureStage: 'setup', + failureReasonCode: 'ambiguous_shared_extensions', + }, + ); + } + return [...candidates.values()][0] ?? []; +} + function workspaceGitEnv(): Record { const env = { ...process.env }; for (const key of Object.keys(env)) { @@ -340,6 +369,17 @@ function commandForHook(hook: WorkspaceHookConfig | undefined): readonly string[ return hook?.command; } +function mergeHookOutput(left: string | undefined, right: string | undefined): string | undefined { + return [left, right].filter(Boolean).join('\n') || undefined; +} + +function hasExtensionHook( + extensions: readonly AgentVExtensionConfig[] | undefined, + hook: ExtensionLifecycleHook, +): boolean { + return (extensions ?? []).some((extension) => extension.hook === hook); +} + function hookExecution(options: { readonly scope: WorkspaceSetupHookScope; readonly name: WorkspaceSetupHookName; @@ -404,6 +444,7 @@ export async function prepareSharedWorkspaceSetup( } = options; const selectedSuiteWorkspace = selectSuiteWorkspace(evalCases); const suiteWorkspace = selectedSuiteWorkspace?.workspace; + const suiteExtensions = selectSuiteExtensions(evalCases); const rawTemplate = suiteWorkspace?.template; const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate); const workspaceTemplate = resolvedTemplate?.dir; @@ -447,7 +488,8 @@ export async function prepareSharedWorkspaceSetup( const hasSharedWorkspace = !!( useStaticWorkspace || (!isPerCaseWorkspace && - (workspaceTemplate || suiteWorkspace?.hooks || suiteWorkspace?.repos?.length)) + (workspaceTemplate || suiteWorkspace?.hooks || suiteWorkspace?.repos?.length)) || + suiteExtensions.length > 0 ); const poolEnabled = configuredMode === 'pooled'; @@ -479,7 +521,9 @@ export async function prepareSharedWorkspaceSetup( const poolSlots: PoolSlot[] = []; const availablePoolSlots: PoolSlot[] = []; const poolSlotBaselines = new Map(); + const poolSlotExtensionStates = new Map(); const hookExecutions: WorkspaceSetupHookExecution[] = []; + let extensionState: ExtensionRuntimeState | undefined; const poolMaxSlots = Math.min(configPoolMaxSlots ?? 10, 50); let repoManager: RepoManager | undefined; @@ -532,7 +576,10 @@ export async function prepareSharedWorkspaceSetup( cause: error, }); } - } else if (!isPerCaseWorkspace && (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length)) { + } else if ( + !isPerCaseWorkspace && + (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length || suiteExtensions.length > 0) + ) { sharedWorkspacePath = getWorkspacePath(evalRunId, 'shared'); await mkdir(sharedWorkspacePath, { recursive: true }); setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`); @@ -603,6 +650,67 @@ export async function prepareSharedWorkspaceSetup( const suiteHooksEnabled = hooksEnabled(suiteWorkspace); const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all; + if (sharedWorkspacePath && suiteExtensions.length > 0) { + try { + extensionState = await runExtensionsForHook({ + extensions: suiteExtensions, + hook: 'beforeAll', + context: { + hook_name: 'beforeAll', + workspace_path: sharedWorkspacePath, + test_id: '__before_all__', + eval_run_id: evalRunId, + eval_dir: evalDir, + }, + state: extensionState, + }); + beforeAllOutput = mergeHookOutput(beforeAllOutput, extensionState?.output); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + if (sharedWorkspacePath && !useStaticWorkspace) { + await cleanupWorkspace(sharedWorkspacePath).catch(() => {}); + } + throw new WorkspaceSetupError(`beforeAll extension failed: ${message}`, { + failureStage: 'setup', + failureReasonCode: 'extension_error', + hookExecutions, + cause: error, + }); + } + } + if (availablePoolSlots.length > 0 && suiteExtensions.length > 0) { + for (const slot of availablePoolSlots) { + setupLog(`running beforeAll extensions on pool slot ${slot.index}`); + try { + const slotExtensionState = await runExtensionsForHook({ + extensions: suiteExtensions, + hook: 'beforeAll', + context: { + hook_name: 'beforeAll', + workspace_path: slot.path, + test_id: '__before_all__', + eval_run_id: evalRunId, + eval_dir: evalDir, + }, + }); + if (slotExtensionState) { + poolSlotExtensionStates.set(slot.path, slotExtensionState); + } + beforeAllOutput = mergeHookOutput(beforeAllOutput, slotExtensionState?.output); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + throw new WorkspaceSetupError( + `beforeAll extension failed on pool slot ${slot.index}: ${message}`, + { + failureStage: 'setup', + failureReasonCode: 'extension_error', + hookExecutions, + cause: error, + }, + ); + } + } + } if (sharedWorkspacePath && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) { const beforeAllHook = suiteBeforeAllHook; const beforeAllCommand = (beforeAllHook.command ?? []).join(' '); @@ -857,9 +965,11 @@ export async function prepareSharedWorkspaceSetup( poolSlots, availablePoolSlots, poolSlotBaselines, + poolSlotExtensionStates, useStaticWorkspace, configuredMode, hookExecutions, + ...(extensionState !== undefined && { extensionState }), }; } catch (error) { await releasePoolSlots({ poolManager, poolSlot, poolSlots }).catch(() => {}); @@ -881,6 +991,7 @@ export async function prepareEvalCaseWorkspace( cleanupWorkspaces: forceCleanup, targetHooks, setupDebug, + sharedExtensionState, } = options; let workspacePath: string | undefined = isPerCaseIsolation(evalCase.workspace) @@ -893,6 +1004,7 @@ export async function prepareEvalCaseWorkspace( let caseWorkspaceFile: string | undefined; const caseHooksEnabled = hooksEnabled(evalCase.workspace); const hookExecutions: WorkspaceSetupHookExecution[] = []; + let extensionState = sharedExtensionState; if (!workspacePath) { const rawCaseTemplate = evalCase.workspace?.template; @@ -925,7 +1037,9 @@ export async function prepareEvalCaseWorkspace( if ( !workspacePath && - (evalCase.workspace?.hooks || evalCase.workspace?.repos?.length) && + (evalCase.workspace?.hooks || + evalCase.workspace?.repos?.length || + evalCase.extensions?.length) && evalRunId ) { workspacePath = getWorkspacePath(evalRunId, evalCase.id); @@ -1025,6 +1139,38 @@ export async function prepareEvalCaseWorkspace( } } + if (workspacePath && evalCase.extensions && evalCase.extensions.length > 0) { + try { + extensionState = await runExtensionsForHook({ + extensions: evalCase.extensions, + hook: 'beforeAll', + context: { + hook_name: 'beforeAll', + workspace_path: workspacePath, + test_id: evalCase.id, + eval_run_id: evalRunId ?? '', + case_input: evalCase.question, + case_metadata: evalCase.metadata, + eval_dir: evalDir ?? process.cwd(), + workspace_file_dir: evalCase.workspace?.workspaceFileDir, + }, + state: extensionState, + }); + beforeAllOutput = mergeHookOutput(beforeAllOutput, extensionState?.output); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + if (forceCleanup && workspacePath) { + await cleanupWorkspace(workspacePath).catch(() => {}); + } + throw new WorkspaceSetupError(`beforeAll extension failed: ${message}`, { + failureStage: 'setup', + failureReasonCode: 'extension_error', + hookExecutions, + cause: error, + }); + } + } + const caseBeforeAllHook = evalCase.workspace?.hooks?.before_all; if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeAllHook)) { const beforeAllHook = caseBeforeAllHook; @@ -1122,6 +1268,40 @@ export async function prepareEvalCaseWorkspace( } const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each; + if (workspacePath && evalCase.extensions && evalCase.extensions.length > 0) { + try { + beforeEachNeedsFreshBaseline = hasExtensionHook(evalCase.extensions, 'beforeEach'); + const nextState = await runExtensionsForHook({ + extensions: evalCase.extensions, + hook: 'beforeEach', + context: { + hook_name: 'beforeEach', + workspace_path: workspacePath, + test_id: evalCase.id, + eval_run_id: evalRunId ?? '', + case_input: evalCase.question, + case_metadata: evalCase.metadata, + eval_dir: evalDir ?? process.cwd(), + workspace_file_dir: evalCase.workspace?.workspaceFileDir, + }, + state: extensionState, + }); + if (nextState !== extensionState) { + beforeEachNeedsFreshBaseline = true; + } + extensionState = nextState; + beforeEachOutput = mergeHookOutput(beforeEachOutput, extensionState?.output); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + throw new WorkspaceSetupError(`beforeEach extension failed: ${message}`, { + failureStage: 'setup', + failureReasonCode: 'extension_error', + hookExecutions, + cause: error, + }); + } + } + if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeEachHook)) { const beforeEachHook = caseBeforeEachHook; const scriptContext: ScriptExecutionContext = { @@ -1243,6 +1423,7 @@ export async function prepareEvalCaseWorkspace( ...(baselineCommit !== undefined && { baselineCommit }), isSharedWorkspace, hookExecutions, + ...(extensionState !== undefined && { extensionState }), }; } diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 890b28c69..97036af04 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -56,6 +56,9 @@ import { import { parseMetadata } from './metadata.js'; import type { TargetDefinition } from './providers/types.js'; import type { + AgentRulesExtensionConfig, + AgentRulesPaths, + AgentVExtensionConfig, ConversationAggregation, ConversationMode, ConversationTurn, @@ -65,6 +68,7 @@ import type { EvalSourceReference, EvalTest, EvalTestSource, + ExtensionLifecycleHook, GraderConfig, JsonObject, JsonValue, @@ -199,6 +203,8 @@ type RawTestSuite = JsonObject & { readonly workspace?: JsonValue; readonly assertions?: JsonValue; readonly preprocessors?: JsonValue; + readonly extensions?: JsonValue; + readonly on_run_complete?: JsonValue; readonly nunjucks_filters?: JsonValue; readonly input?: JsonValue; readonly metadata?: JsonValue; @@ -614,6 +620,7 @@ async function loadTestsFromParsedYamlValue( // Top-level `metadata:` is inherited by cases. Suite identity tags are parsed // separately by parseMetadata() and are not case tags. const suiteMetadataPayload = extractSuiteMetadataPayload(suite); + const evalFileDir = path.dirname(absoluteTestPath); const globalEvaluator = coerceEvaluator(suite.evaluator, 'global') ?? 'llm-grader'; const suitePreprocessors = await parsePreprocessors( @@ -622,9 +629,9 @@ async function loadTestsFromParsedYamlValue( '', absoluteTestPath, ); + const suiteExtensions = parseExtensions(suite.extensions, evalFileDir); const importedSuiteTests: EvalTest[] = []; - const evalFileDir = path.dirname(absoluteTestPath); const nunjucksFilters = await loadNunjucksFilters(suite.nunjucks_filters, evalFileDir); const parentWorkspace = parentWorkspaceLocation(suite); const importEntries = readImports(suite.imports); @@ -956,6 +963,7 @@ async function loadTestsFromParsedYamlValue( evaluator: testCaseEvaluatorKind, assertions: evaluators, ...(suitePreprocessors ? { preprocessors: suitePreprocessors } : {}), + ...(suiteExtensions.length > 0 ? { extensions: suiteExtensions } : {}), workspace: mergedWorkspace, metadata, ...(caseRun?.threshold !== undefined ? { threshold: caseRun.threshold } : {}), @@ -1626,6 +1634,11 @@ function readSuiteRuntimeBlock(suite: RawTestSuite, evalFilePath: string): JsonO `Invalid eval runtime config in ${evalFilePath}: top-level 'early_exit' has been removed. Use repeat.early_exit instead.`, ); } + if (suite.on_run_complete !== undefined) { + throw new Error( + `Invalid eval runtime config in ${evalFilePath}: top-level 'on_run_complete' has been removed. Use extensions with afterAll instead.`, + ); + } return undefined; } @@ -2105,6 +2118,106 @@ function parseWorkspaceHooksConfig( return Object.keys(hooks).length > 0 ? hooks : undefined; } +const EXTENSION_HOOKS = new Set(['beforeAll', 'beforeEach', 'afterEach', 'afterAll']); + +function parseExtensions(raw: unknown, evalFileDir: string): AgentVExtensionConfig[] { + if (raw === undefined) return []; + if (!Array.isArray(raw)) { + throw new Error('extensions must be an array'); + } + + return raw.map((entry, index) => parseExtension(entry, index, evalFileDir)); +} + +function parseExtension(entry: unknown, index: number, evalFileDir: string): AgentVExtensionConfig { + if (typeof entry === 'string') { + return parseExtensionString(entry, `extensions[${index}]`, evalFileDir); + } + if (!isJsonObject(entry)) { + throw new Error(`extensions[${index}] must be a string or object`); + } + + const obj = entry as Record; + const id = typeof obj.id === 'string' ? obj.id : undefined; + if (id !== 'agentv:agent-rules') { + throw new Error(`extensions[${index}].id must be agentv:agent-rules`); + } + const hook = parseExtensionHook(obj.hook, `extensions[${index}].hook`) ?? 'beforeAll'; + const source = isJsonObject(obj.config) ? (obj.config as Record) : obj; + return { + id, + hook, + ...(readPathList(source.skills, `extensions[${index}].skills`) ?? {}), + ...(readPathList(source.hooks, `extensions[${index}].hooks`) ?? {}), + ...(readPathList(source.agents, `extensions[${index}].agents`) ?? {}), + ...(readPathList(source.rules, `extensions[${index}].rules`) ?? {}), + }; +} + +function parseExtensionString( + raw: string, + label: string, + evalFileDir: string, +): AgentVExtensionConfig { + if (raw === 'agentv:agent-rules') { + return { id: 'agentv:agent-rules', hook: 'beforeAll' }; + } + if (raw.startsWith('agentv:agent-rules:')) { + const hook = parseExtensionHook(raw.slice('agentv:agent-rules:'.length), label); + if (!hook) { + throw new Error(`${label} must use one of beforeAll, beforeEach, afterEach, afterAll`); + } + return { id: 'agentv:agent-rules', hook }; + } + if (!raw.startsWith('file://')) { + throw new Error(`${label} must start with file:// or agentv:agent-rules`); + } + + const lastColon = raw.lastIndexOf(':'); + if (lastColon <= 'file://'.length) { + throw new Error(`${label} must be of the form file://path/to/hook.ts:beforeAll`); + } + const functionName = raw.slice(lastColon + 1); + const hook = parseExtensionHook(functionName, label); + if (!hook) { + throw new Error(`${label} must target one of beforeAll, beforeEach, afterEach, afterAll`); + } + const filePart = raw.slice('file://'.length, lastColon); + if (!filePart) { + throw new Error(`${label} must include a file path`); + } + const resolvedPath = path.isAbsolute(filePart) ? filePart : path.resolve(evalFileDir, filePart); + return { + id: raw, + hook, + path: resolvedPath, + functionName: hook, + }; +} + +function parseExtensionHook(raw: unknown, label: string): ExtensionLifecycleHook | undefined { + if (typeof raw !== 'string') return undefined; + if (!EXTENSION_HOOKS.has(raw)) { + throw new Error(`${label} must be one of beforeAll, beforeEach, afterEach, afterAll`); + } + return raw as ExtensionLifecycleHook; +} + +function readPathList(raw: unknown, label: string): Partial | undefined { + if (raw === undefined) return undefined; + const values = + typeof raw === 'string' + ? [raw] + : Array.isArray(raw) + ? raw.filter((entry): entry is string => typeof entry === 'string') + : undefined; + if (!values) { + throw new Error(`${label} must be a string or string array`); + } + const key = label.split('.').at(-1) as keyof AgentRulesExtensionConfig | undefined; + return key ? ({ [key]: values } as Partial) : undefined; +} + /** * Resolve a workspace config value: either an inline object or a string path * to an external workspace YAML file. diff --git a/packages/core/test/evaluation/extensions.test.ts b/packages/core/test/evaluation/extensions.test.ts new file mode 100644 index 000000000..4ff995b8b --- /dev/null +++ b/packages/core/test/evaluation/extensions.test.ts @@ -0,0 +1,413 @@ +import { afterEach, describe, expect, it } from 'bun:test'; +import { execSync } from 'node:child_process'; +import { mkdirSync, writeFileSync } from 'node:fs'; +import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; + +import { runEvaluation } from '../../src/evaluation/orchestrator.js'; +import type { ResolvedTarget } from '../../src/evaluation/providers/targets.js'; +import type { + Provider, + ProviderRequest, + ProviderResponse, +} from '../../src/evaluation/providers/types.js'; +import { loadTestSuite, loadTests } from '../../src/evaluation/yaml-parser.js'; + +const target: ResolvedTarget = { + name: 'mock', + kind: 'mock', + config: {}, +}; + +const passEvaluators = { + 'llm-grader': { + kind: 'llm-grader' as const, + async evaluate() { + return { + score: 1, + verdict: 'pass' as const, + assertions: [{ text: 'passed', passed: true }], + expectedAspectCount: 1, + }; + }, + }, +}; + +class CapturingProvider implements Provider { + readonly id = 'mock:capturing'; + readonly kind = 'mock' as const; + readonly targetName = 'mock'; + lastRequest?: ProviderRequest; + + constructor(private readonly onInvoke?: (request: ProviderRequest) => void | Promise) {} + + async invoke(request: ProviderRequest): Promise { + this.lastRequest = request; + await this.onInvoke?.(request); + return { + output: [{ role: 'assistant', content: 'answer' }], + }; + } +} + +function cleanGitEnv(): Record { + const env: Record = {}; + for (const [key, value] of Object.entries(process.env)) { + if (value !== undefined && !(key.startsWith('GIT_') && key !== 'GIT_SSH_COMMAND')) { + env[key] = value; + } + } + return env; +} + +function createTestRepo(dir: string, files: Record): string { + mkdirSync(dir, { recursive: true }); + const execOptions = { cwd: dir, stdio: 'ignore' as const, env: cleanGitEnv() }; + execSync('git init', execOptions); + execSync('git config user.email "test@test.com"', execOptions); + execSync('git config user.name "Test"', execOptions); + for (const [name, content] of Object.entries(files)) { + const filePath = path.join(dir, name); + mkdirSync(path.dirname(filePath), { recursive: true }); + writeFileSync(filePath, content, 'utf8'); + } + execSync('git add -A && git commit -m "initial"', execOptions); + return execSync('git rev-parse HEAD', { cwd: dir, env: cleanGitEnv() }).toString().trim(); +} + +describe('promptfoo-compatible lifecycle extensions', () => { + const tempDirs: string[] = []; + + afterEach(async () => { + await Promise.all(tempDirs.map((dir) => rm(dir, { recursive: true, force: true }))); + tempDirs.length = 0; + }); + + it('parses file hooks and agent-rules extensions from eval YAML', async () => { + const dir = await mkdtemp(path.join(tmpdir(), 'agentv-extensions-parse-')); + tempDirs.push(dir); + await writeFile(path.join(dir, 'hooks.mjs'), 'export function beforeAll() {}', 'utf8'); + await writeFile( + path.join(dir, 'suite.eval.yaml'), + `extensions: + - file://hooks.mjs:beforeAll + - id: agentv:agent-rules + hook: beforeEach + skills: rules/skills +tests: + - id: one + input: hello + criteria: works +`, + 'utf8', + ); + + const tests = await loadTests(path.join(dir, 'suite.eval.yaml'), dir); + + expect(tests[0].extensions).toEqual([ + { + id: 'file://hooks.mjs:beforeAll', + hook: 'beforeAll', + path: path.join(dir, 'hooks.mjs'), + functionName: 'beforeAll', + }, + { + id: 'agentv:agent-rules', + hook: 'beforeEach', + skills: ['rules/skills'], + }, + ]); + }); + + it('runs lifecycle file hooks and exposes staged agent-rules paths to providers and results', async () => { + const dir = await mkdtemp(path.join(tmpdir(), 'agentv-extensions-run-')); + tempDirs.push(dir); + await mkdir(path.join(dir, 'template'), { recursive: true }); + await mkdir(path.join(dir, 'rules', 'skills', 'csv'), { recursive: true }); + await writeFile(path.join(dir, 'rules', 'skills', 'csv', 'SKILL.md'), '# CSV\n', 'utf8'); + await writeFile(path.join(dir, 'rules', 'AGENTS.md'), '# Rules\n', 'utf8'); + await writeFile( + path.join(dir, 'hooks.mjs'), + `import { appendFileSync } from 'node:fs'; +import path from 'node:path'; + +function log(context, name) { + appendFileSync(path.join(context.eval_dir, 'lifecycle.log'), name + ':' + Boolean(context.workspace_path) + '\\n'); +} + +export function beforeAll(context) { + log(context, 'beforeAll'); + return { provider_context: { custom_flag: 'beforeAll' }, output: 'beforeAll output' }; +} + +export function beforeEach(context) { + log(context, 'beforeEach'); + return { provider_context: { case_id: context.test_id }, output: 'beforeEach output' }; +} + +export function afterEach(context) { + log(context, 'afterEach'); + return { output: 'afterEach output' }; +} + +export function afterAll(context) { + log(context, 'afterAll'); + return { output: 'afterAll output' }; +} +`, + 'utf8', + ); + await writeFile( + path.join(dir, 'suite.eval.yaml'), + `extensions: + - file://hooks.mjs:beforeAll + - file://hooks.mjs:beforeEach + - file://hooks.mjs:afterEach + - file://hooks.mjs:afterAll + - id: agentv:agent-rules + hook: beforeAll + skills: rules/skills + rules: rules/AGENTS.md +workspace: + template: template +tests: + - id: one + input: hello + criteria: works +`, + 'utf8', + ); + const suite = await loadTestSuite(path.join(dir, 'suite.eval.yaml'), dir); + const provider = new CapturingProvider(); + + const results = await runEvaluation({ + testFilePath: path.join(dir, 'suite.eval.yaml'), + repoRoot: dir, + target, + providerFactory: () => provider, + evaluators: passEvaluators, + evalCases: suite.tests, + maxConcurrency: 1, + }); + + const log = await readFile(path.join(dir, 'lifecycle.log'), 'utf8'); + expect(log.trim().split('\n')).toEqual([ + 'beforeAll:true', + 'beforeEach:true', + 'afterEach:true', + 'afterAll:true', + ]); + expect(provider.lastRequest?.metadata?.custom_flag).toBe('beforeAll'); + expect(provider.lastRequest?.metadata?.case_id).toBe('one'); + const providerRules = provider.lastRequest?.metadata?.agent_rules_paths as { + skills?: string[]; + rules?: string[]; + }; + expect(providerRules.skills?.[0]).toContain(path.join('.agentv', 'agent-rules', 'skills')); + expect(providerRules.rules?.[0]).toContain(path.join('.agentv', 'agent-rules', 'rules')); + expect(results[0].metadata?.agent_rules_paths).toEqual( + provider.lastRequest?.metadata?.agent_rules_paths, + ); + expect(results[0].beforeAllOutput).toContain('beforeAll output'); + expect(results[0].beforeEachOutput).toContain('beforeEach output'); + expect(results[0].afterEachOutput).toContain('afterEach output'); + expect(results[0].afterAllOutput).toContain('afterAll output'); + }); + + it('runs afterEach extensions and preserves extension metadata for conversation cases', async () => { + const dir = await mkdtemp(path.join(tmpdir(), 'agentv-extensions-conversation-')); + tempDirs.push(dir); + await mkdir(path.join(dir, 'template'), { recursive: true }); + await mkdir(path.join(dir, 'rules', 'skills', 'chat'), { recursive: true }); + await writeFile(path.join(dir, 'rules', 'skills', 'chat', 'SKILL.md'), '# Chat\n', 'utf8'); + await writeFile( + path.join(dir, 'hooks.mjs'), + `import { appendFileSync } from 'node:fs'; +import path from 'node:path'; + +export function afterEach(context) { + appendFileSync(path.join(context.eval_dir, 'conversation.log'), context.test_id + ':' + Boolean(context.agent_rules_paths?.skills?.length) + '\\n'); + return { output: 'conversation afterEach output' }; +} +`, + 'utf8', + ); + await writeFile( + path.join(dir, 'suite.eval.yaml'), + `extensions: + - id: agentv:agent-rules + hook: beforeAll + skills: rules/skills + - file://hooks.mjs:afterEach +workspace: + template: template +tests: + - id: conversation + mode: conversation + input: "You are concise" + turns: + - input: hello +`, + 'utf8', + ); + const suite = await loadTestSuite(path.join(dir, 'suite.eval.yaml'), dir); + const provider = new CapturingProvider(); + + const results = await runEvaluation({ + testFilePath: path.join(dir, 'suite.eval.yaml'), + repoRoot: dir, + target, + providerFactory: () => provider, + evaluators: passEvaluators, + evalCases: suite.tests, + maxConcurrency: 1, + }); + + expect((await readFile(path.join(dir, 'conversation.log'), 'utf8')).trim()).toBe( + 'conversation:true', + ); + expect(results[0].metadata?.agent_rules_paths).toEqual( + provider.lastRequest?.metadata?.agent_rules_paths, + ); + expect(results[0].afterEachOutput).toContain('conversation afterEach output'); + }); + + it('scopes pooled beforeAll extension state to the selected workspace slot', async () => { + const dir = await mkdtemp(path.join(tmpdir(), 'agentv-extensions-pool-')); + tempDirs.push(dir); + const previousDataDir = process.env.AGENTV_DATA_DIR; + process.env.AGENTV_DATA_DIR = path.join(dir, 'agentv-data'); + try { + const sourceRepo = path.join(dir, 'source-repo'); + const commit = createTestRepo(sourceRepo, { 'README.md': 'base\n' }); + await mkdir(path.join(dir, 'rules', 'skills', 'slot'), { recursive: true }); + await writeFile(path.join(dir, 'rules', 'skills', 'slot', 'SKILL.md'), '# Slot\n', 'utf8'); + await writeFile( + path.join(dir, 'suite.eval.yaml'), + `extensions: + - id: agentv:agent-rules + hook: beforeAll + skills: rules/skills +workspace: + repos: + - path: ./repo-a + repo: file://${sourceRepo} + commit: ${commit} +tests: + - id: one + input: one + criteria: works + - id: two + input: two + criteria: works +`, + 'utf8', + ); + const suite = await loadTestSuite(path.join(dir, 'suite.eval.yaml'), dir); + const requests: ProviderRequest[] = []; + const provider = new CapturingProvider((request) => { + requests.push(request); + }); + + await runEvaluation({ + testFilePath: path.join(dir, 'suite.eval.yaml'), + repoRoot: dir, + target, + providerFactory: () => provider, + evaluators: passEvaluators, + evalCases: suite.tests, + maxConcurrency: 2, + workspaceMode: 'pooled', + poolMaxSlots: 2, + }); + + expect(requests).toHaveLength(2); + const workspacePaths = new Set(requests.map((request) => request.cwd)); + expect(workspacePaths.size).toBe(2); + for (const request of requests) { + expect(request.cwd).toBeDefined(); + const rules = request.metadata?.agent_rules_paths as { skills?: string[] } | undefined; + expect(rules?.skills?.length).toBe(1); + expect(rules?.skills?.[0]).toContain( + path.join(request.cwd ?? '', '.agentv', 'agent-rules', 'skills'), + ); + } + } finally { + if (previousDataDir === undefined) { + process.env.AGENTV_DATA_DIR = undefined; + } else { + process.env.AGENTV_DATA_DIR = previousDataDir; + } + } + }, 30_000); + + it('refreshes the baseline after beforeEach extensions mutate files without state', async () => { + const dir = await mkdtemp(path.join(tmpdir(), 'agentv-extensions-baseline-')); + tempDirs.push(dir); + await mkdir(path.join(dir, 'template'), { recursive: true }); + await writeFile( + path.join(dir, 'hooks.mjs'), + `import { writeFileSync } from 'node:fs'; +import path from 'node:path'; + +export function beforeEach(context) { + writeFileSync(path.join(context.workspace_path, 'setup.txt'), 'setup from extension\\n'); +} +`, + 'utf8', + ); + await writeFile( + path.join(dir, 'suite.eval.yaml'), + `extensions: + - file://hooks.mjs:beforeEach +workspace: + template: template +tests: + - id: one + input: hello + criteria: works +`, + 'utf8', + ); + const suite = await loadTestSuite(path.join(dir, 'suite.eval.yaml'), dir); + const provider = new CapturingProvider((request) => { + if (!request.cwd) { + throw new Error('cwd was not provided'); + } + writeFileSync(path.join(request.cwd, 'agent.txt'), 'agent output\n', 'utf8'); + }); + + const results = await runEvaluation({ + testFilePath: path.join(dir, 'suite.eval.yaml'), + repoRoot: dir, + target, + providerFactory: () => provider, + evaluators: passEvaluators, + evalCases: suite.tests, + maxConcurrency: 1, + }); + + expect(results[0].fileChanges).toContain('agent.txt'); + expect(results[0].fileChanges).not.toContain('setup.txt'); + }); + + it('rejects removed on_run_complete in favor of afterAll extensions', async () => { + const dir = await mkdtemp(path.join(tmpdir(), 'agentv-extensions-removed-')); + tempDirs.push(dir); + await writeFile( + path.join(dir, 'suite.eval.yaml'), + `on_run_complete: ./done.sh +tests: + - id: one + input: hello + criteria: works +`, + 'utf8', + ); + + await expect(loadTestSuite(path.join(dir, 'suite.eval.yaml'), dir)).rejects.toThrow( + /on_run_complete.*extensions with afterAll/, + ); + }); +}); diff --git a/skills-data/agentv-eval-writer/references/eval.schema.json b/skills-data/agentv-eval-writer/references/eval.schema.json index d044864f7..f769ebb89 100644 --- a/skills-data/agentv-eval-writer/references/eval.schema.json +++ b/skills-data/agentv-eval-writer/references/eval.schema.json @@ -18673,28 +18673,164 @@ "type": "string", "minLength": 1 }, + { + "anyOf": [ + { + "type": "string", + "const": "agentv:agent-rules" + }, + { + "type": "string", + "pattern": "^agentv\\:agent\\-rules\\:" + } + ] + }, { "type": "object", - "properties": {}, - "additionalProperties": {} + "properties": { + "id": { + "type": "string", + "const": "agentv:agent-rules" + }, + "hook": { + "type": "string", + "enum": ["beforeAll", "beforeEach", "afterEach", "afterAll"] + }, + "skills": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + } + ] + }, + "hooks": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + } + ] + }, + "agents": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + } + ] + }, + "rules": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + } + ] + }, + "config": { + "type": "object", + "properties": { + "skills": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + } + ] + }, + "hooks": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + } + ] + }, + "agents": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + } + ] + }, + "rules": { + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + } + ] + } + }, + "additionalProperties": false + } + }, + "required": ["id"], + "additionalProperties": false } ] } }, "on_run_complete": { - "anyOf": [ - { - "type": "string", - "minLength": 1 - }, - { - "type": "array", - "items": { - "type": "string", - "minLength": 1 - } - } - ] + "not": {} }, "policy": { "not": {}