Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion apps/cli/src/commands/eval/commands/bundle.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,9 @@ function definitionsWithEvalTargetRefs(

const result = [...definitions];
for (const ref of targetRefs) {
if (ref.use_target && !result.some((definition) => definition.name === ref.name)) {
if (ref.definition && !result.some((definition) => definition.name === ref.name)) {
result.push(ref.definition);
} else if (ref.use_target && !result.some((definition) => definition.name === ref.name)) {
result.push({ name: ref.name, use_target: ref.use_target } as TargetDefinition);
}
}
Expand Down
4 changes: 3 additions & 1 deletion apps/cli/src/commands/eval/targets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,9 @@ export async function selectMultipleTargets(
const definitions = [...fileDefinitions];
if (targetRefs) {
for (const ref of targetRefs) {
if (ref.use_target && !fileDefinitions.some((d) => d.name === ref.name)) {
if (ref.definition && !fileDefinitions.some((d) => d.name === ref.name)) {
definitions.push(ref.definition);
} else if (ref.use_target && !fileDefinitions.some((d) => d.name === ref.name)) {
definitions.push({ name: ref.name, use_target: ref.use_target } as TargetDefinition);
}
}
Expand Down
63 changes: 61 additions & 2 deletions apps/cli/src/commands/eval/task-bundle.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,27 @@ const SKIPPED_DIR_NAMES = new Set([
'.beads',
'.DS_Store',
]);
const AUTHORING_TOP_LEVEL_TARGET_FIELDS = new Set([
'label',
'provider',
'prompts',
'transform',
'delay',
'env',
'model',
'use_target',
'fallback_targets',
'grader_target',
'max_budget_usd',
'workers',
'provider_batching',
'subagent_mode_allowed',
'max_retries',
'retry_initial_delay_ms',
'retry_max_delay_ms',
'retry_backoff_factor',
'retry_status_codes',
]);

export interface TaskBundleTargetSelection {
readonly evalFileAbsolutePath?: string;
Expand Down Expand Up @@ -591,6 +612,44 @@ function uniqueTargetDefinitions(
return selected;
}

function serializeTargetDefinition(definition: TargetDefinition): Record<string, unknown> {
const target: Record<string, unknown> = { label: definition.name };
if (definition.id !== undefined) {
target.id = definition.id;
}
const config: Record<string, unknown> = {};

for (const [key, value] of Object.entries(definition)) {
if (value === undefined || key === 'name' || key === 'id' || key === 'config') {
continue;
}
if (AUTHORING_TOP_LEVEL_TARGET_FIELDS.has(key)) {
target[key] = value;
} else {
config[key] = value;
}
}

if (isRecord(definition.config)) {
for (const [key, value] of Object.entries(definition.config)) {
if (value !== undefined) {
config[key] = value;
}
}
}
if (Object.keys(config).length > 0) {
target.config = config;
}

return target;
}

function serializeTargetDefinitions(
definitions: readonly TargetDefinition[],
): readonly Record<string, unknown>[] {
return definitions.map((definition) => serializeTargetDefinition(definition));
}

function uniqueTargetNames(selections: readonly TaskBundleTargetSelection[]): readonly string[] {
const names: string[] = [];
const seen = new Set<string>();
Expand Down Expand Up @@ -962,7 +1021,7 @@ export async function materializeTaskBundle(
target: options.targetName,
tests: [evalCase],
});
await writeYamlFile(targetsPath, { targets: targetDefinitions });
await writeYamlFile(targetsPath, { targets: serializeTargetDefinitions(targetDefinitions) });

return {
testDir,
Expand Down Expand Up @@ -1033,7 +1092,7 @@ export async function materializeEvalBundle(
tests: options.tests.map((test) => buildPortableEvalCase(test, rewrites)),
});
await writeYamlFile(targetsPath, {
targets: uniqueTargetDefinitions(options.targetSelections),
targets: serializeTargetDefinitions(uniqueTargetDefinitions(options.targetSelections)),
});

const manifest = bundleManifest({
Expand Down
3 changes: 3 additions & 0 deletions apps/cli/src/commands/runs/rerun.ts
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ async function readTargetDefinitions(
}

function targetName(definition: Record<string, unknown>): string | undefined {
if (typeof definition.label === 'string' && definition.label.trim().length > 0) {
return definition.label.trim();
}
return typeof definition.name === 'string' && definition.name.trim().length > 0
? definition.name.trim()
: undefined;
Expand Down
47 changes: 42 additions & 5 deletions apps/cli/test/commands/eval/bundle.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@ describe('agentv eval bundle', () => {
await writeFile(
path.join(sourceDir, '.agentv', 'targets.yaml'),
`targets:
- name: inherited
- label: inherited
provider: mock
response: '{"answer":"Mock provider response from inherited target"}'
fallback_targets: [backup]
- name: backup
- label: backup
provider: mock
response: '{"answer":"Backup mock response"}'
`,
Expand Down Expand Up @@ -153,8 +153,8 @@ tests: ../data/cases.yaml
expect(input[0]?.content[0]).toEqual({ type: 'file', value: 'files/data/input.txt' });

const bundledTargets = await readFile(path.join(bundleDir, 'targets.yaml'), 'utf8');
expect(bundledTargets).toContain('name: inherited');
expect(bundledTargets).toContain('name: backup');
expect(bundledTargets).toContain('label: inherited');
expect(bundledTargets).toContain('label: backup');

await rm(sourceDir, { recursive: true, force: true });
const run = await runCli(bundleDir, [
Expand All @@ -169,6 +169,43 @@ tests: ../data/cases.yaml
await expectFileExists(path.join(bundleDir, 'run', 'index.jsonl'));
}, 60_000);

it('preserves inline eval target object definitions in the bundled target graph', async () => {
const sourceDir = path.join(tempDir, 'inline-source');
const bundleDir = path.join(tempDir, 'inline-bundle');
await mkdir(path.join(sourceDir, '.agentv'), { recursive: true });
await mkdir(path.join(sourceDir, 'evals'), { recursive: true });
await writeFile(path.join(sourceDir, '.agentv', 'targets.yaml'), 'targets: []\n', 'utf8');
await writeFile(
path.join(sourceDir, 'evals', 'inline.eval.yaml'),
`targets:
- label: candidate
provider: mock
response: '{"answer":"inline bundled response"}'
tests:
- id: inline-case
input: hello
assertions:
- type: contains
value: inline
`,
'utf8',
);

const bundle = await runCli(sourceDir, [
'eval',
'bundle',
'evals/inline.eval.yaml',
'--out',
bundleDir,
]);

expect(bundle.exitCode).toBe(0);
const bundledTargets = await readFile(path.join(bundleDir, 'targets.yaml'), 'utf8');
expect(bundledTargets).toContain('label: candidate');
expect(bundledTargets).toContain('provider: mock');
expect(bundledTargets).toContain('inline bundled response');
}, 30_000);

it('reports unbundleable workspace references with their eval location', async () => {
const sourceDir = path.join(tempDir, 'missing-source');
const bundleDir = path.join(tempDir, 'missing-bundle');
Expand All @@ -177,7 +214,7 @@ tests: ../data/cases.yaml
await writeFile(
path.join(sourceDir, '.agentv', 'targets.yaml'),
`targets:
- name: default
- label: default
provider: mock
`,
'utf8',
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/test/commands/grade/grade-prepared.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ console.log(JSON.stringify({
path.join(root, '.agentv', 'targets.yaml'),
`
targets:
- name: codex
- label: codex
provider: cli
command: bun ./scripts/target.ts
`,
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/test/commands/prepare/prepare.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ await Bun.write(\`\${payload.workspace_path}/\${step}.txt\`, \`\${payload.test_i
path.join(root, '.agentv', 'targets.yaml'),
`
targets:
- name: codex
- label: codex
provider: cli
command: bun ./scripts/target.ts
`,
Expand Down
6 changes: 3 additions & 3 deletions apps/cli/test/commands/runs/rerun.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ interface CliResult {
}

const DEFAULT_TARGETS = `targets:
- name: captured
- label: captured
provider: mock
`;

Expand Down Expand Up @@ -121,7 +121,7 @@ async function createBundleFixture(
await writeFile(
overrideTargetsPath,
`targets:
- name: local
- label: local
provider: mock
`,
'utf8',
Expand Down Expand Up @@ -272,7 +272,7 @@ describe('agentv runs rerun', () => {

it('fails clearly for missing env and accepts an explicit env file', async () => {
const created = await fixture(`targets:
- name: captured
- label: captured
provider: cli
command: \${{ LOCAL_AGENT_COMMAND }}
`);
Expand Down
10 changes: 5 additions & 5 deletions apps/cli/test/eval.integration.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,13 @@ async function createFixture(): Promise<EvalFixture> {
const targetsPath = path.join(agentvDir, 'targets.yaml');
const targetsContent = `$schema: agentv-targets-v2.2
targets:
- name: default
- label: default
provider: mock
- name: file-target
- label: file-target
provider: mock
- name: cli-target
- label: cli-target
provider: mock
- name: codex-target
- label: codex-target
provider: codex
model: gpt-5-default
`;
Expand Down Expand Up @@ -90,7 +90,7 @@ async function createNestedEnvFixture(): Promise<EvalFixture> {
const targetsPath = path.join(agentvDir, 'targets.yaml');
const targetsContent = `$schema: agentv-targets-v2.2
targets:
- name: default
- label: default
provider: mock
`;
await writeFile(targetsPath, targetsContent, 'utf8');
Expand Down
10 changes: 5 additions & 5 deletions apps/web/src/content/docs/docs/targets/cli-provider.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Because the contract is "we invoke a command and read a file," almost any useful
```yaml
# .agentv/targets.yaml
targets:
- name: my_agent
- label: my_agent
provider: cli
command: python agent.py --prompt {PROMPT} --out {OUTPUT_FILE}
grader_target: azure-base # required if your evals use LLM graders
Expand Down Expand Up @@ -70,7 +70,7 @@ echo "Hello, world!" > {OUTPUT_FILE}

| Field | Type | Required | Default | Description |
|---|---|---|---|---|
| `name` | string | yes | — | Target identifier used in eval configs. |
| `label` | string | yes | — | AgentV target name used by eval `target`, CLI `--target`, and comparisons. |
| `provider` | literal `"cli"` | yes | — | Selects this provider. |
| `command` | string | yes | — | Shell command template. |
| `timeout_seconds` | number | no | — | Kill the process if it runs longer than this. |
Expand All @@ -89,7 +89,7 @@ For targets where spin-up cost dominates per-case work (e.g. loading a model, au

```yaml
targets:
- name: batched_agent
- label: batched_agent
provider: cli
provider_batching: true
command: python agent.py --batch-in {PROMPT_FILE} --batch-out {OUTPUT_FILE}
Expand All @@ -106,12 +106,12 @@ AgentV has no dedicated "oracle" feature because the `cli` provider already comp
```yaml
# .agentv/targets.yaml
targets:
- name: my_agent
- label: my_agent
provider: cli
command: python agent.py --prompt {PROMPT} --out {OUTPUT_FILE}
grader_target: azure-base

- name: oracle
- label: oracle
provider: cli
command: cp fixtures/{EVAL_ID}.expected.txt {OUTPUT_FILE}
grader_target: azure-base
Expand Down
Loading
Loading