Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 20 additions & 13 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1344,7 +1344,8 @@ async function prepareFileMetadata(params: {

selections = multiSelections.map((sel) => ({
selection: sel,
inlineTargetLabel: resolveTargetLabel(sel.targetName, sel.resolvedTarget.name),
inlineTargetLabel:
sel.targetLabel ?? resolveTargetLabel(sel.targetName, sel.resolvedTarget.name),
}));
} else {
// Single target mode (legacy path)
Expand All @@ -1368,18 +1369,22 @@ async function prepareFileMetadata(params: {
});

// Attach target hooks from eval file if available
const singleTargetHooks = targetRefs?.find((ref) => ref.name === selection.targetName)?.hooks;
const augmentedSelection: TargetSelection = singleTargetHooks
? { ...selection, targetHooks: singleTargetHooks }
: selection;
const singleTargetRef = targetRefs?.find((ref) => ref.name === selection.targetName);
const augmentedSelection: TargetSelection = {
...selection,
...(singleTargetRef?.label ? { targetLabel: singleTargetRef.label } : {}),
...(singleTargetRef?.hooks ? { targetHooks: singleTargetRef.hooks } : {}),
};

selections = [
{
selection: augmentedSelection,
inlineTargetLabel: resolveTargetLabel(
augmentedSelection.targetName,
augmentedSelection.resolvedTarget.name,
),
inlineTargetLabel:
augmentedSelection.targetLabel ??
resolveTargetLabel(
augmentedSelection.targetName,
augmentedSelection.resolvedTarget.name,
),
},
];
}
Expand Down Expand Up @@ -2307,7 +2312,8 @@ export async function runEvalCommand(
const explicitVariant = targetVariantForSelection(selection);
const skippedResults: EvaluationResult[] = targetPrep.testCases.map((testCase) => ({
timestamp: new Date().toISOString(),
testId: testCase.id,
testId: testCase.testId ?? testCase.id,
prompt: testCase.prompt,
score: 0,
assertions: [],
output: budgetMsg,
Expand All @@ -2316,7 +2322,7 @@ export async function runEvalCommand(
output: [{ role: 'assistant' as const, content: budgetMsg }],
finalOutput: budgetMsg,
target: selection.targetName,
testId: testCase.id,
testId: testCase.testId ?? testCase.id,
conversationId: testCase.conversation_id,
error: budgetMsg,
}),
Expand Down Expand Up @@ -2426,7 +2432,8 @@ export async function runEvalCommand(
withSourceMetadata(
{
timestamp: new Date().toISOString(),
testId: testCase.id,
testId: testCase.testId ?? testCase.id,
prompt: testCase.prompt,
score: 0,
assertions: [],
output: message,
Expand All @@ -2435,7 +2442,7 @@ export async function runEvalCommand(
output: [{ role: 'assistant' as const, content: message }],
finalOutput: message,
target: selection.targetName,
testId: testCase.id,
testId: testCase.testId ?? testCase.id,
conversationId: testCase.conversation_id,
error: message,
}),
Expand Down
7 changes: 7 additions & 0 deletions apps/cli/src/commands/eval/targets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ export interface TargetSelection {
readonly definitions: readonly TargetDefinition[];
readonly resolvedTarget: ResolvedTarget;
readonly targetName: string;
readonly targetLabel?: string;
readonly targetSource: 'cli' | 'test-file' | 'default';
readonly targetsFilePath: string;
/** Per-target hooks from eval file (eval-level customization) */
Expand Down Expand Up @@ -260,11 +261,15 @@ export async function selectMultipleTargets(

// Build a lookup for target hooks from eval target refs
const hooksMap = new Map<string, import('@agentv/core').TargetHooksConfig>();
const labelsMap = new Map<string, string>();
if (targetRefs) {
for (const ref of targetRefs) {
if (ref.hooks) {
hooksMap.set(ref.name, ref.hooks);
}
if (ref.label) {
labelsMap.set(ref.name, ref.label);
}
}
}

Expand Down Expand Up @@ -323,6 +328,7 @@ export async function selectMultipleTargets(
modelOverride,
);
const hooks = hooksMap.get(name);
const targetLabel = labelsMap.get(name);

try {
const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath, {
Expand All @@ -332,6 +338,7 @@ export async function selectMultipleTargets(
definitions,
resolvedTarget,
targetName: name,
...(targetLabel ? { targetLabel } : {}),
targetSource: options.targetSource ?? 'cli',
targetsFilePath,
...(hooks && { targetHooks: hooks }),
Expand Down
27 changes: 27 additions & 0 deletions apps/cli/test/commands/eval/artifact-writer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ import {
type GraderResult,
METRICS_SCHEMA_VERSION,
MetricsArtifactWireSchema,
buildEvalTestTargetKey,
buildEvaluationResultTargetKey,
buildResultIndexArtifact,
buildTraceFromMessages,
parseYamlValue,
Expand Down Expand Up @@ -1304,6 +1306,31 @@ describe('writeArtifactsFromResults', () => {
expect(indexEntry?.trials?.[1]?.transcript_summary).toEqual(runTwoResult.transcript_summary);
});

it('keys prompt-expanded resume checks by authored test id plus prompt id', () => {
const prompt = { id: 'direct', label: 'Direct prompt', kind: 'string' as const };
const completed = makeResult({
testId: 'docs',
prompt,
target: 'mock-target',
});
const expandedTest = {
id: 'docs__prompt_direct',
testId: 'docs',
prompt,
input: [{ role: 'user', content: 'Prompt text' }],
expected_output: [],
reference_answer: '',
file_paths: [],
criteria: 'ok',
evaluator: 'llm-grader',
assertions: [],
} as unknown as EvalTest;

expect(buildEvalTestTargetKey(expandedTest, 'mock-target')).toBe(
buildEvaluationResultTargetKey(completed),
);
});

it('handles empty results array', async () => {
const paths = await writeArtifactsFromResults([], testDir);

Expand Down
66 changes: 66 additions & 0 deletions apps/cli/test/commands/eval/targets.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import path from 'node:path';

import { loadTestSuite } from '@agentv/core';

import { selectMultipleTargets } from '../../../src/commands/eval/targets.js';

describe('eval target selection', () => {
let tempDir: string;

beforeEach(async () => {
tempDir = await mkdtemp(path.join(tmpdir(), 'agentv-target-selection-'));
});

afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});

it('resolves authored target ids through targets.yaml while keeping labels for display', async () => {
const agentvDir = path.join(tempDir, '.agentv');
await mkdir(agentvDir, { recursive: true });
await writeFile(
path.join(agentvDir, 'targets.yaml'),
[
'$schema: agentv-targets-v2.2',
'targets:',
' - label: openai:gpt-5.4-mini',
' provider: mock',
'',
].join('\n'),
);
const evalPath = path.join(tempDir, 'target-label.eval.yaml');
await writeFile(
evalPath,
[
'name: target-label-suite',
'targets:',
' - id: openai:gpt-5.4-mini',
' label: mini',
'tests:',
' - id: target-case',
' input: hello',
' criteria: ok',
'',
].join('\n'),
);

const suite = await loadTestSuite(evalPath, tempDir);
const selections = await selectMultipleTargets({
testFilePath: evalPath,
repoRoot: tempDir,
cwd: tempDir,
env: {},
targetNames: suite.targets ?? [],
targetRefs: suite.targetRefs,
targetSource: 'test-file',
});

expect(selections).toHaveLength(1);
expect(selections[0]?.targetName).toBe('openai:gpt-5.4-mini');
expect(selections[0]?.targetLabel).toBe('mini');
expect(selections[0]?.resolvedTarget.kind).toBe('mock');
});
});
2 changes: 1 addition & 1 deletion apps/cli/test/commands/prepare/prepare.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ describe('agentv prepare', () => {
path.join(tempDir, '.agentv', 'targets.yaml'),
`
targets:
- name: codex
- label: codex
provider: cli
command: bun ./scripts/target.ts
`,
Expand Down
42 changes: 30 additions & 12 deletions packages/core/src/evaluation/loaders/config-loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -398,25 +398,43 @@ function parseEvalTargetRef(raw: unknown, location: string): EvalTargetRef {
}

const rawLabel = raw.label;
const rawId = raw.id;
const useTarget = raw.use_target;
const legacyName = raw.name;
const id = typeof rawId === 'string' && rawId.trim().length > 0 ? rawId.trim() : undefined;
const label =
typeof rawLabel === 'string' && rawLabel.trim().length > 0 ? rawLabel.trim() : undefined;
if (!label) {
throw new Error(`Invalid ${location}: target object requires a 'label' field.`);
const useTargetName =
typeof useTarget === 'string' && useTarget.trim().length > 0 ? useTarget.trim() : undefined;
const legacyTargetName =
typeof legacyName === 'string' && legacyName.trim().length > 0 ? legacyName.trim() : undefined;
if (legacyName !== undefined) {
throw new Error(
`Invalid ${location}: target field 'name' has been removed. Use 'id' and 'label' instead.`,
);
}

const hooks = parseTargetHooks(raw.hooks);
const definition = normalizeTargetDefinition(
Object.fromEntries(Object.entries(raw).filter(([key]) => key !== 'hooks')),
) as TargetDefinition;
const useTarget =
typeof raw.use_target === 'string' && raw.use_target.trim().length > 0
? raw.use_target.trim()
: undefined;
const hasInlineDefinition = typeof raw.provider === 'string' || useTargetName !== undefined;
if (hasInlineDefinition && !label) {
throw new Error(`Invalid ${location}: target object requires a 'label' field.`);
}
const name = hasInlineDefinition ? label : (id ?? legacyTargetName ?? label);
if (!name) {
throw new Error(`Invalid ${location}: target object requires an 'id' or 'label' field.`);
}
const definition = hasInlineDefinition
? (normalizeTargetDefinition(
Object.fromEntries(Object.entries(raw).filter(([key]) => key !== 'hooks')),
) as TargetDefinition)
: undefined;

return {
name: label,
...(useTarget !== undefined ? { use_target: useTarget } : {}),
definition,
name,
...(id !== undefined ? { id } : {}),
...(label !== undefined ? { label } : {}),
...(useTargetName !== undefined ? { use_target: useTargetName } : {}),
...(definition ? { definition } : {}),
...(hooks !== undefined ? { hooks } : {}),
};
}
Expand Down
Loading
Loading