Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1015,7 +1015,7 @@ function artifactFileContentResponse(c: C, filePath: string, fileContent: string

function missingTranscriptMessage(): string {
return [
'This result does not include canonical transcript.jsonl metadata.',
'This result does not include canonical transcript.json metadata.',
'Dashboard does not parse response.md or markdown transcripts for this view.',
].join(' ');
}
Expand Down Expand Up @@ -1063,7 +1063,7 @@ function traceSessionArtifactResponse(
function missingTraceMessage(): string {
return [
'This result does not include legacy trace artifact metadata.',
'Dashboard transcript inspection uses transcript.jsonl for current run bundles.',
'Dashboard transcript inspection uses transcript.json for current run bundles.',
].join(' ');
}

Expand Down Expand Up @@ -1155,13 +1155,17 @@ function buildRepeatTrialReadModels(
const metricsPath = caseTrialArtifactPath(resultDir, runPath, 'metrics.json');
const timingPath = caseTrialArtifactPath(resultDir, runPath, 'timing.json');
const gradingPath = caseTrialArtifactPath(resultDir, runPath, 'grading.json');
const transcriptPath = caseTrialArtifactPath(resultDir, runPath, 'transcript.jsonl');
const transcriptPath = caseTrialArtifactPath(resultDir, runPath, 'transcript.json');
const transcriptRawPath = caseTrialArtifactPath(resultDir, runPath, 'transcript-raw.jsonl');
const answerPath = caseTrialArtifactPath(resultDir, runPath, 'outputs/answer.md');
const resultPath = caseTrialArtifactPath(resultDir, runPath, 'result.json');
const runResult = readArtifactJsonObject(baseDir, resultPath);
const metrics = readArtifactJsonObject(baseDir, metricsPath);
const timing = readArtifactJsonObject(baseDir, timingPath);
const toolCalls = objectField(metrics, 'tool_calls');
const tokenUsage = objectField(timing, 'token_usage');
const transcriptSummary =
objectField(trial, 'transcript_summary') ?? objectField(runResult, 'transcript_summary');

return {
...trial,
Expand All @@ -1179,6 +1183,7 @@ function buildRepeatTrialReadModels(
total_tool_calls: numberField(metrics, 'total_tool_calls'),
}),
...(toolCalls && { tool_calls: toolCalls }),
...(transcriptSummary && { transcript_summary: transcriptSummary }),
...(metricsPath && { metrics_path: metricsPath }),
...(timingPath && { timing_path: timingPath }),
...(gradingPath && { grading_path: gradingPath }),
Expand Down
123 changes: 89 additions & 34 deletions apps/cli/test/commands/eval/artifact-writer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,28 @@ describe('buildIndexArtifactEntry', () => {
error: 'model drift',
cost_usd: 0.25,
execution_status: 'quality_failure',
transcript_summary: {
total_turns: 1,
tool_calls: {
file_read: 0,
file_write: 0,
file_edit: 0,
shell: 0,
web_fetch: 0,
web_search: 0,
glob: 0,
grep: 0,
list_dir: 0,
agent_task: 0,
unknown: 0,
},
files_read: [],
files_modified: [],
shell_commands: [],
web_fetches: [],
errors: [{ message: 'model drift' }],
thinking_blocks: 0,
},
},
],
});
Expand Down Expand Up @@ -1031,7 +1053,7 @@ describe('writeArtifactsFromResults', () => {
'result.json',
'timing.json',
'transcript-raw.jsonl',
'transcript.jsonl',
'transcript.json',
]);

const alphaGrading: GradingArtifact = JSON.parse(
Expand Down Expand Up @@ -1159,7 +1181,7 @@ describe('writeArtifactsFromResults', () => {

const [indexEntry] = await readIndexLines(paths.indexPath);
const repeatRowDir = expectRowDir(indexEntry, 'repeat-case');
expect(indexEntry?.trials).toEqual([
expect(indexEntry?.trials).toMatchObject([
{ attempt: 0, run_path: 'run-1', score: 0.25, verdict: 'fail' },
{ attempt: 1, run_path: 'run-2', score: 1, verdict: 'pass' },
]);
Expand Down Expand Up @@ -1227,7 +1249,7 @@ describe('writeArtifactsFromResults', () => {
'result.json',
'timing.json',
'transcript-raw.jsonl',
'transcript.jsonl',
'transcript.json',
]);
}

Expand All @@ -1245,14 +1267,15 @@ describe('writeArtifactsFromResults', () => {
model: 'test-target',
grading_path: './grading.json',
metrics_path: './metrics.json',
transcript_path: './transcript.jsonl',
transcript_path: './transcript.json',
transcript_raw_path: './transcript-raw.jsonl',
output_paths: { answer: './outputs/answer.md' },
timing: {
duration_ms: 2000,
},
});
expect(runOneResult).not.toHaveProperty('status');
expect(indexEntry?.trials?.[0]?.transcript_summary).toEqual(runOneResult.transcript_summary);

const runTwoAnswer = await readFile(
path.join(paths.testArtifactDir, repeatRowDir, 'run-2', 'outputs', 'answer.md'),
Expand All @@ -1271,13 +1294,14 @@ describe('writeArtifactsFromResults', () => {
verdict: 'pass',
grading_path: './grading.json',
metrics_path: './metrics.json',
transcript_path: './transcript.jsonl',
transcript_path: './transcript.json',
transcript_raw_path: './transcript-raw.jsonl',
timing: {
duration_ms: 4000,
},
});
expect(runTwoResult).not.toHaveProperty('status');
expect(indexEntry?.trials?.[1]?.transcript_summary).toEqual(runTwoResult.transcript_summary);
});

it('handles empty results array', async () => {
Expand Down Expand Up @@ -1330,7 +1354,7 @@ describe('writeArtifactsFromResults', () => {
expect(timingOne.duration_ms).toBe(0);
});

it('writes normalized transcript.jsonl rows plus raw transcript evidence', async () => {
it('writes normalized transcript.json plus raw transcript evidence', async () => {
const input = [{ role: 'user' as const, content: 'Inspect artifact output' }];
const output = [
{
Expand All @@ -1346,7 +1370,7 @@ describe('writeArtifactsFromResults', () => {
durationMs: 25,
},
{
tool: 'Bash',
tool: 'command_execution',
id: 'bash-1',
input: { command: 'bun test missing.test.ts' },
status: 'error' as const,
Expand All @@ -1358,7 +1382,7 @@ describe('writeArtifactsFromResults', () => {
const results = [
makeResult({
testId: 'transcript-case',
target: 'codex',
target: 'friendly-codex-target',
conversationId: 'session-123',
durationMs: 4200,
costUsd: 0.25,
Expand All @@ -1369,7 +1393,8 @@ describe('writeArtifactsFromResults', () => {
input,
output,
finalOutput: 'Reading artifact-writer.ts',
target: 'codex',
target: 'friendly-codex-target',
provider: 'codex',
testId: 'transcript-case',
conversationId: 'session-123',
tokenUsage: { input: 100, output: 40, cached: 10, reasoning: 5 },
Expand All @@ -1383,11 +1408,8 @@ describe('writeArtifactsFromResults', () => {
const [indexLine] = await readIndexLines(paths.indexPath);
const rowDir = expectRowDir(indexLine, 'transcript-case');

const transcriptPath = runArtifactPath(testDir, indexLine, 'run-1', 'transcript.jsonl');
const transcriptLines = (await readFile(transcriptPath, 'utf8'))
.trim()
.split('\n')
.map((line) => JSON.parse(line));
const transcriptPath = runArtifactPath(testDir, indexLine, 'run-1', 'transcript.json');
const transcript = JSON.parse(await readFile(transcriptPath, 'utf8'));

const rawTranscriptLines = (
await readFile(runArtifactPath(testDir, indexLine, 'run-1', 'transcript-raw.jsonl'), 'utf8')
Expand All @@ -1396,14 +1418,43 @@ describe('writeArtifactsFromResults', () => {
.split('\n')
.map((line) => JSON.parse(line));

expect(transcriptLines).toHaveLength(2);
expect(transcriptLines[0]).toMatchObject({
expect(transcript).toMatchObject({
schema_version: 'agentv.normalized_transcript.v1',
provider_id: 'codex',
target: 'friendly-codex-target',
transcript_summary: {
total_turns: 2,
tool_calls: {
file_read: 1,
file_write: 0,
file_edit: 0,
shell: 1,
web_fetch: 0,
web_search: 0,
glob: 0,
grep: 0,
list_dir: 0,
agent_task: 0,
unknown: 0,
},
files_read: ['apps/cli/src/commands/eval/artifact-writer.ts'],
files_modified: [],
shell_commands: ['bun test missing.test.ts'],
web_fetches: [],
errors: [
{ message: 'Tool command_execution error', tool_call_id: 'bash-1', tool_name: 'shell' },
],
thinking_blocks: 0,
},
});
expect(transcript.turns).toHaveLength(2);
expect(transcript.turns[0]).toMatchObject({
v: 1,
agent: 'codex',
type: 'user',
content: [{ type: 'text', text: 'Inspect artifact output' }],
});
expect(transcriptLines[1]).toMatchObject({
expect(transcript.turns[1]).toMatchObject({
v: 1,
agent: 'codex',
type: 'assistant',
Expand All @@ -1412,6 +1463,7 @@ describe('writeArtifactsFromResults', () => {
{
type: 'tool_use',
id: 'read-1',
tool_name: 'file_read',
name: 'Read',
input: { file_path: 'apps/cli/src/commands/eval/artifact-writer.ts' },
result: {
Expand All @@ -1423,7 +1475,8 @@ describe('writeArtifactsFromResults', () => {
{
type: 'tool_use',
id: 'bash-1',
name: 'Bash',
tool_name: 'shell',
name: 'command_execution',
input: { command: 'bun test missing.test.ts' },
result: {
status: 'error',
Expand All @@ -1432,23 +1485,26 @@ describe('writeArtifactsFromResults', () => {
},
],
});
expect(transcriptLines[1]).not.toHaveProperty('schema_version');
expect(transcriptLines[1]).not.toHaveProperty('o11y');
expect(transcript.turns[1]).not.toHaveProperty('schema_version');
expect(transcript.turns[1]).not.toHaveProperty('o11y');
expect(rawTranscriptLines[0]).toMatchObject({
schema_version: 'agentv.transcript.v1',
test_id: 'transcript-case',
target: 'codex',
target: 'friendly-codex-target',
message_index: 0,
role: 'user',
});
await expect(readFile(path.join(testDir, rowDir, 'transcript.json'), 'utf8')).rejects.toThrow();
await expect(
readFile(path.join(testDir, rowDir, 'run-1', 'transcript.jsonl'), 'utf8'),
).rejects.toThrow();
await expect(
readFile(runArtifactPath(testDir, indexLine, 'run-1', 'trace.json'), 'utf8'),
).rejects.toThrow();

expect(indexLine).not.toHaveProperty('trace_path');
expect(indexLine?.transcript_path).toBe(`${rowDir}/run-1/transcript.jsonl`);
expect(indexLine?.transcript_path).toBe(`${rowDir}/run-1/transcript.json`);
expect(indexLine?.transcript_raw_path).toBe(`${rowDir}/run-1/transcript-raw.jsonl`);
expect(indexLine?.transcript_summary).toEqual(transcript.transcript_summary);
expect(indexLine?.metrics_path).toBe(`${rowDir}/run-1/metrics.json`);
expect(indexLine.metrics_path.endsWith(CANONICAL_METRICS_ARTIFACT_PATH)).toBe(true);

Expand Down Expand Up @@ -1583,7 +1639,7 @@ describe('writeArtifactsFromResults', () => {
});
expect(summary.trace).not.toHaveProperty('path');
expect(summary.source_artifacts).toMatchObject({
transcript_path: 'transcript.jsonl',
transcript_path: 'transcript.json',
grading_path: 'grading.json',
timing_path: 'timing.json',
file_changes_path: CANONICAL_FILE_CHANGES_ARTIFACT_PATH,
Expand Down Expand Up @@ -1803,23 +1859,22 @@ describe('writeArtifactsFromResults', () => {
const transcriptPath = runArtifactPath(testDir, indexLine, 'run-1', 'transcript-raw.jsonl');
await expect(readFile(transcriptPath, 'utf8')).resolves.toBe(rawLog);
await expect(readFile(rawLogPath, 'utf8')).resolves.toBe(rawLog);
await expect(readFile(path.join(testDir, rowDir, 'transcript.json'), 'utf8')).rejects.toThrow();
await expect(
readFile(path.join(testDir, rowDir, 'run-1', 'transcript.jsonl'), 'utf8'),
).rejects.toThrow();

const transcriptLines = (
await readFile(runArtifactPath(testDir, indexLine, 'run-1', 'transcript.jsonl'), 'utf8')
)
.trim()
.split('\n')
.map((line) => JSON.parse(line));
expect(transcriptLines[0]).toMatchObject({
const transcript = JSON.parse(
await readFile(runArtifactPath(testDir, indexLine, 'run-1', 'transcript.json'), 'utf8'),
);
expect(transcript.turns[0]).toMatchObject({
v: 1,
agent: 'codex',
type: 'assistant',
content: [{ type: 'text', text: 'Raw log copied' }],
});

expect(indexLine.raw_provider_log_path).toBeUndefined();
expect(indexLine.transcript_path).toBe(`${rowDir}/run-1/transcript.jsonl`);
expect(indexLine.transcript_path).toBe(`${rowDir}/run-1/transcript.json`);
expect(indexLine.transcript_raw_path).toBe(`${rowDir}/run-1/transcript-raw.jsonl`);
expect(indexLine).not.toHaveProperty('transcript_json_path');
});
Expand Down Expand Up @@ -1865,7 +1920,7 @@ describe('writeArtifactsFromResults', () => {
expect(JSON.stringify(indexLine)).not.toContain('api_key');

const transcriptJson = await readFile(
runArtifactPath(testDir, indexLine, 'run-1', 'transcript.jsonl'),
runArtifactPath(testDir, indexLine, 'run-1', 'transcript.json'),
'utf8',
);
expect(transcriptJson).not.toContain('secret');
Expand Down
4 changes: 2 additions & 2 deletions apps/cli/test/commands/results/export.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ describe('results export', () => {
metrics_path: `${resultDir}/run-1/metrics.json`,
output_path: `${resultDir}/run-1/outputs/answer.md`,
answer_path: `${resultDir}/run-1/outputs/answer.md`,
transcript_path: `${resultDir}/run-1/transcript.jsonl`,
transcript_path: `${resultDir}/run-1/transcript.json`,
transcript_raw_path: `${resultDir}/run-1/transcript-raw.jsonl`,
});
expect(bundle.entries[0].artifact_refs).not.toHaveProperty('trace_path');
Expand Down Expand Up @@ -459,7 +459,7 @@ describe('results export', () => {
metrics_path: `${rowDir}/run-1/metrics.json`,
output_path: `${rowDir}/run-1/outputs/answer.md`,
answer_path: `${rowDir}/run-1/outputs/answer.md`,
transcript_path: `${rowDir}/run-1/transcript.jsonl`,
transcript_path: `${rowDir}/run-1/transcript.json`,
transcript_raw_path: `${rowDir}/run-1/transcript-raw.jsonl`,
});
expect(entries[0]).not.toHaveProperty('input_path');
Expand Down
Loading
Loading