From dca97ef0dc5a933f968eba4c58fb6bd6e861e323 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 12:19:04 +0200 Subject: [PATCH 1/2] feat(eval): normalize transcript artifacts --- apps/cli/src/commands/results/serve.ts | 6 +- .../commands/eval/artifact-writer.test.ts | 97 +++-- apps/cli/test/commands/results/export.test.ts | 4 +- apps/cli/test/commands/results/serve.test.ts | 4 +- apps/cli/test/eval.integration.test.ts | 3 +- apps/dashboard/src/components/EvalDetail.tsx | 2 +- .../src/components/TranscriptTimeline.tsx | 52 ++- .../__fixtures__/structured-transcript.ts | 4 +- .../components/transcript-timeline.test.tsx | 55 ++- apps/dashboard/src/lib/types.ts | 13 + .../docs/docs/evaluation/running-evals.mdx | 12 +- .../docs/docs/reference/result-artifacts.mdx | 18 +- .../src/content/docs/docs/tools/import.mdx | 2 +- .../src/content/docs/docs/tools/results.mdx | 13 +- packages/core/src/evaluation/metrics.ts | 2 +- .../evaluation/result-artifact-contract.ts | 11 +- .../core/src/evaluation/result-row-schema.ts | 1 + packages/core/src/evaluation/run-artifacts.ts | 39 +- .../core/src/evaluation/trace-envelope.ts | 8 +- packages/core/src/evaluation/trace.ts | 2 +- .../core/src/evaluation/transcript-summary.ts | 383 ++++++++++++++++++ packages/core/src/import/types.ts | 50 ++- packages/core/src/index.ts | 1 + .../core/test/evaluation/orchestrator.test.ts | 4 +- 24 files changed, 683 insertions(+), 103 deletions(-) create mode 100644 packages/core/src/evaluation/transcript-summary.ts diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index f68e10bff..f62779226 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -1015,7 +1015,7 @@ function artifactFileContentResponse(c: C, filePath: string, fileContent: string function missingTranscriptMessage(): string { return [ - 'This result does not include canonical transcript.jsonl metadata.', + 'This result does not include canonical transcript.json metadata.', 'Dashboard does not parse response.md or markdown transcripts for this view.', ].join(' '); } @@ -1063,7 +1063,7 @@ function traceSessionArtifactResponse( function missingTraceMessage(): string { return [ 'This result does not include legacy trace artifact metadata.', - 'Dashboard transcript inspection uses transcript.jsonl for current run bundles.', + 'Dashboard transcript inspection uses transcript.json for current run bundles.', ].join(' '); } @@ -1155,7 +1155,7 @@ function buildRepeatTrialReadModels( const metricsPath = caseTrialArtifactPath(resultDir, runPath, 'metrics.json'); const timingPath = caseTrialArtifactPath(resultDir, runPath, 'timing.json'); const gradingPath = caseTrialArtifactPath(resultDir, runPath, 'grading.json'); - const transcriptPath = caseTrialArtifactPath(resultDir, runPath, 'transcript.jsonl'); + const transcriptPath = caseTrialArtifactPath(resultDir, runPath, 'transcript.json'); const transcriptRawPath = caseTrialArtifactPath(resultDir, runPath, 'transcript-raw.jsonl'); const answerPath = caseTrialArtifactPath(resultDir, runPath, 'outputs/answer.md'); const metrics = readArtifactJsonObject(baseDir, metricsPath); diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 8ab705f0e..f2bbdec95 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -1031,7 +1031,7 @@ describe('writeArtifactsFromResults', () => { 'result.json', 'timing.json', 'transcript-raw.jsonl', - 'transcript.jsonl', + 'transcript.json', ]); const alphaGrading: GradingArtifact = JSON.parse( @@ -1227,7 +1227,7 @@ describe('writeArtifactsFromResults', () => { 'result.json', 'timing.json', 'transcript-raw.jsonl', - 'transcript.jsonl', + 'transcript.json', ]); } @@ -1245,7 +1245,7 @@ describe('writeArtifactsFromResults', () => { model: 'test-target', grading_path: './grading.json', metrics_path: './metrics.json', - transcript_path: './transcript.jsonl', + transcript_path: './transcript.json', transcript_raw_path: './transcript-raw.jsonl', output_paths: { answer: './outputs/answer.md' }, timing: { @@ -1271,7 +1271,7 @@ describe('writeArtifactsFromResults', () => { verdict: 'pass', grading_path: './grading.json', metrics_path: './metrics.json', - transcript_path: './transcript.jsonl', + transcript_path: './transcript.json', transcript_raw_path: './transcript-raw.jsonl', timing: { duration_ms: 4000, @@ -1330,7 +1330,7 @@ describe('writeArtifactsFromResults', () => { expect(timingOne.duration_ms).toBe(0); }); - it('writes normalized transcript.jsonl rows plus raw transcript evidence', async () => { + it('writes normalized transcript.json plus raw transcript evidence', async () => { const input = [{ role: 'user' as const, content: 'Inspect artifact output' }]; const output = [ { @@ -1346,7 +1346,7 @@ describe('writeArtifactsFromResults', () => { durationMs: 25, }, { - tool: 'Bash', + tool: 'command_execution', id: 'bash-1', input: { command: 'bun test missing.test.ts' }, status: 'error' as const, @@ -1358,7 +1358,7 @@ describe('writeArtifactsFromResults', () => { const results = [ makeResult({ testId: 'transcript-case', - target: 'codex', + target: 'friendly-codex-target', conversationId: 'session-123', durationMs: 4200, costUsd: 0.25, @@ -1369,7 +1369,8 @@ describe('writeArtifactsFromResults', () => { input, output, finalOutput: 'Reading artifact-writer.ts', - target: 'codex', + target: 'friendly-codex-target', + provider: 'codex', testId: 'transcript-case', conversationId: 'session-123', tokenUsage: { input: 100, output: 40, cached: 10, reasoning: 5 }, @@ -1383,11 +1384,8 @@ describe('writeArtifactsFromResults', () => { const [indexLine] = await readIndexLines(paths.indexPath); const rowDir = expectRowDir(indexLine, 'transcript-case'); - const transcriptPath = runArtifactPath(testDir, indexLine, 'run-1', 'transcript.jsonl'); - const transcriptLines = (await readFile(transcriptPath, 'utf8')) - .trim() - .split('\n') - .map((line) => JSON.parse(line)); + const transcriptPath = runArtifactPath(testDir, indexLine, 'run-1', 'transcript.json'); + const transcript = JSON.parse(await readFile(transcriptPath, 'utf8')); const rawTranscriptLines = ( await readFile(runArtifactPath(testDir, indexLine, 'run-1', 'transcript-raw.jsonl'), 'utf8') @@ -1396,14 +1394,43 @@ describe('writeArtifactsFromResults', () => { .split('\n') .map((line) => JSON.parse(line)); - expect(transcriptLines).toHaveLength(2); - expect(transcriptLines[0]).toMatchObject({ + expect(transcript).toMatchObject({ + schema_version: 'agentv.normalized_transcript.v1', + provider_id: 'codex', + target: 'friendly-codex-target', + transcript_summary: { + total_turns: 2, + tool_calls: { + file_read: 1, + file_write: 0, + file_edit: 0, + shell: 1, + web_fetch: 0, + web_search: 0, + glob: 0, + grep: 0, + list_dir: 0, + agent_task: 0, + unknown: 0, + }, + files_read: ['apps/cli/src/commands/eval/artifact-writer.ts'], + files_modified: [], + shell_commands: ['bun test missing.test.ts'], + web_fetches: [], + errors: [ + { message: 'Tool command_execution error', tool_call_id: 'bash-1', tool_name: 'shell' }, + ], + thinking_blocks: 0, + }, + }); + expect(transcript.turns).toHaveLength(2); + expect(transcript.turns[0]).toMatchObject({ v: 1, agent: 'codex', type: 'user', content: [{ type: 'text', text: 'Inspect artifact output' }], }); - expect(transcriptLines[1]).toMatchObject({ + expect(transcript.turns[1]).toMatchObject({ v: 1, agent: 'codex', type: 'assistant', @@ -1412,6 +1439,7 @@ describe('writeArtifactsFromResults', () => { { type: 'tool_use', id: 'read-1', + tool_name: 'file_read', name: 'Read', input: { file_path: 'apps/cli/src/commands/eval/artifact-writer.ts' }, result: { @@ -1423,7 +1451,8 @@ describe('writeArtifactsFromResults', () => { { type: 'tool_use', id: 'bash-1', - name: 'Bash', + tool_name: 'shell', + name: 'command_execution', input: { command: 'bun test missing.test.ts' }, result: { status: 'error', @@ -1432,23 +1461,26 @@ describe('writeArtifactsFromResults', () => { }, ], }); - expect(transcriptLines[1]).not.toHaveProperty('schema_version'); - expect(transcriptLines[1]).not.toHaveProperty('o11y'); + expect(transcript.turns[1]).not.toHaveProperty('schema_version'); + expect(transcript.turns[1]).not.toHaveProperty('o11y'); expect(rawTranscriptLines[0]).toMatchObject({ schema_version: 'agentv.transcript.v1', test_id: 'transcript-case', - target: 'codex', + target: 'friendly-codex-target', message_index: 0, role: 'user', }); - await expect(readFile(path.join(testDir, rowDir, 'transcript.json'), 'utf8')).rejects.toThrow(); + await expect( + readFile(path.join(testDir, rowDir, 'run-1', 'transcript.jsonl'), 'utf8'), + ).rejects.toThrow(); await expect( readFile(runArtifactPath(testDir, indexLine, 'run-1', 'trace.json'), 'utf8'), ).rejects.toThrow(); expect(indexLine).not.toHaveProperty('trace_path'); - expect(indexLine?.transcript_path).toBe(`${rowDir}/run-1/transcript.jsonl`); + expect(indexLine?.transcript_path).toBe(`${rowDir}/run-1/transcript.json`); expect(indexLine?.transcript_raw_path).toBe(`${rowDir}/run-1/transcript-raw.jsonl`); + expect(indexLine?.transcript_summary).toEqual(transcript.transcript_summary); expect(indexLine?.metrics_path).toBe(`${rowDir}/run-1/metrics.json`); expect(indexLine.metrics_path.endsWith(CANONICAL_METRICS_ARTIFACT_PATH)).toBe(true); @@ -1583,7 +1615,7 @@ describe('writeArtifactsFromResults', () => { }); expect(summary.trace).not.toHaveProperty('path'); expect(summary.source_artifacts).toMatchObject({ - transcript_path: 'transcript.jsonl', + transcript_path: 'transcript.json', grading_path: 'grading.json', timing_path: 'timing.json', file_changes_path: CANONICAL_FILE_CHANGES_ARTIFACT_PATH, @@ -1803,15 +1835,14 @@ describe('writeArtifactsFromResults', () => { const transcriptPath = runArtifactPath(testDir, indexLine, 'run-1', 'transcript-raw.jsonl'); await expect(readFile(transcriptPath, 'utf8')).resolves.toBe(rawLog); await expect(readFile(rawLogPath, 'utf8')).resolves.toBe(rawLog); - await expect(readFile(path.join(testDir, rowDir, 'transcript.json'), 'utf8')).rejects.toThrow(); + await expect( + readFile(path.join(testDir, rowDir, 'run-1', 'transcript.jsonl'), 'utf8'), + ).rejects.toThrow(); - const transcriptLines = ( - await readFile(runArtifactPath(testDir, indexLine, 'run-1', 'transcript.jsonl'), 'utf8') - ) - .trim() - .split('\n') - .map((line) => JSON.parse(line)); - expect(transcriptLines[0]).toMatchObject({ + const transcript = JSON.parse( + await readFile(runArtifactPath(testDir, indexLine, 'run-1', 'transcript.json'), 'utf8'), + ); + expect(transcript.turns[0]).toMatchObject({ v: 1, agent: 'codex', type: 'assistant', @@ -1819,7 +1850,7 @@ describe('writeArtifactsFromResults', () => { }); expect(indexLine.raw_provider_log_path).toBeUndefined(); - expect(indexLine.transcript_path).toBe(`${rowDir}/run-1/transcript.jsonl`); + expect(indexLine.transcript_path).toBe(`${rowDir}/run-1/transcript.json`); expect(indexLine.transcript_raw_path).toBe(`${rowDir}/run-1/transcript-raw.jsonl`); expect(indexLine).not.toHaveProperty('transcript_json_path'); }); @@ -1865,7 +1896,7 @@ describe('writeArtifactsFromResults', () => { expect(JSON.stringify(indexLine)).not.toContain('api_key'); const transcriptJson = await readFile( - runArtifactPath(testDir, indexLine, 'run-1', 'transcript.jsonl'), + runArtifactPath(testDir, indexLine, 'run-1', 'transcript.json'), 'utf8', ); expect(transcriptJson).not.toContain('secret'); diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index da4df1a0c..90860e80a 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -385,7 +385,7 @@ describe('results export', () => { metrics_path: `${resultDir}/run-1/metrics.json`, output_path: `${resultDir}/run-1/outputs/answer.md`, answer_path: `${resultDir}/run-1/outputs/answer.md`, - transcript_path: `${resultDir}/run-1/transcript.jsonl`, + transcript_path: `${resultDir}/run-1/transcript.json`, transcript_raw_path: `${resultDir}/run-1/transcript-raw.jsonl`, }); expect(bundle.entries[0].artifact_refs).not.toHaveProperty('trace_path'); @@ -459,7 +459,7 @@ describe('results export', () => { metrics_path: `${rowDir}/run-1/metrics.json`, output_path: `${rowDir}/run-1/outputs/answer.md`, answer_path: `${rowDir}/run-1/outputs/answer.md`, - transcript_path: `${rowDir}/run-1/transcript.jsonl`, + transcript_path: `${rowDir}/run-1/transcript.json`, transcript_raw_path: `${rowDir}/run-1/transcript-raw.jsonl`, }); expect(entries[0]).not.toHaveProperty('input_path'); diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 98a08f30b..55fc01cc7 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -2998,7 +2998,7 @@ describe('serve app', () => { }; expect(traceData.schema_version).toBe('agentv.dashboard.trace_artifact.v1'); expect(traceData.status).toBe('missing'); - expect(traceData.message).toContain('transcript.jsonl'); + expect(traceData.message).toContain('transcript.json'); const detailRes = await app.request(`/api/runs/${encodeURIComponent(runId)}`); expect(detailRes.status).toBe(200); @@ -3450,7 +3450,7 @@ describe('serve app', () => { expect(res.status).toBe(200); const data = (await res.json()) as { status: string; message: string }; expect(data.status).toBe('missing'); - expect(data.message).toContain('transcript.jsonl'); + expect(data.message).toContain('transcript.json'); }); it('returns a clear dangling state when the transcript pointer cannot be read', async () => { diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index 00d49a159..6b7f17549 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -409,8 +409,9 @@ describe('agentv eval CLI', () => { expect(canonicalResults).toHaveLength(2); await expectFileExists(path.join(outputDir, 'summary.json')); for (const row of canonicalResults) { - expect(row.transcript_path).toMatch(/run-1\/transcript\.jsonl$/); + expect(row.transcript_path).toMatch(/run-1\/transcript\.json$/); await expectFileExists(path.join(outputDir, row.transcript_path as string)); + expect(row.transcript_summary).toBeDefined(); expect(row.transcript_raw_path).toMatch(/run-1\/transcript-raw\.jsonl$/); await expectFileExists(path.join(outputDir, row.transcript_raw_path as string)); } diff --git a/apps/dashboard/src/components/EvalDetail.tsx b/apps/dashboard/src/components/EvalDetail.tsx index 5447cd996..5b861ce08 100644 --- a/apps/dashboard/src/components/EvalDetail.tsx +++ b/apps/dashboard/src/components/EvalDetail.tsx @@ -1025,7 +1025,7 @@ function TranscriptTab({

No structured transcript

{transcriptData?.message ?? - 'This run does not include canonical transcript.jsonl. Dashboard does not parse response.md or markdown transcripts for this view.'} + 'This run does not include canonical transcript.json. Dashboard does not parse response.md or markdown transcripts for this view.'}

); diff --git a/apps/dashboard/src/components/TranscriptTimeline.tsx b/apps/dashboard/src/components/TranscriptTimeline.tsx index ddb366bb9..4621d04e6 100644 --- a/apps/dashboard/src/components/TranscriptTimeline.tsx +++ b/apps/dashboard/src/components/TranscriptTimeline.tsx @@ -1,10 +1,10 @@ /** - * Structured transcript viewer for canonical `transcript.jsonl` files. + * Structured transcript viewer for canonical `transcript.json` files. * - * The component intentionally reads only transcript JSONL rows derived from - * AgentV trace data. It does not parse `response.md` or markdown transcripts; - * raw transcript/answer artifacts stay available through the Files tab or raw - * artifact links supplied by the caller. + * The component intentionally reads only AgentV-normalized transcript artifacts + * derived from trace data. It does not parse `response.md` or markdown + * transcripts; raw transcript/answer artifacts stay available through the + * Files tab or raw artifact links supplied by the caller. */ import { type ReactNode, type SyntheticEvent, useEffect, useMemo, useState } from 'react'; @@ -155,7 +155,12 @@ function normalizeToolUseBlock(block: Record): Record, messageIndex: number, + document?: { testId?: string; target?: string }, ): TranscriptJsonLine { const content = value.content as readonly unknown[]; const toolCalls = content @@ -197,8 +203,8 @@ function normalizedTranscriptLineToTimelineEntry( : undefined; return { - test_id: '', - target: value.agent as string, + test_id: document?.testId ?? '', + target: document?.target ?? (value.agent as string), message_index: messageIndex, role: value.type as string, agent: value.agent as string, @@ -218,6 +224,28 @@ function normalizedTranscriptLineToTimelineEntry( } export function parseTranscriptJsonl(rawJsonl: string): TranscriptParseResult { + const trimmed = rawJsonl.trim(); + if (trimmed.startsWith('{')) { + try { + const parsed = JSON.parse(trimmed) as unknown; + if (isRecord(parsed) && Array.isArray(parsed.turns)) { + const target = typeof parsed.target === 'string' ? parsed.target : undefined; + const testId = + isRecord(parsed.metadata) && typeof parsed.metadata.test_id === 'string' + ? parsed.metadata.test_id + : undefined; + const entries = parsed.turns.flatMap((turn, index) => + isNormalizedTranscriptLine(turn) + ? [normalizedTranscriptLineToTimelineEntry(turn, index, { target, testId })] + : [], + ); + return { entries }; + } + } catch { + // Fall through to line-oriented parsing so existing error messages stay useful. + } + } + const entries: TranscriptJsonLine[] = []; const lines = rawJsonl.split(/\r?\n/); @@ -281,7 +309,7 @@ function findFilePathBySuffix( } export function findTranscriptPath(nodes: readonly FileNode[]): string | undefined { - return findFilePathBySuffix(nodes, ['transcript.jsonl']); + return findFilePathBySuffix(nodes, ['transcript.json', 'transcript.jsonl']); } export function findAnswerPath(nodes: readonly FileNode[]): string | undefined { @@ -890,11 +918,11 @@ export function TranscriptTimeline({
- Open transcript.jsonl in Files + Open transcript.json in Files - Open normalized JSONL + Open normalized JSON - Download normalized JSONL + Download normalized JSON
diff --git a/apps/dashboard/src/components/__fixtures__/structured-transcript.ts b/apps/dashboard/src/components/__fixtures__/structured-transcript.ts index fa6e1e36a..4b82f30e5 100644 --- a/apps/dashboard/src/components/__fixtures__/structured-transcript.ts +++ b/apps/dashboard/src/components/__fixtures__/structured-transcript.ts @@ -64,8 +64,8 @@ export const structuredTranscriptFiles: FileNode[] = [ type: 'file', }, { - name: 'transcript.jsonl', - path: 'final-json-answer__codex/transcript.jsonl', + name: 'transcript.json', + path: 'final-json-answer__codex/transcript.json', type: 'file', }, { diff --git a/apps/dashboard/src/components/transcript-timeline.test.tsx b/apps/dashboard/src/components/transcript-timeline.test.tsx index 746f0e125..6176aaf31 100644 --- a/apps/dashboard/src/components/transcript-timeline.test.tsx +++ b/apps/dashboard/src/components/transcript-timeline.test.tsx @@ -20,7 +20,7 @@ describe('TranscriptTimeline', () => { entries={parsed.entries} finalAnswer={'{"answer":42,"source":"src/app.ts"}'} answerPath="final-json-answer__codex/outputs/answer.md" - transcriptPath="final-json-answer__codex/transcript.jsonl" + transcriptPath="final-json-answer__codex/transcript.json" answerHref="/api/raw-answer" transcriptHref="/api/raw-transcript" transcriptDownloadHref="/api/download-transcript" @@ -28,7 +28,7 @@ describe('TranscriptTimeline', () => { ); } - it('parses canonical transcript JSONL rows in chronological order', () => { + it('parses canonical transcript rows in chronological order', () => { const parsed = parseTranscriptJsonl(structuredTranscriptJsonl); expect(parsed.error).toBeUndefined(); @@ -37,6 +37,47 @@ describe('TranscriptTimeline', () => { expect(parsed.entries[1].tool_calls?.[0]?.status).toBe('success'); }); + it('parses canonical transcript JSON documents with tool_name values', () => { + const parsed = parseTranscriptJsonl( + JSON.stringify({ + schema_version: 'agentv.normalized_transcript.v1', + provider_id: 'codex', + target: 'codex', + transcript_summary: { + total_turns: 1, + tool_calls: { file_read: 1 }, + files_read: ['src/app.ts'], + files_modified: [], + shell_commands: [], + web_fetches: [], + errors: [], + thinking_blocks: 0, + }, + turns: [ + { + v: 1, + agent: 'codex', + type: 'assistant', + content: [ + { + type: 'tool_use', + id: 'call-read-1', + tool_name: 'file_read', + name: 'Read', + input: { file_path: 'src/app.ts' }, + result: { status: 'success', output: 'contents' }, + }, + ], + }, + ], + }), + ); + + expect(parsed.error).toBeUndefined(); + expect(parsed.entries).toHaveLength(1); + expect(parsed.entries[0].tool_calls?.[0]?.tool).toBe('file_read'); + }); + it('rejects malformed optional tool_calls fields before rendering', () => { const parsed = parseTranscriptJsonl( JSON.stringify({ @@ -54,7 +95,7 @@ describe('TranscriptTimeline', () => { it('finds canonical transcript and answer artifacts without selecting response.md', () => { expect(findTranscriptPath(structuredTranscriptFiles)).toBe( - 'final-json-answer__codex/transcript.jsonl', + 'final-json-answer__codex/transcript.json', ); expect(findAnswerPath(structuredTranscriptFiles)).toBe( 'final-json-answer__codex/outputs/answer.md', @@ -115,7 +156,7 @@ describe('TranscriptTimeline', () => { const html = renderToStaticMarkup( , @@ -126,7 +167,7 @@ describe('TranscriptTimeline', () => { expect(html).toContain('/tmp/agentv-fixture'); }); - it('renders final answer separately from prior assistant/tool context with normalized JSONL access', () => { + it('renders final answer separately from prior assistant/tool context with normalized JSON access', () => { const html = renderStructuredTranscript(); expect(html).toContain('Final answer'); @@ -137,8 +178,8 @@ describe('TranscriptTimeline', () => { expect(html).toContain('Arguments'); expect(html).toContain('Result'); expect(html).toContain('success'); - expect(html).toContain('Open normalized JSONL'); - expect(html).toContain('Download normalized JSONL'); + expect(html).toContain('Open normalized JSON'); + expect(html).toContain('Download normalized JSON'); expect(html).toContain('{"answer":42,"source":"src/app.ts"}'); }); }); diff --git a/apps/dashboard/src/lib/types.ts b/apps/dashboard/src/lib/types.ts index 32571a674..64a77c52f 100644 --- a/apps/dashboard/src/lib/types.ts +++ b/apps/dashboard/src/lib/types.ts @@ -106,6 +106,7 @@ export interface EvalCaseTrial { duration_ms?: number; total_tool_calls?: number; tool_calls?: Record; + transcript_summary?: TranscriptSummary; metrics_path?: string; timing_path?: string; grading_path?: string; @@ -248,6 +249,7 @@ export interface EvalResult { metrics_path?: string; transcript_path?: string; transcript_raw_path?: string; + transcript_summary?: TranscriptSummary; output_path?: string; answer_path?: string; } @@ -301,6 +303,17 @@ export interface EvalDetailResponse { eval: EvalResult; } +export interface TranscriptSummary { + total_turns: number; + tool_calls: Record; + files_read: string[]; + files_modified: string[]; + shell_commands: string[]; + web_fetches: string[]; + errors: Array>; + thinking_blocks: number; +} + export type TranscriptArtifactStatus = 'ok' | 'missing' | 'dangling' | 'unsupported'; export interface TranscriptArtifactResponse { diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index dc881d48b..1a6415611 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -137,7 +137,7 @@ my-results/ grading.json metrics.json timing.json - transcript.jsonl + transcript.json transcript-raw.jsonl outputs/answer.md outputs/file_changes.diff # when workspace changes are captured @@ -460,7 +460,7 @@ See the [Import tool docs](/docs/tools/import/) for all providers and options. Each result row's `result_dir` is an allocated folder under the timestamped run bundle, usually with a readable test-id prefix plus a short hash suffix. It can -include `transcript.jsonl`, `transcript-raw.jsonl`, `grading.json`, +include `transcript.json`, `transcript-raw.jsonl`, `grading.json`, `timing.json`, `metrics.json`, and generated outputs under `outputs/`. The run root does not contain target, model, or `cases/` folders, and it does not contain a mixed transcript artifact; use each index row's `transcript_path` to find the @@ -489,10 +489,10 @@ AgentV does not persist a public `trace.json` sidecar in run bundles. Use `external_trace` metadata for link-out correlation when another observability system already owns spans. -`transcript.jsonl` is the canonical AgentV transcript/timeline artifact. -It uses provider-neutral `agentv.transcript.v1` rows with stable top-level fields -for message order, role/content, tool calls and paired results, timing, token -usage, cost, source metadata, capture state, and trace pointers. +`transcript.json` is the canonical AgentV transcript/timeline artifact. +It uses provider-neutral `agentv.normalized_transcript.v1` data with stable +fields for message order, role/content, canonical `tool_name` values, paired +tool results, and `transcript_summary`. Provider-native payloads can appear only inside opaque nested fields such as `metadata`, `source.metadata`, tool `input`, or tool `output`. diff --git a/apps/web/src/content/docs/docs/reference/result-artifacts.mdx b/apps/web/src/content/docs/docs/reference/result-artifacts.mdx index f6f132412..77acc974f 100644 --- a/apps/web/src/content/docs/docs/reference/result-artifacts.mdx +++ b/apps/web/src/content/docs/docs/reference/result-artifacts.mdx @@ -40,7 +40,7 @@ The default local layout is: grading.json metrics.json timing.json - transcript.jsonl + transcript.json transcript-raw.jsonl outputs/ answer.md @@ -50,7 +50,7 @@ The default local layout is: grading.json metrics.json timing.json - transcript.jsonl + transcript.json transcript-raw.jsonl outputs/ answer.md @@ -91,7 +91,7 @@ reserved for rebuildable local state and are skipped by run discovery. | `metrics.json` | Derived executor behavior summary, such as tool calls, files touched, shell commands, errors, turns, and output sizes. | Dashboard behavior views, metric-style graders, adapter projections, and lightweight analysis. | | `outputs/file_changes.diff` | Full unified diff of workspace file changes when file changes are captured. | Human review and external artifact inspection; LLM and code graders still receive the same full diff through `file_changes`. | | `timing.json` | Duration, token usage, cost usage, and source labels such as `provider_reported`, `token_estimated`, `aggregate`, or `unavailable`. | Cost/latency reporting and provider-accounting audits. | -| `transcript.jsonl` | AgentV-normalized transcript/timeline rows. | Portable human review, replay, transcript-aware graders, and tool-trajectory analysis. | +| `transcript.json` | AgentV-normalized transcript/timeline document with canonical `tool_name` values and `transcript_summary`. | Portable human review, transcript-aware graders, and tool-trajectory analysis. | | `transcript-raw.jsonl` | Native provider or harness evidence when available. | Parser debugging, forensic review, and preserving source bytes without making provider schemas public AgentV fields. | | `test/` | Generated test bundle for the exact eval slice and target settings that produced a row. | Audit, external review, and rerun workflows that should not depend on a mutable source checkout. | | `artifact_pointers` | Offload indirection for large detached payload bytes. | Finding payloads published outside the primary metadata/control-plane branch, such as transcript bytes on `agentv/artifacts/v1`. | @@ -136,8 +136,18 @@ Example row: "grading_path": "refund-eligibility--4f9a7c2d1b6e/run-1/grading.json", "metrics_path": "refund-eligibility--4f9a7c2d1b6e/run-1/metrics.json", "timing_path": "refund-eligibility--4f9a7c2d1b6e/run-1/timing.json", - "transcript_path": "refund-eligibility--4f9a7c2d1b6e/run-1/transcript.jsonl", + "transcript_path": "refund-eligibility--4f9a7c2d1b6e/run-1/transcript.json", "transcript_raw_path": "refund-eligibility--4f9a7c2d1b6e/run-1/transcript-raw.jsonl", + "transcript_summary": { + "total_turns": 4, + "tool_calls": { "file_read": 2, "shell": 1, "unknown": 0 }, + "files_read": ["src/refunds.ts"], + "files_modified": ["src/refunds.ts"], + "shell_commands": ["bun test refunds.test.ts"], + "web_fetches": [], + "errors": [], + "thinking_blocks": 1 + }, "output_path": "refund-eligibility--4f9a7c2d1b6e/run-1/outputs/answer.md", "answer_path": "refund-eligibility--4f9a7c2d1b6e/run-1/outputs/answer.md", "file_changes_path": "refund-eligibility--4f9a7c2d1b6e/run-1/outputs/file_changes.diff", diff --git a/apps/web/src/content/docs/docs/tools/import.mdx b/apps/web/src/content/docs/docs/tools/import.mdx index baa7ebbaf..834b0680c 100644 --- a/apps/web/src/content/docs/docs/tools/import.mdx +++ b/apps/web/src/content/docs/docs/tools/import.mdx @@ -149,7 +149,7 @@ row keys. Rows without `schema_version`, `capture`, or `trace` from older AgentV transcript exports remain replayable. New eval run artifacts write the v1 shape. -For eval run artifacts, `transcript.jsonl` is the portable message/event +For eval run artifacts, `transcript.json` is the portable message/event projection. AgentV does not persist a public `trace.json` run sidecar, and the transcript is not a provider-native session dump. Provider-native session or stream logs, when captured during a new eval run, are preserved in diff --git a/apps/web/src/content/docs/docs/tools/results.mdx b/apps/web/src/content/docs/docs/tools/results.mdx index dc1a28075..b5fbd43fd 100644 --- a/apps/web/src/content/docs/docs/tools/results.mdx +++ b/apps/web/src/content/docs/docs/tools/results.mdx @@ -130,7 +130,7 @@ token/cost usage. Every case uses aggregate `summary.json`, then stores execution artifact details under `run-N/`. Each `run-N/` contains a compact per-attempt manifest `result.json`, `grading.json`, `metrics.json`, `timing.json`, -`transcript.jsonl`, `transcript-raw.jsonl`, `outputs/answer.md`, and +`transcript.json`, `transcript-raw.jsonl`, `outputs/answer.md`, and `outputs/file_changes.diff` when workspace changes were captured. The `result.json` file carries AgentV `execution_status` and `verdict` fields plus `grading_path`, `metrics_path`, transcript, output, and `file_changes_path` @@ -139,8 +139,9 @@ dimension; stochastic samples and infrastructure retries should be represented with explicit sample/retry metadata rather than inferred from folder names. `transcript-raw.jsonl` preserves native provider or harness transcript bytes -when they are available, while `transcript.jsonl` is the normalized -conversation transcript with joined `tool_use.result` blocks. AgentV does not +when they are available, while `transcript.json` is the normalized +conversation transcript with canonical `tool_name` values, joined +`tool_use.result` blocks, and a precomputed `transcript_summary`. AgentV does not persist a public `trace.json` sidecar in run bundles; external observability systems can be linked through safe `external_trace` metadata when available. `summary.json` remains the run-level aggregate summary. `index.jsonl` is the @@ -172,7 +173,7 @@ Vercel `@vercel/agent-eval` `results.o11y` maps into AgentV like this: | `toolCalls` | `metrics.tool_call_events`, `metrics.tool_calls`, and `metrics.tool_call_counts` | `metrics.json`; compact counts can also appear in `summary.json.run_summary[*].tool_calls` | | `totalToolCalls` | `metrics.total_tool_calls` | `metrics.json` | | `webFetches` | `metrics.web_fetches` | `metrics.json` | -| `totalTurns` | `metrics.total_turns` | `metrics.json`; conversational rows remain in `transcript.jsonl` | +| `totalTurns` | `metrics.total_turns` | `metrics.json`; conversational turns remain in `transcript.json` | | `errors` | `metrics.errors` | `metrics.json` | | `thinkingBlocks` | `metrics.reasoning_blocks` and `thinking_blocks` | `metrics.json` | @@ -182,11 +183,11 @@ Agent Skills eval artifacts map into AgentV like this: |----------------------|--------------|-------------------| | Authored `evals/evals.json` cases | AgentV eval cases and test bundle paths | Eval source plus optional `test_dir`, `eval_path`, `targets_path`, `files_path`, and `graders_path` in `index.jsonl` | | Per-case answer | Generated target output artifact | `run-N/outputs/answer.md` | -| Per-attempt sidecars | Normalized transcript, metrics, and raw provider evidence | `run-N/transcript.jsonl`, `run-N/transcript-raw.jsonl`, `run-N/metrics.json` | +| Per-attempt sidecars | Normalized transcript, metrics, and raw provider evidence | `run-N/transcript.json`, `run-N/transcript-raw.jsonl`, `run-N/metrics.json` | | Per-attempt `timing.json` | Duration, token totals, cost, and usage source labels | `run-N/timing.json` | | Per-attempt `grading.json` | Assertions, graders, execution metrics, workspace changes | `run-N/grading.json`; summary fields can reference the same trace/result facts | | Iteration-level `summary.json` | Pass rate, time, tokens, tool calls, cost aggregates | Run-level `summary.json` | -| Transcript/log outlier analysis | Normalized transcript, raw evidence, metrics, and optional external trace link | `transcript.jsonl` for portable review; `transcript-raw.jsonl` for native evidence; `metrics.json` for behavior summaries; `external_trace` for link-out correlation | +| Transcript/log outlier analysis | Normalized transcript, raw evidence, metrics, and optional external trace link | `transcript.json` for portable review; `transcript-raw.jsonl` for native evidence; `metrics.json` for behavior summaries; `external_trace` for link-out correlation | | Aggregate pass rate/time/tokens/delta | Run summaries and comparison tooling | `summary.json`, result comparisons, and projection bundles | ### Vendor-neutral projection bundle diff --git a/packages/core/src/evaluation/metrics.ts b/packages/core/src/evaluation/metrics.ts index cdb6585f1..481616ee5 100644 --- a/packages/core/src/evaluation/metrics.ts +++ b/packages/core/src/evaluation/metrics.ts @@ -4,7 +4,7 @@ * This is a derived per-case executor metrics projection over `EvaluationResult` * and the internal trace envelope. It aligns with AgentV's case-local `metrics.json` * while carrying compact executor observability fields. It is not the - * canonical trace store; portable transcript detail stays in `transcript.jsonl`, and + * canonical trace store; portable transcript detail stays in `transcript.json`, and * duration/token/cost usage stays in `timing.json`. */ diff --git a/packages/core/src/evaluation/result-artifact-contract.ts b/packages/core/src/evaluation/result-artifact-contract.ts index 41ce99347..cf5d3e310 100644 --- a/packages/core/src/evaluation/result-artifact-contract.ts +++ b/packages/core/src/evaluation/result-artifact-contract.ts @@ -26,13 +26,14 @@ export const AGENTV_RESULTS_REFS = { oplog: AGENTV_RESULTS_OPLOG_REF, } as const; -export const CANONICAL_TRANSCRIPT_ARTIFACT_PATH = 'transcript.jsonl' as const; +export const CANONICAL_TRANSCRIPT_ARTIFACT_PATH = 'transcript.json' as const; export const CANONICAL_METRICS_ARTIFACT_PATH = 'metrics.json' as const; export const CANONICAL_FILE_CHANGES_ARTIFACT_PATH = 'outputs/file_changes.diff' as const; -export const TRANSCRIPT_SCHEMA_VERSION = 'agentv.transcript.v1' as const; +export const TRANSCRIPT_SCHEMA_VERSION = 'agentv.normalized_transcript.v1' as const; export const METRICS_SCHEMA_VERSION = 'agentv.metrics.v1' as const; -export const TRANSCRIPT_JSONL_MEDIA_TYPE = 'application/x-ndjson' as const; +export const TRANSCRIPT_JSON_MEDIA_TYPE = + 'application/vnd.agentv.normalized-transcript.v1+json' as const; export const METRICS_JSON_MEDIA_TYPE = 'application/vnd.agentv.metrics.v1+json' as const; export type AgentVResultsRefName = (typeof AGENTV_RESULTS_REFS)[keyof typeof AGENTV_RESULTS_REFS]; @@ -70,13 +71,13 @@ export interface ResultArtifactPointerWire { export type TranscriptArtifactPointer = ResultArtifactPointer & { readonly schemaVersion: typeof TRANSCRIPT_SCHEMA_VERSION; - readonly mediaType: typeof TRANSCRIPT_JSONL_MEDIA_TYPE; + readonly mediaType: typeof TRANSCRIPT_JSON_MEDIA_TYPE; readonly family: 'transcripts'; }; export type TranscriptArtifactPointerWire = ResultArtifactPointerWire & { readonly schema_version: typeof TRANSCRIPT_SCHEMA_VERSION; - readonly media_type: typeof TRANSCRIPT_JSONL_MEDIA_TYPE; + readonly media_type: typeof TRANSCRIPT_JSON_MEDIA_TYPE; readonly family: 'transcripts'; }; diff --git a/packages/core/src/evaluation/result-row-schema.ts b/packages/core/src/evaluation/result-row-schema.ts index 382c2372b..5179601d2 100644 --- a/packages/core/src/evaluation/result-row-schema.ts +++ b/packages/core/src/evaluation/result-row-schema.ts @@ -48,6 +48,7 @@ const RESULT_ROW_ALIASES = { tracePath: 'trace_path', transcriptPath: 'transcript_path', transcriptRawPath: 'transcript_raw_path', + transcriptSummary: 'transcript_summary', workspacePath: 'workspace_path', } as const; diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index b59c80d80..fe46c845c 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -13,7 +13,7 @@ import { tmpdir } from 'node:os'; import path from 'node:path'; import { - traceEnvelopeToNormalizedTranscriptJsonLines, + traceEnvelopeToNormalizedTranscriptJson, traceEnvelopeToTranscriptJsonLines, } from '../import/types.js'; import { parseEvaluationResultBoundary, toCamelCaseDeep } from './case-conversion.js'; @@ -49,6 +49,7 @@ import { traceEnvelopeToTranscriptMessages, } from './trace-envelope.js'; import { type TokenUsage, type TraceSummary, buildTraceFromMessages } from './trace.js'; +import { type TranscriptSummaryWire, buildTranscriptSummary } from './transcript-summary.js'; import type { EvalTest, EvaluationResult, @@ -501,6 +502,7 @@ export interface IndexArtifactEntry { readonly answer_path?: string; readonly transcript_path?: string; readonly transcript_raw_path?: string; + readonly transcript_summary?: TranscriptSummaryWire; readonly metrics_path?: string; readonly file_changes_path?: string; readonly artifact_pointers?: ResultArtifactPointersWire; @@ -557,6 +559,7 @@ export interface AgentVRunResultArtifact { readonly file_changes_path?: string; readonly transcript_path?: string; readonly transcript_raw_path?: string; + readonly transcript_summary?: TranscriptSummaryWire; readonly o11y: { readonly total_turns: number; readonly tool_calls: Record; @@ -935,6 +938,20 @@ function toFilePathList(entries: readonly unknown[]): readonly string[] { .filter((entry): entry is string => entry !== undefined); } +function resultTranscriptProviderId(result: EvaluationResult): string | undefined { + const provider = result.trace.metadata?.provider; + return typeof provider === 'string' && provider.trim().length > 0 ? provider : result.target; +} + +function buildResultTranscriptSummary(result: EvaluationResult): TranscriptSummaryWire { + return buildTranscriptSummary({ + messages: result.trace.messages ?? [], + providerId: resultTranscriptProviderId(result), + fileChanges: result.fileChanges, + error: result.error, + }); +} + function buildAgentVRunResultArtifact(params: { readonly trial: TrialResult; readonly result: EvaluationResult; @@ -960,6 +977,9 @@ function buildAgentVRunResultArtifact(params: { file_changes_path: fileChangesPath, transcript_path: params.hasTranscript ? `./${CANONICAL_TRANSCRIPT_ARTIFACT_PATH}` : undefined, transcript_raw_path: params.hasTranscript ? './transcript-raw.jsonl' : undefined, + transcript_summary: params.hasTranscript + ? buildResultTranscriptSummary(params.result) + : undefined, o11y: { total_turns: metrics.total_turns, tool_calls: metrics.tool_calls, @@ -1063,7 +1083,7 @@ async function writeTrialRunArtifacts(params: { await writeFile(fileChangesPath, result.fileChanges, 'utf8'); } if (transcriptPath && transcriptRawPath) { - await writeNormalizedTranscriptJsonl(transcriptPath, envelope); + await writeNormalizedTranscriptJson(transcriptPath, envelope, result); await writeRawTranscriptJsonl(transcriptRawPath, result, envelope); } const metricsArtifact = await writeMetricsArtifact({ @@ -1767,6 +1787,7 @@ export function buildIndexArtifactEntry( transcript_raw_path: options.transcriptRawPath ? toRelativeArtifactPath(options.outputDir, options.transcriptRawPath) : undefined, + transcript_summary: options.transcriptPath ? buildResultTranscriptSummary(result) : undefined, metrics_path: options.metricsPath ? toRelativeArtifactPath(options.outputDir, options.metricsPath) : undefined, @@ -1855,6 +1876,8 @@ export function buildResultIndexArtifact( isSingleRun && hasTranscript ? path.posix.join(singleRunDir, 'transcript-raw.jsonl') : undefined, + transcript_summary: + isSingleRun && hasTranscript ? buildResultTranscriptSummary(result) : undefined, artifact_pointers: options?.artifactPointers, runtime_source: options?.runtimeSource, ...extraIndexFields, @@ -1877,14 +1900,16 @@ function hasTranscriptProjection(result: EvaluationResult, envelope: TraceEnvelo return result.output.length > 0 || traceEnvelopeToTranscriptMessages(envelope).length > 0; } -async function writeNormalizedTranscriptJsonl( +async function writeNormalizedTranscriptJson( filePath: string, envelope: TraceEnvelope, + result: EvaluationResult, ): Promise { - const lines = traceEnvelopeToNormalizedTranscriptJsonLines(envelope); - const content = - lines.length > 0 ? `${lines.map((line) => JSON.stringify(line)).join('\n')}\n` : ''; - await writeFile(filePath, content, 'utf8'); + const transcript = traceEnvelopeToNormalizedTranscriptJson(envelope, { + fileChanges: result.fileChanges, + error: result.error, + }); + await writeFile(filePath, `${JSON.stringify(transcript, null, 2)}\n`, 'utf8'); } async function writeGeneratedRawTranscriptJsonl( diff --git a/packages/core/src/evaluation/trace-envelope.ts b/packages/core/src/evaluation/trace-envelope.ts index a22a3efca..c016663c0 100644 --- a/packages/core/src/evaluation/trace-envelope.ts +++ b/packages/core/src/evaluation/trace-envelope.ts @@ -8,11 +8,11 @@ * `gen_ai.operation.name` and `openinference.span.kind` are copied exactly and * never case-converted. * - * Derived views such as Provider `Message[]`, `transcript.jsonl`, + * Derived views such as Provider `Message[]`, `transcript.json`, * `TraceSummary`, compact tool trajectories, replay provider responses, and - * OTLP JSON export bodies must project from this artifact. Transcript JSONL - * uses AgentV transcript events on the root span so compatibility rows can - * include input/system turns without changing replay's assistant-only view. + * OTLP JSON export bodies must project from this artifact. Transcript + * projections use AgentV transcript events on the root span so compatibility + * rows can include input/system turns without changing replay's assistant-only view. * Do not introduce a second canonical graph for those compatibility/read * models. * diff --git a/packages/core/src/evaluation/trace.ts b/packages/core/src/evaluation/trace.ts index 48c3c1120..8c9720c00 100644 --- a/packages/core/src/evaluation/trace.ts +++ b/packages/core/src/evaluation/trace.ts @@ -729,7 +729,7 @@ function toTraceError(error: TraceError | string): TraceError { /** * Build the result-local trace read model for an evaluation case from provider * messages and execution metrics. This is the projection used by result JSONL, - * code-grader stdin, `outputs/answer.md`, and `transcript.jsonl`. + * code-grader stdin, `outputs/answer.md`, and `transcript.json`. */ export function buildTraceFromMessages(options: BuildTraceOptions = {}): Trace { const messages = buildTraceMessages(options.input, options.output); diff --git a/packages/core/src/evaluation/transcript-summary.ts b/packages/core/src/evaluation/transcript-summary.ts new file mode 100644 index 000000000..582996c1a --- /dev/null +++ b/packages/core/src/evaluation/transcript-summary.ts @@ -0,0 +1,383 @@ +import { normalizeToolCall } from './providers/normalize-tool-call.js'; +import { + KNOWN_PROVIDERS, + type Message, + type ProviderKind, + type ToolCall, +} from './providers/types.js'; + +export const CANONICAL_TRANSCRIPT_TOOL_NAMES = [ + 'file_read', + 'file_write', + 'file_edit', + 'shell', + 'web_fetch', + 'web_search', + 'glob', + 'grep', + 'list_dir', + 'agent_task', + 'unknown', +] as const; + +export type CanonicalTranscriptToolName = (typeof CANONICAL_TRANSCRIPT_TOOL_NAMES)[number]; + +export interface TranscriptSummaryErrorWire { + readonly message: string; + readonly tool_call_id?: string; + readonly tool_name?: CanonicalTranscriptToolName; +} + +export interface TranscriptSummaryWire { + readonly total_turns: number; + readonly tool_calls: Record; + readonly files_read: readonly string[]; + readonly files_modified: readonly string[]; + readonly shell_commands: readonly string[]; + readonly web_fetches: readonly string[]; + readonly errors: readonly TranscriptSummaryErrorWire[]; + readonly thinking_blocks: number; +} + +const PROVIDER_ALIASES: Readonly> = { + codex: 'codex', + 'codex-cli': 'codex', + 'codex-sdk': 'codex', + copilot: 'copilot-sdk', + 'copilot-cli': 'copilot-cli', + 'copilot-sdk': 'copilot-sdk', + 'copilot-log': 'copilot-log', + pi: 'pi-cli', + 'pi-cli': 'pi-cli', + 'pi-coding-agent': 'pi-coding-agent', + claude: 'claude', + 'claude-cli': 'claude-cli', + 'claude-sdk': 'claude-sdk', + vscode: 'vscode', + 'vscode-insiders': 'vscode-insiders', +}; + +const LEGACY_TOOL_NAME_MAP: Readonly> = { + Read: 'file_read', + Write: 'file_write', + Edit: 'file_edit', + Bash: 'shell', + Skill: 'agent_task', +}; + +const FILE_PATH_KEYS = new Set([ + 'file', + 'filename', + 'filepath', + 'file_path', + 'path', + 'targetfile', + 'targetpath', + 'relativepath', + '_extractedpath', +]); + +const COMMAND_KEYS = new Set([ + 'cmd', + 'command', + 'script', + 'shellcommand', + 'extractedcommand', + '_extractedcommand', +]); + +const URL_KEYS = new Set(['url', 'uri', 'href', 'extractedurl', '_extractedurl']); + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function normalizedKey(key: string): string { + return key.replace(/[^A-Za-z0-9_]+/g, '').toLowerCase(); +} + +function normalizeProviderId(providerId: string | undefined): ProviderKind | undefined { + if (!providerId) { + return undefined; + } + const normalized = providerId.trim().toLowerCase(); + if (!normalized) { + return undefined; + } + if ((KNOWN_PROVIDERS as readonly string[]).includes(normalized)) { + return normalized as ProviderKind; + } + return PROVIDER_ALIASES[normalized]; +} + +function genericCanonicalToolName(toolName: string): CanonicalTranscriptToolName { + const normalized = normalizedKey(toolName); + if ( + normalized === 'file_read' || + normalized === 'read' || + normalized === 'readfile' || + normalized === 'read_file' || + normalized.includes('fileread') || + normalized === 'viewfile' + ) { + return 'file_read'; + } + if ( + normalized === 'file_write' || + normalized === 'write' || + normalized === 'writefile' || + normalized === 'write_file' || + normalized.includes('filewrite') || + normalized === 'createfile' + ) { + return 'file_write'; + } + if ( + normalized === 'file_edit' || + normalized === 'edit' || + normalized === 'editfile' || + normalized === 'edit_file' || + normalized.includes('fileedit') || + normalized.includes('filechange') || + normalized.includes('applypatch') || + normalized.includes('replaceinfile') + ) { + return 'file_edit'; + } + if ( + normalized === 'bash' || + normalized === 'shell' || + normalized.includes('shell') || + normalized.includes('terminal') || + normalized.includes('commandexecution') || + normalized.includes('execcommand') + ) { + return 'shell'; + } + if ( + normalized === 'web_search' || + normalized.includes('websearch') || + normalized === 'searchweb' + ) { + return 'web_search'; + } + if ( + normalized === 'web_fetch' || + normalized.includes('webfetch') || + normalized.includes('fetchurl') || + normalized === 'fetch' || + normalized === 'fetchdoc' || + normalized === 'httpget' + ) { + return 'web_fetch'; + } + if (normalized === 'glob' || normalized.includes('glob')) { + return 'glob'; + } + if (normalized === 'grep' || normalized.includes('grep') || normalized.includes('ripgrep')) { + return 'grep'; + } + if ( + normalized === 'list_dir' || + normalized.includes('listdir') || + normalized.includes('lsdir') || + normalized === 'ls' + ) { + return 'list_dir'; + } + if ( + normalized === 'skill' || + normalized.includes('agenttask') || + normalized.includes('subagent') || + normalized.startsWith('mcp') + ) { + return 'agent_task'; + } + return 'unknown'; +} + +export function canonicalTranscriptToolName( + toolName: string | undefined, + providerId?: string, +): CanonicalTranscriptToolName { + if (!toolName) { + return 'unknown'; + } + const providerKind = normalizeProviderId(providerId); + const providerNormalized = providerKind + ? normalizeToolCall(providerKind, { tool: toolName }) + : undefined; + const routedName = providerNormalized?.tool ?? toolName; + return LEGACY_TOOL_NAME_MAP[routedName] ?? genericCanonicalToolName(routedName); +} + +function emptyToolCallCounts(): Record { + return Object.fromEntries( + CANONICAL_TRANSCRIPT_TOOL_NAMES.map((toolName) => [toolName, 0]), + ) as Record; +} + +function stringValuesByKey(value: unknown, keys: ReadonlySet, maxDepth = 6): string[] { + const values = new Set(); + function visit(entry: unknown, depth: number): void { + if (depth > maxDepth) { + return; + } + if (Array.isArray(entry)) { + for (const item of entry) { + visit(item, depth + 1); + } + return; + } + if (!isRecord(entry)) { + return; + } + for (const [key, nested] of Object.entries(entry)) { + if (keys.has(normalizedKey(key))) { + if (typeof nested === 'string' && nested.trim().length > 0) { + values.add(nested.trim()); + } else if (Array.isArray(nested)) { + for (const item of nested) { + if (typeof item === 'string' && item.trim().length > 0) { + values.add(item.trim()); + } + } + } + } + visit(nested, depth + 1); + } + } + visit(value, 0); + return [...values]; +} + +function firstStringByKey(value: unknown, keys: ReadonlySet): string | undefined { + return stringValuesByKey(value, keys)[0]; +} + +function parseModifiedPathsFromDiff(fileChanges: string | undefined): string[] { + if (!fileChanges) { + return []; + } + const paths = new Set(); + const lines = fileChanges.split('\n'); + for (let index = 0; index < lines.length - 1; index++) { + const oldLine = lines[index]; + const newLine = lines[index + 1]; + if (!oldLine.startsWith('--- a/') || !newLine?.startsWith('+++ b/')) { + continue; + } + const filePath = newLine.slice('+++ b/'.length).trim(); + if (filePath && filePath !== '/dev/null') { + paths.add(filePath); + } + } + return [...paths]; +} + +function collectThinkingBlocks(messages: readonly Message[]): number { + let count = 0; + for (const message of messages) { + const content = message.content; + if (!Array.isArray(content)) { + continue; + } + for (const block of content) { + const entry = block as unknown; + if (!isRecord(entry)) { + continue; + } + if (entry.type === 'thinking' || entry.type === 'reasoning') { + count += 1; + } + } + } + return count; +} + +function errorFromToolCall( + toolCall: ToolCall, + toolName: CanonicalTranscriptToolName, +): TranscriptSummaryErrorWire | undefined { + if ( + toolCall.status !== 'error' && + toolCall.status !== 'timeout' && + toolCall.status !== 'cancelled' + ) { + return undefined; + } + const output = toolCall.output; + const message = + typeof output === 'string' + ? output + : isRecord(output) && typeof output.message === 'string' + ? output.message + : `Tool ${toolCall.tool} ${toolCall.status}`; + return { + message, + ...(toolCall.id ? { tool_call_id: toolCall.id } : {}), + tool_name: toolName, + }; +} + +export function buildTranscriptSummary(params: { + readonly messages: readonly Message[]; + readonly providerId?: string; + readonly fileChanges?: string; + readonly error?: string; +}): TranscriptSummaryWire { + const toolCalls = emptyToolCallCounts(); + const filesRead = new Set(); + const filesModified = new Set(parseModifiedPathsFromDiff(params.fileChanges)); + const shellCommands: string[] = []; + const webFetches: string[] = []; + const errors: TranscriptSummaryErrorWire[] = []; + + for (const message of params.messages) { + for (const toolCall of message.toolCalls ?? []) { + const toolName = canonicalTranscriptToolName(toolCall.tool, params.providerId); + toolCalls[toolName] += 1; + if (toolName === 'file_read') { + for (const filePath of stringValuesByKey(toolCall.input, FILE_PATH_KEYS)) { + filesRead.add(filePath); + } + } else if (toolName === 'file_write' || toolName === 'file_edit') { + for (const filePath of stringValuesByKey(toolCall.input, FILE_PATH_KEYS)) { + filesModified.add(filePath); + } + } else if (toolName === 'shell') { + const command = firstStringByKey(toolCall.input, COMMAND_KEYS); + if (command) { + shellCommands.push(command); + } + } else if (toolName === 'web_fetch') { + const url = firstStringByKey(toolCall.input, URL_KEYS); + if (url) { + webFetches.push(url); + } + } + + const toolError = errorFromToolCall(toolCall, toolName); + if (toolError) { + errors.push(toolError); + } + } + } + + if (params.error) { + errors.unshift({ message: params.error }); + } + + return { + total_turns: params.messages.filter((message) => + ['system', 'user', 'assistant'].includes(message.role), + ).length, + tool_calls: toolCalls, + files_read: [...filesRead], + files_modified: [...filesModified], + shell_commands: shellCommands, + web_fetches: webFetches, + errors, + thinking_blocks: collectThinkingBlocks(params.messages), + }; +} diff --git a/packages/core/src/import/types.ts b/packages/core/src/import/types.ts index ccc174f7b..989b7d969 100644 --- a/packages/core/src/import/types.ts +++ b/packages/core/src/import/types.ts @@ -17,6 +17,7 @@ import { readFile } from 'node:fs/promises'; import type { Message, ProviderTokenUsage, ToolCall } from '../evaluation/providers/types.js'; +import { TRANSCRIPT_SCHEMA_VERSION } from '../evaluation/result-artifact-contract.js'; import { EXECUTION_TRACE_SCHEMA_VERSION, type TraceEnvelope, @@ -25,6 +26,12 @@ import { traceEnvelopeToTranscriptMessages, } from '../evaluation/trace-envelope.js'; import { type Trace, buildTraceFromMessages } from '../evaluation/trace.js'; +import { + type CanonicalTranscriptToolName, + type TranscriptSummaryWire, + buildTranscriptSummary, + canonicalTranscriptToolName, +} from '../evaluation/transcript-summary.js'; export const TRANSCRIPT_ROW_SCHEMA_VERSION = 'agentv.transcript.v1' as const; @@ -152,6 +159,7 @@ export type NormalizedTranscriptContentBlock = | { readonly type: 'tool_use'; readonly id: string; + readonly tool_name: CanonicalTranscriptToolName; readonly name: string; readonly input: unknown; readonly result?: { @@ -187,6 +195,14 @@ export interface NormalizedTranscriptJsonLine { readonly raw_refs?: readonly NormalizedTranscriptRawRef[]; } +export interface NormalizedTranscriptJson { + readonly schema_version: typeof TRANSCRIPT_SCHEMA_VERSION; + readonly provider_id: string; + readonly target: string; + readonly transcript_summary: TranscriptSummaryWire; + readonly turns: readonly NormalizedTranscriptJsonLine[]; +} + /** * Grouped replayable transcript reconstructed from per-message rows. */ @@ -515,10 +531,12 @@ function normalizedToolBlock( toolCall: ToolCall, messageIndex: number, toolIndex: number, + providerId: string | undefined, ): NormalizedTranscriptContentBlock { return dropUndefined({ type: 'tool_use', id: toolCall.id ?? `tool_${messageIndex + 1}_${toolIndex + 1}`, + tool_name: canonicalTranscriptToolName(toolCall.tool, providerId), name: toolCall.tool, input: toolCall.input ?? {}, result: normalizedToolResult(toolCall), @@ -540,6 +558,7 @@ function normalizedImageMetadata( function normalizedContentBlocks( message: Message, messageIndex: number, + providerId: string | undefined, ): NormalizedTranscriptContentBlock[] { const blocks: NormalizedTranscriptContentBlock[] = []; const content = message.content; @@ -580,7 +599,7 @@ function normalizedContentBlocks( } for (const [toolIndex, toolCall] of (message.toolCalls ?? []).entries()) { - blocks.push(normalizedToolBlock(toolCall, messageIndex, toolIndex)); + blocks.push(normalizedToolBlock(toolCall, messageIndex, toolIndex, providerId)); } return blocks; @@ -612,6 +631,7 @@ function applyToolResultToPriorTurn( turns: NormalizedTranscriptJsonLine[], message: Message, messageIndex: number, + providerId: string | undefined, ): boolean { const name = message.name; for (let turnIndex = turns.length - 1; turnIndex >= 0; turnIndex -= 1) { @@ -650,6 +670,7 @@ function applyToolResultToPriorTurn( { type: 'tool_use', id: normalizedTurnId(message) ?? `tool_${messageIndex + 1}`, + tool_name: canonicalTranscriptToolName(name, providerId), name: name ?? 'tool', input: {}, result: { @@ -671,12 +692,13 @@ export function traceEnvelopeToNormalizedTranscriptJsonLines( const summary = traceEnvelopeToTraceSummary(envelope); const source = sourceFromEnvelope(envelope, summary); const agent = source.provider ?? envelope.eval.target ?? 'agentv'; + const providerId = source.provider ?? envelope.eval.target; const model = modelFromSource(source); const turns: NormalizedTranscriptJsonLine[] = []; messages.forEach((message, index) => { if (message.role === 'tool' || message.role === 'function') { - applyToolResultToPriorTurn(turns, message, index); + applyToolResultToPriorTurn(turns, message, index, providerId); return; } @@ -685,7 +707,7 @@ export function traceEnvelopeToNormalizedTranscriptJsonLines( return; } - const content = normalizedContentBlocks(message, index); + const content = normalizedContentBlocks(message, index, providerId); if (content.length === 0) { return; } @@ -708,6 +730,28 @@ export function traceEnvelopeToNormalizedTranscriptJsonLines( return turns; } +export function traceEnvelopeToNormalizedTranscriptJson( + envelope: TraceEnvelope, + options?: { fileChanges?: string; error?: string }, +): NormalizedTranscriptJson { + const messages = traceEnvelopeToTranscriptMessages(envelope); + const summary = traceEnvelopeToTraceSummary(envelope); + const source = sourceFromEnvelope(envelope, summary); + const providerId = source.provider ?? envelope.eval.target ?? 'agentv'; + return { + schema_version: TRANSCRIPT_SCHEMA_VERSION, + provider_id: providerId, + target: envelope.eval.target, + transcript_summary: buildTranscriptSummary({ + messages, + providerId, + fileChanges: options?.fileChanges, + error: options?.error, + }), + turns: traceEnvelopeToNormalizedTranscriptJsonLines(envelope), + }; +} + export function traceEnvelopeToTranscriptJsonLines( envelope: TraceEnvelope, options?: { testId?: string; target?: string }, diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index b9a12327b..a7c3c5833 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -3,6 +3,7 @@ export * from './evaluation/types.js'; export * from './evaluation/trace.js'; export * from './evaluation/trace-envelope.js'; export * from './evaluation/metrics.js'; +export * from './evaluation/transcript-summary.js'; export * from './evaluation/dashboard-trace-read-model.js'; export * from './evaluation/trace-normalization.js'; export * from './evaluation/external-trace.js'; diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 73c544135..4acf92dd1 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -776,13 +776,13 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, expect(readFileSync(path.join(runDir, 'transcript-raw.jsonl'), 'utf8')).toBe( '{"event":"provider-native"}\n', ); - expect(readdirSync(runDir)).toContain('transcript.jsonl'); + expect(readdirSync(runDir)).toContain('transcript.json'); expect(readdirSync(outputsDir)).not.toContain('transcript.jsonl'); expect(readdirSync(outputsDir)).not.toContain('transcript.json'); expect(indexRows[0]?.raw_provider_log_path).toBeUndefined(); expect(indexRows[0]?.trace_path).toBeUndefined(); - expect(indexRows[0]?.transcript_path).toBe(`${resultDir}/run-1/transcript.jsonl`); + expect(indexRows[0]?.transcript_path).toBe(`${resultDir}/run-1/transcript.json`); expect(indexRows[0]?.transcript_raw_path).toBe(`${resultDir}/run-1/transcript-raw.jsonl`); expect(existsSync(rawLogPath)).toBe(false); }); From 1f602d05962bd42ac1f691f07954996799bc60f6 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Jul 2026 16:24:19 +0200 Subject: [PATCH 2/2] fix(eval): project repeat transcript summaries --- apps/cli/src/commands/results/serve.ts | 5 ++ .../commands/eval/artifact-writer.test.ts | 26 ++++++- apps/cli/test/commands/results/serve.test.ts | 75 +++++++++++++++++++ packages/core/src/evaluation/run-artifacts.ts | 9 +++ 4 files changed, 114 insertions(+), 1 deletion(-) diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index f62779226..b21e4fe4d 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -1158,10 +1158,14 @@ function buildRepeatTrialReadModels( const transcriptPath = caseTrialArtifactPath(resultDir, runPath, 'transcript.json'); const transcriptRawPath = caseTrialArtifactPath(resultDir, runPath, 'transcript-raw.jsonl'); const answerPath = caseTrialArtifactPath(resultDir, runPath, 'outputs/answer.md'); + const resultPath = caseTrialArtifactPath(resultDir, runPath, 'result.json'); + const runResult = readArtifactJsonObject(baseDir, resultPath); const metrics = readArtifactJsonObject(baseDir, metricsPath); const timing = readArtifactJsonObject(baseDir, timingPath); const toolCalls = objectField(metrics, 'tool_calls'); const tokenUsage = objectField(timing, 'token_usage'); + const transcriptSummary = + objectField(trial, 'transcript_summary') ?? objectField(runResult, 'transcript_summary'); return { ...trial, @@ -1179,6 +1183,7 @@ function buildRepeatTrialReadModels( total_tool_calls: numberField(metrics, 'total_tool_calls'), }), ...(toolCalls && { tool_calls: toolCalls }), + ...(transcriptSummary && { transcript_summary: transcriptSummary }), ...(metricsPath && { metrics_path: metricsPath }), ...(timingPath && { timing_path: timingPath }), ...(gradingPath && { grading_path: gradingPath }), diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index f2bbdec95..790c810a3 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -746,6 +746,28 @@ describe('buildIndexArtifactEntry', () => { error: 'model drift', cost_usd: 0.25, execution_status: 'quality_failure', + transcript_summary: { + total_turns: 1, + tool_calls: { + file_read: 0, + file_write: 0, + file_edit: 0, + shell: 0, + web_fetch: 0, + web_search: 0, + glob: 0, + grep: 0, + list_dir: 0, + agent_task: 0, + unknown: 0, + }, + files_read: [], + files_modified: [], + shell_commands: [], + web_fetches: [], + errors: [{ message: 'model drift' }], + thinking_blocks: 0, + }, }, ], }); @@ -1159,7 +1181,7 @@ describe('writeArtifactsFromResults', () => { const [indexEntry] = await readIndexLines(paths.indexPath); const repeatRowDir = expectRowDir(indexEntry, 'repeat-case'); - expect(indexEntry?.trials).toEqual([ + expect(indexEntry?.trials).toMatchObject([ { attempt: 0, run_path: 'run-1', score: 0.25, verdict: 'fail' }, { attempt: 1, run_path: 'run-2', score: 1, verdict: 'pass' }, ]); @@ -1253,6 +1275,7 @@ describe('writeArtifactsFromResults', () => { }, }); expect(runOneResult).not.toHaveProperty('status'); + expect(indexEntry?.trials?.[0]?.transcript_summary).toEqual(runOneResult.transcript_summary); const runTwoAnswer = await readFile( path.join(paths.testArtifactDir, repeatRowDir, 'run-2', 'outputs', 'answer.md'), @@ -1278,6 +1301,7 @@ describe('writeArtifactsFromResults', () => { }, }); expect(runTwoResult).not.toHaveProperty('status'); + expect(indexEntry?.trials?.[1]?.transcript_summary).toEqual(runTwoResult.transcript_summary); }); it('handles empty results array', async () => { diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 55fc01cc7..8121fac19 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -2435,6 +2435,81 @@ describe('serve app', () => { expect(data.source_label).toBe(filename); }); + it('projects repeat trial transcript summaries from manifest rows and run result sidecars', async () => { + const runsDir = localResultsExperimentDir(tempDir); + const filename = '2026-03-25T10-05-00-000Z'; + const runDir = path.join(runsDir, filename); + const resultDir = 'demo/repeat-case'; + const firstSummary = { + total_turns: 1, + tool_calls: { shell: 1 }, + files_read: [], + files_modified: [], + shell_commands: ['bun test'], + web_fetches: [], + errors: [], + thinking_blocks: 0, + }; + const secondSummary = { + total_turns: 2, + tool_calls: { file_read: 1 }, + files_read: ['src/index.ts'], + files_modified: [], + shell_commands: [], + web_fetches: [], + errors: [], + thinking_blocks: 1, + }; + + mkdirSync(path.join(runDir, resultDir, 'run-1'), { recursive: true }); + mkdirSync(path.join(runDir, resultDir, 'run-2'), { recursive: true }); + writeFileSync( + path.join(runDir, resultDir, 'run-1', 'result.json'), + `${JSON.stringify({ transcript_summary: firstSummary })}\n`, + ); + writeFileSync( + path.join(runDir, resultDir, 'run-2', 'result.json'), + `${JSON.stringify({ transcript_summary: secondSummary })}\n`, + ); + writeFileSync( + path.join(runDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + test_id: 'repeat-case', + result_dir: resultDir, + trials: [ + { + attempt: 0, + run_path: 'run-1', + score: 0.25, + verdict: 'fail', + transcript_summary: firstSummary, + }, + { attempt: 1, run_path: 'run-2', score: 1, verdict: 'pass' }, + ], + }), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request(`/api/runs/${filename}`); + expect(res.status).toBe(200); + const data = (await res.json()) as { + results: Array<{ + trials?: Array<{ + transcript_path?: string; + transcript_summary?: Record; + }>; + }>; + }; + + expect(data.results[0]?.trials?.[0]?.transcript_summary).toEqual(firstSummary); + expect(data.results[0]?.trials?.[1]?.transcript_summary).toEqual(secondSummary); + expect(data.results[0]?.trials?.map((trial) => trial.transcript_path)).toEqual([ + `${resultDir}/run-1/transcript.json`, + `${resultDir}/run-2/transcript.json`, + ]); + }); + it('loads historical runs without test bundle metadata', async () => { const runId = writeLocalRunArtifact( tempDir, diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index fe46c845c..f8a6fc941 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -362,6 +362,7 @@ export type TrialResultArtifact = { readonly execution_status?: string; readonly failure_stage?: string; readonly failure_reason_code?: string; + readonly transcript_summary?: TranscriptSummaryWire; }; export type TrialAggregationArtifact = @@ -737,6 +738,13 @@ function hasPersistedTrialRuns(result: EvaluationResult): boolean { return (result.trials ?? []).some((trial) => trial.result !== undefined); } +function toTrialTranscriptSummary(trial: TrialResult): TranscriptSummaryWire | undefined { + const result = trial.result; + return result && resultHasExecutionTraceTranscript(result) + ? buildResultTranscriptSummary(result) + : undefined; +} + function toTrialArtifacts( trials: readonly TrialResult[] | undefined, ): readonly TrialResultArtifact[] | undefined { @@ -754,6 +762,7 @@ function toTrialArtifacts( execution_status: trial.executionStatus, failure_stage: trial.failureStage, failure_reason_code: trial.failureReasonCode, + transcript_summary: toTrialTranscriptSummary(trial), })); }