Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion apps/cli/src/commands/pipeline/bench.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@ interface EvaluatorScore {
readonly assertions: readonly { text: string; passed: boolean; evidence?: string }[];
}

function toAssertionResult(assertion: { text: string; passed: boolean; evidence?: string }) {
return {
text: assertion.text,
passed: assertion.passed,
evidence: assertion.evidence ?? '',
score: assertion.passed ? 1 : 0,
verdict: assertion.passed ? 'pass' : 'fail',
};
}

export const evalBenchCommand = command({
name: 'bench',
description: 'Merge grader scores and produce benchmark artifacts',
Expand Down Expand Up @@ -130,14 +140,18 @@ export const evalBenchCommand = command({

// Write grading.json
const grading = {
assertions: allAssertions,
score: Math.round(weightedScore * 1000) / 1000,
verdict: weightedScore >= DEFAULT_THRESHOLD ? 'pass' : 'fail',
assertion_results: allAssertions.map(toAssertionResult),
summary: { passed, failed, total: allAssertions.length, pass_rate: passRate },
graders: evaluators.map((e) => ({
name: e.name,
type: e.type,
score: e.score,
verdict: e.score >= DEFAULT_THRESHOLD ? 'pass' : 'fail',
reasoning: '',
weight: e.weight,
assertion_results: e.assertions.map(toAssertionResult),
})),
};
await writeFile(
Expand Down
101 changes: 78 additions & 23 deletions apps/cli/src/commands/results/manifest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,74 @@ export interface ManifestHydrationOptions {
readonly hydrateTranscriptTrace?: boolean;
}

type HydratedScore = NonNullable<EvaluationResult['scores']>[number];

function mapGradingAssertions(
value: unknown,
): NonNullable<EvaluationResult['assertions']> | undefined {
if (!Array.isArray(value)) {
return undefined;
}
return value.map((assertion) => {
const record = assertion as Record<string, unknown>;
return {
text: String(record.text ?? ''),
passed: Boolean(record.passed),
evidence: typeof record.evidence === 'string' ? record.evidence : undefined,
};
});
}

function readGradingAssertionResults(
record: Record<string, unknown>,
): NonNullable<EvaluationResult['assertions']> | undefined {
return mapGradingAssertions(
Array.isArray(record.assertion_results) ? record.assertion_results : record.assertions,
);
}

function readNestedGradingScores(record: Record<string, unknown>): unknown {
if (Array.isArray(record.scores)) {
return record.scores;
}
if (Array.isArray(record.graders)) {
return record.graders;
}
if (Array.isArray(record.evaluators)) {
return record.evaluators;
}
return undefined;
}

function mapGradingEvaluator(evaluator: Record<string, unknown>): HydratedScore {
const verdict =
evaluator.verdict === 'pass' || evaluator.verdict === 'fail' || evaluator.verdict === 'skip'
? evaluator.verdict
: undefined;
const details =
evaluator.details && typeof evaluator.details === 'object' && !Array.isArray(evaluator.details)
? (evaluator.details as HydratedScore['details'])
: undefined;

return {
name: String(evaluator.name ?? ''),
type: String(evaluator.type ?? '') as HydratedScore['type'],
score: typeof evaluator.score === 'number' ? evaluator.score : 0,
assertions: readGradingAssertionResults(evaluator) ?? [],
scores: mapGradingEvaluators(readNestedGradingScores(evaluator)),
weight: typeof evaluator.weight === 'number' ? evaluator.weight : undefined,
verdict,
details,
};
}

function mapGradingEvaluators(value: unknown): EvaluationResult['scores'] | undefined {
if (!Array.isArray(value)) {
return undefined;
}
return value.map((evaluator) => mapGradingEvaluator(evaluator as Record<string, unknown>));
}

function parseResultRows(content: string, sourceLabel?: string): ResultManifestRecord[] {
return content
.split(/\r?\n/)
Expand Down Expand Up @@ -229,6 +297,14 @@ function hydrateManifestRecord(
const grading = readOptionalJson<GradingArtifact>(baseDir, record.grading_path);
const timing = readOptionalJson<TimingArtifact>(baseDir, record.timing_path);
const testId = record.test_id ?? 'unknown';
const gradingAssertions = grading
? readGradingAssertionResults(grading as unknown as Record<string, unknown>)
: undefined;
const gradingScores = mapGradingEvaluators(
grading?.graders ??
(grading as (GradingArtifact & { evaluators?: GradingArtifact['graders'] }) | undefined)
?.evaluators,
);

return {
timestamp: record.timestamp,
Expand All @@ -240,36 +316,15 @@ function hydrateManifestRecord(
score: record.score,
executionStatus: record.execution_status,
error: record.error,
assertions: grading?.assertions.map((assertion) => ({
assertions: gradingAssertions?.map((assertion) => ({
text: assertion.text,
passed: assertion.passed,
evidence: assertion.evidence,
})),
scores:
// `evaluators` was renamed to `graders` in v4.13 — read both for backwards compat with old artifacts.
// TODO: remove `evaluators` fallback once old run directories are no longer in use.
(
grading?.graders ??
(grading as (GradingArtifact & { evaluators?: GradingArtifact['graders'] }) | undefined)
?.evaluators
)?.map((evaluator) => ({
name: evaluator.name,
type: evaluator.type,
score: evaluator.score,
assertions: Array.isArray(evaluator.assertions)
? evaluator.assertions.map((assertion) => ({
text: String((assertion as Record<string, unknown>).text ?? ''),
passed: Boolean((assertion as Record<string, unknown>).passed),
evidence:
typeof (assertion as Record<string, unknown>).evidence === 'string'
? String((assertion as Record<string, unknown>).evidence)
: undefined,
}))
: undefined,
weight: typeof evaluator.weight === 'number' ? evaluator.weight : undefined,
verdict: typeof evaluator.verdict === 'string' ? evaluator.verdict : undefined,
details: evaluator.details,
})) ?? (record.scores as EvaluationResult['scores']),
gradingScores ?? (record.scores as EvaluationResult['scores']),
tokenUsage: timing?.token_usage
? {
input: timing.token_usage.input,
Expand Down
11 changes: 9 additions & 2 deletions apps/cli/src/commands/results/validate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,17 @@ function checkArtifactFiles(runDir: string, entries: IndexEntry[]): Diagnostic[]
} else {
try {
const grading = JSON.parse(readFileSync(gradingPath, 'utf8'));
if (!grading.assertions || !Array.isArray(grading.assertions)) {
if (Array.isArray(grading.assertion_results)) {
// Current grading sidecar contract.
} else if (Array.isArray(grading.assertions)) {
diagnostics.push({
severity: 'warning',
message: `${testId}: grading.json uses legacy 'assertions' array; rewrite the run to emit 'assertion_results'`,
});
} else {
diagnostics.push({
severity: 'error',
message: `${testId}: grading.json missing 'assertions' array`,
message: `${testId}: grading.json missing 'assertion_results' array`,
});
}
if (!grading.summary) {
Expand Down
4 changes: 2 additions & 2 deletions apps/cli/test/commands/eval/aggregate.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ describe('writePerTestArtifacts', () => {
const grading1 = JSON.parse(
readFileSync(rowRunPath(tmpDir, 'test-1', 'run-1', 'grading.json'), 'utf8'),
);
expect(grading1.assertions).toHaveLength(1);
expect(grading1.assertion_results).toHaveLength(1);

const timing1 = JSON.parse(
readFileSync(rowRunPath(tmpDir, 'test-1', 'run-1', 'timing.json'), 'utf8'),
Expand All @@ -301,7 +301,7 @@ describe('writePerTestArtifacts', () => {
const grading2 = JSON.parse(
readFileSync(rowRunPath(tmpDir, 'test-2', 'run-1', 'grading.json'), 'utf8'),
);
expect(grading2.assertions).toHaveLength(1);
expect(grading2.assertion_results).toHaveLength(1);
});

it('writes outputs/answer.md for results with output', async () => {
Expand Down
Loading
Loading