diff --git a/Makefile.toml b/Makefile.toml index d35f6b74..e9982276 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -392,23 +392,26 @@ args = [ # Real-world job benchmark smoke -# | task | type | cwd | -# | -------------------------------- | --------- | --- | -# | real-world-job-smoke | composite | | -# | real-world-job-smoke-json | command | | -# | real-world-job-smoke-report | command | | -# | real-world-memory | composite | | -# | real-world-memory-json | command | | -# | real-world-memory-report | command | | -# | real-world-memory-evolution | composite | | -# | real-world-memory-evolution-json | command | | -# | real-world-memory-evolution-report | command | | -# | real-world-job-operator-ux | composite | | -# | real-world-job-operator-ux-json | command | | -# | real-world-job-operator-ux-report | command | | -# | real-world-memory-retrieval | composite | | -# | real-world-memory-retrieval-json | command | | -# | real-world-memory-retrieval-report | command | | +# | task | type | cwd | +# | -------------------------------------- | --------- | --- | +# | real-world-job-smoke | composite | | +# | real-world-job-smoke-json | command | | +# | real-world-job-smoke-report | command | | +# | real-world-memory | composite | | +# | real-world-memory-json | command | | +# | real-world-memory-report | command | | +# | real-world-memory-evolution | composite | | +# | real-world-memory-evolution-json | command | | +# | real-world-memory-evolution-report | command | | +# | real-world-memory-consolidation | composite | | +# | real-world-memory-consolidation-json | command | | +# | real-world-memory-consolidation-report | command | | +# | real-world-job-operator-ux | composite | | +# | real-world-job-operator-ux-json | command | | +# | real-world-job-operator-ux-report | command | | +# | real-world-memory-retrieval | composite | | +# | real-world-memory-retrieval-json | command | | +# | real-world-memory-retrieval-report | command | | [tasks.real-world-job-smoke] workspace = false @@ -475,7 +478,7 @@ args = [ "--out", "tmp/real-world-memory/real-world-memory-report.json", "--run-id", - "real-world-memory-trust-resume-personalization", + "real-world-memory", "--adapter-id", "elf_real_world_memory_fixture", "--adapter-name", @@ -649,6 +652,55 @@ args = [ "tmp/real-world-memory/retrieval-report.md", ] +[tasks.real-world-memory-consolidation] +workspace = false +dependencies = [ + "real-world-memory-consolidation-report", +] + +[tasks.real-world-memory-consolidation-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/consolidation", + "--out", + "tmp/real-world-memory/consolidation/report.json", + "--run-id", + "real-world-memory-consolidation", + "--adapter-id", + "fixture_consolidation", + "--adapter-name", + "ELF consolidation fixture", +] + +[tasks.real-world-memory-consolidation-report] +workspace = false +dependencies = [ + "real-world-memory-consolidation-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/consolidation/report.json", + "--out", + "tmp/real-world-memory/consolidation/report.md", +] + # Meta # | task | type | cwd | diff --git a/README.md b/README.md index cae2d70b..c636f041 100644 --- a/README.md +++ b/README.md @@ -161,8 +161,10 @@ Detailed evidence and interpretation: - [Single-User Production Runbook](docs/guide/single_user_production.md) - Future benchmark contract: [Real-World Agent Memory Benchmark v1](docs/spec/real_world_agent_memory_benchmark_v1.md). - This contract defines job-level suites for agent work, but no system win is claimed - under it until a runner encodes and reports those suites. + This contract defines job-level suites for agent work. Checked-in fixture runners now + cover a smoke work-resume slice and proposal-only consolidation cases through + `cargo make real-world-job-smoke` and `cargo make real-world-memory-consolidation`, + but those reports are fixture-level evidence and not live external-adapter wins. Quick comparison snapshot (objective/high-level). This table compares capability coverage, not overall project quality. diff --git a/apps/elf-eval/fixtures/real_world_memory/consolidation/contradiction_report_discard.json b/apps/elf-eval/fixtures/real_world_memory/consolidation/contradiction_report_discard.json new file mode 100644 index 00000000..e24e82a9 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/consolidation/contradiction_report_discard.json @@ -0,0 +1,284 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "consolidation-contradiction-report-discard-001", + "suite": "consolidation", + "title": "Discard a stale contradiction proposal instead of mutating source truth", + "corpus": { + "corpus_id": "real-world-memory-consolidation-synthetic-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "stale-consolidation-source-rewrite", + "kind": "message", + "text": "Old session draft: consolidation may update source notes directly after it finds duplicates.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "contradiction_report_discard", + "evidence_id": "stale-consolidation-source-rewrite" + } + }, + "created_at": "2026-06-01T08:00:00Z" + }, + { + "evidence_id": "current-consolidation-source-rule", + "kind": "decision", + "text": "Current rule: consolidation output is derived and reviewable; it must never destructively rewrite authoritative source notes, events, docs, traces, graph facts, or search traces.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "contradiction_report_discard", + "evidence_id": "current-consolidation-source-rule" + } + }, + "created_at": "2026-06-09T08:00:00Z" + }, + { + "evidence_id": "unsupported-applied-draft", + "kind": "message", + "text": "Unsupported draft: the consolidation worker already applied source note edits in production.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "contradiction_report_discard", + "evidence_id": "unsupported-applied-draft" + } + }, + "created_at": "2026-06-09T08:05:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_consolidation", + "answer": { + "content": "Create a contradiction report proposal and discard the stale source-rewrite synthesis. The report flags one unsupported applied-worker claim while keeping source mutation count at zero.", + "claims": [ + { + "claim_id": "contradiction_report_created", + "text": "Create a contradiction report proposal for the source-rewrite conflict.", + "evidence_ids": [ + "stale-consolidation-source-rewrite", + "current-consolidation-source-rule", + "unsupported-applied-draft" + ], + "confidence": "high" + }, + { + "claim_id": "contradiction_report_discarded", + "text": "Discard the stale source-rewrite synthesis rather than applying it.", + "evidence_ids": [ + "current-consolidation-source-rule", + "unsupported-applied-draft" + ], + "confidence": "high" + } + ], + "evidence_ids": [ + "stale-consolidation-source-rewrite", + "current-consolidation-source-rule", + "unsupported-applied-draft" + ], + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + }, + "consolidation": { + "proposals": [ + { + "proposal_id": "proposal-contradiction-report-discard", + "proposal_kind": "contradiction_report", + "source_refs": [ + "stale-consolidation-source-rewrite", + "current-consolidation-source-rule", + "unsupported-applied-draft" + ], + "expected_source_refs": [ + "stale-consolidation-source-rewrite", + "current-consolidation-source-rule", + "unsupported-applied-draft" + ], + "usefulness_score": 0.9, + "min_usefulness_score": 0.8, + "expected_review_action": "discard", + "actual_review_action": "discard", + "source_mutations": [], + "unsupported_claim_count": 1, + "diff": { + "summary": "Reject a stale source-rewrite synthesis and preserve it as a contradiction report.", + "before": {}, + "after": { + "target": "derived_contradiction_report", + "review_state": "rejected", + "unsupported_claims": [ + "The fixture has no evidence that a consolidation worker applied source note edits in production." + ], + "contradiction": "Older source-rewrite draft conflicts with the current proposal-only consolidation rule." + } + } + } + ], + "executable_gaps": [ + { + "primitive": "live_consolidation_worker_generation", + "follow_up_issue": "[ELF vNext P1] Implement reviewable consolidation worker and proposal review flow", + "reason": "This fixture scores checked-in proposal payloads; it does not execute scheduled LLM generation.", + "blocks_fixture_pass": false + } + ] + } + } + }, + "timeline": [ + { + "event_id": "contradiction-old-draft", + "ts": "2026-06-01T08:00:00Z", + "actor": "agent", + "action": "recorded_old_draft", + "evidence_ids": [ + "stale-consolidation-source-rewrite" + ], + "summary": "An old draft suggested source note rewrites." + }, + { + "event_id": "contradiction-current-rule", + "ts": "2026-06-09T08:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": [ + "current-consolidation-source-rule" + ], + "summary": "The current rule prohibits destructive source mutation." + }, + { + "event_id": "contradiction-unsupported-draft", + "ts": "2026-06-09T08:05:00Z", + "actor": "agent", + "action": "flagged_unsupported_claim", + "evidence_ids": [ + "unsupported-applied-draft" + ], + "summary": "A draft claimed production application without source evidence." + } + ], + "prompt": { + "role": "user", + "content": "Consolidate these sessions, but discard stale or unsupported synthesis and report contradictions.", + "job_mode": "debug", + "constraints": [ + "cite_evidence", + "do_not_run_live_actions", + "state_blockers" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "contradiction_report_created", + "text": "Create a contradiction report proposal for the source-rewrite conflict." + }, + { + "claim_id": "contradiction_report_discarded", + "text": "Discard the stale source-rewrite synthesis rather than applying it." + } + ], + "must_not_include": [ + "source note edits in production are confirmed", + "apply the source-rewrite synthesis" + ], + "evidence_links": { + "contradiction_report_created": [ + "stale-consolidation-source-rewrite", + "current-consolidation-source-rule", + "unsupported-applied-draft" + ], + "contradiction_report_discarded": [ + "current-consolidation-source-rule", + "unsupported-applied-draft" + ] + }, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "current-consolidation-source-rule", + "claim_id": "contradiction_report_discarded", + "requirement": "cite", + "quote": "must never destructively rewrite authoritative source notes" + }, + { + "evidence_id": "unsupported-applied-draft", + "claim_id": "contradiction_report_created", + "requirement": "explain", + "quote": "already applied source note edits in production" + } + ], + "negative_traps": [ + { + "trap_id": "stale-source-rewrite-as-current", + "type": "stale_fact", + "evidence_ids": [ + "stale-consolidation-source-rewrite" + ], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Creates a contradiction report and discards stale synthesis." + }, + "evidence_grounding": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Explains the conflict from current rule and unsupported draft evidence." + }, + "proposal_usefulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Contradiction report is useful for reviewer inspection." + }, + "lineage_completeness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Lineage includes stale, current, and unsupported sources." + }, + "review_action_correctness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Review action is discard." + }, + "source_immutability": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Source mutation count remains zero." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "source mutation count must remain zero for proposal-only consolidation cases" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "consolidation", + "proposal_only", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/consolidation/preference_candidate_defer.json b/apps/elf-eval/fixtures/real_world_memory/consolidation/preference_candidate_defer.json new file mode 100644 index 00000000..5af09e1d --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/consolidation/preference_candidate_defer.json @@ -0,0 +1,242 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "consolidation-preference-candidate-defer-001", + "suite": "consolidation", + "title": "Defer a preference candidate until reviewer confirmation", + "corpus": { + "corpus_id": "real-world-memory-consolidation-synthetic-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "session-preference-1", + "kind": "message", + "text": "Session 1: The operator asked for checkpoint updates to stay concise and evidence-focused.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "preference_candidate_defer", + "evidence_id": "session-preference-1" + } + }, + "created_at": "2026-06-05T09:00:00Z" + }, + { + "evidence_id": "session-preference-2", + "kind": "message", + "text": "Session 2: The operator repeated that durable reports should stay short unless validation evidence requires detail.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "preference_candidate_defer", + "evidence_id": "session-preference-2" + } + }, + "created_at": "2026-06-06T09:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_consolidation", + "answer": { + "content": "Propose a preference candidate for concise, evidence-focused updates, but defer application because the fixture does not include explicit reviewer confirmation.", + "claims": [ + { + "claim_id": "preference_candidate_created", + "text": "Create a preference candidate for concise, evidence-focused updates.", + "evidence_ids": [ + "session-preference-1", + "session-preference-2" + ], + "confidence": "medium" + }, + { + "claim_id": "preference_candidate_deferred", + "text": "Defer the preference candidate until reviewer confirmation.", + "evidence_ids": [ + "session-preference-1", + "session-preference-2" + ], + "confidence": "medium" + } + ], + "evidence_ids": [ + "session-preference-1", + "session-preference-2" + ], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + }, + "consolidation": { + "proposals": [ + { + "proposal_id": "proposal-preference-candidate-defer", + "proposal_kind": "preference_candidate", + "source_refs": [ + "session-preference-1", + "session-preference-2" + ], + "expected_source_refs": [ + "session-preference-1", + "session-preference-2" + ], + "usefulness_score": 0.86, + "min_usefulness_score": 0.75, + "expected_review_action": "defer", + "actual_review_action": "defer", + "source_mutations": [], + "unsupported_claim_count": 0, + "diff": { + "summary": "Stage a preference candidate for review without applying it.", + "before": {}, + "after": { + "target": "derived_preference_candidate", + "text": "Preference candidate: The operator prefers concise, evidence-focused updates." + } + } + } + ], + "executable_gaps": [ + { + "primitive": "live_consolidation_worker_generation", + "follow_up_issue": "[ELF vNext P1] Implement reviewable consolidation worker and proposal review flow", + "reason": "This fixture scores checked-in proposal payloads; it does not execute scheduled LLM generation.", + "blocks_fixture_pass": false + } + ] + } + } + }, + "timeline": [ + { + "event_id": "preference-session-1", + "ts": "2026-06-05T09:00:00Z", + "actor": "user", + "action": "stated_preference", + "evidence_ids": [ + "session-preference-1" + ], + "summary": "The first session asked for concise, evidence-focused checkpoints." + }, + { + "event_id": "preference-session-2", + "ts": "2026-06-06T09:00:00Z", + "actor": "user", + "action": "restated_preference", + "evidence_ids": [ + "session-preference-2" + ], + "summary": "The second session restated a short-report preference with an evidence caveat." + } + ], + "prompt": { + "role": "user", + "content": "Review these sessions and propose any durable preference candidate, but do not apply it without review.", + "job_mode": "personalize", + "constraints": [ + "cite_evidence", + "do_not_run_live_actions", + "state_blockers" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "preference_candidate_created", + "text": "Create a preference candidate for concise, evidence-focused updates." + }, + { + "claim_id": "preference_candidate_deferred", + "text": "Defer the preference candidate until reviewer confirmation." + } + ], + "must_not_include": [ + "Preference applied", + "rewrite existing profile notes" + ], + "evidence_links": { + "preference_candidate_created": [ + "session-preference-1", + "session-preference-2" + ], + "preference_candidate_deferred": [ + "session-preference-1", + "session-preference-2" + ] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "session-preference-1", + "claim_id": "preference_candidate_created", + "requirement": "cite", + "quote": "checkpoint updates to stay concise and evidence-focused" + }, + { + "evidence_id": "session-preference-2", + "claim_id": "preference_candidate_deferred", + "requirement": "use", + "quote": "stay short unless validation evidence requires detail" + } + ], + "negative_traps": [], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Creates but does not apply the preference candidate." + }, + "evidence_grounding": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Uses both preference statements." + }, + "proposal_usefulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Preference proposal is useful enough to stage." + }, + "lineage_completeness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Lineage includes both sessions." + }, + "review_action_correctness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Review action is defer, not apply." + }, + "source_immutability": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Source mutation count remains zero." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "source mutation count must remain zero for proposal-only consolidation cases" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "consolidation", + "proposal_only", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/consolidation/project_summary_apply.json b/apps/elf-eval/fixtures/real_world_memory/consolidation/project_summary_apply.json new file mode 100644 index 00000000..7bb750d3 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/consolidation/project_summary_apply.json @@ -0,0 +1,266 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "consolidation-project-summary-apply-001", + "suite": "consolidation", + "title": "Create a reviewable project summary proposal without source mutation", + "corpus": { + "corpus_id": "real-world-memory-consolidation-synthetic-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "session-project-summary-1", + "kind": "message", + "text": "Session 1: ELF keeps Postgres as source of truth and Qdrant as a rebuildable derived index for candidate retrieval.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "project_summary_apply", + "evidence_id": "session-project-summary-1" + } + }, + "created_at": "2026-06-02T10:00:00Z" + }, + { + "evidence_id": "session-project-summary-2", + "kind": "message", + "text": "Session 2: The production adoption gate passed with bounded caveats; no private production corpus pass is claimed until an operator-owned manifest is supplied.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "project_summary_apply", + "evidence_id": "session-project-summary-2" + } + }, + "created_at": "2026-06-09T10:00:00Z" + }, + { + "evidence_id": "project-summary-source-mutation-trap", + "kind": "message", + "text": "Bad draft: consolidate by rewriting the original project notes in place.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "project_summary_apply", + "evidence_id": "project-summary-source-mutation-trap" + } + }, + "created_at": "2026-06-09T10:05:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_consolidation", + "answer": { + "content": "Create a derived project summary proposal and keep the source notes unchanged. The proposal cites the source-of-truth and adoption-caveat sessions.", + "claims": [ + { + "claim_id": "project_summary_proposal_created", + "text": "Create a derived project summary proposal from the source-of-truth and adoption-caveat sessions.", + "evidence_ids": [ + "session-project-summary-1", + "session-project-summary-2" + ], + "confidence": "high" + }, + { + "claim_id": "project_summary_no_source_mutation", + "text": "The consolidation output is proposal-only and does not rewrite authoritative notes.", + "evidence_ids": [ + "session-project-summary-1", + "session-project-summary-2" + ], + "confidence": "high" + } + ], + "evidence_ids": [ + "session-project-summary-1", + "session-project-summary-2" + ], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + }, + "consolidation": { + "proposals": [ + { + "proposal_id": "proposal-project-summary-apply", + "proposal_kind": "project_summary", + "source_refs": [ + "session-project-summary-1", + "session-project-summary-2" + ], + "expected_source_refs": [ + "session-project-summary-1", + "session-project-summary-2" + ], + "usefulness_score": 0.93, + "min_usefulness_score": 0.8, + "expected_review_action": "apply", + "actual_review_action": "apply", + "source_mutations": [], + "unsupported_claim_count": 0, + "diff": { + "summary": "Create one derived project summary for reviewer approval.", + "before": {}, + "after": { + "target": "derived_project_summary", + "summary": "ELF keeps Postgres authoritative, Qdrant rebuildable, and production adoption bounded by the missing private-corpus manifest." + } + } + } + ], + "executable_gaps": [ + { + "primitive": "live_consolidation_worker_generation", + "follow_up_issue": "[ELF vNext P1] Implement reviewable consolidation worker and proposal review flow", + "reason": "This fixture scores checked-in proposal payloads; it does not execute scheduled LLM generation.", + "blocks_fixture_pass": false + } + ] + } + } + }, + "timeline": [ + { + "event_id": "project-summary-session-1", + "ts": "2026-06-02T10:00:00Z", + "actor": "agent", + "action": "recorded_source_boundary", + "evidence_ids": [ + "session-project-summary-1" + ], + "summary": "The first session recorded ELF source-of-truth and rebuildable-index boundaries." + }, + { + "event_id": "project-summary-session-2", + "ts": "2026-06-09T10:00:00Z", + "actor": "agent", + "action": "recorded_adoption_caveat", + "evidence_ids": [ + "session-project-summary-2" + ], + "summary": "The later session recorded the bounded production adoption caveat." + } + ], + "prompt": { + "role": "user", + "content": "Review the recent sessions and propose a project summary only if it preserves source-truth notes.", + "job_mode": "compile", + "constraints": [ + "cite_evidence", + "do_not_run_live_actions", + "avoid_repeating_completed_work" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "project_summary_proposal_created", + "text": "Create a derived project summary proposal from the source-of-truth and adoption-caveat sessions." + }, + { + "claim_id": "project_summary_no_source_mutation", + "text": "The consolidation output is proposal-only and does not rewrite authoritative notes." + } + ], + "must_not_include": [ + "rewrite the original project notes in place", + "private production corpus pass is claimed" + ], + "evidence_links": { + "project_summary_proposal_created": [ + "session-project-summary-1", + "session-project-summary-2" + ], + "project_summary_no_source_mutation": [ + "session-project-summary-1", + "session-project-summary-2" + ] + }, + "answer_type": "compiled_knowledge", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "session-project-summary-1", + "claim_id": "project_summary_proposal_created", + "requirement": "cite", + "quote": "Postgres as source of truth and Qdrant as a rebuildable derived index" + }, + { + "evidence_id": "session-project-summary-2", + "claim_id": "project_summary_proposal_created", + "requirement": "use", + "quote": "no private production corpus pass is claimed" + } + ], + "negative_traps": [ + { + "trap_id": "project-summary-source-rewrite", + "type": "unsafe_action", + "evidence_ids": [ + "project-summary-source-mutation-trap" + ], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Creates a derived project summary proposal with the right caveat." + }, + "evidence_grounding": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Uses both source sessions and avoids the rewrite trap." + }, + "proposal_usefulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Proposal usefulness meets the fixture threshold." + }, + "lineage_completeness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Proposal lineage includes every expected source ref." + }, + "review_action_correctness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The useful project summary is applied." + }, + "source_immutability": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Source mutation count remains zero." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true", + "source mutation count must remain zero for proposal-only consolidation cases" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "consolidation", + "proposal_only", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/consolidation/weekly_decision_summary_apply.json b/apps/elf-eval/fixtures/real_world_memory/consolidation/weekly_decision_summary_apply.json new file mode 100644 index 00000000..20b73944 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/consolidation/weekly_decision_summary_apply.json @@ -0,0 +1,244 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "consolidation-weekly-decision-summary-apply-001", + "suite": "consolidation", + "title": "Apply a weekly decision summary proposal with complete lineage", + "corpus": { + "corpus_id": "real-world-memory-consolidation-synthetic-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "weekly-decision-typed-failures", + "kind": "decision", + "text": "Monday decision: benchmark reports must preserve typed failure states instead of flattening blocked, incomplete, wrong_result, and unsupported_claim into one pass/fail label.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "weekly_decision_summary_apply", + "evidence_id": "weekly-decision-typed-failures" + } + }, + "created_at": "2026-06-01T12:00:00Z" + }, + { + "evidence_id": "weekly-decision-private-caveat", + "kind": "decision", + "text": "Friday decision: production adoption is acceptable for personal use with bounded caveats, but private-corpus proof remains unclaimed until a private manifest exists.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "weekly_decision_summary_apply", + "evidence_id": "weekly-decision-private-caveat" + } + }, + "created_at": "2026-06-05T12:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_consolidation", + "answer": { + "content": "Apply a weekly decision summary proposal covering typed failure states and the bounded production-adoption caveat. Keep it derived and source-linked.", + "claims": [ + { + "claim_id": "weekly_summary_proposal_created", + "text": "Create a weekly decision summary proposal for typed failure states and bounded adoption caveats.", + "evidence_ids": [ + "weekly-decision-typed-failures", + "weekly-decision-private-caveat" + ], + "confidence": "high" + }, + { + "claim_id": "weekly_summary_review_apply", + "text": "Apply the weekly summary as a derived decision summary after review.", + "evidence_ids": [ + "weekly-decision-typed-failures", + "weekly-decision-private-caveat" + ], + "confidence": "high" + } + ], + "evidence_ids": [ + "weekly-decision-typed-failures", + "weekly-decision-private-caveat" + ], + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + }, + "consolidation": { + "proposals": [ + { + "proposal_id": "proposal-weekly-decision-summary-apply", + "proposal_kind": "weekly_decision_summary", + "source_refs": [ + "weekly-decision-typed-failures", + "weekly-decision-private-caveat" + ], + "expected_source_refs": [ + "weekly-decision-typed-failures", + "weekly-decision-private-caveat" + ], + "usefulness_score": 0.91, + "min_usefulness_score": 0.8, + "expected_review_action": "apply", + "actual_review_action": "apply", + "source_mutations": [], + "unsupported_claim_count": 0, + "diff": { + "summary": "Create a derived weekly decision summary.", + "before": {}, + "after": { + "target": "derived_weekly_decision_summary", + "decisions": [ + "Preserve typed failure states in benchmark reports.", + "Keep the production adoption claim bounded until private-corpus proof exists." + ] + } + } + } + ], + "executable_gaps": [ + { + "primitive": "live_consolidation_worker_generation", + "follow_up_issue": "[ELF vNext P1] Implement reviewable consolidation worker and proposal review flow", + "reason": "This fixture scores checked-in proposal payloads; it does not execute scheduled LLM generation.", + "blocks_fixture_pass": false + } + ] + } + } + }, + "timeline": [ + { + "event_id": "weekly-decision-monday", + "ts": "2026-06-01T12:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": [ + "weekly-decision-typed-failures" + ], + "summary": "The week started with a typed-failure reporting decision." + }, + { + "event_id": "weekly-decision-friday", + "ts": "2026-06-05T12:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": [ + "weekly-decision-private-caveat" + ], + "summary": "The week ended with a bounded production-adoption decision." + } + ], + "prompt": { + "role": "user", + "content": "Summarize this week's durable decisions as a reviewable consolidation proposal.", + "job_mode": "compile", + "constraints": [ + "cite_evidence", + "do_not_run_live_actions" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "weekly_summary_proposal_created", + "text": "Create a weekly decision summary proposal for typed failure states and bounded adoption caveats." + }, + { + "claim_id": "weekly_summary_review_apply", + "text": "Apply the weekly summary as a derived decision summary after review." + } + ], + "must_not_include": [ + "private-corpus proof passed", + "collapse typed failures into a pass/fail label" + ], + "evidence_links": { + "weekly_summary_proposal_created": [ + "weekly-decision-typed-failures", + "weekly-decision-private-caveat" + ], + "weekly_summary_review_apply": [ + "weekly-decision-typed-failures", + "weekly-decision-private-caveat" + ] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "weekly-decision-typed-failures", + "claim_id": "weekly_summary_proposal_created", + "requirement": "cite", + "quote": "preserve typed failure states" + }, + { + "evidence_id": "weekly-decision-private-caveat", + "claim_id": "weekly_summary_proposal_created", + "requirement": "use", + "quote": "private-corpus proof remains unclaimed" + } + ], + "negative_traps": [], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Includes both weekly decisions and their correct review action." + }, + "evidence_grounding": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Uses both decision sources." + }, + "proposal_usefulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Weekly summary is useful enough to apply." + }, + "lineage_completeness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Lineage includes both decision sources." + }, + "review_action_correctness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Review action is apply." + }, + "source_immutability": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Source mutation count remains zero." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "source mutation count must remain zero for proposal-only consolidation cases" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "consolidation", + "proposal_only", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index d87202b7..42e6c496 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -25,6 +25,15 @@ const DEFAULT_RUN_ID: &str = "real-world-job-smoke"; const DEFAULT_ADAPTER_ID: &str = "fixture_smoke"; const DEFAULT_ADAPTER_NAME: &str = "ELF fixture smoke"; const NOT_ENCODED_REASON: &str = "No checked-in real_world_job fixture is encoded for this suite."; +const FORBIDDEN_SOURCE_MUTATION_KEYS: [&str; 7] = [ + "delete_source", + "delete_sources", + "source_delete", + "source_mutation", + "source_mutations", + "source_note_updates", + "overwrite_source", +]; const SUITES: &[&str] = &[ "trust_source_of_truth", "work_resume", @@ -333,6 +342,7 @@ struct AllowedUncertainty { struct AdapterResponse { adapter_id: Option, answer: ProducedAnswer, + consolidation: Option, } #[derive(Clone, Debug, Deserialize, Serialize)] @@ -361,6 +371,51 @@ struct ProducedClaim { confidence: Option, } +#[derive(Clone, Debug, Deserialize)] +struct ConsolidationFixture { + #[serde(default)] + proposals: Vec, + #[serde(default)] + executable_gaps: Vec, +} + +#[derive(Clone, Debug, Deserialize)] +struct ConsolidationProposalFixture { + proposal_id: String, + proposal_kind: String, + #[serde(default)] + source_refs: Vec, + #[serde(default)] + expected_source_refs: Vec, + usefulness_score: f64, + min_usefulness_score: f64, + expected_review_action: ConsolidationReviewAction, + actual_review_action: ConsolidationReviewAction, + #[serde(default)] + source_mutations: Vec, + #[serde(default)] + unsupported_claim_count: usize, + #[serde(default)] + diff: Value, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum ConsolidationReviewAction { + Apply, + Discard, + Defer, +} + +#[derive(Clone, Debug, Deserialize)] +struct ConsolidationExecutableGap { + primitive: String, + follow_up_issue: String, + reason: String, + #[serde(default)] + blocks_fixture_pass: bool, +} + #[derive(Clone, Debug, Deserialize, Serialize)] struct CostReport { #[serde(skip_serializing_if = "Option::is_none")] @@ -565,6 +620,19 @@ struct ReportSummary { trace_incomplete_count: usize, #[serde(default)] operator_ux_gap_count: usize, + #[serde(default)] + consolidation: ConsolidationSummaryReport, +} + +#[derive(Debug, Default, Deserialize, Serialize)] +struct ConsolidationSummaryReport { + proposal_count: usize, + proposal_usefulness: Option, + lineage_completeness: Option, + review_action_correctness: Option, + source_mutation_count: usize, + proposal_unsupported_claim_count: usize, + executable_gap_count: usize, } #[derive(Debug, Deserialize, Serialize)] @@ -645,6 +713,8 @@ struct JobReport { operator_debug: Option, #[serde(skip_serializing_if = "Option::is_none")] evolution: Option, + #[serde(skip_serializing_if = "Option::is_none")] + consolidation: Option, } #[derive(Debug, Deserialize, Serialize)] @@ -673,6 +743,40 @@ struct RetrievalQualityReport { trap_context_count: usize, } +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ConsolidationJobReport { + proposal_count: usize, + proposal_usefulness: Option, + lineage_completeness: Option, + review_action_correctness: Option, + source_mutation_count: usize, + proposal_unsupported_claim_count: usize, + executable_gaps: Vec, + proposals: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ConsolidationProposalReport { + proposal_id: String, + proposal_kind: String, + usefulness_score: f64, + min_usefulness_score: f64, + lineage_completeness: f64, + expected_review_action: ConsolidationReviewAction, + actual_review_action: ConsolidationReviewAction, + review_action_correct: bool, + source_mutation_count: usize, + unsupported_claim_count: usize, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ConsolidationExecutableGapReport { + primitive: String, + follow_up_issue: String, + reason: String, + blocks_fixture_pass: bool, +} + #[derive(Clone, Debug, Deserialize, Serialize)] struct UnsupportedClaimReport { suite_id: String, @@ -732,6 +836,7 @@ struct JobScoring { dimension_scores: Vec, reason: String, evolution: Option, + consolidation: Option, } #[derive(Debug, Default)] @@ -749,6 +854,11 @@ struct FailureCounts { conflict_detection_missing: usize, update_rationale_missing: usize, latency_violations: usize, + proposal_usefulness_failures: usize, + lineage_failures: usize, + review_action_failures: usize, + source_mutations: usize, + blocking_executable_gaps: usize, } #[derive(Debug, Default)] @@ -865,6 +975,7 @@ fn validate_job(job: &RealWorldJob, path: &Path) -> Result<()> { validate_prompt(job, path)?; validate_expected_answer(job, path)?; validate_required_evidence(job, path)?; + validate_consolidation_fixture(job, path)?; validate_scoring_rubric(job, path)?; validate_allowed_uncertainty(job, path)?; validate_operator_debug(job, path)?; @@ -1056,6 +1167,80 @@ fn validate_required_evidence(job: &RealWorldJob, path: &Path) -> Result<()> { Ok(()) } +fn validate_consolidation_fixture(job: &RealWorldJob, path: &Path) -> Result<()> { + let consolidation = + job.corpus.adapter_response.as_ref().and_then(|response| response.consolidation.as_ref()); + + if job.suite == "consolidation" && consolidation.is_none() { + return Err(eyre::eyre!( + "{} consolidation jobs must provide adapter_response.consolidation.", + path.display() + )); + } + + let Some(consolidation) = consolidation else { + return Ok(()); + }; + + if consolidation.proposals.is_empty() && consolidation.executable_gaps.is_empty() { + return Err(eyre::eyre!( + "{} consolidation fixture must provide proposals or executable_gaps.", + path.display() + )); + } + + for proposal in &consolidation.proposals { + validate_consolidation_proposal(proposal, path)?; + } + for gap in &consolidation.executable_gaps { + if gap.primitive.trim().is_empty() + || gap.follow_up_issue.trim().is_empty() + || gap.reason.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete consolidation executable gap.", + path.display() + )); + } + } + + Ok(()) +} + +fn validate_consolidation_proposal( + proposal: &ConsolidationProposalFixture, + path: &Path, +) -> Result<()> { + if proposal.proposal_id.trim().is_empty() + || proposal.proposal_kind.trim().is_empty() + || proposal.source_refs.is_empty() + || proposal.expected_source_refs.is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete consolidation proposal fixture.", + path.display() + )); + } + if !proposal.usefulness_score.is_finite() + || !proposal.min_usefulness_score.is_finite() + || !(0.0..=1.0).contains(&proposal.usefulness_score) + || !(0.0..=1.0).contains(&proposal.min_usefulness_score) + { + return Err(eyre::eyre!( + "{} has invalid consolidation proposal usefulness scores.", + path.display() + )); + } + if !proposal.diff.is_null() && !proposal.diff.is_object() { + return Err(eyre::eyre!( + "{} consolidation proposal diff must be a JSON object when present.", + path.display() + )); + } + + Ok(()) +} + fn validate_scoring_rubric(job: &RealWorldJob, path: &Path) -> Result<()> { if !(0.0..=1.0).contains(&job.scoring_rubric.pass_threshold) { return Err(eyre::eyre!("{} has invalid pass_threshold.", path.display())); @@ -1458,6 +1643,7 @@ fn score_job(job: &RealWorldJob) -> JobScoring { let answer = produced_answer(job); let produced_evidence = produced_evidence_ids(answer); let trap_ids_used = trap_ids_used(job, &produced_evidence); + let consolidation = consolidation_job_report(job); if let Some(status) = job.encoding.status { let evolution = evolution_job_report(job, answer, &trap_ids_used, 0); @@ -1476,6 +1662,7 @@ fn score_job(job: &RealWorldJob) -> JobScoring { .clone() .unwrap_or_else(|| "Job did not reach a runnable scoring state.".to_string()), evolution, + consolidation, }; } @@ -1506,6 +1693,11 @@ fn score_job(job: &RealWorldJob) -> JobScoring { conflict_detection_missing, update_rationale_missing, latency_violations, + proposal_usefulness_failures: proposal_usefulness_failures(consolidation.as_ref()), + lineage_failures: lineage_failures(consolidation.as_ref()), + review_action_failures: review_action_failures(consolidation.as_ref()), + source_mutations: consolidation.as_ref().map_or(0, |report| report.source_mutation_count), + blocking_executable_gaps: blocking_executable_gaps(consolidation.as_ref()), }; let dimension_scores = dimension_scores(job, &counts); let normalized_score = normalized_score(&dimension_scores); @@ -1518,12 +1710,17 @@ fn score_job(job: &RealWorldJob) -> JobScoring { + counts.operator_debug_trace_gaps + counts.operator_debug_repair_unclear + counts.conflict_detection_missing - + counts.update_rationale_missing; + + counts.update_rationale_missing + + counts.proposal_usefulness_failures + + counts.lineage_failures + + counts.review_action_failures; let status = job_status( normalized_score, job.scoring_rubric.pass_threshold, wrong_result_count, unsupported_claims.len(), + counts.source_mutations, + counts.blocking_executable_gaps, ); let reason = job_reason(status, &counts, normalized_score); @@ -1542,6 +1739,7 @@ fn score_job(job: &RealWorldJob) -> JobScoring { dimension_scores, reason, evolution, + consolidation, } } @@ -1849,6 +2047,20 @@ fn hard_fail_hits( hits.push("missing required refusal".to_string()); } + if let Some(consolidation) = consolidation_job_report(job) { + if consolidation.source_mutation_count > 0 { + hits.push( + "source mutation count must remain zero for proposal-only consolidation cases" + .to_string(), + ); + } + if consolidation.executable_gaps.iter().any(|gap| gap.blocks_fixture_pass) { + hits.push( + "missing consolidation primitive requires a precise follow-up issue".to_string(), + ); + } + } + hits } @@ -1881,14 +2093,24 @@ fn dimension_score(dimension_id: &str, max_points: f64, counts: &FailureCounts) counts.missing_claims > 0 || counts.forbidden_claims > 0 || counts.operator_debug_repair_unclear > 0 - || counts.conflict_detection_missing > 0, - "evidence_grounding" => counts.missing_evidence > 0 || counts.unsupported_claims > 0, + || counts.conflict_detection_missing > 0 + || counts.proposal_usefulness_failures > 0 + || counts.review_action_failures > 0, + "evidence_grounding" => + counts.missing_evidence > 0 + || counts.unsupported_claims > 0 + || counts.lineage_failures > 0, "trap_avoidance" => counts.trap_uses > 0, "uncertainty_handling" => counts.unsupported_claims > 0, "lifecycle_behavior" => counts.stale_answers > 0 || counts.conflict_detection_missing > 0 - || counts.update_rationale_missing > 0, + || counts.update_rationale_missing > 0 + || counts.source_mutations > 0, + "source_immutability" => counts.source_mutations > 0, + "proposal_usefulness" => counts.proposal_usefulness_failures > 0, + "lineage_completeness" => counts.lineage_failures > 0, + "review_action_correctness" => counts.review_action_failures > 0, "debuggability" => counts.missing_claims > 0 || counts.unsupported_claims > 0 @@ -1939,9 +2161,15 @@ fn job_status( pass_threshold: f64, wrong_result_count: usize, unsupported_claim_count: usize, + source_mutation_count: usize, + blocking_executable_gap_count: usize, ) -> TypedStatus { if unsupported_claim_count > 0 { TypedStatus::UnsupportedClaim + } else if source_mutation_count > 0 { + TypedStatus::LifecycleFail + } else if blocking_executable_gap_count > 0 { + TypedStatus::Blocked } else if wrong_result_count > 0 { TypedStatus::WrongResult } else if normalized_score >= pass_threshold { @@ -1966,7 +2194,10 @@ fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64 + counts.operator_debug_trace_gaps + counts.operator_debug_repair_unclear + counts.conflict_detection_missing - + counts.update_rationale_missing, + + counts.update_rationale_missing + + counts.proposal_usefulness_failures + + counts.lineage_failures + + counts.review_action_failures, counts.latency_violations ), TypedStatus::WrongResult => format!( @@ -1980,9 +2211,20 @@ fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64 + counts.operator_debug_trace_gaps + counts.operator_debug_repair_unclear + counts.conflict_detection_missing - + counts.update_rationale_missing, + + counts.update_rationale_missing + + counts.proposal_usefulness_failures + + counts.lineage_failures + + counts.review_action_failures, counts.latency_violations ), + TypedStatus::LifecycleFail => format!( + "Job produced {} source mutation(s) and normalized_score {normalized_score:.3}.", + counts.source_mutations + ), + TypedStatus::Blocked => format!( + "Job has {} blocking executable gap(s) and normalized_score {normalized_score:.3}.", + counts.blocking_executable_gaps + ), _ => "Job did not reach a runnable scoring state.".to_string(), } } @@ -2041,6 +2283,122 @@ fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { qdrant_rebuild_case: metrics.qdrant_rebuild_case, operator_debug: job.operator_debug.clone(), evolution: scoring.evolution, + consolidation: scoring.consolidation, + } +} + +fn consolidation_job_report(job: &RealWorldJob) -> Option { + let fixture = job.corpus.adapter_response.as_ref()?.consolidation.as_ref()?; + let proposals = fixture.proposals.iter().map(consolidation_proposal_report).collect::>(); + let executable_gaps = fixture + .executable_gaps + .iter() + .map(|gap| ConsolidationExecutableGapReport { + primitive: gap.primitive.clone(), + follow_up_issue: gap.follow_up_issue.clone(), + reason: gap.reason.clone(), + blocks_fixture_pass: gap.blocks_fixture_pass, + }) + .collect::>(); + let proposal_count = proposals.len(); + let source_mutation_count = + proposals.iter().map(|proposal| proposal.source_mutation_count).sum(); + let proposal_unsupported_claim_count = + proposals.iter().map(|proposal| proposal.unsupported_claim_count).sum(); + + Some(ConsolidationJobReport { + proposal_count, + proposal_usefulness: mean_proposal_metric( + proposals.iter().map(|proposal| proposal.usefulness_score), + ), + lineage_completeness: mean_proposal_metric( + proposals.iter().map(|proposal| proposal.lineage_completeness), + ), + review_action_correctness: mean_proposal_metric( + proposals.iter().map(|proposal| if proposal.review_action_correct { 1.0 } else { 0.0 }), + ), + source_mutation_count, + proposal_unsupported_claim_count, + executable_gaps, + proposals, + }) +} + +fn consolidation_proposal_report( + proposal: &ConsolidationProposalFixture, +) -> ConsolidationProposalReport { + ConsolidationProposalReport { + proposal_id: proposal.proposal_id.clone(), + proposal_kind: proposal.proposal_kind.clone(), + usefulness_score: round3(proposal.usefulness_score), + min_usefulness_score: round3(proposal.min_usefulness_score), + lineage_completeness: round3(lineage_completeness(proposal)), + expected_review_action: proposal.expected_review_action, + actual_review_action: proposal.actual_review_action, + review_action_correct: proposal.expected_review_action == proposal.actual_review_action, + source_mutation_count: proposal.source_mutations.len() + + forbidden_diff_key_count(&proposal.diff), + unsupported_claim_count: proposal.unsupported_claim_count, + } +} + +fn lineage_completeness(proposal: &ConsolidationProposalFixture) -> f64 { + let expected = proposal.expected_source_refs.iter().collect::>(); + let actual = proposal.source_refs.iter().collect::>(); + let matched = expected.iter().filter(|source_ref| actual.contains(**source_ref)).count(); + + matched as f64 / expected.len() as f64 +} + +fn forbidden_diff_key_count(value: &Value) -> usize { + match value { + Value::Object(map) => map + .iter() + .map(|(key, nested)| { + usize::from(FORBIDDEN_SOURCE_MUTATION_KEYS.contains(&key.as_str())) + + forbidden_diff_key_count(nested) + }) + .sum(), + Value::Array(items) => items.iter().map(forbidden_diff_key_count).sum(), + _ => 0, + } +} + +fn proposal_usefulness_failures(consolidation: Option<&ConsolidationJobReport>) -> usize { + consolidation.map_or(0, |report| { + report + .proposals + .iter() + .filter(|proposal| proposal.usefulness_score < proposal.min_usefulness_score) + .count() + }) +} + +fn lineage_failures(consolidation: Option<&ConsolidationJobReport>) -> usize { + consolidation.map_or(0, |report| { + report.proposals.iter().filter(|proposal| proposal.lineage_completeness < 1.0).count() + }) +} + +fn review_action_failures(consolidation: Option<&ConsolidationJobReport>) -> usize { + consolidation.map_or(0, |report| { + report.proposals.iter().filter(|proposal| !proposal.review_action_correct).count() + }) +} + +fn blocking_executable_gaps(consolidation: Option<&ConsolidationJobReport>) -> usize { + consolidation.map_or(0, |report| { + report.executable_gaps.iter().filter(|gap| gap.blocks_fixture_pass).count() + }) +} + +fn mean_proposal_metric(values: impl Iterator) -> Option { + let values = values.collect::>(); + + if values.is_empty() { + None + } else { + Some(round3(values.iter().sum::() / values.len() as f64)) } } @@ -2388,6 +2746,7 @@ fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary { .filter_map(|job| job.operator_debug.as_ref()) .map(|debug| debug.ux_gaps.len()) .sum(), + consolidation: consolidation_summary(jobs), ..ReportSummary::default() }; @@ -2462,6 +2821,39 @@ fn ratio_or(numerator: usize, denominator: usize, empty_value: f64) -> f64 { if denominator == 0 { empty_value } else { round3(numerator as f64 / denominator as f64) } } +fn consolidation_summary(jobs: &[JobReport]) -> ConsolidationSummaryReport { + let reports = jobs.iter().filter_map(|job| job.consolidation.as_ref()).collect::>(); + + if reports.is_empty() { + return ConsolidationSummaryReport::default(); + } + + let proposals = reports.iter().flat_map(|report| report.proposals.iter()).collect::>(); + let executable_gap_count = reports.iter().map(|report| report.executable_gaps.len()).sum(); + + ConsolidationSummaryReport { + proposal_count: proposals.len(), + proposal_usefulness: mean_proposal_metric( + proposals.iter().map(|proposal| proposal.usefulness_score), + ), + lineage_completeness: mean_proposal_metric( + proposals.iter().map(|proposal| proposal.lineage_completeness), + ), + review_action_correctness: mean_proposal_metric( + proposals.iter().map(|proposal| if proposal.review_action_correct { 1.0 } else { 0.0 }), + ), + source_mutation_count: proposals + .iter() + .map(|proposal| proposal.source_mutation_count) + .sum(), + proposal_unsupported_claim_count: proposals + .iter() + .map(|proposal| proposal.unsupported_claim_count) + .sum(), + executable_gap_count, + } +} + fn mean_score(jobs: &[JobReport]) -> f64 { if jobs.is_empty() { return 0.0; @@ -2524,7 +2916,7 @@ fn adapter_report(args: &RunArgs) -> AdapterReport { behavior: "offline_fixture_response".to_string(), storage: TypedStatus::NotEncoded, runtime: TypedStatus::NotEncoded, - notes: "Smoke runner scores checked-in fixture responses; it does not exercise a live external adapter.".to_string(), + notes: "Offline runner scores checked-in fixture responses; it does not exercise a live external adapter.".to_string(), } } @@ -2590,6 +2982,7 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { render_markdown_operator_debugging(&mut out, report); render_markdown_evolution(&mut out, report); render_markdown_trace_explainability(&mut out, report); + render_markdown_consolidation(&mut out, report); render_markdown_unsupported_claims(&mut out, report); render_markdown_follow_ups(&mut out, report); render_markdown_semantics(&mut out, report); @@ -2640,7 +3033,7 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat "Read this when: You need a durable smoke report for real-world agent memory job fixtures.\n", ); out.push_str(&format!("Inputs: `{}`.\n", md_inline(report_path))); - out.push_str("Depends on: `apps/elf-eval/fixtures/real_world_job/`, `apps/elf-eval/fixtures/real_world_memory/`, `docs/spec/real_world_agent_memory_benchmark_v1.md`, and `Makefile.toml`.\n"); + out.push_str("Depends on: `apps/elf-eval/fixtures/`, `docs/spec/real_world_agent_memory_benchmark_v1.md`, and `Makefile.toml`.\n"); out.push_str( "Verification: Compare this Markdown summary with the source JSON before committing.\n\n", ); @@ -2682,6 +3075,32 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat "- Temporal validity not encoded: `{}`\n", report.summary.temporal_validity_not_encoded_count )); + + render_markdown_quality_summary(out, report); + + out.push_str(&format!("- Mean score: `{:.3}`\n", report.summary.mean_score)); + out.push_str(&format!( + "- Mean latency: `{}`\n", + optional_f64(report.summary.mean_latency_ms, " ms") + )); + out.push_str(&format!("- Cost: `{}`\n", cost_display(report.summary.total_cost.as_ref()))); + out.push_str(&format!( + "- Operator-debug jobs: `{}`\n", + report.summary.operator_debug_job_count + )); + out.push_str(&format!("- Raw SQL needed: `{}`\n", report.summary.raw_sql_needed_count)); + out.push_str(&format!( + "- Trace-incomplete debug jobs: `{}`\n", + report.summary.trace_incomplete_count + )); + out.push_str(&format!("- Operator UX gaps: `{}`\n", report.summary.operator_ux_gap_count)); + out.push_str(&format!( + "- Private corpus redaction: `{}`\n\n", + md_inline(report.private_corpus_redaction.policy.as_str()) + )); +} + +fn render_markdown_quality_summary(out: &mut String, report: &RealWorldReport) { out.push_str(&format!( "- Evidence coverage: `{}/{}` (`{:.3}`)\n", report.summary.evidence_covered_count, @@ -2728,25 +3147,9 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat report.summary.trace_explainability_count, report.summary.wrong_result_stage_attribution_count )); - out.push_str(&format!("- Mean score: `{:.3}`\n", report.summary.mean_score)); out.push_str(&format!( - "- Mean latency: `{}`\n", - optional_f64(report.summary.mean_latency_ms, " ms") - )); - out.push_str(&format!("- Cost: `{}`\n", cost_display(report.summary.total_cost.as_ref()))); - out.push_str(&format!( - "- Operator-debug jobs: `{}`\n", - report.summary.operator_debug_job_count - )); - out.push_str(&format!("- Raw SQL needed: `{}`\n", report.summary.raw_sql_needed_count)); - out.push_str(&format!( - "- Trace-incomplete debug jobs: `{}`\n", - report.summary.trace_incomplete_count - )); - out.push_str(&format!("- Operator UX gaps: `{}`\n", report.summary.operator_ux_gap_count)); - out.push_str(&format!( - "- Private corpus redaction: `{}`\n\n", - md_inline(report.private_corpus_redaction.policy.as_str()) + "- Consolidation source mutation count: `{}`\n", + report.summary.consolidation.source_mutation_count )); } @@ -2982,6 +3385,72 @@ fn render_markdown_trace_explainability(out: &mut String, report: &RealWorldRepo out.push('\n'); } +fn render_markdown_consolidation(out: &mut String, report: &RealWorldReport) { + if report.summary.consolidation.proposal_count == 0 { + return; + } + + out.push_str("## Consolidation\n\n"); + out.push_str("| Job | Proposals | Usefulness | Lineage | Review Actions | Source Mutations | Proposal Unsupported Claims | Executable Gaps |\n"); + out.push_str("| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n"); + + for job in &report.jobs { + let Some(consolidation) = &job.consolidation else { + continue; + }; + + out.push_str(&format!( + "| {} | {} | `{}` | `{}` | `{}` | {} | {} | {} |\n", + md_cell(job.job_id.as_str()), + consolidation.proposal_count, + optional_f64(consolidation.proposal_usefulness, ""), + optional_f64(consolidation.lineage_completeness, ""), + optional_f64(consolidation.review_action_correctness, ""), + consolidation.source_mutation_count, + consolidation.proposal_unsupported_claim_count, + consolidation.executable_gaps.len() + )); + } + + out.push_str( + "\nSource mutation count must remain `0` for proposal-only consolidation cases.\n\n", + ); + + render_markdown_consolidation_gaps(out, report); +} + +fn render_markdown_consolidation_gaps(out: &mut String, report: &RealWorldReport) { + let gaps = report + .jobs + .iter() + .filter_map(|job| job.consolidation.as_ref().map(|consolidation| (job, consolidation))) + .flat_map(|(job, consolidation)| { + consolidation.executable_gaps.iter().map(move |gap| (job.job_id.as_str(), gap)) + }) + .collect::>(); + + if gaps.is_empty() { + return; + } + + out.push_str("### Executable Gaps\n\n"); + out.push_str("| Job | Primitive | Follow-Up Issue | Blocks Fixture Pass | Reason |\n"); + out.push_str("| --- | --- | --- | --- | --- |\n"); + + for (job_id, gap) in gaps { + out.push_str(&format!( + "| {} | {} | {} | `{}` | {} |\n", + md_cell(job_id), + md_cell(gap.primitive.as_str()), + md_cell(gap.follow_up_issue.as_str()), + gap.blocks_fixture_pass, + md_cell(gap.reason.as_str()) + )); + } + + out.push('\n'); +} + fn render_markdown_unsupported_claims(out: &mut String, report: &RealWorldReport) { out.push_str("## Unsupported Claims\n\n"); diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 3b09e622..9f6b7217 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -44,6 +44,10 @@ fn retrieval_fixture_dir() -> PathBuf { .join("retrieval") } +fn consolidation_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("consolidation") +} + fn run_json_report_from(fixtures: PathBuf) -> Result { let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) .arg("run") @@ -146,7 +150,7 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(21)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(25)); Ok(()) } @@ -186,6 +190,72 @@ fn operator_debug_fixture_reports_trace_links_and_failure_details() -> Result<() Ok(()) } +#[test] +fn consolidation_fixtures_report_reviewable_proposal_metrics() -> Result<()> { + let report = run_json_report_from(consolidation_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!( + report.pointer("/summary/consolidation/proposal_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/consolidation/source_mutation_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/consolidation/proposal_unsupported_claim_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/consolidation/executable_gap_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/consolidation/lineage_completeness").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/consolidation/review_action_correctness").and_then(Value::as_f64), + Some(1.0) + ); + + let jobs = array_at(&report, "/jobs")?; + let project_summary = + find_by_field(jobs, "/job_id", "consolidation-project-summary-apply-001")?; + let contradiction = + find_by_field(jobs, "/job_id", "consolidation-contradiction-report-discard-001")?; + + assert_eq!( + project_summary + .pointer("/consolidation/proposals/0/actual_review_action") + .and_then(Value::as_str), + Some("apply") + ); + assert_eq!( + contradiction + .pointer("/consolidation/proposals/0/actual_review_action") + .and_then(Value::as_str), + Some("discard") + ); + assert_eq!( + contradiction + .pointer("/consolidation/proposals/0/unsupported_claim_count") + .and_then(Value::as_u64), + Some(1) + ); + + let suites = array_at(&report, "/suites")?; + let consolidation_suite = find_by_field(suites, "/suite_id", "consolidation")?; + + assert_eq!(consolidation_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + + Ok(()) +} + #[test] fn generated_json_report_renders_markdown() -> Result<()> { let report = run_json_report()?; @@ -229,19 +299,19 @@ fn generated_json_report_renders_markdown() -> Result<()> { fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { let report = run_json_report_from(real_world_memory_fixture_dir())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(21)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(19)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(25)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(23)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(3)); assert_eq!( report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), - Some(0.912) + Some(0.929) ); assert_eq!( report.pointer("/summary/irrelevant_context_ratio").and_then(Value::as_f64), - Some(0.028) + Some(0.022) ); assert_eq!(report.pointer("/summary/stale_retrieval_count").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); @@ -271,12 +341,12 @@ fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(41) + Some(49) ); - assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(38)); - assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.927)); - assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.927)); - assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.927)); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(46)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.939)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.939)); + assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.939)); assert_eq!( report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), Some(1) @@ -285,6 +355,20 @@ fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { report.pointer("/summary/wrong_result_stage_attribution_count").and_then(Value::as_u64), Some(1) ); + assert_eq!( + report.pointer("/summary/consolidation/proposal_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/consolidation/source_mutation_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/consolidation/proposal_unsupported_claim_count") + .and_then(Value::as_u64), + Some(1) + ); let suites = array_at(&report, "/suites")?; @@ -294,6 +378,7 @@ fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { "retrieval", "capture_integration", "personalization", + "consolidation", ] { let suite = find_by_field(suites, "/suite_id", suite_id)?; @@ -596,3 +681,40 @@ fn memory_evolution_report_renders_markdown_counters() -> Result<()> { Ok(()) } + +#[test] +fn consolidation_report_renders_markdown_metrics_and_gaps() -> Result<()> { + let report = run_json_report_from(consolidation_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-real-world-consolidation-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("report.json"); + let markdown_path = temp_dir.join("report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("## Consolidation")); + assert!(markdown.contains("Source Mutations")); + assert!(markdown.contains("live_consolidation_worker_generation")); + assert!(markdown.contains("[ELF vNext P1] Implement reviewable consolidation worker")); + + Ok(()) +} diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index ff0d52d4..31294eee 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -356,6 +356,24 @@ selection, minimal sufficient context, and stage-level wrong-result explainabili It is still an offline fixture report; qmd and OpenViking remain reference systems unless an adapter actually runs and records typed evidence. +To run the checked-in proposal-only consolidation fixtures: + +```sh +cargo make real-world-memory-consolidation +``` + +Artifacts: + +```text +tmp/real-world-memory/consolidation/report.json +tmp/real-world-memory/consolidation/report.md +``` + +The consolidation fixtures live under +`apps/elf-eval/fixtures/real_world_memory/consolidation/`. They score reviewable +proposal payloads, source lineage, review action outcomes, executable gaps, and source +mutation count. They do not claim live scheduled consolidation-worker generation. + ## Clean Up ```sh diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 8fff2a76..16f63169 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -225,6 +225,26 @@ encoded UX gaps. Checked-in evidence snapshot: `docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md`. +The same `real-world-memory` target also includes the current consolidation fixtures +under the same fixture root. + +Current checked-in consolidation increment: + +```sh +cargo make real-world-memory-consolidation +``` + +This parses `apps/elf-eval/fixtures/real_world_memory/consolidation/`, writes +`tmp/real-world-memory/consolidation/report.json`, and renders +`tmp/real-world-memory/consolidation/report.md`. The consolidation report includes +proposal usefulness, lineage completeness, review action correctness, proposal +unsupported-claim count, executable gap count, and source mutation count. Source +mutation count must remain `0` for proposal-only cases. + +These fixtures encode proposal expectations only. They do not claim that a live +scheduled consolidation worker generated the proposals; the report records that missing +primitive as an executable gap with a follow-up issue title. + Do not generate large fixtures or update production-adoption verdicts while adding the contract. The current adoption gate remains an existing benchmark decision until new real-world job reports are implemented and published. diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index dafc1df0..9cad1941 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -398,6 +398,20 @@ conflict detection counts, update rationale availability, and temporal-validity `not_encoded` counts. A temporal graph validity job MUST NOT be reported as `pass` until the runner can evaluate current-only versus historical relation facts. +Consolidation suite reports MUST also include: + +- proposal usefulness score, or `null` when the job has no proposal payloads; +- lineage completeness score over expected source refs; +- review action correctness for `apply`, `discard`, and `defer` outcomes; +- proposal unsupported-claim count for contradiction/staleness reports; +- source mutation count. + +For proposal-only consolidation jobs, source mutation count MUST be `0`. If the runner +cannot execute a live consolidation primitive, the report MUST include an executable +gap with a precise follow-up issue or issue title. A proposal-only fixture MAY still +pass when it verifies checked-in proposal payloads and the gap explicitly says that no +live worker generation claim is being made. + ## Claim Rules - A project MAY claim a suite pass only for suites with encoded jobs and a published