From 56299143957a0eb2a021505cfff56f9c9c27dd86 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Tue, 9 Jun 2026 23:29:02 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add knowledge compilation real-world memory fixtures","authority":"XY-848"} --- Makefile.toml | 57 ++ .../knowledge/entity_concept_issue_pages.json | 372 ++++++++++++ .../pages/concept_derived_knowledge_pages.md | 27 + .../knowledge/pages/entity_qdrant_rebuild.md | 26 + .../pages/issue_xy848_knowledge_pages.md | 24 + .../pages/project_elf_benchmark_suite.md | 36 ++ .../knowledge/project_page_rebuild.json | 311 ++++++++++ .../src/bin/real_world_job_benchmark.rs | 549 ++++++++++++++++-- .../tests/real_world_job_benchmark.rs | 145 ++++- docs/guide/benchmarking/index.md | 7 +- .../benchmarking/live_baseline_benchmark.md | 19 + .../real_world_agent_memory_benchmark.md | 15 + .../real_world_agent_memory_benchmark_v1.md | 62 ++ 13 files changed, 1603 insertions(+), 47 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_memory/knowledge/entity_concept_issue_pages.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/knowledge/pages/concept_derived_knowledge_pages.md create mode 100644 apps/elf-eval/fixtures/real_world_memory/knowledge/pages/entity_qdrant_rebuild.md create mode 100644 apps/elf-eval/fixtures/real_world_memory/knowledge/pages/issue_xy848_knowledge_pages.md create mode 100644 apps/elf-eval/fixtures/real_world_memory/knowledge/pages/project_elf_benchmark_suite.md create mode 100644 apps/elf-eval/fixtures/real_world_memory/knowledge/project_page_rebuild.json diff --git a/Makefile.toml b/Makefile.toml index e9982276..03373f46 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -702,6 +702,63 @@ args = [ ] +# Real-world memory knowledge benchmark +# | task | type | cwd | +# | ------------------------------ | --------- | --- | +# | real-world-memory-knowledge | composite | | +# | real-world-memory-knowledge-json | command | | +# | real-world-memory-knowledge-report | command | | + +[tasks.real-world-memory-knowledge] +workspace = false +dependencies = [ + "real-world-memory-knowledge-report", +] + +[tasks.real-world-memory-knowledge-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/knowledge", + "--out", + "tmp/real-world-memory/knowledge-report.json", + "--run-id", + "real-world-memory-knowledge", + "--adapter-id", + "fixture_knowledge", + "--adapter-name", + "ELF knowledge fixture", +] + +[tasks.real-world-memory-knowledge-report] +workspace = false +dependencies = [ + "real-world-memory-knowledge-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/knowledge-report.json", + "--out", + "tmp/real-world-memory/knowledge-report.md", +] + + # Meta # | task | type | cwd | # | ------ | --------- | --- | diff --git a/apps/elf-eval/fixtures/real_world_memory/knowledge/entity_concept_issue_pages.json b/apps/elf-eval/fixtures/real_world_memory/knowledge/entity_concept_issue_pages.json new file mode 100644 index 00000000..f65f78e2 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/knowledge/entity_concept_issue_pages.json @@ -0,0 +1,372 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "knowledge-entity-concept-002", + "suite": "knowledge_compilation", + "title": "Compile entity, concept, and issue timeline pages with stale lint", + "corpus": { + "corpus_id": "real-world-memory-knowledge-synthetic-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "qdrant-rebuild-entity", + "kind": "note", + "text": "Entity fact: Qdrant is a derived rebuildable index for ELF candidate retrieval; Postgres vectors are the source used to rebuild it.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "entity_concept_issue_pages", + "evidence_id": "qdrant-rebuild-entity" + } + }, + "created_at": "2026-06-09T02:00:00Z" + }, + { + "evidence_id": "derived-pages-concept", + "kind": "decision", + "text": "Concept fact: Derived knowledge pages compile current truth, history, backlinks, and lint findings from source notes and events.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "entity_concept_issue_pages", + "evidence_id": "derived-pages-concept" + } + }, + "created_at": "2026-06-09T02:05:00Z" + }, + { + "evidence_id": "xy848-current-timeline", + "kind": "issue", + "text": "Current issue timeline: XY-848 adds knowledge compilation benchmark cases and keeps generated pages pointer-backed benchmark artifacts.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "entity_concept_issue_pages", + "evidence_id": "xy848-current-timeline" + } + }, + "created_at": "2026-06-09T02:10:00Z" + }, + { + "evidence_id": "old-qdrant-authoritative-trap", + "kind": "note", + "text": "Stale fact: Qdrant became the authoritative source for compiled knowledge pages.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "entity_concept_issue_pages", + "evidence_id": "old-qdrant-authoritative-trap" + } + }, + "created_at": "2026-06-08T02:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_knowledge", + "answer": { + "content": "Generated entity, concept, and issue timeline pages cite Qdrant rebuild evidence, derived-page concept evidence, and the current XY-848 timeline; stale Qdrant-authoritative text is linted, and one rebuild explains allowed ordering variance.", + "claims": [ + { + "claim_id": "qdrant_rebuild_entity", + "text": "The Qdrant entity page states that Qdrant is derived and rebuildable from Postgres-held vectors.", + "evidence_ids": ["qdrant-rebuild-entity"], + "confidence": "high" + }, + { + "claim_id": "derived_pages_concept", + "text": "The derived-pages concept page compiles current truth, history, backlinks, and lint findings from source notes and events.", + "evidence_ids": ["derived-pages-concept"], + "confidence": "high" + }, + { + "claim_id": "issue_timeline_current", + "text": "The XY-848 issue timeline page records that generated pages are pointer-backed benchmark artifacts.", + "evidence_ids": ["xy848-current-timeline"], + "confidence": "high" + } + ], + "evidence_ids": [ + "qdrant-rebuild-entity", + "derived-pages-concept", + "xy848-current-timeline" + ], + "pages": [ + { + "page_id": "entity:qdrant-rebuild", + "page_type": "entity", + "title": "Qdrant Rebuild Entity Page", + "path": "apps/elf-eval/fixtures/real_world_memory/knowledge/pages/entity_qdrant_rebuild.md", + "sections": [ + { + "section_id": "current-truth", + "heading": "Current Truth", + "role": "current_truth", + "content": "Qdrant is derived and rebuildable; Postgres vectors remain the source used for rebuild.", + "evidence_ids": ["qdrant-rebuild-entity"], + "timeline_event_ids": ["qdrant-current-fact"] + }, + { + "section_id": "history", + "heading": "History", + "role": "history", + "content": "The stale claim that Qdrant became authoritative is recorded only as lint evidence.", + "evidence_ids": ["old-qdrant-authoritative-trap"], + "timeline_event_ids": ["qdrant-stale-fact"] + } + ], + "backlinks": [ + "project:elf-benchmark-suite", + "concept:derived-knowledge-pages" + ], + "lint_findings": [ + { + "finding_id": "lint-old-qdrant-authoritative", + "finding_type": "stale_claim", + "severity": "error", + "text": "The old Qdrant-authoritative claim conflicts with the current derived-index evidence.", + "evidence_ids": ["old-qdrant-authoritative-trap"], + "trap_id": "old-qdrant-authoritative" + } + ], + "rebuild": { + "first_hash": "blake3:2ac0d7d7e03088fe3171e41c19f3ea1097b07b1d7ddc891f9aa81311d476e001", + "second_hash": "blake3:2ac0d7d7e03088fe3171e41c19f3ea1097b07b1d7ddc891f9aa81311d476e001", + "deterministic": true, + "allowed_variance": [] + } + }, + { + "page_id": "concept:derived-knowledge-pages", + "page_type": "concept", + "title": "Derived Knowledge Pages Concept Page", + "path": "apps/elf-eval/fixtures/real_world_memory/knowledge/pages/concept_derived_knowledge_pages.md", + "sections": [ + { + "section_id": "compiled-truth", + "heading": "Compiled Truth", + "role": "current_truth", + "content": "Derived knowledge pages compile current truth, history, backlinks, and lint findings from source notes and events.", + "evidence_ids": ["derived-pages-concept"], + "timeline_event_ids": ["derived-pages-concept-recorded"] + }, + { + "section_id": "backlinks", + "heading": "Backlinks", + "role": "backlinks", + "content": "The concept links to the Qdrant rebuild entity and the XY-848 issue timeline.", + "evidence_ids": ["derived-pages-concept", "xy848-current-timeline"], + "timeline_event_ids": ["xy848-current-scope"] + } + ], + "backlinks": [ + "entity:qdrant-rebuild", + "issue:xy848-knowledge-pages" + ], + "lint_findings": [], + "rebuild": { + "first_hash": "blake3:498016f1d39a6a0a5241b0c640c30f0720eb9dbdd73b167fdce95b4387d9699a", + "second_hash": "blake3:498016f1d39a6a0a5241b0c640c30f0720eb9dbdd73b167fdce95b4387d9699b", + "deterministic": false, + "allowed_variance": [ + "Backlink order may differ before canonical sort is applied; fixture report records the variance and still compares normalized page sections." + ] + } + }, + { + "page_id": "issue:xy848-knowledge-pages", + "page_type": "issue_timeline", + "title": "XY-848 Knowledge Pages Issue Timeline", + "path": "apps/elf-eval/fixtures/real_world_memory/knowledge/pages/issue_xy848_knowledge_pages.md", + "sections": [ + { + "section_id": "current-state", + "heading": "Current State", + "role": "current_truth", + "content": "XY-848 adds knowledge compilation benchmark cases and marks generated pages as pointer-backed benchmark artifacts.", + "evidence_ids": ["xy848-current-timeline"], + "timeline_event_ids": ["xy848-current-scope"] + }, + { + "section_id": "linked-pages", + "heading": "Linked Pages", + "role": "backlinks", + "content": "The issue timeline links to the Qdrant rebuild entity and derived-knowledge-pages concept pages.", + "evidence_ids": ["qdrant-rebuild-entity", "derived-pages-concept"], + "timeline_event_ids": ["qdrant-current-fact", "derived-pages-concept-recorded"] + } + ], + "backlinks": [ + "entity:qdrant-rebuild", + "concept:derived-knowledge-pages" + ], + "lint_findings": [], + "rebuild": { + "first_hash": "blake3:fed9c4af9f53e787fcb91a4900b6137d728a72b60629ca049a6da57260be682d", + "second_hash": "blake3:fed9c4af9f53e787fcb91a4900b6137d728a72b60629ca049a6da57260be682d", + "deterministic": true, + "allowed_variance": [] + } + } + ], + "latency_ms": 3.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "qdrant-stale-fact", + "ts": "2026-06-08T02:00:00Z", + "actor": "agent", + "action": "recorded_stale_fact", + "evidence_ids": ["old-qdrant-authoritative-trap"], + "summary": "A stale note incorrectly said Qdrant became authoritative." + }, + { + "event_id": "qdrant-current-fact", + "ts": "2026-06-09T02:00:00Z", + "actor": "agent", + "action": "recorded_current_fact", + "evidence_ids": ["qdrant-rebuild-entity"], + "summary": "The current Qdrant fact says it is derived and rebuildable from Postgres-held vectors." + }, + { + "event_id": "derived-pages-concept-recorded", + "ts": "2026-06-09T02:05:00Z", + "actor": "agent", + "action": "recorded_concept", + "evidence_ids": ["derived-pages-concept"], + "summary": "Derived pages compile current truth, history, backlinks, and lint findings from source notes and events." + }, + { + "event_id": "xy848-current-scope", + "ts": "2026-06-09T02:10:00Z", + "actor": "operator", + "action": "recorded_issue_scope", + "evidence_ids": ["xy848-current-timeline"], + "summary": "XY-848 keeps generated knowledge pages as pointer-backed benchmark artifacts." + } + ], + "prompt": { + "role": "user", + "content": "Compile entity, concept, and issue timeline pages for the knowledge suite and identify stale claims plus rebuild variance.", + "job_mode": "compile", + "constraints": [ + "cite_evidence", + "lint_stale_claims", + "include_backlinks", + "explain_allowed_rebuild_variance" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "qdrant_rebuild_entity", + "text": "The Qdrant entity page states that Qdrant is derived and rebuildable from Postgres-held vectors." + }, + { + "claim_id": "derived_pages_concept", + "text": "The derived-pages concept page compiles current truth, history, backlinks, and lint findings from source notes and events." + }, + { + "claim_id": "issue_timeline_current", + "text": "The XY-848 issue timeline page records that generated pages are pointer-backed benchmark artifacts." + } + ], + "must_not_include": [ + "Qdrant became the authoritative source for compiled knowledge pages." + ], + "evidence_links": { + "qdrant_rebuild_entity": ["qdrant-rebuild-entity"], + "derived_pages_concept": ["derived-pages-concept"], + "issue_timeline_current": ["xy848-current-timeline"] + }, + "answer_type": "compiled_knowledge", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "qdrant-rebuild-entity", + "claim_id": "qdrant_rebuild_entity", + "requirement": "cite", + "quote": "Qdrant is a derived rebuildable index" + }, + { + "evidence_id": "derived-pages-concept", + "claim_id": "derived_pages_concept", + "requirement": "cite", + "quote": "current truth, history, backlinks, and lint findings" + }, + { + "evidence_id": "xy848-current-timeline", + "claim_id": "issue_timeline_current", + "requirement": "use", + "quote": "pointer-backed benchmark artifacts" + } + ], + "negative_traps": [ + { + "trap_id": "old-qdrant-authoritative", + "type": "stale_fact", + "evidence_ids": ["old-qdrant-authoritative-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States current entity, concept, and issue timeline truth." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Every page section traces to source notes or timeline events." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Stale Qdrant-authoritative claim is detected as lint evidence." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Pages include backlinks and useful current-truth/history surfaces." + }, + "lifecycle_behavior": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Rebuild records are deterministic enough or explain allowed variance." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "cite_partial_evidence" + }, + "tags": [ + "synthetic", + "knowledge", + "no_live_claim", + "benchmark_artifact" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/concept_derived_knowledge_pages.md b/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/concept_derived_knowledge_pages.md new file mode 100644 index 00000000..88fb9fc4 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/concept_derived_knowledge_pages.md @@ -0,0 +1,27 @@ +# Derived Knowledge Pages Concept Page + +Benchmark artifact only: this page is a derived fixture for `knowledge_compilation` +scoring. It is not authoritative production truth. + +## Compiled Truth + +Derived knowledge pages compile current truth, history, backlinks, and lint findings +from source notes and events. + +Sources: `derived-pages-concept`, `derived-pages-concept-recorded`. + +## Backlinks + +The concept links to the Qdrant rebuild entity and the XY-848 issue timeline. + +Sources: `derived-pages-concept`, `xy848-current-timeline`, `xy848-current-scope`. + +Backlinks: + +- `entity:qdrant-rebuild` +- `issue:xy848-knowledge-pages` + +## Rebuild Note + +Allowed variance: backlink order may differ before canonical sort is applied; the +fixture report records the variance and compares normalized page sections. diff --git a/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/entity_qdrant_rebuild.md b/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/entity_qdrant_rebuild.md new file mode 100644 index 00000000..d2b28c05 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/entity_qdrant_rebuild.md @@ -0,0 +1,26 @@ +# Qdrant Rebuild Entity Page + +Benchmark artifact only: this page is a derived fixture for `knowledge_compilation` +scoring. It is not authoritative production truth. + +## Current Truth + +Qdrant is derived and rebuildable; Postgres vectors remain the source used for rebuild. + +Sources: `qdrant-rebuild-entity`, `qdrant-current-fact`. + +## History + +The stale claim that Qdrant became authoritative is recorded only as lint evidence. + +Sources: `old-qdrant-authoritative-trap`, `qdrant-stale-fact`. + +## Lint + +- `lint-old-qdrant-authoritative`: stale claim; the old Qdrant-authoritative claim + conflicts with the current derived-index evidence. + +## Backlinks + +- `project:elf-benchmark-suite` +- `concept:derived-knowledge-pages` diff --git a/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/issue_xy848_knowledge_pages.md b/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/issue_xy848_knowledge_pages.md new file mode 100644 index 00000000..ac665951 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/issue_xy848_knowledge_pages.md @@ -0,0 +1,24 @@ +# XY-848 Knowledge Pages Issue Timeline + +Benchmark artifact only: this page is a derived fixture for `knowledge_compilation` +scoring. It is not authoritative production truth. + +## Current State + +XY-848 adds knowledge compilation benchmark cases and marks generated pages as +pointer-backed benchmark artifacts. + +Sources: `xy848-current-timeline`, `xy848-current-scope`. + +## Linked Pages + +The issue timeline links to the Qdrant rebuild entity and derived-knowledge-pages +concept pages. + +Sources: `qdrant-rebuild-entity`, `derived-pages-concept`, +`qdrant-current-fact`, `derived-pages-concept-recorded`. + +Backlinks: + +- `entity:qdrant-rebuild` +- `concept:derived-knowledge-pages` diff --git a/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/project_elf_benchmark_suite.md b/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/project_elf_benchmark_suite.md new file mode 100644 index 00000000..de6d403c --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/project_elf_benchmark_suite.md @@ -0,0 +1,36 @@ +# ELF Benchmark Suite Knowledge Page + +Benchmark artifact only: this page is a derived fixture for `knowledge_compilation` +scoring. It is not authoritative production truth. + +## Current Truth + +Generated knowledge pages remain derived benchmark artifacts and source notes stay +authoritative. + +Sources: `elf-knowledge-current-truth`, `knowledge-current-truth-recorded`. + +## History + +The suite borrows llm-wiki lint, gbrain compiled_truth plus timeline, and graphify +report ideas without copying their source-of-truth assumptions. + +Sources: `elf-knowledge-history`, `knowledge-patterns-selected`. + +## XY-848 Timeline + +XY-848 requires project pages, entity/concept pages, issue timelines, current truth +plus history, stale linting, backlinks, and rebuild determinism. + +Sources: `xy848-issue-timeline`, `xy848-scope-recorded`. + +## Private Corpus Summary + +Unsupported: the fixture does not contain private production corpus evidence for a +private-corpus knowledge-page quality claim. + +## Backlinks + +- `entity:qdrant-rebuild` +- `concept:derived-knowledge-pages` +- `issue:xy848-knowledge-pages` diff --git a/apps/elf-eval/fixtures/real_world_memory/knowledge/project_page_rebuild.json b/apps/elf-eval/fixtures/real_world_memory/knowledge/project_page_rebuild.json new file mode 100644 index 00000000..de6fd359 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/knowledge/project_page_rebuild.json @@ -0,0 +1,311 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "knowledge-project-page-001", + "suite": "knowledge_compilation", + "title": "Compile a pointer-backed project page with current truth and history", + "corpus": { + "corpus_id": "real-world-memory-knowledge-synthetic-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "elf-knowledge-current-truth", + "kind": "note", + "text": "Current truth: The ELF knowledge benchmark must keep generated pages derived from notes and source refs; source notes stay authoritative and generated pages are not production truth.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "project_page_rebuild", + "evidence_id": "elf-knowledge-current-truth" + } + }, + "created_at": "2026-06-09T01:00:00Z" + }, + { + "evidence_id": "elf-knowledge-history", + "kind": "decision", + "text": "History: The knowledge compilation suite follows llm-wiki query-save-lint, gbrain compiled_truth plus timeline, and graphify graph report patterns while preserving ELF provenance boundaries.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "project_page_rebuild", + "evidence_id": "elf-knowledge-history" + } + }, + "created_at": "2026-06-09T01:05:00Z" + }, + { + "evidence_id": "xy848-issue-timeline", + "kind": "issue", + "text": "Issue timeline: XY-848 asks for project pages, entity/concept pages, issue timelines, current truth plus history, stale-claim linting, backlinks, and rebuild determinism.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "project_page_rebuild", + "evidence_id": "xy848-issue-timeline" + } + }, + "created_at": "2026-06-09T01:10:00Z" + }, + { + "evidence_id": "old-authoritative-page-trap", + "kind": "compiled_page", + "text": "Stale claim: Generated knowledge pages are authoritative production truth and can replace source notes.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "project_page_rebuild", + "evidence_id": "old-authoritative-page-trap" + } + }, + "created_at": "2026-06-08T01:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_knowledge", + "answer": { + "content": "Generated benchmark page `project_elf_benchmark_suite.md` keeps ELF source notes authoritative, cites current truth and history, links the XY-848 issue timeline, flags one unsupported summary, and rebuilds deterministically.", + "claims": [ + { + "claim_id": "derived_not_authoritative", + "text": "Generated knowledge pages remain derived benchmark artifacts, not authoritative production truth.", + "evidence_ids": ["elf-knowledge-current-truth"], + "confidence": "high" + }, + { + "claim_id": "reference_patterns", + "text": "The page shape uses llm-wiki lint, gbrain compiled truth plus timeline, and graphify report patterns while preserving ELF provenance.", + "evidence_ids": ["elf-knowledge-history"], + "confidence": "high" + }, + { + "claim_id": "rebuild_deterministic", + "text": "The project page rebuild produced the same page hash in two fixture rebuild passes.", + "evidence_ids": ["xy848-issue-timeline"], + "confidence": "high" + } + ], + "evidence_ids": [ + "elf-knowledge-current-truth", + "elf-knowledge-history", + "xy848-issue-timeline" + ], + "pages": [ + { + "page_id": "project:elf-benchmark-suite", + "page_type": "project", + "title": "ELF Benchmark Suite Knowledge Page", + "path": "apps/elf-eval/fixtures/real_world_memory/knowledge/pages/project_elf_benchmark_suite.md", + "sections": [ + { + "section_id": "current-truth", + "heading": "Current Truth", + "role": "current_truth", + "content": "Generated knowledge pages remain derived benchmark artifacts and source notes stay authoritative.", + "evidence_ids": ["elf-knowledge-current-truth"], + "timeline_event_ids": ["knowledge-current-truth-recorded"] + }, + { + "section_id": "history", + "heading": "History", + "role": "history", + "content": "The suite borrows llm-wiki lint, gbrain compiled_truth plus timeline, and graphify report ideas without copying their source-of-truth assumptions.", + "evidence_ids": ["elf-knowledge-history"], + "timeline_event_ids": ["knowledge-patterns-selected"] + }, + { + "section_id": "issue-timeline", + "heading": "XY-848 Timeline", + "role": "timeline", + "content": "XY-848 requires project pages, entity/concept pages, issue timelines, current truth plus history, stale linting, backlinks, and rebuild determinism.", + "evidence_ids": ["xy848-issue-timeline"], + "timeline_event_ids": ["xy848-scope-recorded"] + }, + { + "section_id": "unsupported-private-summary", + "heading": "Private Corpus Summary", + "role": "summary", + "content": "The fixture does not contain private production corpus evidence for a private-corpus knowledge-page quality claim.", + "evidence_ids": [], + "timeline_event_ids": [], + "unsupported_reason": "No private production corpus item is present in this synthetic benchmark fixture." + } + ], + "backlinks": [ + "entity:qdrant-rebuild", + "concept:derived-knowledge-pages", + "issue:xy848-knowledge-pages" + ], + "lint_findings": [ + { + "finding_id": "lint-old-authoritative-page-trap", + "finding_type": "stale_claim", + "severity": "error", + "text": "The stale authoritative-page claim conflicts with current source-of-truth evidence.", + "evidence_ids": ["old-authoritative-page-trap"], + "trap_id": "old-authoritative-page" + } + ], + "rebuild": { + "first_hash": "blake3:93b78a1d6e8e0f7a5c761b0c3c1e311adf3a5c0f8e0f3999d5e6f4012c4a8481", + "second_hash": "blake3:93b78a1d6e8e0f7a5c761b0c3c1e311adf3a5c0f8e0f3999d5e6f4012c4a8481", + "deterministic": true, + "allowed_variance": [] + } + } + ], + "latency_ms": 2.5, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "knowledge-current-truth-recorded", + "ts": "2026-06-09T01:00:00Z", + "actor": "agent", + "action": "recorded_current_truth", + "evidence_ids": ["elf-knowledge-current-truth"], + "summary": "Current truth says generated pages are derived and source notes stay authoritative." + }, + { + "event_id": "knowledge-patterns-selected", + "ts": "2026-06-09T01:05:00Z", + "actor": "agent", + "action": "selected_reference_patterns", + "evidence_ids": ["elf-knowledge-history"], + "summary": "The suite uses llm-wiki, gbrain, and graphify as reference patterns." + }, + { + "event_id": "xy848-scope-recorded", + "ts": "2026-06-09T01:10:00Z", + "actor": "operator", + "action": "recorded_issue_scope", + "evidence_ids": ["xy848-issue-timeline"], + "summary": "XY-848 defines the required knowledge page benchmark dimensions." + } + ], + "prompt": { + "role": "user", + "content": "Compile a project knowledge page for the ELF benchmark suite and report whether every section is cited or flagged unsupported.", + "job_mode": "compile", + "constraints": [ + "cite_evidence", + "derived_pages_not_authoritative", + "flag_unsupported_sections", + "report_rebuild_determinism" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "derived_not_authoritative", + "text": "Generated knowledge pages remain derived benchmark artifacts, not authoritative production truth." + }, + { + "claim_id": "reference_patterns", + "text": "The page shape uses llm-wiki lint, gbrain compiled truth plus timeline, and graphify report patterns while preserving ELF provenance." + }, + { + "claim_id": "rebuild_deterministic", + "text": "The project page rebuild produced the same page hash in two fixture rebuild passes." + } + ], + "must_not_include": [ + "Generated knowledge pages are authoritative production truth.", + "The fixture proves private-corpus knowledge-page quality." + ], + "evidence_links": { + "derived_not_authoritative": ["elf-knowledge-current-truth"], + "reference_patterns": ["elf-knowledge-history"], + "rebuild_deterministic": ["xy848-issue-timeline"] + }, + "answer_type": "compiled_knowledge", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "elf-knowledge-current-truth", + "claim_id": "derived_not_authoritative", + "requirement": "cite", + "quote": "source notes stay authoritative" + }, + { + "evidence_id": "elf-knowledge-history", + "claim_id": "reference_patterns", + "requirement": "cite", + "quote": "llm-wiki query-save-lint, gbrain compiled_truth plus timeline, and graphify graph report patterns" + }, + { + "evidence_id": "xy848-issue-timeline", + "claim_id": "rebuild_deterministic", + "requirement": "use", + "quote": "rebuild determinism" + } + ], + "negative_traps": [ + { + "trap_id": "old-authoritative-page", + "type": "stale_fact", + "evidence_ids": ["old-authoritative-page-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States current derived-page truth and reference pattern rationale." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Every generated page section cites source notes/events or is flagged unsupported." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Stale authoritative-page claim is linted and not used as current truth." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Compiled page includes current truth, history, issue timeline, and backlinks." + }, + "lifecycle_behavior": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Rebuild record is deterministic enough for regression comparison." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "cite_partial_evidence" + }, + "tags": [ + "synthetic", + "knowledge", + "no_live_claim", + "benchmark_artifact" + ] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 42e6c496..f5a5fee6 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -352,6 +352,8 @@ struct ProducedAnswer { claims: Vec, #[serde(default)] evidence_ids: Vec, + #[serde(default)] + pages: Vec, #[serde(skip_serializing_if = "Option::is_none")] latency_ms: Option, #[serde(skip_serializing_if = "Option::is_none")] @@ -371,6 +373,58 @@ struct ProducedClaim { confidence: Option, } +#[derive(Clone, Debug, Deserialize, Serialize)] +struct DerivedPageArtifact { + page_id: String, + page_type: String, + title: String, + #[serde(skip_serializing_if = "Option::is_none")] + path: Option, + #[serde(default)] + sections: Vec, + #[serde(default)] + backlinks: Vec, + #[serde(default)] + lint_findings: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + rebuild: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct DerivedPageSection { + section_id: String, + heading: String, + role: String, + content: String, + #[serde(default)] + evidence_ids: Vec, + #[serde(default)] + timeline_event_ids: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + unsupported_reason: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct DerivedPageLintFinding { + finding_id: String, + finding_type: String, + severity: String, + text: String, + #[serde(default)] + evidence_ids: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + trap_id: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct DerivedPageRebuild { + first_hash: String, + second_hash: String, + deterministic: bool, + #[serde(default)] + allowed_variance: Vec, +} + #[derive(Clone, Debug, Deserialize)] struct ConsolidationFixture { #[serde(default)] @@ -622,6 +676,8 @@ struct ReportSummary { operator_ux_gap_count: usize, #[serde(default)] consolidation: ConsolidationSummaryReport, + #[serde(skip_serializing_if = "Option::is_none")] + knowledge: Option, } #[derive(Debug, Default, Deserialize, Serialize)] @@ -635,6 +691,23 @@ struct ConsolidationSummaryReport { executable_gap_count: usize, } +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct KnowledgeSummary { + job_count: usize, + page_count: usize, + section_count: usize, + backlink_count: usize, + pages_with_backlinks: usize, + citation_coverage: f64, + stale_claim_detection: f64, + rebuild_determinism: f64, + backlink_coverage: f64, + page_usefulness: f64, + unsupported_summary_count: usize, + untraced_section_count: usize, + allowed_variance_count: usize, +} + #[derive(Debug, Deserialize, Serialize)] struct SuiteReport { suite_id: String, @@ -682,6 +755,8 @@ struct JobReport { latency_ms: Option, cost: Option, trace_explainability: Option, + #[serde(skip_serializing_if = "Option::is_none")] + knowledge: Option, trap_ids_used: Vec, dimension_scores: Vec, reason: String, @@ -787,6 +862,29 @@ struct UnsupportedClaimReport { evidence_ids: Vec, } +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct KnowledgeJobMetrics { + page_count: usize, + section_count: usize, + traced_section_count: usize, + flagged_unsupported_section_count: usize, + untraced_section_count: usize, + unsupported_summary_count: usize, + backlink_count: usize, + pages_with_backlinks: usize, + stale_trap_count: usize, + stale_traps_detected: usize, + rebuild_page_count: usize, + deterministic_rebuild_count: usize, + rebuild_failure_count: usize, + allowed_variance_count: usize, + citation_coverage: f64, + stale_claim_detection: f64, + rebuild_determinism: f64, + backlink_coverage: f64, + page_usefulness: f64, +} + #[derive(Clone, Debug, Default, Deserialize, Serialize)] struct EvolutionSummary { stale_answer_count: usize, @@ -832,6 +930,7 @@ struct JobScoring { hard_fail_hits: Vec, unsupported_claims: Vec, wrong_result_count: usize, + knowledge: Option, trap_ids_used: Vec, dimension_scores: Vec, reason: String, @@ -859,6 +958,10 @@ struct FailureCounts { review_action_failures: usize, source_mutations: usize, blocking_executable_gaps: usize, + untraced_page_sections: usize, + missed_stale_findings: usize, + rebuild_failures: usize, + page_usefulness_failures: usize, } #[derive(Debug, Default)] @@ -976,6 +1079,7 @@ fn validate_job(job: &RealWorldJob, path: &Path) -> Result<()> { validate_expected_answer(job, path)?; validate_required_evidence(job, path)?; validate_consolidation_fixture(job, path)?; + validate_adapter_response(job, path)?; validate_scoring_rubric(job, path)?; validate_allowed_uncertainty(job, path)?; validate_operator_debug(job, path)?; @@ -1241,6 +1345,93 @@ fn validate_consolidation_proposal( Ok(()) } +fn validate_adapter_response(job: &RealWorldJob, path: &Path) -> Result<()> { + let Some(adapter_response) = &job.corpus.adapter_response else { + return Ok(()); + }; + let evidence_ids = corpus_evidence_ids(job); + let event_ids = timeline_event_ids(job); + + for page in &adapter_response.answer.pages { + validate_page_artifact(page, path, &evidence_ids, &event_ids)?; + } + + Ok(()) +} + +fn validate_page_artifact( + page: &DerivedPageArtifact, + path: &Path, + evidence_ids: &BTreeSet, + event_ids: &BTreeSet, +) -> Result<()> { + if page.page_id.trim().is_empty() + || page.page_type.trim().is_empty() + || page.title.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete derived page.", path.display())); + } + + for section in &page.sections { + if section.section_id.trim().is_empty() + || section.heading.trim().is_empty() + || section.role.trim().is_empty() + || section.content.trim().is_empty() + { + return Err(eyre::eyre!( + "{} page {} has an incomplete section.", + path.display(), + page.page_id + )); + } + + for evidence_id in §ion.evidence_ids { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + for event_id in §ion.timeline_event_ids { + ensure_known_event(path, event_ids, event_id)?; + } + } + for backlink in &page.backlinks { + if backlink.trim().is_empty() { + return Err(eyre::eyre!( + "{} page {} has an empty backlink.", + path.display(), + page.page_id + )); + } + } + for finding in &page.lint_findings { + if finding.finding_id.trim().is_empty() + || finding.finding_type.trim().is_empty() + || finding.severity.trim().is_empty() + || finding.text.trim().is_empty() + { + return Err(eyre::eyre!( + "{} page {} has an incomplete lint finding.", + path.display(), + page.page_id + )); + } + + for evidence_id in &finding.evidence_ids { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + } + + if let Some(rebuild) = &page.rebuild + && (rebuild.first_hash.trim().is_empty() || rebuild.second_hash.trim().is_empty()) + { + return Err(eyre::eyre!( + "{} page {} has an incomplete rebuild record.", + path.display(), + page.page_id + )); + } + + Ok(()) +} + fn validate_scoring_rubric(job: &RealWorldJob, path: &Path) -> Result<()> { if !(0.0..=1.0).contains(&job.scoring_rubric.pass_threshold) { return Err(eyre::eyre!("{} has invalid pass_threshold.", path.display())); @@ -1595,6 +1786,22 @@ fn corpus_text_by_id(job: &RealWorldJob) -> BTreeMap<&str, &str> { .collect() } +fn timeline_event_ids(job: &RealWorldJob) -> BTreeSet { + job.timeline.iter().map(|event| event.event_id.clone()).collect() +} + +fn ensure_known_event(path: &Path, known: &BTreeSet, event_id: &str) -> Result<()> { + if !known.contains(event_id) { + return Err(eyre::eyre!( + "{} references unknown timeline event id {}.", + path.display(), + event_id + )); + } + + Ok(()) +} + fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result { if jobs.is_empty() { return Err(eyre::eyre!("At least one real_world_job fixture is required.")); @@ -1654,6 +1861,7 @@ fn score_job(job: &RealWorldJob) -> JobScoring { hard_fail_hits: Vec::new(), unsupported_claims: Vec::new(), wrong_result_count: 0, + knowledge: None, trap_ids_used, dimension_scores: declared_not_encoded_dimension_scores(job), reason: job @@ -1669,7 +1877,11 @@ fn score_job(job: &RealWorldJob) -> JobScoring { let missing_claims = missing_required_claims(job, answer); let forbidden_claims = forbidden_claim_hits(job, answer); let missing_evidence = missing_required_evidence(job, &produced_evidence); + let knowledge = knowledge_metrics(job, answer); let mut unsupported_claims = unsupported_claims(job, answer); + + unsupported_claims.extend(unsupported_page_claims(answer)); + let operator_counts = operator_debug_failure_counts(job); let latency_violations = latency_violations(job, answer); let hard_fail_hits = hard_fail_hits(job, &unsupported_claims, &trap_ids_used); @@ -1698,6 +1910,12 @@ fn score_job(job: &RealWorldJob) -> JobScoring { review_action_failures: review_action_failures(consolidation.as_ref()), source_mutations: consolidation.as_ref().map_or(0, |report| report.source_mutation_count), blocking_executable_gaps: blocking_executable_gaps(consolidation.as_ref()), + untraced_page_sections: knowledge + .as_ref() + .map_or(0, |metrics| metrics.untraced_section_count), + missed_stale_findings: knowledge.as_ref().map_or(0, missed_stale_finding_count), + rebuild_failures: knowledge.as_ref().map_or(0, |metrics| metrics.rebuild_failure_count), + page_usefulness_failures: knowledge.as_ref().map_or(0, page_usefulness_failure_count), }; let dimension_scores = dimension_scores(job, &counts); let normalized_score = normalized_score(&dimension_scores); @@ -1713,7 +1931,11 @@ fn score_job(job: &RealWorldJob) -> JobScoring { + counts.update_rationale_missing + counts.proposal_usefulness_failures + counts.lineage_failures - + counts.review_action_failures; + + counts.review_action_failures + + counts.untraced_page_sections + + counts.missed_stale_findings + + counts.rebuild_failures + + counts.page_usefulness_failures; let status = job_status( normalized_score, job.scoring_rubric.pass_threshold, @@ -1735,6 +1957,7 @@ fn score_job(job: &RealWorldJob) -> JobScoring { hard_fail_hits, unsupported_claims, wrong_result_count, + knowledge, trap_ids_used, dimension_scores, reason, @@ -1789,6 +2012,7 @@ fn synthetic_answer(job: &RealWorldJob) -> &ProducedAnswer { content: String::new(), claims: Vec::new(), evidence_ids: Vec::new(), + pages: Vec::new(), latency_ms: None, cost: None, trace_explainability: None, @@ -2024,6 +2248,145 @@ fn unsupported_claim_report(claim: &ProducedClaim, reason: &str) -> UnsupportedC } } +fn unsupported_page_claims(answer: &ProducedAnswer) -> Vec { + answer + .pages + .iter() + .flat_map(|page| { + page.sections.iter().filter_map(|section| { + if section_is_traced(section) || section_is_flagged_unsupported(section) { + return None; + } + + Some(UnsupportedClaimReport { + suite_id: String::new(), + job_id: String::new(), + claim_id: Some(format!("{}:{}", page.page_id, section.section_id)), + claim_text: bounded_text(section.content.as_str(), 240), + reason: + "derived page section has no source evidence and is not flagged unsupported" + .to_string(), + evidence_ids: section.evidence_ids.clone(), + }) + }) + }) + .collect() +} + +fn knowledge_metrics(job: &RealWorldJob, answer: &ProducedAnswer) -> Option { + if answer.pages.is_empty() { + return None; + } + + let mut metrics = KnowledgeJobMetrics { + page_count: answer.pages.len(), + stale_trap_count: stale_traps(job).len(), + ..KnowledgeJobMetrics::default() + }; + + for page in &answer.pages { + accumulate_page_metrics(page, &mut metrics); + } + + metrics.stale_traps_detected = stale_traps(job) + .iter() + .filter(|trap| page_artifacts_detect_stale_trap(&answer.pages, trap)) + .count(); + metrics.citation_coverage = ratio(metrics.traced_section_count, metrics.section_count); + metrics.stale_claim_detection = + ratio_or_full(metrics.stale_traps_detected, metrics.stale_trap_count); + metrics.rebuild_determinism = ratio(metrics.deterministic_rebuild_count, metrics.page_count); + metrics.backlink_coverage = ratio(metrics.pages_with_backlinks, metrics.page_count); + metrics.page_usefulness = round3( + (metrics.citation_coverage + + metrics.stale_claim_detection + + metrics.rebuild_determinism + + metrics.backlink_coverage) + / 4.0, + ); + + Some(metrics) +} + +fn stale_traps(job: &RealWorldJob) -> Vec<&NegativeTrap> { + job.negative_traps + .iter() + .filter(|trap| trap.trap_type == "stale_fact" && trap.failure_if_used) + .collect() +} + +fn accumulate_page_metrics(page: &DerivedPageArtifact, metrics: &mut KnowledgeJobMetrics) { + if !page.backlinks.is_empty() { + metrics.pages_with_backlinks += 1; + } + + metrics.backlink_count += page.backlinks.len(); + + for section in &page.sections { + metrics.section_count += 1; + + if section_is_traced(section) { + metrics.traced_section_count += 1; + } else if section_is_flagged_unsupported(section) { + metrics.flagged_unsupported_section_count += 1; + + if section.role == "summary" { + metrics.unsupported_summary_count += 1; + } + } else { + metrics.untraced_section_count += 1; + } + } + + if let Some(rebuild) = &page.rebuild { + if !rebuild.allowed_variance.is_empty() { + metrics.allowed_variance_count += 1; + } + if rebuild_is_acceptable(rebuild) { + metrics.deterministic_rebuild_count += 1; + } else { + metrics.rebuild_failure_count += 1; + } + } else { + metrics.rebuild_failure_count += 1; + } + + metrics.rebuild_page_count += 1; +} + +fn section_is_traced(section: &DerivedPageSection) -> bool { + !section.evidence_ids.is_empty() || !section.timeline_event_ids.is_empty() +} + +fn section_is_flagged_unsupported(section: &DerivedPageSection) -> bool { + section.unsupported_reason.as_ref().is_some_and(|reason| !reason.trim().is_empty()) +} + +fn rebuild_is_acceptable(rebuild: &DerivedPageRebuild) -> bool { + (rebuild.deterministic && rebuild.first_hash == rebuild.second_hash) + || !rebuild.allowed_variance.is_empty() +} + +fn page_artifacts_detect_stale_trap(pages: &[DerivedPageArtifact], trap: &NegativeTrap) -> bool { + pages.iter().any(|page| { + page.lint_findings.iter().any(|finding| { + finding.trap_id.as_deref() == Some(trap.trap_id.as_str()) + || finding + .evidence_ids + .iter() + .any(|evidence_id| trap.evidence_ids.contains(evidence_id)) + }) + }) +} + +fn missed_stale_finding_count(metrics: &KnowledgeJobMetrics) -> usize { + metrics.stale_trap_count.saturating_sub(metrics.stale_traps_detected) +} + +fn page_usefulness_failure_count(metrics: &KnowledgeJobMetrics) -> usize { + if metrics.page_usefulness < 0.8 { 1 } else { 0 } +} + fn hard_fail_hits( job: &RealWorldJob, unsupported_claims: &[UnsupportedClaimReport], @@ -2095,18 +2458,21 @@ fn dimension_score(dimension_id: &str, max_points: f64, counts: &FailureCounts) || counts.operator_debug_repair_unclear > 0 || counts.conflict_detection_missing > 0 || counts.proposal_usefulness_failures > 0 - || counts.review_action_failures > 0, + || counts.review_action_failures > 0 + || counts.page_usefulness_failures > 0, "evidence_grounding" => counts.missing_evidence > 0 || counts.unsupported_claims > 0 - || counts.lineage_failures > 0, - "trap_avoidance" => counts.trap_uses > 0, + || counts.lineage_failures > 0 + || counts.untraced_page_sections > 0, + "trap_avoidance" => counts.trap_uses > 0 || counts.missed_stale_findings > 0, "uncertainty_handling" => counts.unsupported_claims > 0, "lifecycle_behavior" => counts.stale_answers > 0 || counts.conflict_detection_missing > 0 || counts.update_rationale_missing > 0 - || counts.source_mutations > 0, + || counts.source_mutations > 0 + || counts.rebuild_failures > 0, "source_immutability" => counts.source_mutations > 0, "proposal_usefulness" => counts.proposal_usefulness_failures > 0, "lineage_completeness" => counts.lineage_failures > 0, @@ -2180,42 +2546,17 @@ fn job_status( } fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64) -> String { + let wrong_result_signal_count = wrong_result_signal_count(counts); + match status { TypedStatus::Pass => format!("Job passed with normalized_score {normalized_score:.3}."), TypedStatus::UnsupportedClaim => format!( "Job produced {} unsupported claim(s), {} wrong-result signal(s), {} latency violation(s), and normalized_score {normalized_score:.3}.", - counts.unsupported_claims, - counts.missing_claims - + counts.forbidden_claims - + counts.missing_evidence - + counts.trap_uses - + counts.operator_debug_missing - + counts.operator_debug_raw_sql - + counts.operator_debug_trace_gaps - + counts.operator_debug_repair_unclear - + counts.conflict_detection_missing - + counts.update_rationale_missing - + counts.proposal_usefulness_failures - + counts.lineage_failures - + counts.review_action_failures, - counts.latency_violations + counts.unsupported_claims, wrong_result_signal_count, counts.latency_violations ), TypedStatus::WrongResult => format!( "Job produced {} wrong-result signal(s), {} latency violation(s), and normalized_score {normalized_score:.3}.", - counts.missing_claims - + counts.forbidden_claims - + counts.missing_evidence - + counts.trap_uses - + counts.operator_debug_missing - + counts.operator_debug_raw_sql - + counts.operator_debug_trace_gaps - + counts.operator_debug_repair_unclear - + counts.conflict_detection_missing - + counts.update_rationale_missing - + counts.proposal_usefulness_failures - + counts.lineage_failures - + counts.review_action_failures, - counts.latency_violations + wrong_result_signal_count, counts.latency_violations ), TypedStatus::LifecycleFail => format!( "Job produced {} source mutation(s) and normalized_score {normalized_score:.3}.", @@ -2229,6 +2570,26 @@ fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64 } } +fn wrong_result_signal_count(counts: &FailureCounts) -> usize { + counts.missing_claims + + counts.forbidden_claims + + counts.missing_evidence + + counts.trap_uses + + counts.operator_debug_missing + + counts.operator_debug_raw_sql + + counts.operator_debug_trace_gaps + + counts.operator_debug_repair_unclear + + counts.conflict_detection_missing + + counts.update_rationale_missing + + counts.proposal_usefulness_failures + + counts.lineage_failures + + counts.review_action_failures + + counts.untraced_page_sections + + counts.missed_stale_findings + + counts.rebuild_failures + + counts.page_usefulness_failures +} + fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { let answer = produced_answer(job); let metrics = job_metrics(job, answer); @@ -2266,6 +2627,7 @@ fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { latency_ms: answer.latency_ms, cost: answer.cost.clone(), trace_explainability: answer.trace_explainability.clone(), + knowledge: scoring.knowledge, trap_ids_used: scoring.trap_ids_used, dimension_scores: scoring.dimension_scores, reason: scoring.reason, @@ -2747,6 +3109,7 @@ fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary { .map(|debug| debug.ux_gaps.len()) .sum(), consolidation: consolidation_summary(jobs), + knowledge: knowledge_summary(jobs), ..ReportSummary::default() }; @@ -2821,6 +3184,10 @@ fn ratio_or(numerator: usize, denominator: usize, empty_value: f64) -> f64 { if denominator == 0 { empty_value } else { round3(numerator as f64 / denominator as f64) } } +fn ratio_or_full(numerator: usize, denominator: usize) -> f64 { + ratio_or(numerator, denominator, 1.0) +} + fn consolidation_summary(jobs: &[JobReport]) -> ConsolidationSummaryReport { let reports = jobs.iter().filter_map(|job| job.consolidation.as_ref()).collect::>(); @@ -2854,6 +3221,60 @@ fn consolidation_summary(jobs: &[JobReport]) -> ConsolidationSummaryReport { } } +fn knowledge_summary(jobs: &[JobReport]) -> Option { + let knowledge_jobs = jobs.iter().filter_map(|job| job.knowledge.as_ref()).collect::>(); + + if knowledge_jobs.is_empty() { + return None; + } + + let job_count = knowledge_jobs.len(); + let page_count = knowledge_jobs.iter().map(|metrics| metrics.page_count).sum::(); + let section_count = knowledge_jobs.iter().map(|metrics| metrics.section_count).sum::(); + let traced_section_count = + knowledge_jobs.iter().map(|metrics| metrics.traced_section_count).sum::(); + let stale_trap_count = + knowledge_jobs.iter().map(|metrics| metrics.stale_trap_count).sum::(); + let stale_traps_detected = + knowledge_jobs.iter().map(|metrics| metrics.stale_traps_detected).sum::(); + let deterministic_rebuild_count = + knowledge_jobs.iter().map(|metrics| metrics.deterministic_rebuild_count).sum::(); + let rebuild_page_count = + knowledge_jobs.iter().map(|metrics| metrics.rebuild_page_count).sum::(); + let backlink_count = knowledge_jobs.iter().map(|metrics| metrics.backlink_count).sum::(); + let pages_with_backlinks = + knowledge_jobs.iter().map(|metrics| metrics.pages_with_backlinks).sum::(); + let page_usefulness = round3( + knowledge_jobs.iter().map(|metrics| metrics.page_usefulness).sum::() + / job_count as f64, + ); + + Some(KnowledgeSummary { + job_count, + page_count, + section_count, + backlink_count, + pages_with_backlinks, + citation_coverage: ratio(traced_section_count, section_count), + stale_claim_detection: ratio_or_full(stale_traps_detected, stale_trap_count), + rebuild_determinism: ratio(deterministic_rebuild_count, rebuild_page_count), + backlink_coverage: ratio(pages_with_backlinks, page_count), + page_usefulness, + unsupported_summary_count: knowledge_jobs + .iter() + .map(|metrics| metrics.unsupported_summary_count) + .sum(), + untraced_section_count: knowledge_jobs + .iter() + .map(|metrics| metrics.untraced_section_count) + .sum(), + allowed_variance_count: knowledge_jobs + .iter() + .map(|metrics| metrics.allowed_variance_count) + .sum(), + }) +} + fn mean_score(jobs: &[JobReport]) -> f64 { if jobs.is_empty() { return 0.0; @@ -2983,6 +3404,7 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { render_markdown_evolution(&mut out, report); render_markdown_trace_explainability(&mut out, report); render_markdown_consolidation(&mut out, report); + render_markdown_knowledge(&mut out, report); render_markdown_unsupported_claims(&mut out, report); render_markdown_follow_ups(&mut out, report); render_markdown_semantics(&mut out, report); @@ -3094,6 +3516,28 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat report.summary.trace_incomplete_count )); out.push_str(&format!("- Operator UX gaps: `{}`\n", report.summary.operator_ux_gap_count)); + + if let Some(knowledge) = &report.summary.knowledge { + out.push_str(&format!( + "- Knowledge citation coverage: `{:.3}`\n", + knowledge.citation_coverage + )); + out.push_str(&format!( + "- Stale claim detection: `{:.3}`\n", + knowledge.stale_claim_detection + )); + out.push_str(&format!("- Rebuild determinism: `{:.3}`\n", knowledge.rebuild_determinism)); + out.push_str(&format!( + "- Backlinks: `{}` total, `{:.3}` page coverage\n", + knowledge.backlink_count, knowledge.backlink_coverage + )); + out.push_str(&format!("- Page usefulness: `{:.3}`\n", knowledge.page_usefulness)); + out.push_str(&format!( + "- Unsupported summary count: `{}`\n", + knowledge.unsupported_summary_count + )); + } + out.push_str(&format!( "- Private corpus redaction: `{}`\n\n", md_inline(report.private_corpus_redaction.policy.as_str()) @@ -3451,6 +3895,42 @@ fn render_markdown_consolidation_gaps(out: &mut String, report: &RealWorldReport out.push('\n'); } +fn render_markdown_knowledge(out: &mut String, report: &RealWorldReport) { + let knowledge_jobs = + report.jobs.iter().filter(|job| job.knowledge.is_some()).collect::>(); + + if knowledge_jobs.is_empty() { + return; + } + + out.push_str("## Knowledge Page Metrics\n\n"); + out.push_str("| Job | Pages | Sections | Citation Coverage | Stale Claim Detection | Rebuild Determinism | Page Usefulness | Backlinks | Unsupported Summaries | Untraced Sections | Allowed Variance |\n"); + out.push_str("| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n"); + + for job in knowledge_jobs { + let Some(knowledge) = &job.knowledge else { + continue; + }; + + out.push_str(&format!( + "| {} | {} | {} | `{:.3}` | `{:.3}` | `{:.3}` | `{:.3}` | {} | {} | {} | {} |\n", + md_cell(job.job_id.as_str()), + knowledge.page_count, + knowledge.section_count, + knowledge.citation_coverage, + knowledge.stale_claim_detection, + knowledge.rebuild_determinism, + knowledge.page_usefulness, + knowledge.backlink_count, + knowledge.unsupported_summary_count, + knowledge.untraced_section_count, + knowledge.allowed_variance_count + )); + } + + out.push('\n'); +} + fn render_markdown_unsupported_claims(out: &mut String, report: &RealWorldReport) { out.push_str("## Unsupported Claims\n\n"); @@ -3520,6 +4000,7 @@ fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) { ); out.push_str("- `unsupported_claim`: a job produced a substantive claim not supported by the fixture evidence links.\n"); out.push_str("- `not_encoded`: a suite has no checked-in fixture, or an encoded fixture declares a capability gap so no pass/fail claim is allowed.\n\n"); + out.push_str("For `knowledge_compilation` jobs, generated pages are benchmark artifacts. Page sections must cite source evidence or timeline events, or be explicitly flagged as unsupported. Flagged unsupported summaries are counted separately from hidden unsupported claims.\n\n"); out.push_str("## Suites With `not_encoded` Status\n\n"); if report.not_encoded_suites.is_empty() { diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 9f6b7217..cc665cb4 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -48,6 +48,10 @@ fn consolidation_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("consolidation") } +fn knowledge_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("knowledge") +} + fn run_json_report_from(fixtures: PathBuf) -> Result { let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) .arg("run") @@ -150,7 +154,7 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(25)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(27)); Ok(()) } @@ -256,6 +260,77 @@ fn consolidation_fixtures_report_reviewable_proposal_metrics() -> Result<()> { Ok(()) } +#[test] +fn knowledge_fixtures_report_page_metrics() -> Result<()> { + let report = run_json_report_from(knowledge_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/knowledge/page_count").and_then(Value::as_u64), Some(4)); + assert_eq!( + report.pointer("/summary/knowledge/section_count").and_then(Value::as_u64), + Some(10) + ); + assert_eq!( + report.pointer("/summary/knowledge/citation_coverage").and_then(Value::as_f64), + Some(0.9) + ); + assert_eq!( + report.pointer("/summary/knowledge/stale_claim_detection").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/knowledge/rebuild_determinism").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/knowledge/backlink_count").and_then(Value::as_u64), + Some(9) + ); + assert_eq!( + report.pointer("/summary/knowledge/pages_with_backlinks").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/knowledge/backlink_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/knowledge/page_usefulness").and_then(Value::as_f64), + Some(0.969) + ); + assert_eq!( + report.pointer("/summary/knowledge/unsupported_summary_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/knowledge/allowed_variance_count").and_then(Value::as_u64), + Some(1) + ); + + let suites = array_at(&report, "/suites")?; + let knowledge_suite = find_by_field(suites, "/suite_id", "knowledge_compilation")?; + + assert_eq!(knowledge_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(knowledge_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(2)); + + let jobs = array_at(&report, "/jobs")?; + let project_page_job = find_by_field(jobs, "/job_id", "knowledge-project-page-001")?; + + assert_eq!( + project_page_job.pointer("/knowledge/unsupported_summary_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + project_page_job.pointer("/knowledge/untraced_section_count").and_then(Value::as_u64), + Some(0) + ); + + Ok(()) +} + #[test] fn generated_json_report_renders_markdown() -> Result<()> { let report = run_json_report()?; @@ -295,23 +370,70 @@ fn generated_json_report_renders_markdown() -> Result<()> { Ok(()) } +#[test] +fn knowledge_json_report_renders_markdown_metrics() -> Result<()> { + let report = run_json_report_from(knowledge_fixture_dir())?; + let temp_dir = env::temp_dir().join(format!("elf-real-world-knowledge-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("knowledge-report.json"); + let markdown_path = temp_dir.join("knowledge-report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("Knowledge Page Metrics")); + assert!(markdown.contains("Knowledge citation coverage")); + assert!(markdown.contains("Backlinks: `9` total")); + assert!(markdown.contains("Unsupported summary count")); + assert!(markdown.contains("knowledge-project-page-001")); + assert!(markdown.contains("knowledge-entity-concept-002")); + + Ok(()) +} + +fn assert_root_knowledge_summary(report: &Value) { + assert_eq!(report.pointer("/summary/knowledge/job_count").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/knowledge/page_count").and_then(Value::as_u64), Some(4)); + assert_eq!( + report.pointer("/summary/knowledge/page_usefulness").and_then(Value::as_f64), + Some(0.969) + ); +} + #[test] fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { let report = run_json_report_from(real_world_memory_fixture_dir())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(25)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(23)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(27)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(25)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(3)); assert_eq!( report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), - Some(0.929) + Some(0.938) ); assert_eq!( report.pointer("/summary/irrelevant_context_ratio").and_then(Value::as_f64), - Some(0.022) + Some(0.02) ); assert_eq!(report.pointer("/summary/stale_retrieval_count").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); @@ -341,12 +463,12 @@ fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(49) + Some(55) ); - assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(46)); - assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.939)); - assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.939)); - assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.939)); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(52)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.945)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.945)); + assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.945)); assert_eq!( report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), Some(1) @@ -370,6 +492,8 @@ fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { Some(1) ); + assert_root_knowledge_summary(&report); + let suites = array_at(&report, "/suites")?; for suite_id in [ @@ -379,6 +503,7 @@ fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { "capture_integration", "personalization", "consolidation", + "knowledge_compilation", ] { let suite = find_by_field(suites, "/suite_id", suite_id)?; diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index 2829e253..a0409e6d 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -38,7 +38,8 @@ cleanup, use `docs/guide/single_user_production.md`. operator-debugging UX report with trace/viewer links, raw-SQL avoidance, root-cause step counts, dropped-candidate visibility, and repair-action clarity. - `real_world_agent_memory_benchmark.md`: operator overview for the v1 real-world - agent memory benchmark contract, including suite taxonomy and typed report states. + agent memory benchmark contract, including suite taxonomy, typed report states, and + the knowledge-compilation fixture task. - `real_world_memory_evolution.md`: run and interpret the checked-in memory evolution jobs for current facts, historical facts, stale traps, conflicts, update rationales, and temporal graph limitations. @@ -50,8 +51,8 @@ cleanup, use `docs/guide/single_user_production.md`. summaries and durable scripts. - Keep generated real-world job smoke JSON and Markdown under `tmp/real-world-job/`; commit fixture schemas, smoke fixtures, runner code, and durable docs only. -- Keep generated real-world memory trust/personalization JSON and Markdown under - `tmp/real-world-memory/`; commit fixtures, runner code, and durable docs only. +- Keep generated real-world memory trust/personalization/knowledge JSON and Markdown + under `tmp/real-world-memory/`; commit fixtures, runner code, and durable docs only. - Link the newest decision-relevant report from README and this index. - When benchmark semantics change, update `live_baseline_benchmark.md` and the relevant spec before publishing a new result. diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index 31294eee..5d5f0387 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -374,6 +374,25 @@ The consolidation fixtures live under proposal payloads, source lineage, review action outcomes, executable gaps, and source mutation count. They do not claim live scheduled consolidation-worker generation. +To run the checked-in knowledge-compilation and page-rebuild fixtures: + +```sh +cargo make real-world-memory-knowledge +``` + +Artifacts: + +```text +tmp/real-world-memory/knowledge-report.json +tmp/real-world-memory/knowledge-report.md +``` + +The knowledge fixtures live under +`apps/elf-eval/fixtures/real_world_memory/knowledge/`. They score derived page +citation coverage, stale-claim linting, rebuild determinism, backlink coverage, page +usefulness, and explicitly flagged unsupported summaries. Generated pages are +benchmark artifacts, not source-truth replacements. + ## Clean Up ```sh diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 16f63169..305ec553 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -245,6 +245,21 @@ These fixtures encode proposal expectations only. They do not claim that a live scheduled consolidation worker generated the proposals; the report records that missing primitive as an executable gap with a follow-up issue title. +Current checked-in knowledge-compilation increment: + +```sh +cargo make real-world-memory-knowledge +``` + +This parses `apps/elf-eval/fixtures/real_world_memory/knowledge/`, writes +`tmp/real-world-memory/knowledge-report.json`, and renders +`tmp/real-world-memory/knowledge-report.md`. The fixtures include synthetic project, +entity, concept, and issue-timeline page artifacts. Generated pages are benchmark +artifacts only: every section must cite source evidence or timeline events, or it must +be explicitly flagged unsupported. The report publishes citation coverage, stale claim +detection, rebuild determinism, aggregate backlink counts and page coverage, page +usefulness, unsupported summary count, and untraced section count. + Do not generate large fixtures or update production-adoption verdicts while adding the contract. The current adoption gate remains an existing benchmark decision until new real-world job reports are implemented and published. diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index 9cad1941..d1aefae9 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -191,6 +191,65 @@ An answer that states a required claim without any acceptable evidence link is a `unsupported_claim` unless the job's `allowed_uncertainty` explicitly permits an uncited low-confidence statement. +### Optional `adapter_response.answer.pages` + +Knowledge-compilation fixtures MAY include generated page artifacts in +`corpus.adapter_response.answer.pages[]`. These page artifacts are benchmark outputs, +not authoritative source truth. Any checked-in generated page fixture MUST be clearly +marked as a benchmark artifact. + +Each page entry MUST include: + +- `page_id`: stable page identifier, such as `project:elf-benchmark-suite`. +- `page_type`: `project`, `entity`, `concept`, `issue_timeline`, or another + fixture-defined type. +- `title`: human-readable page title. +- `path`: optional fixture path for a checked-in benchmark artifact page. +- `sections`: generated page sections. +- `backlinks`: zero or more page, entity, concept, issue, or evidence identifiers. +- `lint_findings`: zero or more stale, unsupported, or contradiction findings. +- `rebuild`: optional rebuild comparison record. + +Each `sections[]` entry MUST include: + +- `section_id` +- `heading` +- `role`: examples include `current_truth`, `history`, `timeline`, `backlinks`, and + `summary`. +- `content`: bounded fixture text. +- `evidence_ids`: zero or more ids from `corpus.items[]`. +- `timeline_event_ids`: zero or more ids from `timeline[]`. +- `unsupported_reason`: optional reason why the section is intentionally unsupported. + +Every generated page section MUST trace back to at least one `evidence_id` or +`timeline_event_id`, or it MUST include `unsupported_reason`. A section that lacks both +trace evidence and an unsupported flag is an `unsupported_claim`. A section with +`role = "summary"` and `unsupported_reason` is counted as an unsupported summary, but it +is not a hidden unsupported claim because the page explicitly marks the gap. + +Each `lint_findings[]` entry SHOULD include: + +- `finding_id` +- `finding_type`: for example `stale_claim`, `unsupported_section`, or + `contradiction`. +- `severity` +- `text` +- `evidence_ids` +- `trap_id`: optional link to `negative_traps[]`. + +Each `rebuild` record SHOULD include: + +- `first_hash` +- `second_hash` +- `deterministic`: true when repeat rebuilds produced byte-stable output. +- `allowed_variance`: explanations for accepted non-semantic variance. + +Knowledge-compilation reports SHOULD include citation coverage, stale claim detection, +rebuild determinism, page usefulness, backlink counts, unsupported summary count, and +untraced section count. Rebuild results are acceptable only when repeated output is +deterministic enough for regression comparison or every allowed variance is explicitly +reported. + ### `negative_traps` Negative traps MUST be explicit so systems are tested against realistic memory failure @@ -387,6 +446,9 @@ Reports MUST include: stages, especially for wrong-result stage attribution; - per-suite typed status and score distribution; - unsupported claim list with claim text or a bounded redacted description; +- for encoded knowledge-compilation jobs with page artifacts: citation coverage, stale + claim detection, rebuild determinism, page usefulness, backlink counts, unsupported + summary count, and untraced section count; - explicit `not_encoded` suite list; - private-corpus redaction policy when private fixtures are used. - capture/integration coverage classes when any fixture declares `capture_behaviors`,