hack-ink · yvette-carlisle · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/Makefile.toml b/Makefile.toml
@@ -418,6 +418,7 @@ args = [
 # | real-world-memory-consolidation        | composite |     |
 # | real-world-memory-consolidation-json   | command   |     |
 # | real-world-memory-consolidation-report | command   |     |
+# | real-world-memory-live-consolidation   | command   |     |
 # | real-world-job-operator-ux             | composite |     |
 # | real-world-job-operator-ux-json        | command   |     |
 # | real-world-job-operator-ux-report      | command   |     |
@@ -830,6 +831,14 @@ args = [
 	"tmp/real-world-memory/consolidation/report.md",
 ]
 
+[tasks.real-world-memory-live-consolidation]
+workspace = false
+command = "bash"
+args = [
+	"-lc",
+	"docker compose -f docker-compose.baseline.yml run --build --rm -e ELF_CONSOLIDATION_LIVE_REPORT_DIR -e ELF_CONSOLIDATION_LIVE_FIXTURES baseline-runner bash scripts/real-world-consolidation-live-adapter.sh",
+]
+
 [tasks.real-world-memory-core-archival]
 workspace = false
 dependencies = [

diff --git a/README.md b/README.md
@@ -181,6 +181,14 @@ provider-backed ELF evidence was required.
   evidence fields. qmd remains `wrong_result` on the same slice, but this is not a
   broad qmd, Graphiti/Zep, mem0/OpenMemory, Letta, hosted-memory, or private-corpus
   superiority claim.
+- Live consolidation proposal scoring after XY-934: `cargo make
+  real-world-memory-live-consolidation` runs the consolidation fixture slice through
+  `ElfService` consolidation run creation, worker proposal materialization, and
+  apply/defer/discard review audit transitions. ELF passes 4/4 live consolidation jobs
+  with complete lineage, one unsupported-claim flag preserved, and zero source
+  mutations. Managed dreaming and Always-On Memory Agent patterns remain product
+  references, not direct live competitors, because no contained runner emits comparable
+  artifacts.
 - Live operator-debugging slice after XY-932: `cargo make
   real-world-job-operator-ux-live-adapters` emits narrow Docker-isolated
   `live_real_world` records for ELF and qmd over the operator-debugging fixtures.
@@ -255,6 +263,7 @@ Detailed evidence and interpretation:
 - [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md)
 - [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md)
 - [Capture/Write-Policy Live Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md)
+- [Live Consolidation Proposal Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md)
 - [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md)
 - [Live Temporal Reconciliation Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md)
 - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
@@ -335,6 +344,7 @@ Detailed comparison, mechanism-level analysis, and source map:
 - [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md)
 - [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md)
 - [Capture/Write-Policy Live Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md)
+- [Live Consolidation Proposal Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md)
 - [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md)
 - [Live Temporal Reconciliation Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md)
 - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
@@ -347,7 +357,8 @@ Detailed comparison, mechanism-level analysis, and source map:
 - [RAG/Graph Adapter Feasibility Research Run](docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json)
 
 Latest real-world benchmark report: June 16, 2026. Latest external research refresh:
-June 11, 2026.
+June 11, 2026; June 16 adds live temporal reconciliation and live consolidation
+self-check evidence.
 
 ## Documentation
 

diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs
@@ -175,6 +175,21 @@ fn capture_write_policy_live_markdown_path() -> Result<PathBuf> {
 		.join("2026-06-11-capture-write-policy-live-report.md"))
 }
 
+fn live_consolidation_proposal_scoring_report_path() -> Result<PathBuf> {
+	Ok(workspace_root()?
+		.join("docs")
+		.join("research")
+		.join("2026-06-16-live-consolidation-proposal-scoring-report.json"))
+}
+
+fn live_consolidation_proposal_scoring_markdown_path() -> Result<PathBuf> {
+	Ok(workspace_root()?
+		.join("docs")
+		.join("guide")
+		.join("benchmarking")
+		.join("2026-06-16-live-consolidation-proposal-scoring-report.md"))
+}
+
 fn temporal_history_competitor_gap_json_path() -> Result<PathBuf> {
 	Ok(workspace_root()?
 		.join("docs")
@@ -2021,6 +2036,124 @@ fn capture_write_policy_live_report_preserves_competitor_boundaries() -> Result<
 	Ok(())
 }
 
+#[test]
+fn live_consolidation_report_preserves_reviewable_output_boundaries() -> Result<()> {
+	let workspace = workspace_root()?;
+	let report = serde_json::from_str::<Value>(&fs::read_to_string(
+		live_consolidation_proposal_scoring_report_path()?,
+	)?)?;
+	let markdown = fs::read_to_string(live_consolidation_proposal_scoring_markdown_path()?)?;
+	let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?;
+	let readme = fs::read_to_string(readme_path()?)?;
+	let benchmark_guide = fs::read_to_string(
+		workspace
+			.join("docs")
+			.join("guide")
+			.join("benchmarking")
+			.join("real_world_agent_memory_benchmark.md"),
+	)?;
+	let makefile = fs::read_to_string(workspace.join("Makefile.toml"))?;
+	let live_script =
+		fs::read_to_string(workspace.join("scripts/real-world-consolidation-live-adapter.sh"))?;
+	let live_adapter =
+		fs::read_to_string(workspace.join("apps/elf-eval/src/bin/real_world_live_adapter.rs"))?;
+
+	assert_eq!(
+		report.pointer("/schema").and_then(Value::as_str),
+		Some("elf.live_consolidation_proposal_scoring_report/v1")
+	);
+	assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-934"));
+	assert_eq!(
+		report
+			.pointer("/live_consolidation_results/elf_live_real_world/suite_status")
+			.and_then(Value::as_str),
+		Some("pass")
+	);
+	assert_eq!(
+		report
+			.pointer("/live_consolidation_results/elf_live_real_world/encoded_job_count")
+			.and_then(Value::as_u64),
+		Some(4)
+	);
+	assert_eq!(
+		report
+			.pointer("/live_consolidation_results/elf_live_real_world/proposal_count")
+			.and_then(Value::as_u64),
+		Some(4)
+	);
+	assert_eq!(
+		report
+			.pointer("/live_consolidation_results/elf_live_real_world/source_mutation_count")
+			.and_then(Value::as_u64),
+		Some(0)
+	);
+	assert_eq!(
+		report
+			.pointer("/live_consolidation_results/elf_live_real_world/review_event_count")
+			.and_then(Value::as_u64),
+		Some(6)
+	);
+	assert_eq!(
+		report
+			.pointer("/live_consolidation_results/qmd_live_real_world/suite_status")
+			.and_then(Value::as_str),
+		Some("not_encoded")
+	);
+
+	let jobs = array_at(&report, "/jobs")?;
+	let project_summary =
+		find_by_field(jobs, "/job_id", "consolidation-project-summary-apply-001")?;
+	let preference =
+		find_by_field(jobs, "/job_id", "consolidation-preference-candidate-defer-001")?;
+	let contradiction =
+		find_by_field(jobs, "/job_id", "consolidation-contradiction-report-discard-001")?;
+
+	assert_eq!(
+		project_summary.pointer("/final_review_state").and_then(Value::as_str),
+		Some("applied")
+	);
+	assert_eq!(project_summary.pointer("/review_event_count").and_then(Value::as_u64), Some(2));
+	assert_eq!(preference.pointer("/final_review_state").and_then(Value::as_str), Some("archived"));
+	assert_eq!(
+		contradiction.pointer("/final_review_state").and_then(Value::as_str),
+		Some("rejected")
+	);
+	assert_eq!(
+		contradiction.pointer("/unsupported_claim_flag_count").and_then(Value::as_u64),
+		Some(1)
+	);
+	assert_eq!(contradiction.pointer("/source_lineage_count").and_then(Value::as_u64), Some(3));
+
+	let positions = array_at(&report, "/reference_positions")?;
+	let qmd = find_by_field(positions, "/project", "qmd")?;
+	let managed = find_by_field(positions, "/project", "managed_dreaming_memory_systems")?;
+	let always_on = find_by_field(positions, "/project", "always_on_memory_agent_patterns")?;
+
+	assert_eq!(qmd.pointer("/position").and_then(Value::as_str), Some("untested"));
+	assert_eq!(managed.pointer("/position").and_then(Value::as_str), Some("product_reference"));
+	assert_eq!(always_on.pointer("/position").and_then(Value::as_str), Some("product_reference"));
+	assert!(markdown.contains("ELF now has service-backed live consolidation proposal scoring"));
+	assert!(markdown.contains("This is not scheduled production consolidation"));
+	assert!(markdown.contains("Source mutations"));
+	assert!(markdown.contains("Do not mix knowledge-page rebuild/lint scoring"));
+	assert!(
+		benchmarking_index.contains("2026-06-16-live-consolidation-proposal-scoring-report.md")
+	);
+	assert!(readme.contains("Live Consolidation Proposal Scoring Report - June 16, 2026"));
+	assert!(readme.contains("real-world-memory-live-consolidation"));
+	assert!(benchmark_guide.contains("Current live consolidation increment"));
+	assert!(benchmark_guide.contains("tmp/real-world-memory/live-consolidation/summary.json"));
+	assert!(makefile.contains("[tasks.real-world-memory-live-consolidation]"));
+	assert!(makefile.contains("scripts/real-world-consolidation-live-adapter.sh"));
+	assert!(live_script.contains("elf.real_world_consolidation_live_adapter_sweep/v1"));
+	assert!(live_script.contains("real_world_live_adapter -- elf"));
+	assert!(!live_script.contains("real_world_live_adapter -- qmd"));
+	assert!(live_adapter.contains("fn materialize_elf_consolidation("));
+	assert!(live_adapter.contains("ConsolidationProposalReviewRequest"));
+
+	Ok(())
+}
+
 fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Result<()> {
 	let suites = array_at(adapter, "/suites")?;
 	let capabilities = array_at(adapter, "/capabilities")?;
@@ -3016,6 +3149,7 @@ fn assert_competitor_strength_matrix_scenario_json(scenarios: &[Value]) -> Resul
 	let work_resume = find_by_field(scenarios, "/scenario_id", "work_resume")?;
 	let operator_debug = find_by_field(scenarios, "/scenario_id", "operator_debugging")?;
 	let context_trajectory = find_by_field(scenarios, "/scenario_id", "context_trajectory")?;
+	let consolidation = find_by_field(scenarios, "/scenario_id", "consolidation")?;
 
 	assert!(
 		retrieval_debug
@@ -3051,6 +3185,20 @@ fn assert_competitor_strength_matrix_scenario_json(scenarios: &[Value]) -> Resul
 			.and_then(Value::as_str)
 			.is_some_and(|claim| claim.contains("OpenMemory and claude-mem UI/export"))
 	);
+	assert!(
+		consolidation
+			.pointer("/current_elf_evidence")
+			.and_then(Value::as_str)
+			.is_some_and(|claim| claim.contains("XY-934 adds live_real_world")
+				&& claim.contains("zero source mutations"))
+	);
+	assert!(
+		consolidation
+			.pointer("/current_competitor_evidence")
+			.and_then(Value::as_str)
+			.is_some_and(|claim| claim.contains("qmd remains not_encoded")
+				&& claim.contains("product references only"))
+	);
 
 	let personalization = find_by_field(scenarios, "/scenario_id", "personalization")?;
 
@@ -3927,12 +4075,24 @@ fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) -
 
 	assert_eq!(
 		consolidation.pointer("/comparison_judgment").and_then(Value::as_str),
-		Some("not_tested")
+		Some("improved")
 	);
 	assert_eq!(
 		consolidation.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64),
 		Some(1)
 	);
+	assert_eq!(consolidation.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(4));
+	assert_eq!(
+		consolidation.pointer("/post_stage_counts/not_encoded").and_then(Value::as_u64),
+		Some(0)
+	);
+	assert!(
+		consolidation
+			.pointer("/post_stage_basis")
+			.and_then(Value::as_str)
+			.is_some_and(|basis| basis.contains("apply/defer/discard audit")
+				&& basis.contains("zero source mutations"))
+	);
 
 	let scheduled = find_by_field(stages, "/stage_id", "scheduled_memory_task_readiness")?;
 
@@ -3948,6 +4108,7 @@ fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) -
 	assert_eq!(retest.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64), Some(11));
 	assert!(array_contains_str(ledger, "/summary/improved", "current_vs_historical_correctness")?);
 	assert!(array_contains_str(ledger, "/summary/improved", "preference_evolution")?);
+	assert!(array_contains_str(ledger, "/summary/improved", "reviewable_consolidation")?);
 	assert!(array_at(ledger, "/summary/regressed")?.is_empty());
 	assert!(array_contains_str(ledger, "/summary/unchanged", "deletion_ttl_tombstone_behavior")?);
 	assert!(array_contains_str(ledger, "/summary/unchanged", "final_competitor_retest_status")?);
@@ -3959,15 +4120,18 @@ fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) -
 
 fn assert_dreaming_readiness_markdown_boundaries(markdown: &str) {
 	assert!(
-		markdown.contains("`improved`: current-vs-historical correctness and preference evolution")
+		markdown
+			.contains("`improved`: current-vs-historical correctness, preference evolution, and")
+			&& markdown.contains("reviewable consolidation")
 	);
 	assert!(markdown.contains("`regressed`: none"));
 	assert!(markdown.contains("the XY-905 run passes all six memory-evolution jobs"));
 	assert!(markdown.contains("XY-905"));
 	assert!(
 		markdown
-			.contains("Do not claim this ledger fixes preference history against mem0/OpenMemory")
+			.contains("Do not claim this ledger proves preference history against mem0/OpenMemory")
 	);
+	assert!(markdown.contains("Reviewable consolidation now has ELF live service-backed"));
 }
 
 #[test]

diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md
@@ -112,7 +112,7 @@ results, or lifecycle failures into one aggregate leaderboard.
 | Retrieval quality | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF and qmd both pass encoded live retrieval and stress/same-corpus retrieval evidence. | XY-923 |
 | Retrieval quality and local debug UX | `loss` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested. | XY-923 |
 | Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. The mem0 local OSS preference-correction history scenario is now measured and is also an ELF loss. | XY-905 |
-| Consolidation/proposal review | `not_tested` | `fixture_backed`, `not_encoded` | ELF fixture consolidation passes, but live consolidation proposal generation and review-action scoring are not encoded. | XY-926 |
+| Consolidation/proposal review | `not_tested` for direct competitors; ELF self-check passes | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF fixture consolidation passes and XY-934 adds live service-backed proposal materialization, lineage, confidence/usefulness, unsupported-claim flags, and apply/defer/discard audit evidence. Managed dreaming and Always-On Memory Agent patterns remain product references, not direct live competitors. | XY-934 |
 | Knowledge page compilation | `not_tested` | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `blocked`, `not_encoded` | ELF fixture knowledge pages pass, but live knowledge compilation is not encoded. The XY-929 graph/RAG representative slice scores graphify as wrong_result and keeps GraphRAG, llm-wiki, and gbrain as blocked or not_tested references. | XY-926, XY-929 |
 | Operator debugging/viewer UX | `win` | `fixture_backed`, `live_real_world`, `blocked`, `not_encoded` | ELF now has a narrow live operator-debug win over qmd on trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence. ELF ties qmd on replay-command availability and repair-action clarity. XY-925 adds claude-mem progressive-disclosure and retrieval-repair prompt coverage, but claude-mem viewer/operator workflows and OpenMemory UI/export remain blocked, so this is not a broad viewer-product superiority claim. | XY-926 |
 | Capture/write policy and redaction | `not_tested` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF live capture/write-policy self-check jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. qmd remains `not_encoded`; agentmemory and claude-mem hook-capture comparisons remain `blocked` until Docker-contained hook observations and write-policy/viewer readback artifacts exist, so no broad capture-hook superiority claim is allowed. | XY-933, XY-925 |
@@ -131,7 +131,8 @@ results, or lifecycle failures into one aggregate leaderboard.
 | XY-923 | P0 | Backlog | qmd trace-level replay and wrong-result diagnostics. |
 | XY-924/XY-931 | P0 | Encoded local OSS history; UI/export setup blocker measured | mem0/OpenMemory local OSS history and SDK export-style readback are measured; OpenMemory UI/export has a blocked export-helper setup probe and still needs a dedicated compose/import path before any product-UX comparison. |
 | XY-925 | P1 | Fixture slice encoded; runtime paths still blocked | First-generation OSS prompt coverage and typed blockers are recorded for agentmemory, memsearch, and claude-mem; durable agentmemory hooks and claude-mem viewer/operator runs still need runtime adapters. |
-| XY-926 | P1 | Backlog | Live consolidation and knowledge-page suites; broad operator-debugging remains dependent on OpenMemory and claude-mem UI runners. |
+| XY-926 | P1 | Partial live suites encoded | ELF live knowledge-page scoring is encoded; broader knowledge-page external comparisons and broad operator-debugging remain dependent on contained llm-wiki/gbrain/GraphRAG/OpenMemory/claude-mem runners. Consolidation is split to XY-934. |
+| XY-934 | P1 | ELF live self-check encoded | Live consolidation proposal scoring is encoded for ELF with lineage, confidence/usefulness, unsupported-claim flags, and review-action audit; direct competitor runners remain untested or product-reference only. |
 | XY-933 | P1 | Live ELF self-check encoded | Capture/write-policy redaction, exclusion, source-id, evidence-binding, and no-leak scoring for ELF; durable agentmemory/claude-mem capture-hook comparison remains blocked. |
 | XY-927 | P1 | Fixture encoded; Letta export blocked | ELF core-vs-archival fixture coverage is encoded; a contained Letta export/readback adapter remains future work before win/tie/loss claims. |
 | XY-928 | P1 | Encoded blocked fixtures | OpenViking context-trajectory and hierarchy benchmark is encoded but blocked until evidence-bearing same-corpus and staged artifacts exist. |