hack-ink · yvette-carlisle · Jun 10, 2026 · Jun 10, 2026
diff --git a/README.md b/README.md
@@ -143,6 +143,10 @@ with the production embedding provider path, `Qwen3-Embedding-8B`, and
   passed same-corpus retrieval but failed lifecycle/cold-start coverage. memsearch,
   mem0, OpenViking, and claude-mem remained `incomplete` or wrong-result typed states;
   those states are reported as limitations, not hidden as proof.
+- Real-world agent memory aggregate after the P1 benchmark batch: 38 fixture-backed
+  jobs across 11 suites, 35 pass, 1 incomplete, 2 blocked, 0 wrong-result,
+  0 not-encoded, and 0 unsupported-claim results. The remaining non-pass jobs are
+  production-ops operator boundaries, not hidden benchmark wins.
 - The benchmark runner and report publisher are checked in and Docker-isolated:
   `cargo make baseline-live-docker`, `cargo make baseline-backfill-docker`,
   `cargo make baseline-production-private-addendum`,
@@ -157,19 +161,30 @@ Detailed evidence and interpretation:
 - [Live Baseline Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-live-baseline-report.md)
 - [Synthetic Production Corpus Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-corpus-report.md)
 - [Production Adoption Gate Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md)
+- [Real-World Comparison Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md)
 - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
 - [Single-User Production Runbook](docs/guide/single_user_production.md)
-- Future benchmark contract:
+- Benchmark contract:
   [Real-World Agent Memory Benchmark v1](docs/spec/real_world_agent_memory_benchmark_v1.md).
-  This contract defines job-level suites for agent work. Checked-in fixture runners now
-  cover a smoke work-resume slice and proposal-only consolidation cases through
-  `cargo make real-world-job-smoke` and `cargo make real-world-memory-consolidation`,
-  and `cargo make real-world-memory` now reports the first external adapter coverage
-  manifest for ELF, qmd, agentmemory, mem0/OpenMemory, claude-mem, memsearch, and
-  OpenViking. Those real-world reports still distinguish fixture-backed and
-  live-baseline-only evidence from true live real-world adapter runs; no external
-  project has a live real-world suite win until an adapter actually executes
-  `real_world_job` prompts and scoring.
+  This contract defines job-level suites for agent work. `cargo make real-world-memory`
+  now reports fixture-backed ELF evidence plus the external adapter coverage manifest
+  for ELF, qmd, agentmemory, mem0/OpenMemory, claude-mem, memsearch, and OpenViking.
+  The report still distinguishes fixture-backed and live-baseline-only evidence from
+  true live real-world adapter runs; no external project has a live real-world suite win
+  until an adapter actually executes `real_world_job` prompts and scoring.
+
+Evidence-backed position after the June 10 real-world report:
+
+- ELF is better evidenced than the tested alternatives on evidence-bound writes,
+  deterministic ingestion boundaries, Postgres source-of-truth plus rebuildable Qdrant
+  indexing, scoped service APIs, and fixture-backed provenance/resume/evolution checks.
+- ELF and qmd are both strong in the current encoded retrieval evidence: qmd remains
+  the local retrieval-debug baseline, while ELF has the stronger service and provenance
+  contract.
+- ELF is still behind or not yet proven on live real-world external adapters,
+  private-corpus production quality, credentialed production-ops gates, qmd-style local
+  debug knobs, agentmemory/claude-mem/OpenMemory-style continuity UX, OpenViking-style
+  context trajectory, and hosted managed memory.
 
 Quick comparison snapshot (objective/high-level).
 This table compares capability coverage, not overall project quality.
@@ -222,7 +237,8 @@ Detailed comparison, mechanism-level analysis, and source map:
 - [Agent Memory Selection Research Run](docs/research/2026-06-08-agent-memory-selection.json)
 - [Real-World Benchmark Dimension Research Run](docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json)
 
-Latest external research refresh: June 9, 2026.
+Latest real-world benchmark report: June 10, 2026. Latest external research refresh:
+June 9, 2026.
 
 ## Documentation
 

diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json
@@ -20,21 +20,21 @@
       "evidence_class": "fixture_backed",
       "docker_default": true,
       "host_global_installs_required": false,
-      "overall_status": "wrong_result",
+      "overall_status": "incomplete",
       "setup": {
         "status": "pass",
         "evidence": "The checked-in real_world_memory fixtures parse and score through the ELF fixture runner.",
         "command": "cargo make real-world-memory",
         "artifact": "tmp/real-world-memory/real-world-memory-report.json"
       },
       "run": {
-        "status": "wrong_result",
-        "evidence": "The current fixture set reports 27 jobs, 25 pass, 1 wrong_result, and 1 not_encoded.",
+        "status": "incomplete",
+        "evidence": "The current fixture set reports 38 jobs, 35 pass, 1 incomplete, 2 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim.",
         "command": "cargo make real-world-memory",
         "artifact": "tmp/real-world-memory/real-world-memory-report.json"
       },
       "result": {
-        "status": "wrong_result",
+        "status": "incomplete",
         "evidence": "This is fixture-backed ELF scoring, not a live external adapter result.",
         "artifact": "tmp/real-world-memory/real-world-memory-report.md"
       },
@@ -66,40 +66,50 @@
           "status": "pass",
           "evidence": "Checked-in work-resume fixtures are encoded and passing."
         },
+        {
+          "suite_id": "project_decisions",
+          "status": "pass",
+          "evidence": "Checked-in project-decision fixtures cover accepted decisions, reversals, current validation gates, rationale, and bounded caveats."
+        },
         {
           "suite_id": "retrieval",
           "status": "pass",
-          "evidence": "Checked-in retrieval fixtures are encoded; one deliberate operator-debug wrong-result case is reported under operator_debugging_ux."
+          "evidence": "Checked-in retrieval fixtures cover alternate phrasing, distractors, multi-hop routing, current-versus-obsolete selection, and minimal context."
         },
         {
           "suite_id": "memory_evolution",
-          "status": "not_encoded",
-          "evidence": "The relation temporal-validity case is deliberately not_encoded until temporal graph validity is implemented."
+          "status": "pass",
+          "evidence": "Checked-in memory-evolution fixtures cover current-versus-historical facts and the relation temporal-validity case is encoded."
         },
         {
-          "suite_id": "operator_debugging_ux",
-          "status": "wrong_result",
-          "evidence": "The aggregate fixture set includes one deliberate wrong-result trace attribution case."
+          "suite_id": "consolidation",
+          "status": "pass",
+          "evidence": "Proposal-only consolidation fixtures are encoded and passing without source mutation."
         },
         {
-          "suite_id": "capture_integration",
+          "suite_id": "knowledge_compilation",
           "status": "pass",
-          "evidence": "The redaction and capture-boundary fixture is encoded and passing."
+          "evidence": "Knowledge page fixtures are encoded and passing with citation and rebuild metrics."
         },
         {
-          "suite_id": "personalization",
+          "suite_id": "operator_debugging_ux",
           "status": "pass",
-          "evidence": "The scoped preference fixture is encoded and passing."
+          "evidence": "Operator-debugging fixtures now expose stage attribution and dropped-candidate evidence without raw SQL."
         },
         {
-          "suite_id": "consolidation",
+          "suite_id": "capture_integration",
           "status": "pass",
-          "evidence": "Proposal-only consolidation fixtures are encoded and passing without source mutation."
+          "evidence": "The redaction and capture-boundary fixture is encoded and passing."
         },
         {
-          "suite_id": "knowledge_compilation",
+          "suite_id": "production_ops",
+          "status": "incomplete",
+          "evidence": "Production-ops fixtures encode restore, Qdrant rebuild, backfill resume, resource-envelope interpretation, plus typed incomplete and blocked operator boundaries."
+        },
+        {
+          "suite_id": "personalization",
           "status": "pass",
-          "evidence": "Knowledge page fixtures are encoded and passing with citation and rebuild metrics."
+          "evidence": "The scoped preference fixture is encoded and passing."
         }
       ],
       "evidence": [
@@ -115,7 +125,8 @@
         }
       ],
       "notes": [
-        "This adapter record exists to keep ELF fixture results separate from live external adapter results."
+        "This adapter record exists to keep ELF fixture results separate from live external adapter results.",
+        "The remaining non-pass ELF fixture states are production-ops operator boundaries: a Docker local-embedding dependency, provider credentials, and an operator-owned private corpus manifest."
       ],
       "follow_up": {
         "title": "[ELF benchmark vNext] Replace fixture-only ELF answers with live real-world adapter execution where appropriate",

diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs
@@ -224,7 +224,7 @@ fn real_world_report_includes_external_adapter_coverage_manifest() -> Result<()>
 		report
 			.pointer("/external_adapters/summary/overall_status_counts/wrong_result")
 			.and_then(Value::as_u64),
-		Some(4)
+		Some(3)
 	);
 	assert_eq!(
 		report
@@ -236,7 +236,7 @@ fn real_world_report_includes_external_adapter_coverage_manifest() -> Result<()>
 		report
 			.pointer("/external_adapters/summary/overall_status_counts/incomplete")
 			.and_then(Value::as_u64),
-		Some(1)
+		Some(2)
 	);
 	assert_eq!(
 		report
@@ -258,6 +258,7 @@ fn real_world_report_includes_external_adapter_coverage_manifest() -> Result<()>
 	let openviking = find_by_field(adapters, "/adapter_id", "openviking_live_baseline")?;
 
 	assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed"));
+	assert_eq!(elf.pointer("/overall_status").and_then(Value::as_str), Some("incomplete"));
 	assert_eq!(qmd.pointer("/overall_status").and_then(Value::as_str), Some("pass"));
 	assert_eq!(qmd.pointer("/suites/0/status").and_then(Value::as_str), Some("not_encoded"));
 	assert_eq!(