From ecf78e731bc240a61237ead3e83a2fc57ad125f3 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 17 Jun 2026 10:01:32 +0800
Subject: [PATCH 1/2] Add Dreaming competitor retest closeout

---
 README.md                                     |  18 +-
 ...06-16-dreaming-readiness-stage-ledger.json |  22 +-
 ...ing-competitor-strength-retest-report.json | 504 ++++++++++++++++++
 .../tests/real_world_job_benchmark.rs         | 278 +++++++++-
 ...6-06-16-dreaming-readiness-stage-ledger.md |  10 +-
 ...aming-competitor-strength-retest-report.md | 131 +++++
 docs/evidence/benchmarking/index.md           |   1 +
 7 files changed, 954 insertions(+), 10 deletions(-)
 create mode 100644 apps/elf-eval/fixtures/report_snapshots/2026-06-17-dreaming-competitor-strength-retest-report.json
 create mode 100644 docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md

diff --git a/README.md b/README.md
index 3628775b..aa49d41c 100644
--- a/README.md
+++ b/README.md
@@ -168,6 +168,15 @@ provider-backed ELF evidence was required.
   does not create a managed-memory parity claim. The new `proactive_brief` fixture
   scores 5 jobs, with 4 pass and 1 blocked private-corpus case; it does not create
   Pulse or hosted managed-memory parity.
+- Dreaming competitor-strength closeout after XY-955: the June 17 retest keeps ELF
+  locally and partially stronger only. The aggregate fixture retest remains 53 pass
+  and 7 typed blockers, the representative graph/RAG slice remains typed non-pass,
+  first-generation OSS fixture coverage remains 4 pass and 2 blocked, and the fresh
+  full live-adapter rerun reports ELF at 40 pass/0 wrong_result versus qmd at 17
+  pass/13 wrong_result while preserving qmd's separate debug-ergonomics edge. This
+  rejects broad superiority claims and leaves qmd debug ergonomics,
+  OpenViking trajectory, Letta core/archive, graph/RAG quality, and XY-930
+  private/provider gates as follow-up work.
 - Full-suite live real-world adapter sweep after XY-926: ELF and qmd emit
   Docker-isolated `live_real_world` records for all 55 checked-in jobs across 13 suites
   through `cargo make real-world-memory-live-adapters`. Both keep the original
@@ -275,6 +284,7 @@ Detailed evidence and interpretation:
 - [Live Temporal Reconciliation Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md)
 - [Proactive Brief Scoring Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md)
 - [Scheduled Memory Task Scoring Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md)
+- [Dreaming Competitor-Strength Retest Report - June 17, 2026](docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md)
 - [Live Baseline Benchmark Runbook](docs/runbook/benchmarking/live_baseline_benchmark.md)
 - [Single-User Production Runbook](docs/runbook/single_user_production.md)
 - Benchmark contract:
@@ -358,6 +368,7 @@ Detailed comparison, mechanism-level analysis, and source map:
 - [Live Temporal Reconciliation Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md)
 - [Proactive Brief Scoring Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md)
 - [Scheduled Memory Task Scoring Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md)
+- [Dreaming Competitor-Strength Retest Report - June 17, 2026](docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md)
 - [Live Baseline Benchmark Runbook](docs/runbook/benchmarking/live_baseline_benchmark.md)
 - [Real-World Agent Memory Benchmark](docs/runbook/benchmarking/real_world_agent_memory_benchmark.md)
 - [External Memory Improvement Plan](docs/evidence/external_memory/external_memory_improvement_plan.md)
@@ -369,9 +380,10 @@ Detailed comparison, mechanism-level analysis, and source map:
 - [Derived Knowledge Page Follow-Up Research](docs/research/derived_knowledge_page_followup.md)
 - [Dreaming Product Surface Follow-Up Research](docs/research/dreaming_product_surface_followup.md)
 
-Latest real-world benchmark report: June 16, 2026. Latest external research refresh:
-June 11, 2026; June 16 adds live temporal reconciliation, live consolidation
-self-check evidence, and fixture-backed scheduled-memory task scoring.
+Latest real-world benchmark report: June 17, 2026. Latest external research refresh:
+June 11, 2026; June 17 adds the Dreaming competitor-strength closeout retest and
+optimization queue after the June 16 temporal reconciliation, live consolidation
+self-check, proactive-brief, and scheduled-memory scoring evidence.
 
 ## Documentation
 
diff --git a/apps/elf-eval/fixtures/report_snapshots/2026-06-16-dreaming-readiness-stage-ledger.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-16-dreaming-readiness-stage-ledger.json
index cbd7c1ed..bd5116e6 100644
--- a/apps/elf-eval/fixtures/report_snapshots/2026-06-16-dreaming-readiness-stage-ledger.json
+++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-16-dreaming-readiness-stage-ledger.json
@@ -534,10 +534,11 @@
         }
       ],
       "evidence_files": [
-        "docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md",
         "docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md",
         "docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md",
-        "docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md"
+        "docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md",
+        "docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md",
+        "apps/elf-eval/fixtures/report_snapshots/2026-06-17-dreaming-competitor-strength-retest-report.json"
       ],
       "baseline_counts": {
         "pass": 22,
@@ -547,6 +548,23 @@
         "not_encoded": 11
       },
       "baseline_basis": "ELF full live real-world sweep: 22 pass, 5 wrong_result, 2 blocked, and 11 not_encoded jobs. The not_encoded jobs are represented as not_tested for this stage gate while preserving the raw not_encoded count.",
+      "post_stage_counts": {
+        "pass": 40,
+        "wrong_result": 0,
+        "blocked": 7,
+        "not_tested": 0,
+        "not_encoded": 19,
+        "incomplete": 0
+      },
+      "post_stage_basis": "XY-955 closeout retest: aggregate fixture retest passes 53/60 with 7 typed blockers; representative graph/RAG remains 0 pass, 1 wrong_result, 1 incomplete, and 3 blocked; first-generation OSS fixture slice is 4 pass and 2 blocked; ELF live adapter materialization is 40 pass, 0 wrong_result, 7 blocked, and 19 not_encoded; qmd live adapter materialization is 17 pass, 13 wrong_result, 7 blocked, and 29 not_encoded; private/provider gates remain under XY-930.",
+      "qmd_post_stage_counts": {
+        "pass": 17,
+        "wrong_result": 13,
+        "blocked": 7,
+        "not_tested": 0,
+        "not_encoded": 29,
+        "incomplete": 0
+      },
       "comparison_judgment": "unchanged",
       "regression_rule": "Any higher wrong_result/blocked/not_tested count, missing typed blocker, or unsupported broad competitor win claim is a regression.",
       "improvement_rule": "An improvement requires reduced live wrong_result or not_tested counts with no weakened evidence-class boundary and no private/provider claim without inputs.",
diff --git a/apps/elf-eval/fixtures/report_snapshots/2026-06-17-dreaming-competitor-strength-retest-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-17-dreaming-competitor-strength-retest-report.json
new file mode 100644
index 00000000..13b4ec0d
--- /dev/null
+++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-17-dreaming-competitor-strength-retest-report.json
@@ -0,0 +1,504 @@
+{
+  "schema": "elf.dreaming_competitor_strength_retest_report/v1",
+  "report_id": "xy-955-dreaming-competitor-strength-retest-2026-06-17",
+  "authority": "XY-955",
+  "created_at": "2026-06-17T00:00:00Z",
+  "purpose": "Close out the Dreaming-readiness benchmark program pass by comparing the XY-951 baseline and downstream stage evidence against a fresh local/public retest without converting blockers or fixture-only evidence into wins.",
+  "source_evidence_cutoff": "2026-06-17",
+  "source_baseline": {
+    "stage_ledger": "apps/elf-eval/fixtures/report_snapshots/2026-06-16-dreaming-readiness-stage-ledger.json",
+    "competitor_strength_adoption_report": "apps/elf-eval/fixtures/report_snapshots/2026-06-11-competitor-strength-adoption-report.json",
+    "competitor_strength_matrix": "apps/elf-eval/fixtures/report_snapshots/2026-06-11-xy-897-competitor-strength-matrix.json"
+  },
+  "judgment_terms": [
+    "improved",
+    "regressed",
+    "unchanged",
+    "blocked",
+    "not_tested"
+  ],
+  "status_terms": [
+    "pass",
+    "wrong_result",
+    "blocked",
+    "not_tested",
+    "not_encoded",
+    "incomplete",
+    "typed_non_pass",
+    "non_goal",
+    "product_reference"
+  ],
+  "summary": {
+    "overall_judgment": "locally_and_partially_stronger_only",
+    "broader_superiority": "not_proven",
+    "improved_stage_count": 6,
+    "regressed_stage_count": 0,
+    "unchanged_stage_count": 2,
+    "blocked_stage_count": 0,
+    "not_tested_stage_count": 0,
+    "unsupported_claims_rejected": [
+      "ELF does not broadly beat qmd from this retest.",
+      "ELF does not beat mem0/OpenMemory on UI/export, hosted Platform behavior, or optional graph memory.",
+      "ELF does not beat Letta on core/archive memory until a contained Letta export/readback runner exists.",
+      "ELF does not beat OpenViking on staged trajectory, hierarchy selection, or recursive context expansion.",
+      "ELF does not prove graph/RAG citation or navigation parity from representative typed non-pass fixtures.",
+      "ELF does not prove private-corpus or credentialed provider quality without XY-930 operator inputs."
+    ]
+  },
+  "commands": [
+    {
+      "command": "cargo make real-world-memory",
+      "status": "pass",
+      "artifact": "tmp/real-world-memory/real-world-memory-report.json",
+      "summary": {
+        "job_count": 60,
+        "pass": 53,
+        "wrong_result": 0,
+        "blocked": 7,
+        "not_encoded": 0,
+        "mean_score": 0.883,
+        "evidence_coverage": 1.0,
+        "source_ref_coverage": 1.0,
+        "quote_coverage": 1.0
+      }
+    },
+    {
+      "command": "cargo make real-world-memory-graph-rag",
+      "status": "pass",
+      "artifact": "tmp/real-world-memory/graph-rag/report.json",
+      "summary": {
+        "job_count": 5,
+        "pass": 0,
+        "wrong_result": 1,
+        "incomplete": 1,
+        "blocked": 3,
+        "not_encoded": 0,
+        "evidence_coverage": 0.25
+      }
+    },
+    {
+      "command": "cargo make real-world-first-generation-oss",
+      "status": "pass",
+      "artifact": "tmp/real-world-memory/first-generation-oss/report.json",
+      "summary": {
+        "job_count": 6,
+        "pass": 4,
+        "wrong_result": 0,
+        "blocked": 2,
+        "not_encoded": 0,
+        "evidence_coverage": 1.0
+      }
+    },
+    {
+      "command": "cargo make real-world-memory-live-adapters",
+      "status": "pass",
+      "artifact": "tmp/real-world-memory/live-adapters/summary.json",
+      "partial_summary": {
+        "elf_live_real_world": {
+          "job_count": 66,
+          "pass": 40,
+          "wrong_result": 0,
+          "blocked": 7,
+          "not_encoded": 19,
+          "incomplete": 0,
+          "mean_score": 0.606,
+          "evidence_coverage": 0.571,
+          "memory_evolution_status": "pass",
+          "consolidation_status": "pass",
+          "knowledge_compilation_status": "pass",
+          "operator_debugging_ux_status": "pass",
+          "capture_integration_status": "pass",
+          "proactive_brief_status": "blocked",
+          "scheduled_memory_status": "blocked",
+          "production_ops_status": "blocked",
+          "context_trajectory_status": "blocked"
+        },
+        "qmd_live_real_world": {
+          "job_count": 66,
+          "pass": 17,
+          "wrong_result": 13,
+          "blocked": 7,
+          "not_encoded": 29,
+          "incomplete": 0,
+          "mean_score": 0.352,
+          "evidence_coverage": 0.379,
+          "memory_evolution_status": "wrong_result",
+          "retrieval_status": "pass",
+          "work_resume_status": "pass",
+          "project_decisions_status": "pass",
+          "operator_debugging_ux_status": "wrong_result",
+          "production_ops_status": "blocked",
+          "context_trajectory_status": "blocked"
+        }
+      }
+    }
+  ],
+  "stage_closeout": [
+    {
+      "stage_id": "current_vs_historical_correctness",
+      "baseline_counts": {
+        "pass": 1,
+        "wrong_result": 5,
+        "blocked": 0,
+        "not_tested": 0,
+        "not_encoded": 0
+      },
+      "current_counts": {
+        "pass": 6,
+        "wrong_result": 0,
+        "blocked": 0,
+        "not_tested": 0,
+        "not_encoded": 0
+      },
+      "judgment": "improved",
+      "evidence": [
+        "apps/elf-eval/fixtures/report_snapshots/2026-06-16-live-temporal-reconciliation-report.json",
+        "tmp/real-world-memory/live-adapters/elf-report.json"
+      ],
+      "boundary": "Improved for the encoded ELF live memory_evolution slice; not a Graphiti/Zep, mem0/OpenMemory, Letta, private-corpus, or broad qmd superiority claim."
+    },
+    {
+      "stage_id": "preference_evolution",
+      "baseline_counts": {
+        "pass": 0,
+        "wrong_result": 1,
+        "blocked": 0,
+        "not_tested": 0,
+        "not_encoded": 0
+      },
+      "current_counts": {
+        "pass": 1,
+        "wrong_result": 0,
+        "blocked": 0,
+        "not_tested": 0,
+        "not_encoded": 0
+      },
+      "judgment": "improved",
+      "evidence": [
+        "apps/elf-eval/fixtures/report_snapshots/2026-06-16-live-temporal-reconciliation-report.json",
+        "docs/evidence/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md"
+      ],
+      "boundary": "ELF's encoded current-vs-historical preference case improved; mem0/OpenMemory local OSS history remains a measured strength and UI/export remains separately blocked or non-goal."
+    },
+    {
+      "stage_id": "deletion_ttl_tombstone_behavior",
+      "baseline_counts": {
+        "pass": 1,
+        "wrong_result": 0,
+        "blocked": 0,
+        "not_tested": 0,
+        "not_encoded": 0
+      },
+      "current_counts": {
+        "pass": 1,
+        "wrong_result": 0,
+        "blocked": 0,
+        "not_tested": 0,
+        "not_encoded": 0
+      },
+      "judgment": "unchanged",
+      "evidence": [
+        "apps/elf-eval/fixtures/report_snapshots/2026-06-16-live-temporal-reconciliation-report.json"
+      ],
+      "boundary": "The single encoded tombstone/TTL job remains passing; broader update/delete/recreate history remains follow-up work."
+    },
+    {
+      "stage_id": "reviewable_consolidation",
+      "baseline_counts": {
+        "pass": 4,
+        "wrong_result": 0,
+        "blocked": 0,
+        "not_tested": 1,
+        "not_encoded": 1
+      },
+      "current_counts": {
+        "pass": 4,
+        "wrong_result": 0,
+        "blocked": 0,
+        "not_tested": 0,
+        "not_encoded": 0
+      },
+      "judgment": "improved",
+      "evidence": [
+        "apps/elf-eval/fixtures/report_snapshots/2026-06-16-live-consolidation-proposal-scoring-report.json",
+        "tmp/real-world-memory/live-adapters/elf-report.json"
+      ],
+      "boundary": "ELF has live service-backed self-check evidence; direct competitor consolidation runners remain untested or product-reference only."
+    },
+    {
+      "stage_id": "memory_summary_top_of_mind_behavior",
+      "baseline_counts": {
+        "pass": 8,
+        "wrong_result": 0,
+        "blocked": 0,
+        "not_tested": 1,
+        "not_encoded": 1
+      },
+      "current_counts": {
+        "pass": 9,
+        "wrong_result": 0,
+        "blocked": 0,
+        "not_tested": 0,
+        "not_encoded": 0
+      },
+      "judgment": "improved",
+      "evidence": [
+        "tmp/real-world-memory/real-world-memory-report.json",
+        "docs/spec/system_memory_summary_v1.md"
+      ],
+      "boundary": "Improved as fixture-backed source-trace contract evidence only; service-native top-of-mind behavior remains future work."
+    },
+    {
+      "stage_id": "proactive_brief_readiness",
+      "baseline_counts": {
+        "pass": 0,
+        "wrong_result": 0,
+        "blocked": 0,
+        "not_tested": 1,
+        "not_encoded": 1
+      },
+      "current_counts": {
+        "pass": 4,
+        "wrong_result": 0,
+        "blocked": 1,
+        "not_tested": 0,
+        "not_encoded": 0
+      },
+      "judgment": "improved",
+      "evidence": [
+        "apps/elf-eval/fixtures/report_snapshots/2026-06-16-proactive-brief-scoring-report.json",
+        "tmp/real-world-memory/real-world-memory-report.json"
+      ],
+      "boundary": "Improved as fixture-backed proactive brief scoring only; private-corpus refresh stays blocked under XY-930 and Pulse parity is not proven."
+    },
+    {
+      "stage_id": "scheduled_memory_task_readiness",
+      "baseline_counts": {
+        "pass": 0,
+        "wrong_result": 0,
+        "blocked": 1,
+        "not_tested": 0,
+        "not_encoded": 0
+      },
+      "current_counts": {
+        "pass": 4,
+        "wrong_result": 0,
+        "blocked": 1,
+        "not_tested": 0,
+        "not_encoded": 0
+      },
+      "judgment": "improved",
+      "evidence": [
+        "apps/elf-eval/fixtures/report_snapshots/2026-06-16-scheduled-memory-task-scoring-report.json",
+        "tmp/real-world-memory/real-world-memory-report.json"
+      ],
+      "boundary": "Improved as fixture-backed scheduled task readback only; hosted scheduler, notification, provider-backed private-corpus, and silent-mutation parity are not proven."
+    },
+    {
+      "stage_id": "final_competitor_retest_status",
+      "baseline_counts": {
+        "pass": 22,
+        "wrong_result": 5,
+        "blocked": 2,
+        "not_tested": 11,
+        "not_encoded": 11
+      },
+      "current_counts": {
+        "pass": 40,
+        "wrong_result": 0,
+        "blocked": 7,
+        "not_tested": 0,
+        "not_encoded": 19,
+        "incomplete": 0
+      },
+      "judgment": "unchanged",
+      "evidence": [
+        "tmp/real-world-memory/live-adapters/elf-report.json",
+        "tmp/real-world-memory/live-adapters/qmd-report.json",
+        "tmp/real-world-memory/live-adapters/summary.json",
+        "tmp/real-world-memory/graph-rag/report.json",
+        "tmp/real-world-memory/first-generation-oss/report.json"
+      ],
+      "qmd_current_counts": {
+        "pass": 17,
+        "wrong_result": 13,
+        "blocked": 7,
+        "not_tested": 0,
+        "not_encoded": 29,
+        "incomplete": 0
+      },
+      "boundary": "ELF live wrong_result count improved in the fresh local retest and qmd now has a fresh scored live report, but qmd debug ergonomics remains a measured ELF loss, Graph/RAG remains typed non-pass, OpenViking/Letta/private/provider gates remain blocked or not tested, and broader superiority is not proven."
+    }
+  ],
+  "scenario_retests": [
+    {
+      "scenario_id": "qmd_debug_ergonomics",
+      "baseline_outcome": "loss",
+      "current_outcome": "unchanged",
+      "current_status": "pass",
+      "evidence_class": "live_baseline_only",
+      "evidence": "Fresh qmd live materialization produced a scored full-suite report with 17 pass, 13 wrong_result, 7 blocked, and 29 not_encoded jobs. That full-suite typed non-pass result does not retest or erase the prior qmd top-10/replay ergonomics advantage, which remains the authoritative debug-ergonomics evidence.",
+      "follow_up": "XY-923"
+    },
+    {
+      "scenario_id": "mem0_openmemory_preference_history_export",
+      "baseline_outcome": "loss_for_preference_history_tie_for_scoped_personalization_blocked_for_ui_export",
+      "current_outcome": "unchanged",
+      "current_status": "blocked",
+      "evidence_class": "live_baseline_only",
+      "evidence": "ELF temporal preference case improved, but mem0/OpenMemory local OSS history/export-style readback evidence remains stronger for history and OpenMemory UI/export remains setup-blocked.",
+      "follow_up": "XY-930 for private/provider gates; dedicated OpenMemory UI/export runner remains a measured blocker."
+    },
+    {
+      "scenario_id": "letta_core_archive",
+      "baseline_outcome": "blocked",
+      "current_outcome": "unchanged",
+      "current_status": "blocked",
+      "evidence_class": "fixture_backed",
+      "evidence": "ELF core_archival_memory fixture passes locally, but no contained Letta export/readback artifact maps core and archival source ids.",
+      "follow_up": "Decodex-ready issue brief: build a contained Letta core/archive export-readback adapter before win/tie/loss claims."
+    },
+    {
+      "scenario_id": "graphiti_zep_temporal_graph_validity",
+      "baseline_outcome": "blocked",
+      "current_outcome": "unchanged",
+      "current_status": "blocked",
+      "evidence_class": "research_gate",
+      "evidence": "Graphiti/Zep remains blocked by provider/graph-store setup in the representative Graph/RAG slice; ELF temporal reconciliation improvement does not prove graph temporal validity parity.",
+      "follow_up": "Graph/RAG adapter follow-up; provider inputs remain explicit."
+    },
+    {
+      "scenario_id": "openviking_trajectory_hierarchy",
+      "baseline_outcome": "blocked",
+      "current_outcome": "unchanged",
+      "current_status": "blocked",
+      "evidence_class": "fixture_backed",
+      "evidence": "OpenViking staged retrieval, hierarchy selection, and recursive expansion remain encoded blocked fixtures behind missing evidence-bearing staged artifacts.",
+      "follow_up": "XY-928"
+    },
+    {
+      "scenario_id": "graph_rag_citation_navigation_knowledge_surfaces",
+      "baseline_outcome": "not_tested",
+      "current_outcome": "unchanged",
+      "current_status": "typed_non_pass",
+      "evidence_class": "fixture_backed",
+      "evidence": "The representative graph/RAG retest has 0 pass, 1 wrong_result, 1 incomplete, and 3 blocked jobs; this is not graph/RAG quality parity.",
+      "follow_up": "XY-929"
+    },
+    {
+      "scenario_id": "private_provider_production_gates",
+      "baseline_outcome": "blocked",
+      "current_outcome": "unchanged",
+      "current_status": "blocked",
+      "evidence_class": "blocked",
+      "evidence": "No operator-owned private manifest or explicit credentialed provider setup was supplied in this lane.",
+      "follow_up": "XY-930"
+    }
+  ],
+  "optimization_queue": [
+    {
+      "priority": "P0",
+      "issue": "XY-923",
+      "status": "existing",
+      "brief": "Re-run qmd trace/replay diagnostics with comparable immediate top-k/replay, expansion, fusion, rerank, and candidate-drop artifacts; preserve the qmd debug ergonomics loss unless ELF produces comparable artifacts."
+    },
+    {
+      "priority": "P1",
+      "issue": "XY-930",
+      "status": "existing",
+      "brief": "Run private-corpus and credentialed provider gates only after operator-owned manifest and explicit provider setup exist; otherwise keep typed blockers."
+    },
+    {
+      "priority": "P1",
+      "issue": "XY-928",
+      "status": "existing",
+      "brief": "Materialize OpenViking staged trajectory, hierarchy selection, and recursive expansion evidence before claiming ELF ties or beats those strengths."
+    },
+    {
+      "priority": "P1",
+      "issue": "letta-core-archive-adapter-brief",
+      "status": "proposed",
+      "brief": "Add a contained Letta core/archive export-readback adapter that emits source ids for core blocks and archival memories; non-goals are ELF product changes and broad Letta claims."
+    },
+    {
+      "priority": "P2",
+      "issue": "XY-929",
+      "status": "existing",
+      "brief": "Promote Graph/RAG citation, navigation, stale-source lint, and knowledge-surface cases only when adapters emit comparable evidence-linked outputs; keep blocked, incomplete, wrong_result, and not_tested states typed."
+    },
+    {
+      "priority": "P2",
+      "issue": "service-native-dreaming-outputs-brief",
+      "status": "proposed",
+      "brief": "Move fixture-backed memory summary, proactive brief, and scheduled task contracts into service-native readback/materialization with the same source-ref, freshness, rationale, trace, and no-source-mutation gates."
+    }
+  ],
+  "claim_boundaries": {
+    "allowed": [
+      "ELF is locally and partially stronger after the Dreaming stages on encoded temporal reconciliation, reviewable consolidation self-checks, fixture-backed memory summary, proactive brief, and scheduled-memory task scoring.",
+      "The public/local aggregate fixture retest remains 53 pass, 0 wrong_result, and 7 typed blocked jobs across 60 jobs.",
+      "The representative graph/RAG slice remains typed non-pass.",
+      "Private/provider gates remain blocked under XY-930."
+    ],
+    "not_allowed": [
+      "Do not claim broad ELF-over-qmd superiority.",
+      "Do not claim ELF beats managed Dreaming, Pulse, ChatGPT Tasks, mem0/OpenMemory, Letta, OpenViking, Graphiti/Zep, or graph/RAG systems from fixture-only, partial live, blocked, or smoke-only evidence.",
+      "Do not collapse scenario-level outcomes into a leaderboard.",
+      "Do not treat qmd full-suite wrong_result counts as a regression of qmd debug ergonomics."
+    ]
+  },
+  "follow_up_issue_briefs": {
+    "existing": [
+      {
+        "issue": "XY-923",
+        "title": "qmd trace/replay diagnostics and debug ergonomics comparison",
+        "reason": "qmd remains the measured local retrieval-debug ergonomics loss; the fresh qmd full-suite live report does not provide comparable immediate top-k, replay, expansion, fusion, rerank, and candidate-drop debug artifacts.",
+        "scope": "Re-run qmd trace/replay diagnostics with comparable immediate top-k, replay, expansion, fusion, rerank, and candidate-drop artifacts.",
+        "non_goal": "Do not reinterpret qmd full-suite wrong_result counts as a regression of qmd debug ergonomics.",
+        "validation": "A scored qmd/ELF debug ergonomics artifact that preserves pass, wrong_result, blocked, and not_encoded states."
+      },
+      {
+        "issue": "XY-930",
+        "title": "private-corpus and credentialed provider gates",
+        "reason": "operator-owned private manifest and explicit provider setup remain absent.",
+        "scope": "Run private-corpus and credentialed provider gates only when operator inputs exist; otherwise publish typed blockers.",
+        "non_goal": "Do not infer credentials or promote synthetic/provider smoke evidence into private-corpus pass evidence.",
+        "validation": "A public-safe report that states whether the private/provider caveats are removed or still blocked."
+      },
+      {
+        "issue": "XY-928",
+        "title": "OpenViking trajectory and hierarchy evidence",
+        "reason": "OpenViking staged retrieval, hierarchy selection, and recursive expansion remain blocked by missing evidence-bearing staged artifacts.",
+        "scope": "Materialize same-corpus evidence ids and staged trajectory outputs before scoring hierarchy/recursive retrieval.",
+        "non_goal": "Do not claim ELF ties or beats OpenViking from fixture-only blocked rows.",
+        "validation": "Scored context-trajectory reports with typed pass, wrong_result, blocked, or incomplete outcomes."
+      },
+      {
+        "issue": "XY-929",
+        "title": "Graph/RAG citation and navigation adapter promotion",
+        "reason": "The representative graph/RAG retest remains 0 pass, 1 wrong_result, 1 incomplete, and 3 blocked.",
+        "scope": "Promote graph/RAG citation, navigation, stale-source lint, and knowledge-surface cases only when adapters emit comparable evidence-linked outputs.",
+        "non_goal": "Do not convert research gates, tiny smokes, blocked setup, or graphify wrong_result into graph/RAG parity evidence.",
+        "validation": "Representative graph/RAG reports that keep blocked, incomplete, wrong_result, not_tested, and non_goal states typed."
+      }
+    ],
+    "proposed": [
+      {
+        "issue": "letta-core-archive-adapter-brief",
+        "title": "contained Letta core/archive export-readback adapter",
+        "reason": "ELF has fixture-backed core/archival memory evidence, but no contained Letta artifact maps core blocks, archival readback, and source ids.",
+        "scope": "Create a Docker-contained Letta export/readback adapter over benchmark-owned data and score only mapped core/archive evidence.",
+        "non_goal": "Do not change ELF product behavior or make broad Letta win/tie/loss claims before comparable evidence exists.",
+        "validation": "A scored artifact containing Letta core block JSON, archival search/readback JSON, source ids, and typed outcome states."
+      },
+      {
+        "issue": "service-native-dreaming-outputs-brief",
+        "title": "service-native memory summary, proactive brief, and scheduled task materialization",
+        "reason": "The Dreaming output improvements are currently fixture-backed contracts, not service-native generated/readback behavior.",
+        "scope": "Move memory summary, proactive brief, and scheduled task outputs into service-native materialization with source refs, freshness, rationale, trace, and no-source-mutation gates.",
+        "non_goal": "Do not build a polished hosted scheduler, Pulse clone, notification product, or private/provider path in this follow-up.",
+        "validation": "Service-native scored reports that preserve fixture boundaries and fail stale, tombstoned, unsupported, or untraced current claims."
+      }
+    ]
+  }
+}
diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs
index 532add8b..e6aab322 100644
--- a/apps/elf-eval/tests/real_world_job_benchmark.rs
+++ b/apps/elf-eval/tests/real_world_job_benchmark.rs
@@ -206,6 +206,18 @@ fn dreaming_readiness_stage_ledger_markdown_path() -> Result<PathBuf> {
 		.join("2026-06-16-dreaming-readiness-stage-ledger.md"))
 }
 
+fn dreaming_competitor_strength_retest_report_json_path() -> Result<PathBuf> {
+	report_snapshot_path("2026-06-17-dreaming-competitor-strength-retest-report.json")
+}
+
+fn dreaming_competitor_strength_retest_report_markdown_path() -> Result<PathBuf> {
+	Ok(workspace_root()?
+		.join("docs")
+		.join("evidence")
+		.join("benchmarking")
+		.join("2026-06-17-dreaming-competitor-strength-retest-report.md"))
+}
+
 fn live_temporal_reconciliation_report_json_path() -> Result<PathBuf> {
 	report_snapshot_path("2026-06-16-live-temporal-reconciliation-report.json")
 }
@@ -2817,6 +2829,241 @@ fn live_temporal_reconciliation_report_records_xy905_before_after() -> Result<()
 	Ok(())
 }
 
+#[test]
+fn dreaming_competitor_strength_retest_report_closes_xy955_without_overclaims() -> Result<()> {
+	let report = serde_json::from_str::<Value>(&fs::read_to_string(
+		dreaming_competitor_strength_retest_report_json_path()?,
+	)?)?;
+	let markdown = fs::read_to_string(dreaming_competitor_strength_retest_report_markdown_path()?)?;
+	let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?;
+	let readme = fs::read_to_string(readme_path()?)?;
+
+	assert_eq!(
+		report.pointer("/schema").and_then(Value::as_str),
+		Some("elf.dreaming_competitor_strength_retest_report/v1")
+	);
+	assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-955"));
+	assert_eq!(
+		report.pointer("/summary/overall_judgment").and_then(Value::as_str),
+		Some("locally_and_partially_stronger_only")
+	);
+	assert_eq!(
+		report.pointer("/summary/broader_superiority").and_then(Value::as_str),
+		Some("not_proven")
+	);
+	assert_eq!(report.pointer("/summary/regressed_stage_count").and_then(Value::as_u64), Some(0));
+	assert!(array_contains_str(&report, "/status_terms", "typed_non_pass")?);
+	assert!(array_contains_str(
+		&report,
+		"/summary/unsupported_claims_rejected",
+		"ELF does not broadly beat qmd from this retest."
+	)?);
+
+	assert_xy955_commands(&report)?;
+	assert_xy955_stage_closeout(&report)?;
+	assert_xy955_scenario_retests(&report)?;
+	assert_xy955_optimization_queue(&report)?;
+	assert_xy955_follow_up_issue_briefs(&report)?;
+
+	assert!(markdown.contains("ELF is locally and partially stronger"));
+	assert!(
+		markdown.contains("The full live-adapter command now has fresh ELF and qmd scored reports")
+	);
+	assert!(
+		markdown.contains(
+			"Do not treat qmd full-suite wrong_result counts as a regression of qmd debug"
+		)
+	);
+	assert!(markdown.contains("## Follow-Up Issue Briefs"));
+	assert!(markdown.contains(
+		"| GraphRAG/LightRAG/RAGFlow/llm-wiki/gbrain/graphify citation/navigation/knowledge surfaces |"
+	));
+	assert!(
+		benchmarking_index.contains("2026-06-17-dreaming-competitor-strength-retest-report.md")
+	);
+	assert!(readme.contains("Dreaming Competitor-Strength Retest Report - June 17, 2026"));
+	assert!(readme.contains("Latest real-world benchmark report: June 17, 2026"));
+
+	Ok(())
+}
+
+fn assert_xy955_commands(report: &Value) -> Result<()> {
+	let commands = array_at(report, "/commands")?;
+	let aggregate = find_by_field(commands, "/command", "cargo make real-world-memory")?;
+	let graph_rag = find_by_field(commands, "/command", "cargo make real-world-memory-graph-rag")?;
+	let first_generation =
+		find_by_field(commands, "/command", "cargo make real-world-first-generation-oss")?;
+	let live = find_by_field(commands, "/command", "cargo make real-world-memory-live-adapters")?;
+
+	assert_eq!(aggregate.pointer("/status").and_then(Value::as_str), Some("pass"));
+	assert_eq!(aggregate.pointer("/summary/pass").and_then(Value::as_u64), Some(53));
+	assert_eq!(aggregate.pointer("/summary/blocked").and_then(Value::as_u64), Some(7));
+	assert_eq!(graph_rag.pointer("/status").and_then(Value::as_str), Some("pass"));
+	assert_eq!(graph_rag.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1));
+	assert_eq!(graph_rag.pointer("/summary/incomplete").and_then(Value::as_u64), Some(1));
+	assert_eq!(graph_rag.pointer("/summary/blocked").and_then(Value::as_u64), Some(3));
+	assert_eq!(first_generation.pointer("/summary/pass").and_then(Value::as_u64), Some(4));
+	assert_eq!(first_generation.pointer("/summary/blocked").and_then(Value::as_u64), Some(2));
+	assert_eq!(live.pointer("/status").and_then(Value::as_str), Some("pass"));
+	assert_eq!(
+		live.pointer("/partial_summary/elf_live_real_world/pass").and_then(Value::as_u64),
+		Some(40)
+	);
+	assert_eq!(
+		live.pointer("/partial_summary/elf_live_real_world/wrong_result").and_then(Value::as_u64),
+		Some(0)
+	);
+	assert_eq!(
+		live.pointer("/partial_summary/qmd_live_real_world/pass").and_then(Value::as_u64),
+		Some(17)
+	);
+	assert_eq!(
+		live.pointer("/partial_summary/qmd_live_real_world/wrong_result").and_then(Value::as_u64),
+		Some(13)
+	);
+
+	Ok(())
+}
+
+fn assert_xy955_stage_closeout(report: &Value) -> Result<()> {
+	let stages = array_at(report, "/stage_closeout")?;
+
+	assert_eq!(stages.len(), 8);
+
+	let current = find_by_field(stages, "/stage_id", "current_vs_historical_correctness")?;
+	let proactive = find_by_field(stages, "/stage_id", "proactive_brief_readiness")?;
+	let scheduled = find_by_field(stages, "/stage_id", "scheduled_memory_task_readiness")?;
+	let final_retest = find_by_field(stages, "/stage_id", "final_competitor_retest_status")?;
+
+	assert_eq!(current.pointer("/judgment").and_then(Value::as_str), Some("improved"));
+	assert_eq!(current.pointer("/current_counts/pass").and_then(Value::as_u64), Some(6));
+	assert_eq!(current.pointer("/current_counts/wrong_result").and_then(Value::as_u64), Some(0));
+	assert_eq!(proactive.pointer("/judgment").and_then(Value::as_str), Some("improved"));
+	assert_eq!(proactive.pointer("/current_counts/blocked").and_then(Value::as_u64), Some(1));
+	assert_eq!(scheduled.pointer("/current_counts/pass").and_then(Value::as_u64), Some(4));
+	assert_eq!(scheduled.pointer("/current_counts/blocked").and_then(Value::as_u64), Some(1));
+	assert_eq!(final_retest.pointer("/judgment").and_then(Value::as_str), Some("unchanged"));
+	assert_eq!(final_retest.pointer("/current_counts/pass").and_then(Value::as_u64), Some(40));
+	assert_eq!(
+		final_retest.pointer("/current_counts/wrong_result").and_then(Value::as_u64),
+		Some(0)
+	);
+	assert_eq!(final_retest.pointer("/current_counts/blocked").and_then(Value::as_u64), Some(7));
+	assert_eq!(
+		final_retest.pointer("/current_counts/not_encoded").and_then(Value::as_u64),
+		Some(19)
+	);
+	assert!(final_retest.pointer("/boundary").and_then(Value::as_str).is_some_and(|boundary| {
+		boundary.contains("qmd now has a fresh scored live report")
+			&& boundary.contains("broader superiority is not proven")
+	}));
+	assert_eq!(final_retest.pointer("/qmd_current_counts/pass").and_then(Value::as_u64), Some(17));
+	assert_eq!(
+		final_retest.pointer("/qmd_current_counts/wrong_result").and_then(Value::as_u64),
+		Some(13)
+	);
+
+	Ok(())
+}
+
+fn assert_xy955_scenario_retests(report: &Value) -> Result<()> {
+	let scenarios = array_at(report, "/scenario_retests")?;
+	let qmd = find_by_field(scenarios, "/scenario_id", "qmd_debug_ergonomics")?;
+	let mem0 =
+		find_by_field(scenarios, "/scenario_id", "mem0_openmemory_preference_history_export")?;
+	let letta = find_by_field(scenarios, "/scenario_id", "letta_core_archive")?;
+	let graph_rag = find_by_field(
+		scenarios,
+		"/scenario_id",
+		"graph_rag_citation_navigation_knowledge_surfaces",
+	)?;
+	let private_provider =
+		find_by_field(scenarios, "/scenario_id", "private_provider_production_gates")?;
+
+	assert_eq!(qmd.pointer("/current_outcome").and_then(Value::as_str), Some("unchanged"));
+	assert_eq!(qmd.pointer("/current_status").and_then(Value::as_str), Some("pass"));
+	assert!(qmd.pointer("/evidence").and_then(Value::as_str).is_some_and(|evidence| {
+		evidence.contains("17 pass")
+			&& evidence.contains("13 wrong_result")
+			&& evidence.contains("does not retest or erase")
+	}));
+	assert_eq!(mem0.pointer("/current_outcome").and_then(Value::as_str), Some("unchanged"));
+	assert!(mem0.pointer("/evidence").and_then(Value::as_str).is_some_and(|evidence| {
+		evidence.contains("mem0/OpenMemory local OSS history")
+			&& evidence.contains("OpenMemory UI/export remains setup-blocked")
+	}));
+	assert_eq!(letta.pointer("/current_status").and_then(Value::as_str), Some("blocked"));
+	assert_eq!(
+		graph_rag.pointer("/current_status").and_then(Value::as_str),
+		Some("typed_non_pass")
+	);
+	assert!(graph_rag.pointer("/evidence").and_then(Value::as_str).is_some_and(|evidence| {
+		evidence.contains("0 pass")
+			&& evidence.contains("1 wrong_result")
+			&& evidence.contains("3 blocked")
+	}));
+	assert_eq!(private_provider.pointer("/follow_up").and_then(Value::as_str), Some("XY-930"));
+
+	Ok(())
+}
+
+fn assert_xy955_optimization_queue(report: &Value) -> Result<()> {
+	let queue = array_at(report, "/optimization_queue")?;
+	let qmd = find_by_field(queue, "/issue", "XY-923")?;
+	let private_provider = find_by_field(queue, "/issue", "XY-930")?;
+	let openviking = find_by_field(queue, "/issue", "XY-928")?;
+	let letta = find_by_field(queue, "/issue", "letta-core-archive-adapter-brief")?;
+	let service_native = find_by_field(queue, "/issue", "service-native-dreaming-outputs-brief")?;
+
+	assert_eq!(qmd.pointer("/status").and_then(Value::as_str), Some("existing"));
+	assert_eq!(private_provider.pointer("/status").and_then(Value::as_str), Some("existing"));
+	assert_eq!(openviking.pointer("/status").and_then(Value::as_str), Some("existing"));
+	assert_eq!(letta.pointer("/status").and_then(Value::as_str), Some("proposed"));
+	assert_eq!(service_native.pointer("/status").and_then(Value::as_str), Some("proposed"));
+	assert!(array_contains_str(
+		report,
+		"/claim_boundaries/not_allowed",
+		"Do not treat qmd full-suite wrong_result counts as a regression of qmd debug ergonomics."
+	)?);
+
+	Ok(())
+}
+
+fn assert_xy955_follow_up_issue_briefs(report: &Value) -> Result<()> {
+	let existing = array_at(report, "/follow_up_issue_briefs/existing")?;
+	let proposed = array_at(report, "/follow_up_issue_briefs/proposed")?;
+	let qmd = find_by_field(existing, "/issue", "XY-923")?;
+	let private_provider = find_by_field(existing, "/issue", "XY-930")?;
+	let letta = find_by_field(proposed, "/issue", "letta-core-archive-adapter-brief")?;
+	let service_native =
+		find_by_field(proposed, "/issue", "service-native-dreaming-outputs-brief")?;
+
+	assert!(qmd.pointer("/scope").and_then(Value::as_str).is_some_and(|scope| {
+		scope.contains("immediate top-k") && scope.contains("candidate-drop artifacts")
+	}));
+	assert!(qmd.pointer("/non_goal").and_then(Value::as_str).is_some_and(|non_goal| {
+		non_goal.contains("qmd full-suite wrong_result counts")
+			&& non_goal.contains("debug ergonomics")
+	}));
+	assert!(
+		private_provider
+			.pointer("/non_goal")
+			.and_then(Value::as_str)
+			.is_some_and(|non_goal| non_goal.contains("Do not infer credentials"))
+	);
+	assert!(letta.pointer("/validation").and_then(Value::as_str).is_some_and(|validation| {
+		validation.contains("Letta core block JSON") && validation.contains("typed outcome states")
+	}));
+	assert!(
+		service_native
+			.pointer("/non_goal")
+			.and_then(Value::as_str)
+			.is_some_and(|non_goal| non_goal.contains("Pulse clone"))
+	);
+
+	Ok(())
+}
+
 #[test]
 fn qmd_trace_replay_diagnostics_report_preserves_claim_boundaries() -> Result<()> {
 	let report = serde_json::from_str::<Value>(&fs::read_to_string(
@@ -4139,6 +4386,14 @@ fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) -
 		Some(0)
 	);
 
+	assert_dreaming_final_competitor_retest_stage(ledger, stages)?;
+	assert_dreaming_memory_summary_stage(stages)?;
+	assert_dreaming_proactive_brief_stage(stages)?;
+
+	Ok(())
+}
+
+fn assert_dreaming_final_competitor_retest_stage(ledger: &Value, stages: &[Value]) -> Result<()> {
 	let retest = find_by_field(stages, "/stage_id", "final_competitor_retest_status")?;
 
 	assert_eq!(retest.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(22));
@@ -4146,6 +4401,24 @@ fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) -
 	assert_eq!(retest.pointer("/baseline_counts/blocked").and_then(Value::as_u64), Some(2));
 	assert_eq!(retest.pointer("/baseline_counts/not_tested").and_then(Value::as_u64), Some(11));
 	assert_eq!(retest.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64), Some(11));
+	assert_eq!(retest.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(40));
+	assert_eq!(retest.pointer("/post_stage_counts/wrong_result").and_then(Value::as_u64), Some(0));
+	assert_eq!(retest.pointer("/post_stage_counts/blocked").and_then(Value::as_u64), Some(7));
+	assert_eq!(retest.pointer("/post_stage_counts/not_encoded").and_then(Value::as_u64), Some(19));
+	assert_eq!(retest.pointer("/qmd_post_stage_counts/pass").and_then(Value::as_u64), Some(17));
+	assert_eq!(
+		retest.pointer("/qmd_post_stage_counts/wrong_result").and_then(Value::as_u64),
+		Some(13)
+	);
+	assert!(retest.pointer("/post_stage_basis").and_then(Value::as_str).is_some_and(|basis| {
+		basis.contains("XY-955 closeout retest")
+			&& basis.contains("qmd live adapter materialization is 17 pass")
+	}));
+
+	assert_dreaming_readiness_summary_buckets(ledger)
+}
+
+fn assert_dreaming_readiness_summary_buckets(ledger: &Value) -> Result<()> {
 	assert!(array_contains_str(ledger, "/summary/improved", "current_vs_historical_correctness")?);
 	assert!(array_contains_str(ledger, "/summary/improved", "preference_evolution")?);
 	assert!(array_contains_str(ledger, "/summary/improved", "reviewable_consolidation")?);
@@ -4162,9 +4435,6 @@ fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) -
 	assert!(array_at(ledger, "/summary/blocked")?.is_empty());
 	assert!(array_at(ledger, "/summary/not_tested")?.is_empty());
 
-	assert_dreaming_memory_summary_stage(stages)?;
-	assert_dreaming_proactive_brief_stage(stages)?;
-
 	Ok(())
 }
 
@@ -4253,7 +4523,9 @@ fn assert_dreaming_readiness_markdown_boundaries(markdown: &str) {
 	assert!(markdown.contains("`regressed`: none"));
 	assert!(markdown.contains("the XY-905 run passes all six memory-evolution jobs"));
 	assert!(markdown.contains("XY-952 adds a reviewable `elf.memory_summary/v1`"));
+	assert!(markdown.contains("XY-955 closes the final competitor retest row"));
 	assert!(markdown.contains("XY-905"));
+	assert!(markdown.contains("qmd live `pass=17`, `wrong_result=13`"));
 	assert!(
 		markdown
 			.contains("Do not claim this ledger proves preference history against mem0/OpenMemory")
diff --git a/docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md b/docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md
index e6e0e379..413332d5 100644
--- a/docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md
+++ b/docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md
@@ -44,6 +44,12 @@ Current stage status:
 - `blocked`: none.
 - `not_tested`: none.
 
+XY-955 closes the final competitor retest row for this program pass. The closeout
+keeps the final competitor judgment `unchanged`: the fresh public/local retest
+confirms ELF's encoded Dreaming-stage improvements, produces a fresh qmd full-suite
+live report with typed non-pass states, keeps graph/RAG typed non-pass, and leaves
+private/provider gates tied to XY-930.
+
 The known live `memory_evolution` loss is now repaired for the encoded ELF live
 adapter slice: the XY-905 run passes all six memory-evolution jobs and reports
 current, historical, rationale, tombstone, invalidation, selected, dropped, and
@@ -101,7 +107,7 @@ provider-backed private-corpus quality, or silent source mutation safety.
 | Memory summary and top-of-mind behavior | `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival` | `cargo make real-world-memory-summary`; `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival`; `cargo make real-world-memory-live-adapters` | `pass=8`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=9`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `improved` | Move from fixture-backed summary/source-trace readback into service-native admin readback and later live top-of-mind behavior; do not turn hidden summaries into authoritative memory. |
 | Proactive brief readiness | `cargo make real-world-first-generation-oss`; `cargo make real-world-job-operator-ux` | `cargo make real-world-memory-proactive-brief`; `cargo make real-world-memory`; `cargo test -p elf-eval --test real_world_job_benchmark -- --test-threads=1` | `pass=0`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=4`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0`; evidence-ref/freshness/rationale coverage `1.000`; invalid-current and tombstone violations `0` | `improved` | Move from fixture-backed proactive brief scoring into service-native generated brief readback and later live adapter materialization; keep scheduling and private-corpus refresh behind owned lanes and operator inputs. |
 | Scheduled memory task readiness | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-scheduled`; `cargo make real-world-memory`; `cargo test -p elf-eval --test real_world_job_benchmark scheduled_memory -- --test-threads=1` | `pass=0`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0` | `pass=4`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0`; evidence-ref/freshness/action/trace coverage `1.000`; invalid-current, unsupported-current, tombstone, and source-mutation violations `0` | `improved` | Move from fixture-backed scheduled task scoring into service-native queued task materialization and operator-visible readback; keep hosted/private/provider scheduler gates behind XY-930 inputs. |
-| Final competitor retest status | `cargo make real-world-memory-live-adapters`; `cargo make real-world-first-generation-oss`; `cargo make real-world-memory-graph-rag`; `cargo make openmemory-ui-export-readback`; `cargo make baseline-production-private-addendum` when operator input exists | Same commands; private/provider commands may remain typed blocked under XY-930 | `pass=22`, `wrong_result=5`, `blocked=2`, `not_tested=11`, `not_encoded=11` | partial XY-905 evidence: ELF live adapter `pass=40`, `wrong_result=0`, `blocked=5`, `not_encoded=10` | `unchanged` | Rerun the broader competitor matrix after each optimization; the XY-905 live adapter improvement does not replace private/provider or external competitor gates. |
+| Final competitor retest status | `cargo make real-world-memory-live-adapters`; `cargo make real-world-first-generation-oss`; `cargo make real-world-memory-graph-rag`; `cargo make openmemory-ui-export-readback`; `cargo make baseline-production-private-addendum` when operator input exists | Same commands; private/provider commands may remain typed blocked under XY-930 | `pass=22`, `wrong_result=5`, `blocked=2`, `not_tested=11`, `not_encoded=11` | XY-955 closeout: aggregate fixture retest `pass=53`, `blocked=7`; graph/RAG `wrong_result=1`, `incomplete=1`, `blocked=3`; first-generation OSS `pass=4`, `blocked=2`; ELF live `pass=40`, `wrong_result=0`, `blocked=7`, `not_encoded=19`; qmd live `pass=17`, `wrong_result=13`, `blocked=7`, `not_encoded=29` | `unchanged` | Convert only measured losses and typed blockers into follow-up issues; the ELF live improvement and qmd full-suite non-pass do not replace qmd debug ergonomics, private/provider, OpenViking, Letta, or graph/RAG gates. |
 
 ## Evidence Anchors
 
@@ -114,7 +120,7 @@ provider-backed private-corpus quality, or silent source mutation safety.
 | Memory summary and top-of-mind behavior | `docs/spec/system_memory_summary_v1.md`; `apps/elf-eval/fixtures/real_world_memory/memory_summary/`; `apps/elf-eval/fixtures/real_world_memory/knowledge/`; `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/`; `docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md` |
 | Proactive brief readiness | `docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md`; `apps/elf-eval/fixtures/real_world_memory/proactive_brief/`; `docs/decisions/2026-06-08-agent-memory-selection.md`; `docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` |
 | Scheduled memory task readiness | `docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md`; `apps/elf-eval/fixtures/real_world_memory/scheduled_memory/`; `docs/decisions/2026-06-08-agent-memory-selection.md` |
-| Final competitor retest status | `docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md`; `docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md`; `docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` |
+| Final competitor retest status | `docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md`; `docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md`; `docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md`; `docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md`; `apps/elf-eval/fixtures/report_snapshots/2026-06-17-dreaming-competitor-strength-retest-report.json` |
 
 ## Report Shape For Downstream Issues
 
diff --git a/docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md b/docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md
new file mode 100644
index 00000000..62396f6d
--- /dev/null
+++ b/docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md
@@ -0,0 +1,131 @@
+# Dreaming Competitor-Strength Retest Report - June 17, 2026
+
+Goal: Close out the XY-955 Dreaming-readiness benchmark program pass with a
+baseline-vs-current competitor-strength retest and optimization queue.
+Read this when: You need the final stage-ledger closeout after XY-905, XY-934,
+XY-952, XY-953, and XY-954, or need to know which remaining losses and blockers are
+ready for follow-up issue work.
+Inputs:
+`apps/elf-eval/fixtures/report_snapshots/2026-06-17-dreaming-competitor-strength-retest-report.json`,
+`apps/elf-eval/fixtures/report_snapshots/2026-06-16-dreaming-readiness-stage-ledger.json`,
+`apps/elf-eval/fixtures/report_snapshots/2026-06-11-competitor-strength-adoption-report.json`,
+the June 16 stage reports, and the fresh `tmp/real-world-memory/` retest outputs.
+Outputs: Scenario-level improved/regressed/unchanged/blocked/not-tested judgments,
+claim boundaries, and the next optimization queue.
+
+## Executive Judgment
+
+ELF is locally and partially stronger after the Dreaming-readiness stages. It is not
+broadly superior to the tracked competitors.
+
+The public/local retest supports these narrow improvements:
+
+- Live ELF `memory_evolution` moved from `pass=1`, `wrong_result=5` in the XY-951
+  baseline to `pass=6`, `wrong_result=0` in the XY-905 report and the fresh partial
+  ELF live adapter output.
+- Live ELF consolidation self-checks now pass for service-backed proposal
+  materialization, source lineage, confidence/usefulness, unsupported-claim flags,
+  review actions, and zero source mutations.
+- Fixture-backed memory summary, proactive brief, and scheduled-memory task scoring
+  are now encoded and passing except for their explicit private/provider blockers.
+
+The broader competitor-strength outcome is unchanged:
+
+- qmd debug ergonomics remain a measured ELF loss from the existing trace/replay
+  report. The fresh qmd full-suite live report is typed non-pass, but that does not
+  retest or erase qmd's top-k/replay artifact advantage.
+- mem0/OpenMemory preference-history and export-style local OSS readback remain
+  separate measured strengths; OpenMemory UI/export and hosted Platform behavior are
+  not proven by this retest.
+- Letta core/archive, OpenViking trajectory/hierarchy, Graphiti/Zep temporal graph,
+  and broad graph/RAG citation/navigation quality remain blocked, incomplete,
+  wrong-result, or not-tested.
+- Private-corpus and credentialed provider gates remain tied to XY-930.
+
+No scenario regressed in the checked-in local/public retest evidence. The remaining
+work is issue-shaped only for measured losses or typed blockers.
+
+## Commands
+
+| Command | Status | Artifact | Result |
+| --- | --- | --- | --- |
+| `cargo make real-world-memory` | `pass` | `tmp/real-world-memory/real-world-memory-report.json` | 60 jobs, 53 pass, 0 wrong_result, 7 blocked, evidence/source-ref/quote coverage 1.000. |
+| `cargo make real-world-memory-graph-rag` | `pass` | `tmp/real-world-memory/graph-rag/report.json` | 5 jobs, 0 pass, 1 wrong_result, 1 incomplete, 3 blocked. This is typed non-pass graph/RAG evidence. |
+| `cargo make real-world-first-generation-oss` | `pass` | `tmp/real-world-memory/first-generation-oss/report.json` | 6 jobs, 4 pass, 2 blocked, evidence coverage 1.000. |
+| `cargo make real-world-memory-live-adapters` | `pass` | `tmp/real-world-memory/live-adapters/summary.json` | ELF live: 66 jobs, 40 pass, 0 wrong_result, 7 blocked, 19 not_encoded. qmd live: 66 jobs, 17 pass, 13 wrong_result, 7 blocked, 29 not_encoded. |
+
+The full live-adapter command now has fresh ELF and qmd scored reports. The qmd
+full-suite non-pass result is not a regression of qmd debug ergonomics and is not a
+broad ELF-over-qmd win.
+
+## Stage Closeout
+
+| Stage | Baseline | Current | Judgment | Boundary |
+| --- | --- | --- | --- | --- |
+| Current-vs-historical correctness | `pass=1`, `wrong_result=5` | `pass=6`, `wrong_result=0` | `improved` | Encoded ELF live `memory_evolution` only; no Graphiti/Zep, mem0/OpenMemory, Letta, private-corpus, or broad qmd claim. |
+| Preference evolution | `wrong_result=1` | `pass=1`, `wrong_result=0` | `improved` | ELF current-vs-historical preference case improved; mem0/OpenMemory history remains separately stronger on the local OSS history surface. |
+| Deletion, TTL, and tombstones | `pass=1` | `pass=1` | `unchanged` | Single encoded tombstone/TTL job remains passing; broader update/delete/recreate history is still follow-up work. |
+| Reviewable consolidation | `pass=4`, `not_tested=1`, `not_encoded=1` | `pass=4`, `not_tested=0`, `not_encoded=0` | `improved` | ELF live self-check evidence only; direct competitor consolidation runners remain untested or product-reference only. |
+| Memory summary/top-of-mind | `pass=8`, `not_tested=1`, `not_encoded=1` | `pass=9`, `not_tested=0`, `not_encoded=0` | `improved` | Fixture-backed `elf.memory_summary/v1` source-trace contract evidence only. |
+| Proactive brief readiness | `pass=0`, `not_tested=1`, `not_encoded=1` | `pass=4`, `blocked=1` | `improved` | Fixture-backed proactive brief scoring only; private-corpus refresh stays blocked under XY-930 and Pulse parity is not proven. |
+| Scheduled memory task readiness | `pass=0`, `blocked=1` | `pass=4`, `blocked=1` | `improved` | Fixture-backed scheduled task readback only; hosted scheduler, notification, provider-backed private-corpus, and silent-mutation parity are not proven. |
+| Final competitor retest status | `pass=22`, `wrong_result=5`, `blocked=2`, `not_tested=11`, `not_encoded=11` | ELF live: `pass=40`, `wrong_result=0`, `blocked=7`, `not_encoded=19`; qmd live: `pass=17`, `wrong_result=13`, `blocked=7`, `not_encoded=29`; graph/RAG typed non-pass; first-generation OSS `pass=4`, `blocked=2` | `unchanged` | ELF live improvement and qmd full-suite non-pass do not remove qmd debug ergonomics, private/provider, OpenViking, Letta, or graph/RAG blockers. |
+
+## Scenario Retest Matrix
+
+| Scenario | Baseline outcome | Current outcome | Status | Follow-up |
+| --- | --- | --- | --- | --- |
+| qmd debug ergonomics | `loss` | `unchanged` | `pass` for fresh qmd full-suite materialization; debug ergonomics still a measured ELF loss | XY-923 |
+| mem0/OpenMemory preference/history/export | ELF loss on correction history, tie on scoped personalization, UI/export blocked | `unchanged` | `blocked` for UI/export and private/provider inputs | XY-930 plus dedicated UI/export runner work |
+| Letta core/archive | `blocked` | `unchanged` | `blocked` | Proposed Letta core/archive adapter brief |
+| Graphiti/Zep temporal graph validity | `blocked` | `unchanged` | `blocked` | Graph/RAG adapter follow-up with explicit provider setup |
+| OpenViking trajectory/hierarchy | `blocked` | `unchanged` | `blocked` | XY-928 |
+| GraphRAG/LightRAG/RAGFlow/llm-wiki/gbrain/graphify citation/navigation/knowledge surfaces | `not_tested` | `unchanged` | typed non-pass: blocked, incomplete, wrong_result, not_tested, or non_goal | XY-929 |
+| Private/provider production gates | `blocked` | `unchanged` | `blocked` | XY-930 |
+
+## Optimization Queue
+
+| Priority | Issue | Status | Brief |
+| --- | --- | --- | --- |
+| P0 | XY-923 | Existing | Re-run qmd trace/replay diagnostics with comparable immediate top-k/replay, expansion, fusion, rerank, and candidate-drop artifacts; preserve qmd's debug ergonomics edge unless ELF produces comparable artifacts. |
+| P1 | XY-930 | Existing | Run private-corpus and credentialed provider gates only after operator-owned manifest and explicit provider setup exist; otherwise keep typed blockers. |
+| P1 | XY-928 | Existing | Materialize OpenViking staged trajectory, hierarchy selection, and recursive expansion evidence before claiming ELF ties or beats those strengths. |
+| P1 | Letta core/archive adapter | Proposed | Add a contained Letta core/archive export-readback adapter that emits source ids for core blocks and archival memories. Non-goals: ELF product changes and broad Letta claims. |
+| P2 | XY-929 | Existing | Promote Graph/RAG citation, navigation, stale-source lint, and knowledge-surface cases only when adapters emit comparable evidence-linked outputs. |
+| P2 | Service-native Dreaming outputs | Proposed | Move fixture-backed memory summary, proactive brief, and scheduled task contracts into service-native readback/materialization with source-ref, freshness, rationale, trace, and no-source-mutation gates. |
+
+## Follow-Up Issue Briefs
+
+These are Decodex-ready follow-up shapes for the remaining measured losses or typed
+blockers. Existing Linear issues should be linked rather than duplicated.
+
+| Issue | State | Brief |
+| --- | --- | --- |
+| XY-923 | Existing | Re-run qmd trace/replay diagnostics with comparable immediate top-k, replay, expansion, fusion, rerank, and candidate-drop artifacts. Non-goal: do not reinterpret qmd full-suite wrong_result counts as a regression of qmd debug ergonomics. Validation: a scored qmd/ELF debug ergonomics artifact with typed outcomes preserved. |
+| XY-930 | Existing | Run private-corpus and credentialed provider gates only after operator-owned manifest and explicit provider setup exist. Non-goal: do not infer credentials or promote synthetic/provider smoke evidence into private-corpus pass evidence. Validation: a public-safe report that states whether the private/provider caveats are removed or still blocked. |
+| XY-928 | Existing | Materialize OpenViking same-corpus evidence ids and staged trajectory outputs before scoring hierarchy or recursive retrieval. Non-goal: do not claim ELF ties or beats OpenViking from fixture-only blocked rows. Validation: scored context-trajectory reports with typed pass, wrong_result, blocked, or incomplete outcomes. |
+| XY-929 | Existing | Promote graph/RAG citation, navigation, stale-source lint, and knowledge-surface cases only when adapters emit comparable evidence-linked outputs. Non-goal: do not convert research gates, tiny smokes, blocked setup, or graphify wrong_result into graph/RAG parity evidence. Validation: representative graph/RAG reports with typed non-pass states preserved. |
+| Letta core/archive adapter | Proposed | Create a Docker-contained Letta export/readback adapter over benchmark-owned data and score only mapped core/archive evidence. Non-goal: no ELF product change or broad Letta claim before comparable evidence exists. Validation: a scored artifact containing Letta core block JSON, archival search/readback JSON, source ids, and typed outcomes. |
+| Service-native Dreaming outputs | Proposed | Move memory summary, proactive brief, and scheduled task outputs into service-native materialization with source refs, freshness, rationale, trace, and no-source-mutation gates. Non-goal: no polished hosted scheduler, Pulse clone, notification product, or private/provider path in this follow-up. Validation: service-native scored reports that fail stale, tombstoned, unsupported, or untraced current claims. |
+
+## Claim Boundaries
+
+Allowed:
+
+- ELF is locally and partially stronger after the Dreaming stages on encoded temporal
+  reconciliation, reviewable consolidation self-checks, fixture-backed memory
+  summary, proactive brief, and scheduled-memory task scoring.
+- The public/local aggregate fixture retest remains 53 pass, 0 wrong_result, and 7
+  typed blocked jobs across 60 jobs.
+- The representative graph/RAG slice remains typed non-pass.
+- Private/provider gates remain blocked under XY-930.
+
+Not allowed:
+
+- Do not claim broad ELF-over-qmd superiority.
+- Do not claim ELF beats managed Dreaming, Pulse, ChatGPT Tasks, mem0/OpenMemory,
+  Letta, OpenViking, Graphiti/Zep, or graph/RAG systems from fixture-only, partial
+  live, blocked, or smoke-only evidence.
+- Do not collapse scenario-level outcomes into a leaderboard.
+- Do not treat qmd full-suite wrong_result counts as a regression of qmd debug
+  ergonomics.
diff --git a/docs/evidence/benchmarking/index.md b/docs/evidence/benchmarking/index.md
index e8f581b6..2f7c6428 100644
--- a/docs/evidence/benchmarking/index.md
+++ b/docs/evidence/benchmarking/index.md
@@ -35,3 +35,4 @@ Routes to: Benchmarking evidence concepts under `docs/evidence/benchmarking/`.
 - `2026-06-16-live-temporal-reconciliation-report.md`: Live Temporal Reconciliation Report - June 16, 2026.
 - `2026-06-16-proactive-brief-scoring-report.md`: Proactive Brief Scoring Report - June 16, 2026.
 - `2026-06-16-scheduled-memory-task-scoring-report.md`: Real-World Job Benchmark Report.
+- `2026-06-17-dreaming-competitor-strength-retest-report.md`: Dreaming Competitor-Strength Retest Report - June 17, 2026.

From 7bf44cca65fa6dbbe6197c021f6c30aaeddb0f21 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Fri, 19 Jun 2026 12:26:52 +0800
Subject: [PATCH 2/2] {"schema":"decodex/commit/1","summary":"Add Dreaming
 report OKF frontmatter","authority":"XY-955"}

---
 ...7-dreaming-competitor-strength-retest-report.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md b/docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md
index 62396f6d..1415b002 100644
--- a/docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md
+++ b/docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md
@@ -1,3 +1,17 @@
+---
+type: Evidence
+title: "Dreaming Competitor-Strength Retest Report - June 17, 2026"
+description: "Checked-in benchmark evidence record: Dreaming Competitor-Strength Retest Report - June 17, 2026."
+resource: docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md
+status: active
+authority: current_state
+owner: evidence
+last_verified: 2026-06-19
+tags:
+  - docs
+  - evidence
+  - benchmarking
+---
 # Dreaming Competitor-Strength Retest Report - June 17, 2026
 
 Goal: Close out the XY-955 Dreaming-readiness benchmark program pass with a