From 3ea74fc434a3365a68ffd5326daded49d72c33e3 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Thu, 11 Jun 2026 18:57:47 +0800
Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Publish qmd trace
 replay diagnostics report","authority":"XY-923"}

---
 README.md                                     |   2 +
 .../tests/real_world_job_benchmark.rs         | 175 +++++++++++
 ...-11-competitor-strength-adoption-report.md |  14 +-
 ...elf-qmd-trace-replay-diagnostics-report.md | 140 +++++++++
 docs/guide/benchmarking/index.md              |   4 +
 ...1-competitor-strength-adoption-report.json |  19 +-
 ...f-qmd-trace-replay-diagnostics-report.json | 293 ++++++++++++++++++
 7 files changed, 636 insertions(+), 11 deletions(-)
 create mode 100644 docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md
 create mode 100644 docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json
diff --git a/README.md b/README.md
index bdd884b3..51452873 100644
--- a/README.md
+++ b/README.md
@@ -195,6 +195,7 @@ Detailed evidence and interpretation:
 - [Live Real-World Adapter Sweep Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md)
 - [Post-Adapter Production Adoption Refresh - June 10, 2026](docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md)
 - [qmd and OpenViking Strength-Profile Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md)
+- [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md)
 - [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md)
 - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
 - [Single-User Production Runbook](docs/guide/single_user_production.md)
@@ -269,6 +270,7 @@ Detailed comparison, mechanism-level analysis, and source map:
 - [Post-Adapter Production Adoption Refresh - June 10, 2026](docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md)
 - [Competitor Strength Evidence Matrix - June 11, 2026](docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md)
 - [Temporal History Competitor Gap Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md)
+- [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md)
 - [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md)
 - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
 - [Real-World Agent Memory Benchmark](docs/guide/benchmarking/real_world_agent_memory_benchmark.md)
diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs
index ce163f29..bf0b0bbc 100644
--- a/apps/elf-eval/tests/real_world_job_benchmark.rs
+++ b/apps/elf-eval/tests/real_world_job_benchmark.rs
@@ -107,6 +107,36 @@ fn retrieval_debug_profile_json_path() -> Result<PathBuf> {
 		.join("2026-06-11-elf-qmd-retrieval-debug-profile.json"))
 }
 
+fn trace_replay_diagnostics_report_path() -> Result<PathBuf> {
+	Ok(workspace_root()?
+		.join("docs")
+		.join("research")
+		.join("2026-06-11-elf-qmd-trace-replay-diagnostics-report.json"))
+}
+
+fn trace_replay_diagnostics_markdown_path() -> Result<PathBuf> {
+	Ok(workspace_root()?
+		.join("docs")
+		.join("guide")
+		.join("benchmarking")
+		.join("2026-06-11-elf-qmd-trace-replay-diagnostics-report.md"))
+}
+
+fn competitor_strength_adoption_report_path() -> Result<PathBuf> {
+	Ok(workspace_root()?
+		.join("docs")
+		.join("guide")
+		.join("benchmarking")
+		.join("2026-06-11-competitor-strength-adoption-report.md"))
+}
+
+fn competitor_strength_adoption_report_json_path() -> Result<PathBuf> {
+	Ok(workspace_root()?
+		.join("docs")
+		.join("research")
+		.join("2026-06-11-competitor-strength-adoption-report.json"))
+}
+
 fn competitor_strength_matrix_path() -> Result<PathBuf> {
 	Ok(workspace_root()?
 		.join("docs")
@@ -1404,6 +1434,151 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> {
 	Ok(())
 }
 
+#[test]
+fn qmd_trace_replay_diagnostics_report_preserves_claim_boundaries() -> Result<()> {
+	let report = serde_json::from_str::<Value>(&fs::read_to_string(
+		trace_replay_diagnostics_report_path()?,
+	)?)?;
+	let markdown = fs::read_to_string(trace_replay_diagnostics_markdown_path()?)?;
+	let readme = fs::read_to_string(readme_path()?)?;
+	let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?;
+	let adoption_report = fs::read_to_string(competitor_strength_adoption_report_path()?)?;
+	let adoption_json = serde_json::from_str::<Value>(&fs::read_to_string(
+		competitor_strength_adoption_report_json_path()?,
+	)?)?;
+
+	assert_trace_replay_diagnostics_json(&report)?;
+	assert_trace_replay_diagnostics_markdown(&markdown);
+
+	assert!(readme.contains("ELF/qmd Trace Replay Diagnostics Report - June 11, 2026"));
+	assert!(benchmarking_index.contains("2026-06-11-elf-qmd-trace-replay-diagnostics-report.md"));
+	assert!(benchmarking_index.contains("qmd top-10/replay artifact"));
+	assert!(benchmarking_index.contains("ELF trace/admin surfaces"));
+	assert!(adoption_report.contains("| Retrieval quality and local debug UX | `loss` |"));
+	assert!(
+		adoption_report
+			.contains("Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF")
+	);
+
+	assert_trace_replay_adoption_json(&adoption_json)?;
+
+	Ok(())
+}
+
+fn assert_trace_replay_diagnostics_json(report: &Value) -> Result<()> {
+	assert_eq!(
+		report.pointer("/schema").and_then(Value::as_str),
+		Some("elf.trace_replay_diagnostics_report/v1")
+	);
+	assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-923"));
+	assert_eq!(
+		string_array_at(report, "/outcome_terms")?,
+		["win", "tie", "loss", "not_tested", "blocked", "non_goal"].map(str::to_owned)
+	);
+	assert_eq!(
+		report.pointer("/summary/retrieval_correctness").and_then(Value::as_str),
+		Some("tie")
+	);
+	assert_eq!(report.pointer("/summary/outcome_counts/loss").and_then(Value::as_u64), Some(2));
+	assert_eq!(
+		report.pointer("/summary/outcome_counts/not_tested").and_then(Value::as_u64),
+		Some(4)
+	);
+	assert_eq!(report.pointer("/summary/outcome_counts/non_goal").and_then(Value::as_u64), Some(1));
+
+	let scenarios = array_at(report, "/scenario_outcomes")?;
+	let retrieval = find_by_field(scenarios, "/scenario_id", "retrieval_correctness_guardrail")?;
+	let top10 = find_by_field(scenarios, "/scenario_id", "default_top10_candidate_artifact")?;
+	let replay = find_by_field(scenarios, "/scenario_id", "replay_command_locality")?;
+	let trace_surface =
+		find_by_field(scenarios, "/scenario_id", "trace_admin_replay_surface_availability")?;
+	let expansion = find_by_field(scenarios, "/scenario_id", "query_expansion_attribution")?;
+	let dense_sparse =
+		find_by_field(scenarios, "/scenario_id", "dense_sparse_channel_attribution")?;
+	let fusion = find_by_field(scenarios, "/scenario_id", "fusion_attribution")?;
+	let rerank = find_by_field(scenarios, "/scenario_id", "rerank_attribution")?;
+	let candidate_drop = find_by_field(scenarios, "/scenario_id", "candidate_drop_diagnostics")?;
+	let selected =
+		find_by_field(scenarios, "/scenario_id", "selected_but_not_narrated_wrong_results")?;
+	let tombstone =
+		find_by_field(scenarios, "/scenario_id", "evidence_absent_tombstone_diagnostics")?;
+
+	assert_eq!(scenarios.len(), 11);
+	assert_eq!(retrieval.pointer("/outcome").and_then(Value::as_str), Some("tie"));
+	assert_eq!(top10.pointer("/outcome").and_then(Value::as_str), Some("loss"));
+	assert_eq!(replay.pointer("/outcome").and_then(Value::as_str), Some("loss"));
+	assert_eq!(trace_surface.pointer("/outcome").and_then(Value::as_str), Some("tie"));
+	assert_eq!(expansion.pointer("/outcome").and_then(Value::as_str), Some("not_tested"));
+	assert_eq!(dense_sparse.pointer("/outcome").and_then(Value::as_str), Some("not_tested"));
+	assert_eq!(fusion.pointer("/outcome").and_then(Value::as_str), Some("not_tested"));
+	assert_eq!(rerank.pointer("/result_type").and_then(Value::as_str), Some("non_goal"));
+	assert_eq!(rerank.pointer("/outcome").and_then(Value::as_str), Some("non_goal"));
+	assert_eq!(candidate_drop.pointer("/outcome").and_then(Value::as_str), Some("not_tested"));
+	assert!(array_contains_str(candidate_drop, "/typed_non_pass_states", "retrieved_but_dropped")?);
+	assert_eq!(selected.pointer("/result_type").and_then(Value::as_str), Some("wrong_result"));
+	assert!(array_contains_str(selected, "/typed_non_pass_states", "selected_but_not_narrated")?);
+	assert_eq!(tombstone.pointer("/outcome").and_then(Value::as_str), Some("win"));
+	assert_eq!(tombstone.pointer("/qmd_status").and_then(Value::as_str), Some("wrong_result"));
+	assert!(array_contains_str(
+		report,
+		"/wrong_result_diagnostics/qmd_missing_evidence",
+		"delete-tombstone"
+	)?);
+	assert!(array_contains_str(
+		report,
+		"/claim_boundaries",
+		"qmd currently wins the default local-debug artifact surface: top-10 rows plus short CLI replay."
+	)?);
+	assert!(array_contains_str(
+		report,
+		"/claim_boundaries",
+		"Do not claim qmd beats ELF as a memory system overall."
+	)?);
+
+	Ok(())
+}
+
+fn assert_trace_replay_diagnostics_markdown(markdown: &str) {
+	assert!(markdown.contains("Retrieval correctness is still tied"));
+	assert!(markdown.contains("| Default top-10 candidate artifact |"));
+	assert!(markdown.contains("| Replay command locality |"));
+	assert!(markdown.contains("| Rerank attribution | `live_baseline_only` | `non_goal` |"));
+	assert!(markdown.contains("| Candidate-drop diagnostics | `research_gate` | `not_encoded` |"));
+	assert!(markdown.contains("`retrieved_but_dropped` | Defined but `not_tested`"));
+	assert!(markdown.contains("npx tsx src/cli/qmd.ts query"));
+	assert!(markdown.contains("cargo run -p elf-eval -- --config-a"));
+	assert!(markdown.contains("Do not claim qmd beats ELF as a memory system overall"));
+	assert!(markdown.contains("Do not score rerank superiority from a qmd `--no-rerank` run"));
+}
+
+fn assert_trace_replay_adoption_json(adoption: &Value) -> Result<()> {
+	let local_debug = find_by_field(
+		array_at(adoption, "/scenario_outcomes")?,
+		"/scenario_id",
+		"local_debug_replay_ux",
+	)?;
+
+	assert_eq!(local_debug.pointer("/outcome").and_then(Value::as_str), Some("loss"));
+	assert!(
+		local_debug
+			.pointer("/measured_claim")
+			.and_then(Value::as_str)
+			.is_some_and(|claim| claim.contains("qmd stronger on immediate top-10"))
+	);
+	assert!(array_contains_str(
+		local_debug,
+		"/command_artifacts",
+		"docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md"
+	)?);
+	assert!(array_contains_str(
+		adoption,
+		"/claim_boundaries/not_allowed",
+		"Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system or retrieval-quality win."
+	)?);
+
+	Ok(())
+}
+
 fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> {
 	let projects = array_at(matrix, "/project_matrix")?;
 	let qmd = find_by_field(projects, "/project", "qmd")?;
diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md
index e46ba1f7..1bf607f7 100644
--- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md
+++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md
@@ -35,9 +35,11 @@ The remaining caveats are material:
   exists.
 - Credentialed provider production-ops gates are blocked until explicit provider
   setup exists.
-- Several competitor strengths remain `not_tested`: qmd replay/debug UX,
-  mem0/OpenMemory history/UI, OpenViking trajectory, Letta core-vs-archival memory,
-  and graph/RAG navigation.
+- Several competitor strengths remain `not_tested`: mem0/OpenMemory history/UI,
+  OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation.
+  The XY-923 follow-up now scores qmd's immediate top-10/replay artifact ergonomics
+  as stronger than ELF's default stress report, while expansion, fusion, rerank, and
+  candidate-drop diagnosis remain untested.
 
 ## Evidence Classes
 
@@ -68,6 +70,7 @@ results, or lifecycle failures into one aggregate leaderboard.
 | `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke` | `2026-06-11-temporal-history-competitor-gap-report.md` | Graphiti/Zep temporal smoke remains blocked by `provider_api_key_missing`. |
 | `cargo make graphify-docker-graph-report-smoke` | `2026-06-11-graph-rag-scored-smoke-adapter-report.md` | graphify reaches tiny Docker graph/report scoring but remains wrong_result. |
 | `cargo make baseline-production-synthetic`, `cargo make baseline-backfill-docker`, backup/restore, Qdrant rebuild proof | `2026-06-10-production-adoption-refresh.md` | ELF has provider synthetic, stress, backfill, restore, and rebuild evidence; private-corpus proof is blocked by missing operator-owned manifest. |
+| `ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` plus ELF trace-bundle and qmd CLI replay commands | `2026-06-11-elf-qmd-trace-replay-diagnostics-report.md` | Retrieval correctness remains tied, but qmd wins current immediate top-10/replay artifact ergonomics; ELF trace/admin surfaces are useful but not yet hydrated into the default stress artifact. |
 
 ## Scenario Matrix
 
@@ -77,7 +80,7 @@ results, or lifecycle failures into one aggregate leaderboard.
 | Work resume and coding-agent continuity | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF and qmd both pass encoded live `work_resume` jobs; agentmemory, claude-mem, and OpenViking continuity strengths remain blocked or not encoded. | XY-925, XY-928 |
 | Project decisions and reversals | `tie` | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF and qmd both pass encoded `project_decisions` jobs; Letta-style core/archival decision memory is not tested. | XY-927 |
 | Retrieval quality | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF and qmd both pass encoded live retrieval and stress/same-corpus retrieval evidence. | XY-923 |
-| Retrieval quality and local debug UX | `not_tested` | `live_baseline_only`, `research_gate`, `not_encoded` | qmd remains the local retrieval-debug UX reference, but no scored rule compares qmd top-10/replay artifacts with ELF trace/admin bundle surfaces. | XY-923 |
+| Retrieval quality and local debug UX | `loss` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested. | XY-923 |
 | Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. | XY-905 |
 | Consolidation/proposal review | `not_tested` | `fixture_backed`, `not_encoded` | ELF fixture consolidation passes, but live consolidation proposal generation and review-action scoring are not encoded. | XY-926 |
 | Knowledge page compilation | `not_tested` | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `not_encoded` | ELF fixture knowledge pages pass, but live knowledge compilation is not encoded; graphify reaches a tiny scored smoke and remains wrong_result. | XY-926, XY-929 |
@@ -120,6 +123,8 @@ results, or lifecycle failures into one aggregate leaderboard.
 ## Claims Not Allowed
 
 - Do not claim ELF broadly beats qmd.
+- Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system
+  or retrieval-quality win.
 - Do not claim ELF beats mem0/OpenMemory on history, UI/export, hosted behavior, or
   graph memory.
 - Do not claim ELF beats OpenViking on staged context trajectory.
@@ -128,4 +133,3 @@ results, or lifecycle failures into one aggregate leaderboard.
 - Do not promote `fixture_backed`, `live_baseline_only`, `smoke_only`,
   `research_gate`, `blocked`, `wrong_result`, `lifecycle_fail`, `unsupported`, or
   `not_encoded` states into a generic pass/fail score.
-
diff --git a/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md b/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md
new file mode 100644
index 00000000..e3a7a7c7
--- /dev/null
+++ b/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md
@@ -0,0 +1,140 @@
+# ELF/qmd Trace Replay Diagnostics Report - June 11, 2026
+
+Goal: Compare ELF and qmd on trace-level replay and wrong-result diagnostics while
+keeping retrieval correctness as a separate guardrail.
+Read this when: You need the XY-923 report lane for qmd top-10 replay artifacts,
+ELF trace/admin bundle surfaces, and typed wrong-result diagnosis classes.
+Inputs: The June 11 ELF/qmd retrieval-debug profile, qmd/OpenViking strength profile,
+memory-evolution diagnostic, competitor-strength adoption report, live baseline
+runner, ELF trace replay code, and the ELF service trace/admin contract.
+Outputs: Scenario-level `win`, `tie`, `loss`, `not_tested`, `blocked`, or
+`non_goal` outcomes plus concrete replay commands and artifact paths.
+
+Machine-readable companion:
+`docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json`.
+
+## Executive Judgment
+
+Retrieval correctness is still tied: ELF and qmd both pass the encoded live retrieval
+suite and both pass the 480-document generated-public stress baseline.
+
+Trace-level debugging is not tied. In the current checked-in artifacts, qmd is ahead
+on immediate local replay ergonomics because the baseline keeps top-10 JSON rows with
+files, scores, line numbers, snippets, and distractor visibility, and the replay path
+is a short CLI sequence. ELF has a deeper service trace model and admin bundle
+surfaces, but the stress report still does not hydrate the equivalent candidate list
+by default.
+
+The resulting narrow position:
+
+- Retrieval correctness: `tie`.
+- Default per-query candidate artifact: ELF `loss` against qmd.
+- Replay command locality: ELF `loss` against qmd.
+- ELF trace/admin replay surface: `tie` as an available but different replay surface,
+  not a default-artifact win.
+- Expansion, dense/sparse contribution, fusion, and candidate-drop diagnostics:
+  `not_tested` until comparable stage artifacts are emitted.
+- Rerank stage scoring: `non_goal` for the current qmd stress path because it uses
+  `--no-rerank`.
+- Wrong-result selected-but-not-narrated diagnosis: `tie` on typed non-pass
+  classification, not on answer quality.
+
+This is not a broad qmd-over-ELF claim. It is a scored local-debug artifact gap.
+
+## Replay Artifact Manifest
+
+| System | Replay surface | Command | Artifact |
+| --- | --- | --- | --- |
+| ELF | Stress guardrail with trace ids | `ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` | `tmp/live-baseline/live-baseline-report.json`; summarized in `docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json` |
+| ELF | Admin trace bundle hydration | `curl -fsS 'http://127.0.0.1:51891/v2/admin/traces/<trace_id>/bundle?mode=full&stage_items_limit=256&candidates_limit=200' -H 'X-ELF-Tenant-Id: <tenant>' -H 'X-ELF-Project-Id: <project>' -H 'X-ELF-Agent-Id: <agent>'` | `elf.trace_bundle/v1` response from the admin service |
+| ELF | Trace ranking replay | `cargo run -p elf-eval -- --config-a config/local/elf.docker.toml --config-b config/local/elf.docker.toml --trace-id <trace_id>` | JSON trace compare output over `search_trace_candidates` |
+| qmd | Stress guardrail and top-10 rows | `ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` | `tmp/live-baseline/qmd-query.json`; summarized in `docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json` |
+| qmd | Per-query CLI replay | `npx tsx src/cli/qmd.ts query 'lex: <query>\nvec: <query>' -c elfbench --json --no-rerank --min-score 0 -n 10` | JSON top-10 rows with `file`, line/snippet/score fields when qmd returns them |
+| qmd | Lifecycle replay | `npx tsx src/cli/qmd.ts update && npx tsx src/cli/qmd.ts embed -f -c elfbench && npx tsx src/cli/qmd.ts query ... --json --no-rerank` | `tmp/live-baseline/qmd-query.json` checks for update, delete, and cold-start recovery |
+
+## Scenario Outcomes
+
+| Scenario | Evidence | Result type | ELF outcome | Diagnostic judgment |
+| --- | --- | --- | --- | --- |
+| Retrieval correctness guardrail | `live_real_world`, `live_baseline_only` | `pass` | `tie` | Both systems pass encoded retrieval and stress same-corpus checks; this row does not score debugging ergonomics. |
+| Default top-10 candidate artifact | `live_baseline_only` | `pass` | `loss` | qmd exposes file, score, line/snippet, and distractor rows directly; ELF records trace ids and top evidence but not the full candidate list in the report. |
+| Replay command locality | `live_baseline_only` | `pass` | `loss` | qmd replay is a short local CLI query/update/embed path; ELF replay requires a live service config, persisted traces, headers, and trace ids. |
+| Trace/admin replay surface availability | `implementation_reference` | `not_encoded` | `tie` | ELF has admin trace bundles and `elf-eval` trace replay; qmd has direct CLI replay. They are different useful surfaces and are not scored as equivalent quality. |
+| Query expansion attribution | `research_gate` | `not_encoded` | `not_tested` | No comparable artifact shows expansion variants or dynamic expansion decisions for both systems. |
+| Dense/sparse channel attribution | `research_gate` | `not_encoded` | `not_tested` | ELF uses dense plus BM25 and qmd uses structured `lex:` plus `vec:`, but the scored artifacts do not expose comparable per-channel contribution. |
+| Fusion attribution | `research_gate` | `not_encoded` | `not_tested` | No comparable artifact shows fusion inputs, RRF/weighted-fusion contributions, or fusion-stage candidate drops. |
+| Rerank attribution | `live_baseline_only` | `non_goal` | `non_goal` | The current qmd stress and materializer paths use `--no-rerank`; no rerank-on comparison is claimed. |
+| Candidate-drop diagnostics | `research_gate` | `not_encoded` | `not_tested` | `retrieved_but_dropped` is defined but not observed because current qmd artifacts lack intermediate candidate traces and the ELF stress report does not hydrate candidate bundles. |
+| Selected-but-not-narrated wrong results | `live_real_world` | `wrong_result` | `tie` | Both live paths produce memory-evolution wrong results where evidence is present but current-vs-historical or lifecycle narration is missing. |
+| Evidence-absent and tombstone diagnosis | `live_real_world` | `wrong_result` | `win` | ELF retrieved all required memory-evolution evidence and passed delete/TTL; qmd missed three required evidence links including the delete tombstone. |
+
+Summary: `1` ELF win, `3` ties, `2` ELF losses, `4` not-tested scenarios, `0`
+blocked scenarios, and `1` non-goal scenario. The losses are local-debug artifact
+losses only. They do not change the retrieval-correctness tie.
+
+## Stage Scoring Notes
+
+| Stage | Current score | Reason |
+| --- | --- | --- |
+| Expansion | `not_tested` | The current artifacts do not expose comparable expansion variants or dynamic expansion decisions. |
+| Dense retrieval | `not_tested` | The systems have dense/vector surfaces, but no comparable scored dense-only contribution artifact. |
+| Sparse retrieval | `not_tested` | qmd `lex:` and ELF BM25 are present in command or service design, but contribution and drops are not scored. |
+| Fusion | `not_tested` | Fusion candidates and final fusion deltas are not materialized comparably. |
+| Rerank | `non_goal` | qmd uses `--no-rerank` in the current path; rerank superiority is out of scope for this run. |
+| Candidate drops | `not_tested` | No current report can prove retrieved-but-dropped evidence for qmd, and ELF candidate bundles are not hydrated into the stress artifact. |
+| Selected-but-not-narrated | `tie` | Both systems have typed memory-evolution wrong-result rows where evidence is selected or available but not narrated as lifecycle history. |
+| Replay commands | `loss` | qmd's local CLI replay is shorter and directly tied to top-10 JSON output. |
+
+## Typed Non-Pass States
+
+The report preserves the wrong-result classes from the June 11 diagnostics:
+
+| Class | Current coverage |
+| --- | --- |
+| `evidence_absent` | Observed for qmd on verdict caveat, preference rationale, and delete tombstone misses. |
+| `retrieved_but_dropped` | Defined but `not_tested`; current artifacts do not expose enough candidate-stage data. |
+| `selected_but_not_narrated` | Observed for both ELF and qmd on supersession and temporal-validity jobs. |
+| `contradicted_by_lifecycle_evidence` | Observed when current, historical, supersession, or tombstone evidence makes the answer incomplete. |
+
+These states are typed evidence, not leaderboard shortcuts. A `wrong_result` with
+good evidence recall is still a wrong result.
+
+## Claim Boundaries
+
+Allowed:
+
+- ELF and qmd remain tied on encoded retrieval correctness.
+- qmd currently wins the default local-debug artifact surface: top-10 rows plus short
+  CLI replay.
+- ELF has useful service trace/admin replay surfaces, but they are not yet hydrated
+  into the default stress report as qmd-like candidate artifacts.
+- ELF narrowly wins the memory-evolution evidence-retention slice because qmd misses
+  the delete tombstone and two other required evidence links.
+- Expansion, dense/sparse contribution, fusion, rerank-on quality, and
+  retrieved-but-dropped candidate diagnosis remain unproven.
+
+Not allowed:
+
+- Do not claim qmd beats ELF as a memory system overall.
+- Do not claim ELF beats qmd retrieval overall.
+- Do not turn qmd top-10 ergonomics into a retrieval-quality win.
+- Do not treat ELF trace/admin endpoint availability as proof that the default
+  benchmark report has qmd-level candidate visibility.
+- Do not score rerank superiority from a qmd `--no-rerank` run.
+- Do not collapse `not_tested`, `non_goal`, or `wrong_result` into pass evidence.
+
+## Follow-Up Gate
+
+The next measurement should emit one candidate-replay artifact per suspicious query
+with:
+
+1. Expansion variants and whether the original query was included.
+2. Dense-only and sparse-only candidate sets.
+3. Fusion rank and score contribution.
+4. Rerank score, or an explicit rerank-disabled marker.
+5. Final selected items.
+6. Dropped or demoted expected evidence.
+7. A one-command replay line for both ELF and qmd.
+
+Until that exists, the current evidence supports a qmd local-debug artifact win, not a
+broad product or retrieval win.
diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md
index b462818e..efab4bb0 100644
--- a/docs/guide/benchmarking/index.md
+++ b/docs/guide/benchmarking/index.md
@@ -74,6 +74,10 @@ cleanup, use `docs/guide/single_user_production.md`.
   report that separates qmd retrieval quality from debug/replay ergonomics, records
   qmd wrong-result diagnosis classes, and preserves OpenViking context-trajectory
   surfaces as `not_tested` until staged/hierarchical evidence is encoded.
+- `2026-06-11-elf-qmd-trace-replay-diagnostics-report.md`: XY-923 trace-level
+  replay and wrong-result diagnostics report that scores qmd top-10/replay artifact
+  ergonomics against ELF trace/admin surfaces while keeping retrieval correctness,
+  rerank, fusion, candidate-drop, and typed non-pass boundaries separate.
 - `2026-06-11-first-generation-oss-adapter-promotion-report.md`: XY-898
   first-generation OSS adapter promotion report that updates agentmemory,
   mem0/OpenMemory, memsearch, and claude-mem with fresh scenario-level baseline
diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json
index e9fbb3e6..9226f5ca 100644
--- a/docs/research/2026-06-11-competitor-strength-adoption-report.json
+++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json
@@ -12,7 +12,7 @@
       "Live temporal reconciliation remains wrong_result for five of six memory_evolution jobs.",
       "Private-corpus production quality is blocked until an operator-owned manifest exists.",
       "Credentialed provider production-ops gates are blocked until explicit provider setup exists.",
-      "Several competitor strengths remain not_tested: qmd replay/debug UX, mem0/OpenMemory history/UI, OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation."
+      "Several competitor strengths remain not_tested: mem0/OpenMemory history/UI, OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation. The XY-923 follow-up now scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, rerank, and candidate-drop diagnosis remain untested."
     ]
   },
   "evidence_class_terms": [
@@ -65,6 +65,11 @@
       "command": "cargo make baseline-production-synthetic, cargo make baseline-backfill-docker, backup/restore plus Qdrant rebuild proof",
       "artifact": "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md",
       "claim": "ELF has provider synthetic, stress, backfill, restore, and rebuild evidence, while private-corpus proof remains blocked by missing operator-owned manifest."
+    },
+    {
+      "command": "ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker plus ELF trace-bundle and qmd CLI replay commands",
+      "artifact": "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md",
+      "claim": "Retrieval correctness remains tied, but qmd wins current immediate top-10/replay artifact ergonomics; ELF trace/admin surfaces are useful but not yet hydrated into the default stress artifact."
     }
   ],
   "scenario_outcomes": [
@@ -122,15 +127,16 @@
     {
       "scenario_id": "local_debug_replay_ux",
       "title": "Retrieval quality and local debug UX",
-      "outcome": "not_tested",
-      "evidence_classes": ["live_baseline_only", "research_gate", "not_encoded"],
-      "measured_claim": "qmd remains the local retrieval-debug UX reference, but no scored rule compares qmd top-10/replay artifacts with ELF trace/admin bundle surfaces.",
+      "outcome": "loss",
+      "evidence_classes": ["live_baseline_only", "research_gate", "wrong_result", "not_encoded"],
+      "measured_claim": "The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested.",
       "command_artifacts": [
         "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md",
-        "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md"
+        "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md",
+        "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md"
       ],
       "follow_up_issues": ["XY-923"],
-      "caveat": "No ELF loss is claimed until comparable replay and candidate-diagnosis evidence is scored."
+      "caveat": "The loss is a local-debug artifact loss only; retrieval correctness remains tied and no broad qmd-over-ELF memory-system claim is allowed."
     },
     {
       "scenario_id": "memory_evolution_temporal_history",
@@ -344,6 +350,7 @@
     ],
     "not_allowed": [
       "Do not claim ELF broadly beats qmd.",
+      "Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system or retrieval-quality win.",
       "Do not claim ELF beats mem0/OpenMemory on history, UI/export, hosted behavior, or graph memory.",
       "Do not claim ELF beats OpenViking on staged context trajectory.",
       "Do not claim ELF beats Letta on core-vs-archival memory.",
diff --git a/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json b/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json
new file mode 100644
index 00000000..ebc095d2
--- /dev/null
+++ b/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json
@@ -0,0 +1,293 @@
+{
+  "schema": "elf.trace_replay_diagnostics_report/v1",
+  "run_id": "2026-06-11-elf-qmd-trace-replay-diagnostics",
+  "authority": "XY-923",
+  "created_at": "2026-06-11",
+  "scope": "ELF versus qmd trace-level replay and wrong-result diagnostics, with retrieval correctness kept as a separate guardrail.",
+  "inputs": [
+    "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md",
+    "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json",
+    "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md",
+    "docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json",
+    "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md",
+    "docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json",
+    "scripts/live-baseline-benchmark.sh",
+    "apps/elf-eval/src/app.rs",
+    "docs/spec/system_elf_memory_service_v2.md"
+  ],
+  "outcome_terms": [
+    "win",
+    "tie",
+    "loss",
+    "not_tested",
+    "blocked",
+    "non_goal"
+  ],
+  "result_type_terms": [
+    "pass",
+    "wrong_result",
+    "blocked",
+    "not_encoded",
+    "non_goal"
+  ],
+  "summary": {
+    "retrieval_correctness": "tie",
+    "debug_ergonomics": "qmd wins the current default top-10 candidate artifact and short replay-command surfaces.",
+    "elf_trace_position": "ELF has service trace, admin bundle, and trace replay surfaces, but they are not hydrated into the default stress report as qmd-like candidate artifacts.",
+    "outcome_counts": {
+      "win": 1,
+      "tie": 3,
+      "loss": 2,
+      "not_tested": 4,
+      "blocked": 0,
+      "non_goal": 1
+    }
+  },
+  "commands": [
+    {
+      "system": "ELF",
+      "purpose": "stress retrieval guardrail with trace ids",
+      "command": "ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker",
+      "status": "pass",
+      "artifact": "tmp/live-baseline/live-baseline-report.json"
+    },
+    {
+      "system": "ELF",
+      "purpose": "admin trace bundle hydration",
+      "command": "curl -fsS 'http://127.0.0.1:51891/v2/admin/traces/<trace_id>/bundle?mode=full&stage_items_limit=256&candidates_limit=200' -H 'X-ELF-Tenant-Id: <tenant>' -H 'X-ELF-Project-Id: <project>' -H 'X-ELF-Agent-Id: <agent>'",
+      "status": "available_not_hydrated_in_default_stress_report",
+      "artifact": "elf.trace_bundle/v1 admin response"
+    },
+    {
+      "system": "ELF",
+      "purpose": "trace ranking replay from persisted candidates",
+      "command": "cargo run -p elf-eval -- --config-a config/local/elf.docker.toml --config-b config/local/elf.docker.toml --trace-id <trace_id>",
+      "status": "available_not_run_for_the_checked_in_stress_report",
+      "artifact": "elf-eval trace compare JSON"
+    },
+    {
+      "system": "qmd",
+      "purpose": "stress retrieval guardrail plus top-10 rows",
+      "command": "ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker",
+      "status": "pass",
+      "artifact": "tmp/live-baseline/qmd-query.json"
+    },
+    {
+      "system": "qmd",
+      "purpose": "per-query replay",
+      "command": "npx tsx src/cli/qmd.ts query 'lex: <query>\\nvec: <query>' -c elfbench --json --no-rerank --min-score 0 -n 10",
+      "status": "pass_in_baseline_driver",
+      "artifact": "tmp/live-baseline/qmd-query.json"
+    },
+    {
+      "system": "qmd",
+      "purpose": "lifecycle replay",
+      "command": "npx tsx src/cli/qmd.ts update && npx tsx src/cli/qmd.ts embed -f -c elfbench && npx tsx src/cli/qmd.ts query ... --json --no-rerank",
+      "status": "pass_for_update_delete_cold_start_checks",
+      "artifact": "tmp/live-baseline/qmd-query.json"
+    }
+  ],
+  "scenario_outcomes": [
+    {
+      "scenario_id": "retrieval_correctness_guardrail",
+      "surface": "retrieval correctness",
+      "evidence_class": "live_real_world_and_live_baseline_only",
+      "result_type": "pass",
+      "elf_status": "pass",
+      "qmd_status": "pass",
+      "outcome": "tie",
+      "diagnostic_judgment": "Both systems pass encoded retrieval and stress same-corpus checks; this row does not score debugging ergonomics.",
+      "artifacts": [
+        "docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json",
+        "tmp/live-baseline/live-baseline-report.json"
+      ]
+    },
+    {
+      "scenario_id": "default_top10_candidate_artifact",
+      "surface": "default top-10 candidate artifact",
+      "evidence_class": "live_baseline_only",
+      "result_type": "pass",
+      "elf_status": "not_encoded",
+      "qmd_status": "pass",
+      "outcome": "loss",
+      "diagnostic_judgment": "qmd exposes file, score, line/snippet, and distractor rows directly; ELF records trace ids and top evidence but not the full candidate list in the report.",
+      "artifacts": [
+        "tmp/live-baseline/qmd-query.json",
+        "docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json"
+      ]
+    },
+    {
+      "scenario_id": "replay_command_locality",
+      "surface": "replay command locality",
+      "evidence_class": "live_baseline_only",
+      "result_type": "pass",
+      "elf_status": "not_encoded",
+      "qmd_status": "pass",
+      "outcome": "loss",
+      "diagnostic_judgment": "qmd replay is a short local CLI query/update/embed path; ELF replay requires a live service config, persisted traces, headers, and trace ids.",
+      "artifacts": [
+        "scripts/live-baseline-benchmark.sh",
+        "apps/elf-eval/src/app.rs",
+        "docs/spec/system_elf_memory_service_v2.md"
+      ]
+    },
+    {
+      "scenario_id": "trace_admin_replay_surface_availability",
+      "surface": "trace/admin replay surface availability",
+      "evidence_class": "implementation_reference",
+      "result_type": "not_encoded",
+      "elf_status": "pass",
+      "qmd_status": "pass",
+      "outcome": "tie",
+      "diagnostic_judgment": "ELF has admin trace bundles and elf-eval trace replay; qmd has direct CLI replay. They are different useful surfaces and are not scored as equivalent quality.",
+      "artifacts": [
+        "docs/spec/system_elf_memory_service_v2.md",
+        "apps/elf-eval/src/app.rs",
+        "scripts/live-baseline-benchmark.sh"
+      ]
+    },
+    {
+      "scenario_id": "query_expansion_attribution",
+      "surface": "query expansion attribution",
+      "evidence_class": "research_gate",
+      "result_type": "not_encoded",
+      "elf_status": "not_encoded",
+      "qmd_status": "not_encoded",
+      "outcome": "not_tested",
+      "diagnostic_judgment": "No comparable artifact shows expansion variants or dynamic expansion decisions for both systems.",
+      "artifacts": [
+        "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md"
+      ]
+    },
+    {
+      "scenario_id": "dense_sparse_channel_attribution",
+      "surface": "dense/sparse channel attribution",
+      "evidence_class": "research_gate",
+      "result_type": "not_encoded",
+      "elf_status": "not_encoded",
+      "qmd_status": "not_encoded",
+      "outcome": "not_tested",
+      "diagnostic_judgment": "ELF uses dense plus BM25 and qmd uses structured lex plus vec, but the scored artifacts do not expose comparable per-channel contribution.",
+      "artifacts": [
+        "docs/spec/system_elf_memory_service_v2.md",
+        "scripts/live-baseline-benchmark.sh"
+      ]
+    },
+    {
+      "scenario_id": "fusion_attribution",
+      "surface": "fusion attribution",
+      "evidence_class": "research_gate",
+      "result_type": "not_encoded",
+      "elf_status": "not_encoded",
+      "qmd_status": "not_encoded",
+      "outcome": "not_tested",
+      "diagnostic_judgment": "No comparable artifact shows fusion inputs, RRF or weighted-fusion contribution, or fusion-stage candidate drops.",
+      "artifacts": [
+        "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md"
+      ]
+    },
+    {
+      "scenario_id": "rerank_attribution",
+      "surface": "rerank attribution",
+      "evidence_class": "live_baseline_only",
+      "result_type": "non_goal",
+      "elf_status": "not_encoded",
+      "qmd_status": "not_encoded",
+      "outcome": "non_goal",
+      "diagnostic_judgment": "The current qmd stress and materializer paths use --no-rerank; no rerank-on comparison is claimed.",
+      "artifacts": [
+        "scripts/live-baseline-benchmark.sh",
+        "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md"
+      ]
+    },
+    {
+      "scenario_id": "candidate_drop_diagnostics",
+      "surface": "candidate-drop diagnostics",
+      "evidence_class": "research_gate",
+      "result_type": "not_encoded",
+      "elf_status": "not_encoded",
+      "qmd_status": "not_encoded",
+      "outcome": "not_tested",
+      "diagnostic_judgment": "retrieved_but_dropped is defined but not observed because current qmd artifacts lack intermediate candidate traces and the ELF stress report does not hydrate candidate bundles.",
+      "typed_non_pass_states": [
+        "retrieved_but_dropped"
+      ],
+      "artifacts": [
+        "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json",
+        "docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json"
+      ]
+    },
+    {
+      "scenario_id": "selected_but_not_narrated_wrong_results",
+      "surface": "selected-but-not-narrated wrong-result diagnosis",
+      "evidence_class": "live_real_world",
+      "result_type": "wrong_result",
+      "elf_status": "wrong_result",
+      "qmd_status": "wrong_result",
+      "outcome": "tie",
+      "diagnostic_judgment": "Both live paths produce memory-evolution wrong results where evidence is present but current-vs-historical or lifecycle narration is missing.",
+      "typed_non_pass_states": [
+        "selected_but_not_narrated",
+        "contradicted_by_lifecycle_evidence"
+      ],
+      "artifacts": [
+        "docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json"
+      ]
+    },
+    {
+      "scenario_id": "evidence_absent_tombstone_diagnostics",
+      "surface": "evidence-absent and tombstone diagnosis",
+      "evidence_class": "live_real_world",
+      "result_type": "wrong_result",
+      "elf_status": "pass",
+      "qmd_status": "wrong_result",
+      "outcome": "win",
+      "diagnostic_judgment": "ELF retrieved all required memory-evolution evidence and passed delete/TTL; qmd missed three required evidence links including the delete tombstone.",
+      "typed_non_pass_states": [
+        "evidence_absent",
+        "contradicted_by_lifecycle_evidence"
+      ],
+      "artifacts": [
+        "docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json"
+      ]
+    }
+  ],
+  "wrong_result_diagnostics": {
+    "typed_non_pass_states": [
+      {
+        "class": "evidence_absent",
+        "coverage": "observed_for_qmd",
+        "meaning": "Required evidence is absent from produced evidence ids."
+      },
+      {
+        "class": "retrieved_but_dropped",
+        "coverage": "not_tested",
+        "meaning": "Required evidence appears in an intermediate candidate set but is absent from the final selected or narrated answer."
+      },
+      {
+        "class": "selected_but_not_narrated",
+        "coverage": "observed_for_elf_and_qmd",
+        "meaning": "Evidence is selected or available, but the answer does not narrate the required lifecycle relationship."
+      },
+      {
+        "class": "contradicted_by_lifecycle_evidence",
+        "coverage": "observed_for_elf_and_qmd",
+        "meaning": "The answer is contradicted or made incomplete by current, historical, supersession, or tombstone evidence."
+      }
+    ],
+    "qmd_missing_evidence": [
+      "verdict-bounded-private-caveat",
+      "pref-current-concise-rationale",
+      "delete-tombstone"
+    ]
+  },
+  "claim_boundaries": [
+    "ELF and qmd remain tied on encoded retrieval correctness.",
+    "qmd currently wins the default local-debug artifact surface: top-10 rows plus short CLI replay.",
+    "ELF trace/admin endpoint availability is not proof that the default benchmark report has qmd-level candidate visibility.",
+    "Rerank superiority is not scored from a qmd --no-rerank run.",
+    "Expansion, dense/sparse contribution, fusion, and retrieved-but-dropped candidate diagnostics remain not_tested.",
+    "Do not claim qmd beats ELF as a memory system overall.",
+    "Do not collapse not_tested, non_goal, or wrong_result into pass evidence."
+  ]
+}