hack-ink · yvette-carlisle · Jun 11, 2026 · Jun 11, 2026
diff --git a/README.md b/README.md
@@ -195,6 +195,7 @@ Detailed evidence and interpretation:
 - [Live Real-World Adapter Sweep Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md)
 - [Post-Adapter Production Adoption Refresh - June 10, 2026](docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md)
 - [qmd and OpenViking Strength-Profile Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md)
+- [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md)
 - [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md)
 - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
 - [Single-User Production Runbook](docs/guide/single_user_production.md)
@@ -269,6 +270,7 @@ Detailed comparison, mechanism-level analysis, and source map:
 - [Post-Adapter Production Adoption Refresh - June 10, 2026](docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md)
 - [Competitor Strength Evidence Matrix - June 11, 2026](docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md)
 - [Temporal History Competitor Gap Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md)
+- [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md)
 - [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md)
 - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
 - [Real-World Agent Memory Benchmark](docs/guide/benchmarking/real_world_agent_memory_benchmark.md)

diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs
@@ -107,6 +107,36 @@ fn retrieval_debug_profile_json_path() -> Result<PathBuf> {
 		.join("2026-06-11-elf-qmd-retrieval-debug-profile.json"))
 }
 
+fn trace_replay_diagnostics_report_path() -> Result<PathBuf> {
+	Ok(workspace_root()?
+		.join("docs")
+		.join("research")
+		.join("2026-06-11-elf-qmd-trace-replay-diagnostics-report.json"))
+}
+
+fn trace_replay_diagnostics_markdown_path() -> Result<PathBuf> {
+	Ok(workspace_root()?
+		.join("docs")
+		.join("guide")
+		.join("benchmarking")
+		.join("2026-06-11-elf-qmd-trace-replay-diagnostics-report.md"))
+}
+
+fn competitor_strength_adoption_report_path() -> Result<PathBuf> {
+	Ok(workspace_root()?
+		.join("docs")
+		.join("guide")
+		.join("benchmarking")
+		.join("2026-06-11-competitor-strength-adoption-report.md"))
+}
+
+fn competitor_strength_adoption_report_json_path() -> Result<PathBuf> {
+	Ok(workspace_root()?
+		.join("docs")
+		.join("research")
+		.join("2026-06-11-competitor-strength-adoption-report.json"))
+}
+
 fn competitor_strength_matrix_path() -> Result<PathBuf> {
 	Ok(workspace_root()?
 		.join("docs")
@@ -1404,6 +1434,151 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> {
 	Ok(())
 }
 
+#[test]
+fn qmd_trace_replay_diagnostics_report_preserves_claim_boundaries() -> Result<()> {
+	let report = serde_json::from_str::<Value>(&fs::read_to_string(
+		trace_replay_diagnostics_report_path()?,
+	)?)?;
+	let markdown = fs::read_to_string(trace_replay_diagnostics_markdown_path()?)?;
+	let readme = fs::read_to_string(readme_path()?)?;
+	let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?;
+	let adoption_report = fs::read_to_string(competitor_strength_adoption_report_path()?)?;
+	let adoption_json = serde_json::from_str::<Value>(&fs::read_to_string(
+		competitor_strength_adoption_report_json_path()?,
+	)?)?;
+
+	assert_trace_replay_diagnostics_json(&report)?;
+	assert_trace_replay_diagnostics_markdown(&markdown);
+
+	assert!(readme.contains("ELF/qmd Trace Replay Diagnostics Report - June 11, 2026"));
+	assert!(benchmarking_index.contains("2026-06-11-elf-qmd-trace-replay-diagnostics-report.md"));
+	assert!(benchmarking_index.contains("qmd top-10/replay artifact"));
+	assert!(benchmarking_index.contains("ELF trace/admin surfaces"));
+	assert!(adoption_report.contains("| Retrieval quality and local debug UX | `loss` |"));
+	assert!(
+		adoption_report
+			.contains("Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF")
+	);
+
+	assert_trace_replay_adoption_json(&adoption_json)?;
+
+	Ok(())
+}
+
+fn assert_trace_replay_diagnostics_json(report: &Value) -> Result<()> {
+	assert_eq!(
+		report.pointer("/schema").and_then(Value::as_str),
+		Some("elf.trace_replay_diagnostics_report/v1")
+	);
+	assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-923"));
+	assert_eq!(
+		string_array_at(report, "/outcome_terms")?,
+		["win", "tie", "loss", "not_tested", "blocked", "non_goal"].map(str::to_owned)
+	);
+	assert_eq!(
+		report.pointer("/summary/retrieval_correctness").and_then(Value::as_str),
+		Some("tie")
+	);
+	assert_eq!(report.pointer("/summary/outcome_counts/loss").and_then(Value::as_u64), Some(2));
+	assert_eq!(
+		report.pointer("/summary/outcome_counts/not_tested").and_then(Value::as_u64),
+		Some(4)
+	);
+	assert_eq!(report.pointer("/summary/outcome_counts/non_goal").and_then(Value::as_u64), Some(1));
+
+	let scenarios = array_at(report, "/scenario_outcomes")?;
+	let retrieval = find_by_field(scenarios, "/scenario_id", "retrieval_correctness_guardrail")?;
+	let top10 = find_by_field(scenarios, "/scenario_id", "default_top10_candidate_artifact")?;
+	let replay = find_by_field(scenarios, "/scenario_id", "replay_command_locality")?;
+	let trace_surface =
+		find_by_field(scenarios, "/scenario_id", "trace_admin_replay_surface_availability")?;
+	let expansion = find_by_field(scenarios, "/scenario_id", "query_expansion_attribution")?;
+	let dense_sparse =
+		find_by_field(scenarios, "/scenario_id", "dense_sparse_channel_attribution")?;
+	let fusion = find_by_field(scenarios, "/scenario_id", "fusion_attribution")?;
+	let rerank = find_by_field(scenarios, "/scenario_id", "rerank_attribution")?;
+	let candidate_drop = find_by_field(scenarios, "/scenario_id", "candidate_drop_diagnostics")?;
+	let selected =
+		find_by_field(scenarios, "/scenario_id", "selected_but_not_narrated_wrong_results")?;
+	let tombstone =
+		find_by_field(scenarios, "/scenario_id", "evidence_absent_tombstone_diagnostics")?;
+
+	assert_eq!(scenarios.len(), 11);
+	assert_eq!(retrieval.pointer("/outcome").and_then(Value::as_str), Some("tie"));
+	assert_eq!(top10.pointer("/outcome").and_then(Value::as_str), Some("loss"));
+	assert_eq!(replay.pointer("/outcome").and_then(Value::as_str), Some("loss"));
+	assert_eq!(trace_surface.pointer("/outcome").and_then(Value::as_str), Some("tie"));
+	assert_eq!(expansion.pointer("/outcome").and_then(Value::as_str), Some("not_tested"));
+	assert_eq!(dense_sparse.pointer("/outcome").and_then(Value::as_str), Some("not_tested"));
+	assert_eq!(fusion.pointer("/outcome").and_then(Value::as_str), Some("not_tested"));
+	assert_eq!(rerank.pointer("/result_type").and_then(Value::as_str), Some("non_goal"));
+	assert_eq!(rerank.pointer("/outcome").and_then(Value::as_str), Some("non_goal"));
+	assert_eq!(candidate_drop.pointer("/outcome").and_then(Value::as_str), Some("not_tested"));
+	assert!(array_contains_str(candidate_drop, "/typed_non_pass_states", "retrieved_but_dropped")?);
+	assert_eq!(selected.pointer("/result_type").and_then(Value::as_str), Some("wrong_result"));
+	assert!(array_contains_str(selected, "/typed_non_pass_states", "selected_but_not_narrated")?);
+	assert_eq!(tombstone.pointer("/outcome").and_then(Value::as_str), Some("win"));
+	assert_eq!(tombstone.pointer("/qmd_status").and_then(Value::as_str), Some("wrong_result"));
+	assert!(array_contains_str(
+		report,
+		"/wrong_result_diagnostics/qmd_missing_evidence",
+		"delete-tombstone"
+	)?);
+	assert!(array_contains_str(
+		report,
+		"/claim_boundaries",
+		"qmd currently wins the default local-debug artifact surface: top-10 rows plus short CLI replay."
+	)?);
+	assert!(array_contains_str(
+		report,
+		"/claim_boundaries",
+		"Do not claim qmd beats ELF as a memory system overall."
+	)?);
+
+	Ok(())
+}
+
+fn assert_trace_replay_diagnostics_markdown(markdown: &str) {
+	assert!(markdown.contains("Retrieval correctness is still tied"));
+	assert!(markdown.contains("| Default top-10 candidate artifact |"));
+	assert!(markdown.contains("| Replay command locality |"));
+	assert!(markdown.contains("| Rerank attribution | `live_baseline_only` | `non_goal` |"));
+	assert!(markdown.contains("| Candidate-drop diagnostics | `research_gate` | `not_encoded` |"));
+	assert!(markdown.contains("`retrieved_but_dropped` | Defined but `not_tested`"));
+	assert!(markdown.contains("npx tsx src/cli/qmd.ts query"));
+	assert!(markdown.contains("cargo run -p elf-eval -- --config-a"));
+	assert!(markdown.contains("Do not claim qmd beats ELF as a memory system overall"));
+	assert!(markdown.contains("Do not score rerank superiority from a qmd `--no-rerank` run"));
+}
+
+fn assert_trace_replay_adoption_json(adoption: &Value) -> Result<()> {
+	let local_debug = find_by_field(
+		array_at(adoption, "/scenario_outcomes")?,
+		"/scenario_id",
+		"local_debug_replay_ux",
+	)?;
+
+	assert_eq!(local_debug.pointer("/outcome").and_then(Value::as_str), Some("loss"));
+	assert!(
+		local_debug
+			.pointer("/measured_claim")
+			.and_then(Value::as_str)
+			.is_some_and(|claim| claim.contains("qmd stronger on immediate top-10"))
+	);
+	assert!(array_contains_str(
+		local_debug,
+		"/command_artifacts",
+		"docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md"
+	)?);
+	assert!(array_contains_str(
+		adoption,
+		"/claim_boundaries/not_allowed",
+		"Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system or retrieval-quality win."
+	)?);
+
+	Ok(())
+}
+
 fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> {
 	let projects = array_at(matrix, "/project_matrix")?;
 	let qmd = find_by_field(projects, "/project", "qmd")?;

diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md
@@ -35,9 +35,11 @@ The remaining caveats are material:
   exists.
 - Credentialed provider production-ops gates are blocked until explicit provider
   setup exists.
-- Several competitor strengths remain `not_tested`: qmd replay/debug UX,
-  mem0/OpenMemory history/UI, OpenViking trajectory, Letta core-vs-archival memory,
-  and graph/RAG navigation.
+- Several competitor strengths remain `not_tested`: mem0/OpenMemory history/UI,
+  OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation.
+  The XY-923 follow-up now scores qmd's immediate top-10/replay artifact ergonomics
+  as stronger than ELF's default stress report, while expansion, fusion, rerank, and
+  candidate-drop diagnosis remain untested.
 
 ## Evidence Classes
 
@@ -68,6 +70,7 @@ results, or lifecycle failures into one aggregate leaderboard.
 | `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke` | `2026-06-11-temporal-history-competitor-gap-report.md` | Graphiti/Zep temporal smoke remains blocked by `provider_api_key_missing`. |
 | `cargo make graphify-docker-graph-report-smoke` | `2026-06-11-graph-rag-scored-smoke-adapter-report.md` | graphify reaches tiny Docker graph/report scoring but remains wrong_result. |
 | `cargo make baseline-production-synthetic`, `cargo make baseline-backfill-docker`, backup/restore, Qdrant rebuild proof | `2026-06-10-production-adoption-refresh.md` | ELF has provider synthetic, stress, backfill, restore, and rebuild evidence; private-corpus proof is blocked by missing operator-owned manifest. |
+| `ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` plus ELF trace-bundle and qmd CLI replay commands | `2026-06-11-elf-qmd-trace-replay-diagnostics-report.md` | Retrieval correctness remains tied, but qmd wins current immediate top-10/replay artifact ergonomics; ELF trace/admin surfaces are useful but not yet hydrated into the default stress artifact. |
 
 ## Scenario Matrix
 
@@ -77,7 +80,7 @@ results, or lifecycle failures into one aggregate leaderboard.
 | Work resume and coding-agent continuity | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF and qmd both pass encoded live `work_resume` jobs; agentmemory, claude-mem, and OpenViking continuity strengths remain blocked or not encoded. | XY-925, XY-928 |
 | Project decisions and reversals | `tie` | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF and qmd both pass encoded `project_decisions` jobs; Letta-style core/archival decision memory is not tested. | XY-927 |
 | Retrieval quality | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF and qmd both pass encoded live retrieval and stress/same-corpus retrieval evidence. | XY-923 |
-| Retrieval quality and local debug UX | `not_tested` | `live_baseline_only`, `research_gate`, `not_encoded` | qmd remains the local retrieval-debug UX reference, but no scored rule compares qmd top-10/replay artifacts with ELF trace/admin bundle surfaces. | XY-923 |
+| Retrieval quality and local debug UX | `loss` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested. | XY-923 |
 | Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. | XY-905 |
 | Consolidation/proposal review | `not_tested` | `fixture_backed`, `not_encoded` | ELF fixture consolidation passes, but live consolidation proposal generation and review-action scoring are not encoded. | XY-926 |
 | Knowledge page compilation | `not_tested` | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `not_encoded` | ELF fixture knowledge pages pass, but live knowledge compilation is not encoded; graphify reaches a tiny scored smoke and remains wrong_result. | XY-926, XY-929 |
@@ -120,6 +123,8 @@ results, or lifecycle failures into one aggregate leaderboard.
 ## Claims Not Allowed
 
 - Do not claim ELF broadly beats qmd.
+- Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system
+  or retrieval-quality win.
 - Do not claim ELF beats mem0/OpenMemory on history, UI/export, hosted behavior, or
   graph memory.
 - Do not claim ELF beats OpenViking on staged context trajectory.
@@ -128,4 +133,3 @@ results, or lifecycle failures into one aggregate leaderboard.
 - Do not promote `fixture_backed`, `live_baseline_only`, `smoke_only`,
   `research_gate`, `blocked`, `wrong_result`, `lifecycle_fail`, `unsupported`, or
   `not_encoded` states into a generic pass/fail score.
-