From 15134a4c18856b8923524bd7cf13e7ace0966f4c Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Tue, 9 Jun 2026 12:54:07 +0800
Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add private
 production corpus benchmark","authority":"XY-818"}

---
 Makefile.toml                                 |  18 ++
 README.md                                     |   2 +
 .../synthetic_coding_agent_manifest.json      | 105 +++++++
 apps/elf-eval/src/bin/live_baseline_elf.rs    | 164 +++++++---
 .../2026-06-09-production-corpus-report.md    |  55 ++++
 docs/guide/benchmarking/index.md              |   5 +-
 .../benchmarking/live_baseline_benchmark.md   |  53 +++-
 docs/spec/index.md                            |   2 +
 docs/spec/production_corpus_manifest_v1.md    | 102 +++++++
 scripts/live-baseline-benchmark.sh            | 285 +++++++++++++++++-
 scripts/live-baseline-report-to-md.sh         |  37 ++-
 11 files changed, 781 insertions(+), 47 deletions(-)
 create mode 100644 apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json
 create mode 100644 docs/guide/benchmarking/2026-06-09-production-corpus-report.md
 create mode 100644 docs/spec/production_corpus_manifest_v1.md

diff --git a/Makefile.toml b/Makefile.toml
index 3cf5f17c..ab3c4762 100644
--- a/Makefile.toml
+++ b/Makefile.toml
@@ -299,6 +299,8 @@ args = [
 # | baseline-live-docker       | command |     |
 # | baseline-live-report       | command |     |
 # | baseline-live-docker-clean | command |     |
+# | baseline-production-synthetic | command | |
+# | baseline-production-private | command |   |
 
 [tasks.baseline-live-docker]
 workspace = false
@@ -327,6 +329,22 @@ args = [
 	"--remove-orphans",
 ]
 
+[tasks.baseline-production-synthetic]
+workspace = false
+command = "bash"
+args = [
+	"-lc",
+	"set -euo pipefail; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; selected_projects=\"$(printenv ELF_BASELINE_PROJECTS || true)\"; if [ -z \"$selected_projects\" ]; then selected_projects=\"ELF\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=\"$selected_projects\"; export ELF_BASELINE_PROFILE=production-synthetic; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner",
+]
+
+[tasks.baseline-production-private]
+workspace = false
+command = "bash"
+args = [
+	"-lc",
+	"set -euo pipefail; manifest=\"$(printenv ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST || true)\"; if [ -z \"$manifest\" ]; then echo \"ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST is required for baseline-production-private\" >&2; exit 1; fi; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; selected_projects=\"$(printenv ELF_BASELINE_PROJECTS || true)\"; if [ -z \"$selected_projects\" ]; then selected_projects=\"ELF\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=\"$selected_projects\"; export ELF_BASELINE_PROFILE=production-private; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner",
+]
+
 
 # Meta
 # | task   | type      | cwd |
diff --git a/README.md b/README.md
index 11b5fe2d..9b183598 100644
--- a/README.md
+++ b/README.md
@@ -134,6 +134,7 @@ embeddings.
 Detailed evidence and interpretation:
 
 - [Live Baseline Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-live-baseline-report.md)
+- [Synthetic Production Corpus Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-corpus-report.md)
 - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
 
 Quick comparison snapshot (objective/high-level).
@@ -177,6 +178,7 @@ Project signature strengths (what each does especially well):
 Detailed comparison, mechanism-level analysis, and source map:
 
 - [Live Baseline Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-live-baseline-report.md)
+- [Synthetic Production Corpus Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-corpus-report.md)
 - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
 - [External Memory Improvement Plan](docs/guide/research/external_memory_improvement_plan.md)
 - [Detailed External Comparison](docs/guide/research/comparison_external_projects.md)
diff --git a/apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json b/apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json
new file mode 100644
index 00000000..d627b627
--- /dev/null
+++ b/apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json
@@ -0,0 +1,105 @@
+{
+  "schema": "elf.production_corpus_manifest/v1",
+  "manifest_id": "synthetic-coding-agent-prod-corpus-2026-06-09",
+  "description": "Synthetic, sanitized production-style coding-agent memory corpus for ELF adoption benchmarking.",
+  "evidence": [
+    {
+      "evidence_id": "issue-xy812-resume",
+      "category": "issue",
+      "title": "XY-812 Resume Lane",
+      "text": "XY-812 resume lane uses branch y/elf-xy-812. The next command is `cargo make trace-gate`; the stale blocker cleared after PR #108 merged."
+    },
+    {
+      "evidence_id": "pr-110-review",
+      "category": "pr",
+      "title": "PR 110 Review Status",
+      "text": "PR #110 is review-ready for the ELF viewer lane. It passed `cargo make checks` and waits for the non-draft review handoff."
+    },
+    {
+      "evidence_id": "worktree-xy791-repair",
+      "category": "worktree",
+      "title": "XY-791 Strict Config Repair",
+      "text": "Worktree XY-791 recovered strict-config repair after rebase. The exact gate was `cargo make fmt && cargo make lint-fix && cargo make checks`."
+    },
+    {
+      "evidence_id": "runbook-live-baseline",
+      "category": "runbook",
+      "title": "Private Production Corpus Runbook",
+      "text": "Private production fixtures use `ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST` with `cargo make baseline-production-private` and stay out of git."
+    },
+    {
+      "evidence_id": "decision-qdrant-derived",
+      "category": "decision",
+      "title": "Qdrant Derived Index Decision",
+      "text": "Decision: Qdrant remains a rebuildable derived index. Postgres stores source-of-truth vectors, notes, chunks, and audit rows."
+    },
+    {
+      "evidence_id": "blocker-stale-qwen-key",
+      "category": "blocker",
+      "title": "Stale Provider Key Blocker",
+      "text": "Stale blocker: missing Qwen key applied only to provider stress runs. The synthetic production corpus uses local deterministic embeddings."
+    },
+    {
+      "evidence_id": "recovery-xy640-ledger",
+      "category": "recovery_note",
+      "title": "XY-640 Ledger Replay Recovery",
+      "text": "Recovery note: XY-640 ledger replay resumes from checkpoint `ledger-replay-42` and verifies the retained lane with `cargo make test`."
+    },
+    {
+      "evidence_id": "decision-xy818-supersedes",
+      "category": "decision",
+      "title": "Superseded Command Decision",
+      "text": "Update case: old command `cargo make lint` was superseded by `cargo make lint-fix` for Decodex ELF lanes."
+    }
+  ],
+  "queries": [
+    {
+      "query_id": "q-resume-lane",
+      "task": "resume_lane",
+      "query": "How do I resume XY-812 and what command is next?",
+      "expected_evidence_ids": ["issue-xy812-resume"],
+      "allowed_alternate_evidence_ids": [],
+      "expected_terms": ["XY-812", "cargo make trace-gate"]
+    },
+    {
+      "query_id": "q-recover-exact-command",
+      "task": "recover_exact_command",
+      "query": "Recover the exact repair gate command for XY-791 strict config.",
+      "expected_evidence_ids": ["worktree-xy791-repair"],
+      "allowed_alternate_evidence_ids": ["runbook-live-baseline"],
+      "expected_terms": ["XY-791", "cargo make fmt && cargo make lint-fix && cargo make checks"]
+    },
+    {
+      "query_id": "q-explain-stale-blocker",
+      "task": "explain_stale_blocker",
+      "query": "Why is the missing Qwen key blocker stale for the synthetic production corpus?",
+      "expected_evidence_ids": ["blocker-stale-qwen-key"],
+      "allowed_alternate_evidence_ids": [],
+      "expected_terms": ["missing Qwen key", "local deterministic embeddings"]
+    },
+    {
+      "query_id": "q-find-prior-decision",
+      "task": "find_prior_decision",
+      "query": "What prior decision explains why Qdrant can be rebuilt?",
+      "expected_evidence_ids": ["decision-qdrant-derived"],
+      "allowed_alternate_evidence_ids": [],
+      "expected_terms": ["Qdrant", "rebuildable derived index"]
+    },
+    {
+      "query_id": "q-compare-project-status",
+      "task": "compare_project_status",
+      "query": "Compare PR #110 and XY-640 status.",
+      "expected_evidence_ids": ["pr-110-review"],
+      "allowed_alternate_evidence_ids": ["recovery-xy640-ledger"],
+      "expected_terms": ["PR #110", "review-ready"]
+    },
+    {
+      "query_id": "q-detect-contradiction-update",
+      "task": "detect_contradiction_update",
+      "query": "Which command superseded cargo make lint for Decodex ELF lanes?",
+      "expected_evidence_ids": ["decision-xy818-supersedes"],
+      "allowed_alternate_evidence_ids": [],
+      "expected_terms": ["cargo make lint-fix", "superseded"]
+    }
+  ]
+}
diff --git a/apps/elf-eval/src/bin/live_baseline_elf.rs b/apps/elf-eval/src/bin/live_baseline_elf.rs
index 75c9b83e..b0857036 100644
--- a/apps/elf-eval/src/bin/live_baseline_elf.rs
+++ b/apps/elf-eval/src/bin/live_baseline_elf.rs
@@ -61,9 +61,35 @@ struct QueryManifest {
 #[derive(Clone, Debug, Deserialize, Serialize)]
 struct QueryCase {
 	id: String,
+	task: Option<String>,
 	query: String,
 	expected_doc: String,
 	expected_terms: Vec<String>,
+	#[serde(default)]
+	allowed_alternate_docs: Vec<String>,
+	#[serde(default)]
+	expected_evidence_ids: Vec<String>,
+	#[serde(default)]
+	allowed_alternate_evidence_ids: Vec<String>,
+}
+impl QueryCase {
+	fn generated(
+		id: String,
+		query: String,
+		expected_doc: String,
+		expected_terms: Vec<String>,
+	) -> Self {
+		Self {
+			id,
+			task: None,
+			query,
+			expected_evidence_ids: vec![evidence_id_for_doc(&expected_doc)],
+			allowed_alternate_docs: Vec::new(),
+			allowed_alternate_evidence_ids: Vec::new(),
+			expected_doc,
+			expected_terms,
+		}
+	}
 }
 
 #[derive(Debug)]
@@ -158,6 +184,9 @@ struct QuerySummary {
 	total: usize,
 	pass: usize,
 	fail: usize,
+	wrong_result_count: usize,
+	latency_ms_total: f64,
+	latency_ms_mean: f64,
 }
 
 #[derive(Debug, Serialize)]
@@ -179,13 +208,20 @@ struct CheckResult {
 #[derive(Debug, Serialize)]
 struct QueryResult {
 	id: String,
+	task: Option<String>,
 	query: String,
 	expected_doc: String,
+	allowed_alternate_docs: Vec<String>,
 	expected_terms: Vec<String>,
+	expected_evidence_ids: Vec<String>,
+	allowed_alternate_evidence_ids: Vec<String>,
 	matched: bool,
 	matched_terms: Vec<String>,
+	top_evidence_id: Option<String>,
+	matched_evidence_id: Option<String>,
 	top_note_key: Option<String>,
 	top_snippet: Option<String>,
+	latency_ms: f64,
 	returned_count: usize,
 }
 
@@ -499,6 +535,16 @@ fn outbox_done(counts: &BTreeMap<String, i64>, expected_note_count: usize) -> bo
 fn retrieval_check(query_results: &[QueryResult]) -> CheckResult {
 	let pass_count = query_results.iter().filter(|result| result.matched).count();
 	let fail_count = query_results.len().saturating_sub(pass_count);
+	let expected_evidence_ids = query_results
+		.iter()
+		.map(|result| {
+			serde_json::json!({
+				"query_id": result.id,
+				"expected": result.expected_evidence_ids,
+				"allowed_alternates": result.allowed_alternate_evidence_ids,
+			})
+		})
+		.collect::<Vec<_>>();
 
 	CheckResult {
 		name: "same_corpus_retrieval",
@@ -512,6 +558,8 @@ fn retrieval_check(query_results: &[QueryResult]) -> CheckResult {
 			"total": query_results.len(),
 			"pass": pass_count,
 			"fail": fail_count,
+			"wrong_result_count": fail_count,
+			"expected_evidence_ids": expected_evidence_ids,
 		}),
 	}
 }
@@ -579,12 +627,12 @@ fn concurrent_add_request(index: usize) -> AddNoteRequest {
 fn concurrent_query_case(index: usize) -> QueryCase {
 	let marker = concurrent_marker(index);
 
-	QueryCase {
-		id: format!("concurrent-{index:03}"),
-		query: format!("Find the concurrent benchmark note containing marker {marker}."),
-		expected_doc: format!("concurrent-{index:03}.md"),
-		expected_terms: vec![marker],
-	}
+	QueryCase::generated(
+		format!("concurrent-{index:03}"),
+		format!("Find the concurrent benchmark note containing marker {marker}."),
+		format!("concurrent-{index:03}.md"),
+		vec![marker],
+	)
 }
 
 fn concurrent_marker(index: usize) -> String {
@@ -648,12 +696,12 @@ fn soak_query_case(index: usize) -> QueryCase {
 	let marker = soak_marker(index);
 	let (topic, _) = soak_topic(index);
 
-	QueryCase {
-		id: format!("soak-{index:03}"),
-		query: format!("Find the soak benchmark note about {topic} containing marker {marker}."),
-		expected_doc: format!("soak-{index:03}.md"),
-		expected_terms: vec![marker],
-	}
+	QueryCase::generated(
+		format!("soak-{index:03}"),
+		format!("Find the soak benchmark note about {topic} containing marker {marker}."),
+		format!("soak-{index:03}.md"),
+		vec![marker],
+	)
 }
 
 fn soak_marker(index: usize) -> String {
@@ -808,6 +856,19 @@ fn key_for_doc(doc: &str) -> String {
 	if key.is_empty() { "doc".to_string() } else { key }
 }
 
+fn evidence_id_for_doc(doc: &str) -> String {
+	Path::new(doc).file_stem().and_then(|stem| stem.to_str()).unwrap_or(doc).to_string()
+}
+
+fn expected_docs_for_case(case: &QueryCase) -> Vec<String> {
+	let mut docs = Vec::with_capacity(case.allowed_alternate_docs.len().saturating_add(1));
+
+	docs.push(case.expected_doc.clone());
+	docs.extend(case.allowed_alternate_docs.iter().cloned());
+
+	docs
+}
+
 fn embed_text(text: &str, vector_dim: u32) -> Vec<f32> {
 	let dim = vector_dim as usize;
 	let mut vector = vec![0.0_f32; dim];
@@ -966,6 +1027,8 @@ async fn run(args: Args) -> color_eyre::Result<ElfBaselineReport> {
 	let query_results = run_queries(&service, query_manifest.queries).await?;
 	let pass_count = query_results.iter().filter(|result| result.matched).count();
 	let fail_count = query_results.len().saturating_sub(pass_count);
+	let latency_ms_total = query_results.iter().map(|result| result.latency_ms).sum::<f64>();
+	let latency_ms_mean = latency_ms_total / query_results.len().max(1) as f64;
 	let retrieval_status =
 		if fail_count == 0 { "retrieval_pass" } else { "retrieval_wrong_result" };
 	let mut checks = vec![retrieval_check(&query_results), worker_indexing_check(initial_worker)];
@@ -1004,7 +1067,14 @@ async fn run(args: Args) -> color_eyre::Result<ElfBaselineReport> {
 			rebuild_missing_vector_count: rebuild.missing_vector_count,
 			rebuild_error_count: rebuild.error_count,
 		},
-		summary: QuerySummary { total: query_results.len(), pass: pass_count, fail: fail_count },
+		summary: QuerySummary {
+			total: query_results.len(),
+			pass: pass_count,
+			fail: fail_count,
+			wrong_result_count: fail_count,
+			latency_ms_total,
+			latency_ms_mean,
+		},
 		check_summary,
 		checks,
 		queries: query_results,
@@ -1262,13 +1332,14 @@ async fn run_single_query(
 		.ok()
 		.and_then(|value| value.parse::<u32>().ok())
 		.unwrap_or(10);
+	let started_at = Instant::now();
 	let response = service
 		.search_raw(SearchRequest {
 			tenant_id: TENANT_ID.to_string(),
 			project_id: PROJECT_ID.to_string(),
 			agent_id: AGENT_ID.to_string(),
 			token_id: None,
-			payload_level: PayloadLevel::default(),
+			payload_level: PayloadLevel::L2,
 			read_profile: "private_only".to_string(),
 			query: case.query.clone(),
 			top_k: Some(top_k),
@@ -1278,6 +1349,7 @@ async fn run_single_query(
 			ranking: None,
 		})
 		.await?;
+	let latency_ms = started_at.elapsed().as_secs_f64() * 1_000.0;
 	let top = response.items.first();
 	let top_text = top.map(|item| item.snippet.clone()).unwrap_or_default();
 	let matched_terms = case
@@ -1287,19 +1359,41 @@ async fn run_single_query(
 		.cloned()
 		.collect::<Vec<_>>();
 	let top_key = top.and_then(|item| item.key.clone());
-	let expected_key = key_for_doc(&case.expected_doc);
-	let matched = matched_terms.len() == case.expected_terms.len()
-		|| top_key.as_deref().is_some_and(|key| key == expected_key);
+	let expected_docs = expected_docs_for_case(&case);
+	let matched_doc =
+		top_key.as_deref().and_then(|key| expected_docs.iter().find(|doc| key_for_doc(doc) == key));
+	let top_evidence_id = top.and_then(|item| {
+		item.source_ref.get("document").and_then(Value::as_str).map(evidence_id_for_doc)
+	});
+	let matched_evidence_id = matched_doc.map(|doc| evidence_id_for_doc(doc));
+	let matched = matched_terms.len() == case.expected_terms.len() || matched_doc.is_some();
+	let expected_evidence_ids = if case.expected_evidence_ids.is_empty() {
+		vec![evidence_id_for_doc(&case.expected_doc)]
+	} else {
+		case.expected_evidence_ids.clone()
+	};
+	let allowed_alternate_evidence_ids = if case.allowed_alternate_evidence_ids.is_empty() {
+		case.allowed_alternate_docs.iter().map(|doc| evidence_id_for_doc(doc)).collect()
+	} else {
+		case.allowed_alternate_evidence_ids.clone()
+	};
 
 	Ok(QueryResult {
 		id: case.id,
+		task: case.task,
 		query: case.query,
 		expected_doc: case.expected_doc,
+		allowed_alternate_docs: case.allowed_alternate_docs,
 		expected_terms: case.expected_terms,
+		expected_evidence_ids,
+		allowed_alternate_evidence_ids,
 		matched,
 		matched_terms,
+		top_evidence_id,
+		matched_evidence_id,
 		top_note_key: top_key,
 		top_snippet: top.map(|item| item.snippet.clone()),
+		latency_ms,
 		returned_count: response.items.len(),
 	})
 }
@@ -1375,12 +1469,12 @@ async fn run_update_replacement_check(
 		run_worker_until_indexed(runtime, service, &[update_note_id], "lifecycle_update").await?;
 	let update_query = run_single_query(
 		service,
-		QueryCase {
-			id: "lifecycle-update-new-marker".to_string(),
-			query: "Which rotated JWT key id does the auth middleware require?".to_string(),
-			expected_doc: update_note.source_doc.clone(),
-			expected_terms: vec!["kid-v4".to_string(), "RotatedJwtKeyPlan".to_string()],
-		},
+		QueryCase::generated(
+			"lifecycle-update-new-marker".to_string(),
+			"Which rotated JWT key id does the auth middleware require?".to_string(),
+			update_note.source_doc.clone(),
+			vec!["kid-v4".to_string(), "RotatedJwtKeyPlan".to_string()],
+		),
 	)
 	.await?;
 	let old_marker_absent = update_query
@@ -1427,12 +1521,12 @@ async fn run_delete_suppression_check(
 		run_worker_until_indexed(runtime, service, &[delete_note_id], "lifecycle_delete").await?;
 	let delete_query = run_single_query(
 		service,
-		QueryCase {
-			id: "lifecycle-delete-suppresses-note".to_string(),
-			query: delete_note.text.clone(),
-			expected_doc: delete_note.source_doc.clone(),
-			expected_terms: distinctive_terms(&delete_note.text, 2),
-		},
+		QueryCase::generated(
+			"lifecycle-delete-suppresses-note".to_string(),
+			delete_note.text.clone(),
+			delete_note.source_doc.clone(),
+			distinctive_terms(&delete_note.text, 2),
+		),
 	)
 	.await?;
 	let delete_pass = !delete_query.matched
@@ -1464,12 +1558,12 @@ async fn run_cold_start_recovery_check(
 	let recovery_service = build_service(runtime).await?;
 	let recovery_query = run_single_query(
 		&recovery_service,
-		QueryCase {
-			id: "lifecycle-cold-start-recovery".to_string(),
-			query: recovery_note.text.clone(),
-			expected_doc: recovery_note.source_doc.clone(),
-			expected_terms: distinctive_terms(&recovery_note.text, 2),
-		},
+		QueryCase::generated(
+			"lifecycle-cold-start-recovery".to_string(),
+			recovery_note.text.clone(),
+			recovery_note.source_doc.clone(),
+			distinctive_terms(&recovery_note.text, 2),
+		),
 	)
 	.await?;
 	let outbox_counts = pending_outbox_counts(service).await?;
diff --git a/docs/guide/benchmarking/2026-06-09-production-corpus-report.md b/docs/guide/benchmarking/2026-06-09-production-corpus-report.md
new file mode 100644
index 00000000..8d1505c8
--- /dev/null
+++ b/docs/guide/benchmarking/2026-06-09-production-corpus-report.md
@@ -0,0 +1,55 @@
+# Live Baseline Benchmark Report
+
+Goal: Publish a Markdown summary for one generated live baseline aggregate report.
+Read this when: You need a durable, reviewable summary of a live baseline JSON report.
+Inputs: `tmp/live-baseline/live-baseline-report.json`.
+Depends on: `scripts/live-baseline-benchmark.sh` and `docs/guide/benchmarking/live_baseline_benchmark.md`.
+Verification: Compare this Markdown summary with the source JSON before committing.
+
+## Summary
+
+- Run ID: `live-baseline-20260609045306`
+- Generated at: `2026-06-09T04:53:18Z`
+- Verdict: `pass`
+- Project filter: `ELF`
+- Corpus profile: `production-synthetic`
+- Corpus track: `synthetic_production`
+- Corpus manifest: `synthetic-coding-agent-prod-corpus-2026-06-09`
+- Documents: `8`
+- Queries: `6`
+- Wrong-result count: `0`
+- Query latency mean: `7.137632833333334 ms`
+- Project summary: `1 pass`, `0 fail`, `0 incomplete`
+- Same-corpus summary: `1 pass`, `0 fail`, `0 incomplete`
+- Full check summary: `7/7 pass`
+
+## Projects
+
+| Project | Status | Retrieval | Checks | Elapsed | Reason |
+| --- | --- | --- | --- | --- | --- |
+| ELF | `pass` | `retrieval_pass` | `7/7` | `12s` | ELF added the corpus, rebuilt Qdrant, and returned expected evidence for every query |
+
+## Embedding
+
+| Project | Mode | Provider | Model | Dimensions | Timeout | API Base | Path |
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| ELF | `local` | `local` | `local-hash` | `256` | `1000ms` | `http://127.0.0.1` | `/embeddings` |
+
+## Query Evidence
+
+| Project | Query | Task | Expected Evidence | Allowed Alternates | Top Evidence | Matched | Latency |
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| ELF | `q-resume-lane` | `resume_lane` | `issue-xy812-resume` | `` | `issue-xy812-resume` | `true` | `9.213627 ms` |
+| ELF | `q-recover-exact-command` | `recover_exact_command` | `worktree-xy791-repair` | `runbook-live-baseline` | `worktree-xy791-repair` | `true` | `6.424872 ms` |
+| ELF | `q-explain-stale-blocker` | `explain_stale_blocker` | `blocker-stale-qwen-key` | `` | `blocker-stale-qwen-key` | `true` | `7.749393 ms` |
+| ELF | `q-find-prior-decision` | `find_prior_decision` | `decision-qdrant-derived` | `` | `decision-qdrant-derived` | `true` | `6.66385 ms` |
+| ELF | `q-compare-project-status` | `compare_project_status` | `pr-110-review` | `recovery-xy640-ledger` | `recovery-xy640-ledger` | `true` | `6.344976 ms` |
+| ELF | `q-detect-contradiction-update` | `detect_contradiction_update` | `decision-xy818-supersedes` | `` | `decision-xy818-supersedes` | `true` | `6.429079 ms` |
+
+## Result Semantics
+
+- `pass`: every encoded check for the selected project and profile passed.
+- `fail`: clone, install, import, build, retrieval, lifecycle, recovery, concurrency, soak, resource-envelope, or another declared check failed.
+- `incomplete`: the encoded check could not complete without extra provider keys, host integration, native dependency support, durable runtime wiring, or more adapter work.
+
+`incomplete` is not a pass; treat it as benchmark wiring debt.
diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md
index 4493e306..3fcd0143 100644
--- a/docs/guide/benchmarking/index.md
+++ b/docs/guide/benchmarking/index.md
@@ -20,9 +20,12 @@ Outputs: The smallest benchmarking guide or report needed to continue.
 ## Guides And Reports
 
 - `live_baseline_benchmark.md`: run, clean up, publish, and interpret the live
-  Docker-only benchmark matrix.
+  Docker-only benchmark matrix, including generated public and production-corpus
+  profiles.
 - `2026-06-09-live-baseline-report.md`: checked-in evidence snapshot for the June 9,
   2026 ELF production-provider stress run and all-project smoke comparison.
+- `2026-06-09-production-corpus-report.md`: checked-in synthetic production-corpus
+  ELF adoption benchmark report with task queries and evidence IDs.
 
 ## Update Rules
 
diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md
index b61b1e2b..c229eff6 100644
--- a/docs/guide/benchmarking/live_baseline_benchmark.md
+++ b/docs/guide/benchmarking/live_baseline_benchmark.md
@@ -3,7 +3,9 @@
 Goal: Run Docker-isolated, current-HEAD baseline checks against ELF and the external memory projects compared with ELF.
 Read this when: You need evidence about which external projects actually run against a shared benchmark corpus.
 Preconditions: Docker and Docker Compose are available on the host.
-Depends on: `docker-compose.baseline.yml`, `scripts/live-baseline-benchmark.sh`, and `docs/spec/system_competitive_parity_gate_v1.md`.
+Depends on: `docker-compose.baseline.yml`, `scripts/live-baseline-benchmark.sh`,
+`docs/spec/system_competitive_parity_gate_v1.md`, and
+`docs/spec/production_corpus_manifest_v1.md`.
 Verification: `cargo make baseline-live-docker` writes `tmp/live-baseline/live-baseline-report.json`; `cargo make baseline-live-report` can render that JSON into a checked-in Markdown report.
 
 ## Scope
@@ -40,9 +42,20 @@ Corpus profiles:
   that make the check closer to a production retrieval benchmark.
 - `stress`: 480 documents by default, 16 query cases, and alternate phrasings for
   every needle query.
+- `production-synthetic`: checked-in synthetic coding-agent production corpus with
+  issues, PRs, worktrees, runbooks, decisions, blockers, recovery notes, and
+  task-oriented queries. Fixture:
+  `apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json`.
+- `production-private`: local private/sanitized production corpus manifest supplied by
+  `ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST`.
 
 Use `ELF_BASELINE_SCALE_DOCS` and `ELF_BASELINE_STRESS_DOCS` to raise or lower the
 generated corpus sizes.
+Use `ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST` to supply a local manifest that follows
+`docs/spec/production_corpus_manifest_v1.md`. The private profile fails closed when the
+manifest path is absent, the file is missing, a referenced `local_path` is missing, or a
+query references an unknown evidence ID. It does not fall back to the checked-in
+synthetic fixture.
 Use `ELF_BASELINE_CONCURRENT_NOTES`, `ELF_BASELINE_MAX_ELF_SECONDS`, and
 `ELF_BASELINE_MAX_ELF_RSS_KB` to tune ELF's concurrent-write and resource-envelope
 checks.
@@ -138,6 +151,23 @@ ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker
 ELF_BASELINE_PROJECTS=ELF,memsearch cargo make baseline-live-docker
 ```
 
+To run the checked-in synthetic production-style corpus through ELF:
+
+```sh
+cargo make baseline-production-synthetic
+```
+
+To run a private local production corpus without committing private content:
+
+```sh
+ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST=tmp/private-production-corpus/manifest.json \
+cargo make baseline-production-private
+```
+
+The private manifest can contain sanitized inline `text` fields or `local_path` fields
+that point to local sanitized text/Markdown files. Keep private manifests and local
+evidence under `tmp/` or outside the repository. `tmp/` is ignored by git.
+
 The only host artifact is:
 
 ```text
@@ -146,12 +176,21 @@ tmp/live-baseline/
 
 That directory contains the aggregate report, per-project logs, and the shared query
 fixture used by the run. The aggregate report records `corpus.profile`,
-`corpus.document_count`, and `corpus.query_count` so smoke, scale, and stress runs are
-not confused. Each project record includes `elapsed_seconds` for rough local runtime
-comparison. ELF project records also include an `embedding` summary so deterministic
-local and production-provider runs are not confused. Each project record also includes
-`checks` and `check_summary`; the aggregate `full_check_summary` is the
-adoption-relevant multi-check count.
+`corpus.track`, `corpus.manifest_id`, `corpus.document_count`, and
+`corpus.query_count` so generated public corpus results are not confused with
+synthetic or private production-corpus results. Each project record includes
+`elapsed_seconds` for rough local runtime comparison. ELF project records also include
+an `embedding` summary so deterministic local and production-provider runs are not
+confused. ELF query records include task, expected evidence IDs, allowed alternate
+evidence IDs, top evidence ID, wrong-result count, and per-query latency. Each project
+record also includes `checks` and `check_summary`; the aggregate `full_check_summary`
+is the adoption-relevant multi-check count.
+
+Production-ready claims must cite a concrete report path. A claim based only on
+generated public `smoke`, `scale`, or `stress` profiles is not enough for personal
+production adoption. Cite a `production-synthetic` report for fixture coverage, and
+cite a `production-private` report when making a private-corpus production-readiness
+claim.
 
 ## Publish A Markdown Report
 
diff --git a/docs/spec/index.md b/docs/spec/index.md
index e7c8f30c..7cec41ce 100644
--- a/docs/spec/index.md
+++ b/docs/spec/index.md
@@ -37,6 +37,8 @@ Question this index answers: "what must remain true?"
   proposal contract over immutable source evidence.
 - `system_competitive_parity_gate_v1.md`: Docker-only adoption gate that decides
   whether ELF meets or exceeds selected external memory-system baselines.
+- `production_corpus_manifest_v1.md`: Sanitized/private coding-agent production
+  corpus manifest schema for adoption benchmark runs.
 
 ## Spec document contract
 
diff --git a/docs/spec/production_corpus_manifest_v1.md b/docs/spec/production_corpus_manifest_v1.md
new file mode 100644
index 00000000..4d582958
--- /dev/null
+++ b/docs/spec/production_corpus_manifest_v1.md
@@ -0,0 +1,102 @@
+# Production Corpus Manifest v1
+
+Purpose: Define the sanitized/private coding-agent production corpus manifest used by
+ELF adoption benchmarks.
+Status: normative
+Read this when: You are creating, validating, or running a production-style personal
+agent memory benchmark corpus.
+Not this document: Docker benchmark run commands, report publication steps, or private
+fixture storage procedures.
+Defines: `elf.production_corpus_manifest/v1` fields, required evidence categories,
+query tasks, evidence expectations, and private-content safety rules.
+
+## Contract
+
+A production corpus manifest is a JSON object with:
+
+- `schema`: exactly `elf.production_corpus_manifest/v1`.
+- `manifest_id`: stable lower-risk identifier for the corpus snapshot.
+- `description`: optional English summary.
+- `evidence`: non-empty array of production-style memory evidence items.
+- `queries`: non-empty array of task-oriented retrieval checks.
+
+The checked-in benchmark fixture must be synthetic and sanitized. Real private
+production content must not be committed.
+
+## Evidence Items
+
+Each `evidence[]` item must include:
+
+- `evidence_id`: lower-case ASCII identifier safe for filenames. Allowed shape:
+  `[a-z0-9][a-z0-9_.-]{1,80}`.
+- `category`: one of `issue`, `pr`, `worktree`, `runbook`, `decision`, `blocker`,
+  or `recovery_note`.
+- `title`: short English title.
+- Exactly one of:
+  - `text`: sanitized inline English evidence text.
+  - `local_path`: path to a local sanitized text/Markdown file, resolved relative to
+    the manifest when not absolute.
+
+Evidence text must not contain secrets, tokens, private keys, personal credentials, or
+unsanitized private conversation content.
+
+## Query Cases
+
+Each `queries[]` item must include:
+
+- `query_id`: stable query identifier.
+- `task`: one of `resume_lane`, `recover_exact_command`, `explain_stale_blocker`,
+  `find_prior_decision`, `compare_project_status`, or
+  `detect_contradiction_update`.
+- `query`: English task-oriented search query.
+- `expected_evidence_ids`: non-empty array of evidence IDs that satisfy the query.
+- `allowed_alternate_evidence_ids`: array of acceptable alternate evidence IDs. Use
+  an empty array when no alternate is allowed.
+- `expected_terms`: non-empty array of terms that should appear in the matched
+  evidence snippet when the expected note key is not the top result.
+
+Every query must record both expected evidence IDs and allowed alternates, even when
+the allowed alternate list is empty.
+
+## Benchmark Mapping
+
+The Docker benchmark materializes each evidence item as a temporary Markdown document
+inside the benchmark work directory. The source document filename is
+`<evidence_id>.md`. Reports must expose evidence IDs and allowed alternates, not local
+private file paths.
+
+For `production-private` runs, the runner must fail closed when the manifest is absent,
+the manifest references a missing `local_path`, or any query references an unknown
+evidence ID. It must not silently fall back to the checked-in synthetic corpus.
+
+## Minimal Example
+
+```json
+{
+  "schema": "elf.production_corpus_manifest/v1",
+  "manifest_id": "local-private-prod-corpus-2026-06-09",
+  "evidence": [
+    {
+      "evidence_id": "issue-xy123-resume",
+      "category": "issue",
+      "title": "XY-123 Resume State",
+      "text": "XY-123 resumes on branch y/example with command `cargo make checks`."
+    }
+  ],
+  "queries": [
+    {
+      "query_id": "q-resume-xy123",
+      "task": "resume_lane",
+      "query": "How do I resume XY-123?",
+      "expected_evidence_ids": ["issue-xy123-resume"],
+      "allowed_alternate_evidence_ids": [],
+      "expected_terms": ["XY-123", "cargo make checks"]
+    }
+  ]
+}
+```
+
+## Related Guides
+
+- `docs/guide/benchmarking/live_baseline_benchmark.md`: run commands, private fixture
+  placement, and report publication.
diff --git a/scripts/live-baseline-benchmark.sh b/scripts/live-baseline-benchmark.sh
index fbb56b05..1b5a6e0a 100755
--- a/scripts/live-baseline-benchmark.sh
+++ b/scripts/live-baseline-benchmark.sh
@@ -16,6 +16,10 @@ SCALE_DOC_COUNT="${ELF_BASELINE_SCALE_DOCS:-120}"
 STRESS_DOC_COUNT="${ELF_BASELINE_STRESS_DOCS:-480}"
 QUERY_TOP_K="${ELF_BASELINE_TOP_K:-10}"
 CURRENT_PROJECT_STARTED_AT=""
+PRODUCTION_SYNTHETIC_MANIFEST="${ROOT_DIR}/apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json"
+CORPUS_TRACK="generated_public"
+CORPUS_PATH_DESCRIPTION="generated in Docker under /bench/corpus"
+CORPUS_MANIFEST_ID=""
 
 if [[ ! -f "/.dockerenv" && "${ELF_BASELINE_ALLOW_HOST:-0}" != "1" ]]; then
   echo "Refusing to run live baseline benchmark outside Docker. Use cargo make baseline-live-docker." >&2
@@ -157,21 +161,28 @@ query_docs = anchors[: (3 if profile == "smoke" else len(anchors))]
 queries = []
 for doc in query_docs:
     base_id = doc["name"].replace("-memory.md", "").replace(".md", "")
+    evidence_id = doc["name"].replace(".md", "")
     queries.append(
         {
             "id": f"q-{base_id}",
+            "task": "same_corpus_retrieval",
             "query": doc["query"],
             "expected_doc": doc["name"],
             "expected_terms": doc["terms"],
+            "expected_evidence_ids": [evidence_id],
+            "allowed_alternate_evidence_ids": [],
         }
     )
     if profile == "stress":
         queries.append(
             {
                 "id": f"q-{base_id}-alt",
+                "task": "same_corpus_retrieval",
                 "query": doc["alternate_query"],
                 "expected_doc": doc["name"],
                 "expected_terms": doc["terms"],
+                "expected_evidence_ids": [evidence_id],
+                "allowed_alternate_evidence_ids": [],
             }
         )
 
@@ -191,13 +202,264 @@ queries_path.write_text(
 PY
 }
 
+prepare_production_corpus() {
+  local manifest_path="${ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST:-}"
+  local corpus_summary="${REPORT_DIR}/production-corpus-summary.json"
+
+  case "${CORPUS_PROFILE}" in
+    production-synthetic)
+      manifest_path="${manifest_path:-${PRODUCTION_SYNTHETIC_MANIFEST}}"
+      ;;
+    production-private)
+      if [[ -z "${manifest_path}" ]]; then
+        echo "ELF_BASELINE_PROFILE=production-private requires ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST." >&2
+        exit 1
+      fi
+      ;;
+    *)
+      echo "Unsupported production corpus profile: ${CORPUS_PROFILE}" >&2
+      exit 1
+      ;;
+  esac
+
+  if [[ ! -f "${manifest_path}" ]]; then
+    echo "Missing production corpus manifest: ${manifest_path}" >&2
+    exit 1
+  fi
+
+  python3 - "${CORPUS_PROFILE}" "${manifest_path}" "${CORPUS_DIR}" "${REPORT_DIR}/queries.json" "${corpus_summary}" <<'PY'
+import json
+import re
+import sys
+from collections import Counter
+from pathlib import Path
+
+profile, manifest_path_raw, corpus_dir_raw, queries_path_raw, summary_path_raw = sys.argv[1:]
+manifest_path = Path(manifest_path_raw)
+corpus_dir = Path(corpus_dir_raw)
+queries_path = Path(queries_path_raw)
+summary_path = Path(summary_path_raw)
+corpus_track = "synthetic_production" if profile == "production-synthetic" else "private_production"
+allowed_categories = {
+    "issue",
+    "pr",
+    "worktree",
+    "runbook",
+    "decision",
+    "blocker",
+    "recovery_note",
+}
+allowed_tasks = {
+    "resume_lane",
+    "recover_exact_command",
+    "explain_stale_blocker",
+    "find_prior_decision",
+    "compare_project_status",
+    "detect_contradiction_update",
+}
+id_re = re.compile(r"[a-z0-9][a-z0-9_.-]{1,80}")
+
+
+def fail(message):
+    raise SystemExit(f"Invalid production corpus manifest: {message}")
+
+
+def require_string(obj, field, context):
+    value = obj.get(field)
+    if not isinstance(value, str) or not value.strip():
+        fail(f"{context}.{field} must be a non-empty string")
+    return value.strip()
+
+
+def require_string_list(obj, field, context):
+    value = obj.get(field)
+    if not isinstance(value, list) or not value:
+        fail(f"{context}.{field} must be a non-empty string array")
+    out = []
+    for index, item in enumerate(value):
+        if not isinstance(item, str) or not item.strip():
+            fail(f"{context}.{field}[{index}] must be a non-empty string")
+        out.append(item.strip())
+    return out
+
+
+def load_text(item, context):
+    has_text = isinstance(item.get("text"), str)
+    has_path = isinstance(item.get("local_path"), str)
+    if has_text == has_path:
+        fail(f"{context} must set exactly one of text or local_path")
+    if has_text:
+        text = item["text"].strip()
+    else:
+        local_path = Path(item["local_path"])
+        if not local_path.is_absolute():
+            local_path = manifest_path.parent / local_path
+        if not local_path.is_file():
+            fail(f"{context}.local_path does not point to a readable file")
+        text = local_path.read_text(encoding="utf-8").strip()
+    if not text:
+        fail(f"{context} text must not be empty")
+    if "\x00" in text:
+        fail(f"{context} text contains a NUL byte")
+    return text
+
+
+manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
+if manifest.get("schema") != "elf.production_corpus_manifest/v1":
+    fail("schema must be elf.production_corpus_manifest/v1")
+
+manifest_id = require_string(manifest, "manifest_id", "$")
+evidence_items = manifest.get("evidence")
+if not isinstance(evidence_items, list) or not evidence_items:
+    fail("$.evidence must be a non-empty array")
+query_items = manifest.get("queries")
+if not isinstance(query_items, list) or not query_items:
+    fail("$.queries must be a non-empty array")
+
+for existing in corpus_dir.glob("*.md"):
+    existing.unlink()
+
+evidence_by_id = {}
+category_counts = Counter()
+for index, item in enumerate(evidence_items):
+    context = f"$.evidence[{index}]"
+    if not isinstance(item, dict):
+        fail(f"{context} must be an object")
+    evidence_id = require_string(item, "evidence_id", context)
+    if not id_re.fullmatch(evidence_id):
+        fail(f"{context}.evidence_id must be lower-case ASCII and safe for filenames")
+    if evidence_id in evidence_by_id:
+        fail(f"{context}.evidence_id duplicates an earlier item")
+    category = require_string(item, "category", context)
+    if category not in allowed_categories:
+        fail(f"{context}.category must be one of {sorted(allowed_categories)}")
+    title = require_string(item, "title", context)
+    text = load_text(item, context)
+    evidence_by_id[evidence_id] = {
+        "category": category,
+        "title": title,
+        "text": text,
+    }
+    category_counts[category] += 1
+    (corpus_dir / f"{evidence_id}.md").write_text(
+        "\n".join(
+            [
+                f"# {title}",
+                "",
+                text,
+                "",
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+queries = []
+task_counts = Counter()
+for index, item in enumerate(query_items):
+    context = f"$.queries[{index}]"
+    if not isinstance(item, dict):
+        fail(f"{context} must be an object")
+    query_id = require_string(item, "query_id", context)
+    task = require_string(item, "task", context)
+    if task not in allowed_tasks:
+        fail(f"{context}.task must be one of {sorted(allowed_tasks)}")
+    query = require_string(item, "query", context)
+    expected_ids = require_string_list(item, "expected_evidence_ids", context)
+    allowed_alternate_ids = item.get("allowed_alternate_evidence_ids", [])
+    if allowed_alternate_ids is None:
+        allowed_alternate_ids = []
+    if not isinstance(allowed_alternate_ids, list):
+        fail(f"{context}.allowed_alternate_evidence_ids must be an array")
+    allowed_alternate_ids = [
+        evidence_id.strip()
+        for evidence_id in allowed_alternate_ids
+        if isinstance(evidence_id, str) and evidence_id.strip()
+    ]
+    expected_terms = require_string_list(item, "expected_terms", context)
+    for evidence_id in [*expected_ids, *allowed_alternate_ids]:
+        if evidence_id not in evidence_by_id:
+            fail(f"{context} references unknown evidence_id {evidence_id!r}")
+    queries.append(
+        {
+            "id": query_id,
+            "task": task,
+            "query": query,
+            "expected_doc": f"{expected_ids[0]}.md",
+            "allowed_alternate_docs": [
+                f"{evidence_id}.md" for evidence_id in [*expected_ids[1:], *allowed_alternate_ids]
+            ],
+            "expected_terms": expected_terms,
+            "expected_evidence_ids": expected_ids,
+            "allowed_alternate_evidence_ids": allowed_alternate_ids,
+        }
+    )
+    task_counts[task] += 1
+
+queries_path.write_text(
+    json.dumps(
+        {
+            "schema": "elf.live_baseline.queries/v1",
+            "profile": profile,
+            "corpus_track": corpus_track,
+            "manifest_schema": manifest["schema"],
+            "manifest_id": manifest_id,
+            "document_count": len(evidence_by_id),
+            "queries": queries,
+        },
+        indent=2,
+    )
+    + "\n",
+    encoding="utf-8",
+)
+
+summary_path.write_text(
+    json.dumps(
+        {
+            "schema": "elf.production_corpus_summary/v1",
+            "corpus_track": corpus_track,
+            "manifest_schema": manifest["schema"],
+            "manifest_id": manifest_id,
+            "document_count": len(evidence_by_id),
+            "query_count": len(queries),
+            "category_counts": dict(sorted(category_counts.items())),
+            "task_counts": dict(sorted(task_counts.items())),
+            "evidence_ids": sorted(evidence_by_id),
+            "query_evidence": [
+                {
+                    "query_id": query["id"],
+                    "task": query["task"],
+                    "expected_evidence_ids": query["expected_evidence_ids"],
+                    "allowed_alternate_evidence_ids": query["allowed_alternate_evidence_ids"],
+                }
+                for query in queries
+            ],
+        },
+        indent=2,
+    )
+    + "\n",
+    encoding="utf-8",
+)
+PY
+
+  CORPUS_TRACK="$(jq -r '.corpus_track' "${corpus_summary}")"
+  CORPUS_MANIFEST_ID="$(jq -r '.manifest_id' "${corpus_summary}")"
+  CORPUS_PATH_DESCRIPTION="production corpus materialized in Docker under /bench/corpus"
+}
+
 rm -rf "${WORK_DIR}"
 mkdir -p "${REPORT_DIR}"
 find "${REPORT_DIR}" -maxdepth 1 -type f -delete
 mkdir -p "${REPOS_DIR}" "${CORPUS_DIR}" "${HOME_DIR}"
 : >"${RECORDS}"
 
-generate_corpus
+case "${CORPUS_PROFILE}" in
+  production-synthetic | production-private)
+    prepare_production_corpus
+    ;;
+  *)
+    generate_corpus
+    ;;
+esac
 DOCUMENT_COUNT="$(find "${CORPUS_DIR}" -maxdepth 1 -type f -name '*.md' | wc -l | tr -d ' ')"
 QUERY_COUNT="$(jq '.queries | length' "${REPORT_DIR}/queries.json")"
 
@@ -243,6 +505,8 @@ json_record() {
         command_summary: $command_summary,
         elapsed_seconds: $elapsed_seconds,
         embedding: ($checks[0].embedding // null),
+        query_summary: ($checks[0].query_summary // null),
+        queries: ($checks[0].queries // null),
         check_summary: $checks[0].check_summary,
         checks: $checks[0].checks
       }' >>"${RECORDS}"
@@ -267,6 +531,8 @@ json_record() {
         log_path: $log_path,
         command_summary: $command_summary,
         elapsed_seconds: $elapsed_seconds,
+        query_summary: null,
+        queries: null,
         check_summary: {
           total: 1,
           pass: (if $retrieval_status == "retrieval_pass" then 1 else 0 end),
@@ -333,6 +599,9 @@ finish_report() {
     --arg run_id "${RUN_ID}" \
     --arg project_filter "${PROJECT_FILTER}" \
     --arg corpus_profile "${CORPUS_PROFILE}" \
+    --arg corpus_track "${CORPUS_TRACK}" \
+    --arg corpus_path "${CORPUS_PATH_DESCRIPTION}" \
+    --arg corpus_manifest_id "${CORPUS_MANIFEST_ID}" \
     --argjson document_count "${DOCUMENT_COUNT}" \
     --argjson query_count "${QUERY_COUNT}" \
     --arg generated_at "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
@@ -344,9 +613,11 @@ finish_report() {
       project_filter: $project_filter,
       corpus: {
         profile: $corpus_profile,
+        track: $corpus_track,
+        manifest_id: (if $corpus_manifest_id == "" then null else $corpus_manifest_id end),
         document_count: $document_count,
         query_count: $query_count,
-        path: "generated in Docker under /bench/corpus",
+        path: $corpus_path,
         query_file: "tmp/live-baseline/queries.json"
       },
       verdict: (
@@ -374,6 +645,14 @@ finish_report() {
         fail: ([.[] | .check_summary.fail // 0] | add // 0),
         incomplete: ([.[] | .check_summary.incomplete // 0] | add // 0)
       },
+      wrong_result_count: ([.[] | .query_summary.wrong_result_count // .query_summary.fail // 0] | add // 0),
+      latency_ms: {
+        total: ([.[] | .query_summary.latency_ms_total // 0] | add // 0),
+        mean: (
+          [.[] | select(.query_summary != null) | .query_summary.latency_ms_mean // 0] as $means
+          | if ($means | length) == 0 then 0 else (($means | add) / ($means | length)) end
+        )
+      },
       projects: .
     }' "${RECORDS}" >"${REPORT}"
 }
@@ -419,7 +698,7 @@ project_elf() {
   if run_cmd "${project}: same-corpus retrieval" 1200 "${log_path}" \
     "cd '${ROOT_DIR}' && cargo run -p elf-eval --bin live_baseline_elf -- --config config/local/elf.docker.toml --corpus '${CORPUS_DIR}' --queries '${REPORT_DIR}/queries.json' --out '${result_path}'"; then
     if [[ -s "${result_path}" ]] && jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then
-      jq '{embedding, check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json"
+      jq '{embedding, query_summary: .summary, queries, check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json"
     fi
     if [[ -s "${result_path}" ]] && jq -e --argjson document_count "${DOCUMENT_COUNT}" --argjson query_count "${QUERY_COUNT}" '
       .schema == "elf.live_baseline.elf_result/v1" and
diff --git a/scripts/live-baseline-report-to-md.sh b/scripts/live-baseline-report-to-md.sh
index 651f29b4..bdb54ed8 100755
--- a/scripts/live-baseline-report-to-md.sh
+++ b/scripts/live-baseline-report-to-md.sh
@@ -4,6 +4,10 @@ set -euo pipefail
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 REPORT="${1:-${ELF_BASELINE_REPORT:-${ROOT_DIR}/tmp/live-baseline/live-baseline-report.json}}"
 OUT="${2:-${ELF_BASELINE_MARKDOWN_REPORT:-}}"
+REPORT_DISPLAY="${REPORT}"
+if [[ "${REPORT_DISPLAY}" == "${ROOT_DIR}/"* ]]; then
+  REPORT_DISPLAY="${REPORT_DISPLAY#"${ROOT_DIR}/"}"
+fi
 
 if ! command -v jq >/dev/null 2>&1; then
   echo "Missing jq; cannot render live baseline Markdown report." >&2
@@ -16,7 +20,7 @@ if [[ ! -f "${REPORT}" ]]; then
 fi
 
 render_report() {
-  jq -r --arg report_path "${REPORT}" '
+  jq -r --arg report_path "${REPORT_DISPLAY}" '
     def dash:
       if . == null then "-" else tostring end;
     def md:
@@ -39,8 +43,16 @@ render_report() {
     ("- Verdict: `" + (.verdict | md) + "`"),
     ("- Project filter: `" + (.project_filter | md) + "`"),
     ("- Corpus profile: `" + (.corpus.profile | md) + "`"),
+    ("- Corpus track: `" + ((.corpus.track // "generated_public") | md) + "`"),
+    (
+      if (.corpus.manifest_id // null) == null then empty
+      else "- Corpus manifest: `" + (.corpus.manifest_id | md) + "`"
+      end
+    ),
     ("- Documents: `" + (.corpus.document_count | tostring) + "`"),
     ("- Queries: `" + (.corpus.query_count | tostring) + "`"),
+    ("- Wrong-result count: `" + ((.wrong_result_count // 0) | tostring) + "`"),
+    ("- Query latency mean: `" + ((.latency_ms.mean // 0) | tostring) + " ms`"),
     ("- Project summary: `" + (.summary.pass | tostring) + " pass`, `" + (.summary.fail | tostring) + " fail`, `" + (.summary.incomplete | tostring) + " incomplete`"),
     ("- Same-corpus summary: `" + (.same_corpus_summary.pass | tostring) + " pass`, `" + (.same_corpus_summary.fail | tostring) + " fail`, `" + (.same_corpus_summary.incomplete | tostring) + " incomplete`"),
     ("- Full check summary: `" + (.full_check_summary.pass | tostring) + "/" + (.full_check_summary.total | tostring) + " pass`"),
@@ -80,6 +92,29 @@ render_report() {
           ""
         else empty end
     ),
+    (
+      [.projects[] | {project, queries: (.queries // [])} | select((.queries | length) > 0)] as $query_projects
+      | if ($query_projects | length) > 0 then
+          "## Query Evidence",
+          "",
+          "| Project | Query | Task | Expected Evidence | Allowed Alternates | Top Evidence | Matched | Latency |",
+          "| --- | --- | --- | --- | --- | --- | --- | --- |",
+          (
+            $query_projects[]
+            | .project as $project
+            | .queries[]
+            | "| " + ($project | md)
+              + " | `" + (.id | md) + "`"
+              + " | `" + ((.task // "-") | md) + "`"
+              + " | `" + (((.expected_evidence_ids // []) | join(", ")) | md) + "`"
+              + " | `" + (((.allowed_alternate_evidence_ids // []) | join(", ")) | md) + "`"
+              + " | `" + ((.top_evidence_id // "-") | md) + "`"
+              + " | `" + (.matched | tostring) + "`"
+              + " | `" + ((.latency_ms // 0) | tostring) + " ms` |"
+          ),
+          ""
+        else empty end
+    ),
     "## Result Semantics",
     "",
     "- `pass`: every encoded check for the selected project and profile passed.",