From 15134a4c18856b8923524bd7cf13e7ace0966f4c Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Tue, 9 Jun 2026 12:54:07 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add private production corpus benchmark","authority":"XY-818"} --- Makefile.toml | 18 ++ README.md | 2 + .../synthetic_coding_agent_manifest.json | 105 +++++++ apps/elf-eval/src/bin/live_baseline_elf.rs | 164 +++++++--- .../2026-06-09-production-corpus-report.md | 55 ++++ docs/guide/benchmarking/index.md | 5 +- .../benchmarking/live_baseline_benchmark.md | 53 +++- docs/spec/index.md | 2 + docs/spec/production_corpus_manifest_v1.md | 102 +++++++ scripts/live-baseline-benchmark.sh | 285 +++++++++++++++++- scripts/live-baseline-report-to-md.sh | 37 ++- 11 files changed, 781 insertions(+), 47 deletions(-) create mode 100644 apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json create mode 100644 docs/guide/benchmarking/2026-06-09-production-corpus-report.md create mode 100644 docs/spec/production_corpus_manifest_v1.md diff --git a/Makefile.toml b/Makefile.toml index 3cf5f17c..ab3c4762 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -299,6 +299,8 @@ args = [ # | baseline-live-docker | command | | # | baseline-live-report | command | | # | baseline-live-docker-clean | command | | +# | baseline-production-synthetic | command | | +# | baseline-production-private | command | | [tasks.baseline-live-docker] workspace = false @@ -327,6 +329,22 @@ args = [ "--remove-orphans", ] +[tasks.baseline-production-synthetic] +workspace = false +command = "bash" +args = [ + "-lc", + "set -euo pipefail; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; selected_projects=\"$(printenv ELF_BASELINE_PROJECTS || true)\"; if [ -z \"$selected_projects\" ]; then selected_projects=\"ELF\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=\"$selected_projects\"; export ELF_BASELINE_PROFILE=production-synthetic; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", +] + +[tasks.baseline-production-private] +workspace = false +command = "bash" +args = [ + "-lc", + "set -euo pipefail; manifest=\"$(printenv ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST || true)\"; if [ -z \"$manifest\" ]; then echo \"ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST is required for baseline-production-private\" >&2; exit 1; fi; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; selected_projects=\"$(printenv ELF_BASELINE_PROJECTS || true)\"; if [ -z \"$selected_projects\" ]; then selected_projects=\"ELF\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=\"$selected_projects\"; export ELF_BASELINE_PROFILE=production-private; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", +] + # Meta # | task | type | cwd | diff --git a/README.md b/README.md index 11b5fe2d..9b183598 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,7 @@ embeddings. Detailed evidence and interpretation: - [Live Baseline Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-live-baseline-report.md) +- [Synthetic Production Corpus Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-corpus-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) Quick comparison snapshot (objective/high-level). @@ -177,6 +178,7 @@ Project signature strengths (what each does especially well): Detailed comparison, mechanism-level analysis, and source map: - [Live Baseline Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-live-baseline-report.md) +- [Synthetic Production Corpus Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-corpus-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [External Memory Improvement Plan](docs/guide/research/external_memory_improvement_plan.md) - [Detailed External Comparison](docs/guide/research/comparison_external_projects.md) diff --git a/apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json b/apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json new file mode 100644 index 00000000..d627b627 --- /dev/null +++ b/apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json @@ -0,0 +1,105 @@ +{ + "schema": "elf.production_corpus_manifest/v1", + "manifest_id": "synthetic-coding-agent-prod-corpus-2026-06-09", + "description": "Synthetic, sanitized production-style coding-agent memory corpus for ELF adoption benchmarking.", + "evidence": [ + { + "evidence_id": "issue-xy812-resume", + "category": "issue", + "title": "XY-812 Resume Lane", + "text": "XY-812 resume lane uses branch y/elf-xy-812. The next command is `cargo make trace-gate`; the stale blocker cleared after PR #108 merged." + }, + { + "evidence_id": "pr-110-review", + "category": "pr", + "title": "PR 110 Review Status", + "text": "PR #110 is review-ready for the ELF viewer lane. It passed `cargo make checks` and waits for the non-draft review handoff." + }, + { + "evidence_id": "worktree-xy791-repair", + "category": "worktree", + "title": "XY-791 Strict Config Repair", + "text": "Worktree XY-791 recovered strict-config repair after rebase. The exact gate was `cargo make fmt && cargo make lint-fix && cargo make checks`." + }, + { + "evidence_id": "runbook-live-baseline", + "category": "runbook", + "title": "Private Production Corpus Runbook", + "text": "Private production fixtures use `ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST` with `cargo make baseline-production-private` and stay out of git." + }, + { + "evidence_id": "decision-qdrant-derived", + "category": "decision", + "title": "Qdrant Derived Index Decision", + "text": "Decision: Qdrant remains a rebuildable derived index. Postgres stores source-of-truth vectors, notes, chunks, and audit rows." + }, + { + "evidence_id": "blocker-stale-qwen-key", + "category": "blocker", + "title": "Stale Provider Key Blocker", + "text": "Stale blocker: missing Qwen key applied only to provider stress runs. The synthetic production corpus uses local deterministic embeddings." + }, + { + "evidence_id": "recovery-xy640-ledger", + "category": "recovery_note", + "title": "XY-640 Ledger Replay Recovery", + "text": "Recovery note: XY-640 ledger replay resumes from checkpoint `ledger-replay-42` and verifies the retained lane with `cargo make test`." + }, + { + "evidence_id": "decision-xy818-supersedes", + "category": "decision", + "title": "Superseded Command Decision", + "text": "Update case: old command `cargo make lint` was superseded by `cargo make lint-fix` for Decodex ELF lanes." + } + ], + "queries": [ + { + "query_id": "q-resume-lane", + "task": "resume_lane", + "query": "How do I resume XY-812 and what command is next?", + "expected_evidence_ids": ["issue-xy812-resume"], + "allowed_alternate_evidence_ids": [], + "expected_terms": ["XY-812", "cargo make trace-gate"] + }, + { + "query_id": "q-recover-exact-command", + "task": "recover_exact_command", + "query": "Recover the exact repair gate command for XY-791 strict config.", + "expected_evidence_ids": ["worktree-xy791-repair"], + "allowed_alternate_evidence_ids": ["runbook-live-baseline"], + "expected_terms": ["XY-791", "cargo make fmt && cargo make lint-fix && cargo make checks"] + }, + { + "query_id": "q-explain-stale-blocker", + "task": "explain_stale_blocker", + "query": "Why is the missing Qwen key blocker stale for the synthetic production corpus?", + "expected_evidence_ids": ["blocker-stale-qwen-key"], + "allowed_alternate_evidence_ids": [], + "expected_terms": ["missing Qwen key", "local deterministic embeddings"] + }, + { + "query_id": "q-find-prior-decision", + "task": "find_prior_decision", + "query": "What prior decision explains why Qdrant can be rebuilt?", + "expected_evidence_ids": ["decision-qdrant-derived"], + "allowed_alternate_evidence_ids": [], + "expected_terms": ["Qdrant", "rebuildable derived index"] + }, + { + "query_id": "q-compare-project-status", + "task": "compare_project_status", + "query": "Compare PR #110 and XY-640 status.", + "expected_evidence_ids": ["pr-110-review"], + "allowed_alternate_evidence_ids": ["recovery-xy640-ledger"], + "expected_terms": ["PR #110", "review-ready"] + }, + { + "query_id": "q-detect-contradiction-update", + "task": "detect_contradiction_update", + "query": "Which command superseded cargo make lint for Decodex ELF lanes?", + "expected_evidence_ids": ["decision-xy818-supersedes"], + "allowed_alternate_evidence_ids": [], + "expected_terms": ["cargo make lint-fix", "superseded"] + } + ] +} diff --git a/apps/elf-eval/src/bin/live_baseline_elf.rs b/apps/elf-eval/src/bin/live_baseline_elf.rs index 75c9b83e..b0857036 100644 --- a/apps/elf-eval/src/bin/live_baseline_elf.rs +++ b/apps/elf-eval/src/bin/live_baseline_elf.rs @@ -61,9 +61,35 @@ struct QueryManifest { #[derive(Clone, Debug, Deserialize, Serialize)] struct QueryCase { id: String, + task: Option, query: String, expected_doc: String, expected_terms: Vec, + #[serde(default)] + allowed_alternate_docs: Vec, + #[serde(default)] + expected_evidence_ids: Vec, + #[serde(default)] + allowed_alternate_evidence_ids: Vec, +} +impl QueryCase { + fn generated( + id: String, + query: String, + expected_doc: String, + expected_terms: Vec, + ) -> Self { + Self { + id, + task: None, + query, + expected_evidence_ids: vec![evidence_id_for_doc(&expected_doc)], + allowed_alternate_docs: Vec::new(), + allowed_alternate_evidence_ids: Vec::new(), + expected_doc, + expected_terms, + } + } } #[derive(Debug)] @@ -158,6 +184,9 @@ struct QuerySummary { total: usize, pass: usize, fail: usize, + wrong_result_count: usize, + latency_ms_total: f64, + latency_ms_mean: f64, } #[derive(Debug, Serialize)] @@ -179,13 +208,20 @@ struct CheckResult { #[derive(Debug, Serialize)] struct QueryResult { id: String, + task: Option, query: String, expected_doc: String, + allowed_alternate_docs: Vec, expected_terms: Vec, + expected_evidence_ids: Vec, + allowed_alternate_evidence_ids: Vec, matched: bool, matched_terms: Vec, + top_evidence_id: Option, + matched_evidence_id: Option, top_note_key: Option, top_snippet: Option, + latency_ms: f64, returned_count: usize, } @@ -499,6 +535,16 @@ fn outbox_done(counts: &BTreeMap, expected_note_count: usize) -> bo fn retrieval_check(query_results: &[QueryResult]) -> CheckResult { let pass_count = query_results.iter().filter(|result| result.matched).count(); let fail_count = query_results.len().saturating_sub(pass_count); + let expected_evidence_ids = query_results + .iter() + .map(|result| { + serde_json::json!({ + "query_id": result.id, + "expected": result.expected_evidence_ids, + "allowed_alternates": result.allowed_alternate_evidence_ids, + }) + }) + .collect::>(); CheckResult { name: "same_corpus_retrieval", @@ -512,6 +558,8 @@ fn retrieval_check(query_results: &[QueryResult]) -> CheckResult { "total": query_results.len(), "pass": pass_count, "fail": fail_count, + "wrong_result_count": fail_count, + "expected_evidence_ids": expected_evidence_ids, }), } } @@ -579,12 +627,12 @@ fn concurrent_add_request(index: usize) -> AddNoteRequest { fn concurrent_query_case(index: usize) -> QueryCase { let marker = concurrent_marker(index); - QueryCase { - id: format!("concurrent-{index:03}"), - query: format!("Find the concurrent benchmark note containing marker {marker}."), - expected_doc: format!("concurrent-{index:03}.md"), - expected_terms: vec![marker], - } + QueryCase::generated( + format!("concurrent-{index:03}"), + format!("Find the concurrent benchmark note containing marker {marker}."), + format!("concurrent-{index:03}.md"), + vec![marker], + ) } fn concurrent_marker(index: usize) -> String { @@ -648,12 +696,12 @@ fn soak_query_case(index: usize) -> QueryCase { let marker = soak_marker(index); let (topic, _) = soak_topic(index); - QueryCase { - id: format!("soak-{index:03}"), - query: format!("Find the soak benchmark note about {topic} containing marker {marker}."), - expected_doc: format!("soak-{index:03}.md"), - expected_terms: vec![marker], - } + QueryCase::generated( + format!("soak-{index:03}"), + format!("Find the soak benchmark note about {topic} containing marker {marker}."), + format!("soak-{index:03}.md"), + vec![marker], + ) } fn soak_marker(index: usize) -> String { @@ -808,6 +856,19 @@ fn key_for_doc(doc: &str) -> String { if key.is_empty() { "doc".to_string() } else { key } } +fn evidence_id_for_doc(doc: &str) -> String { + Path::new(doc).file_stem().and_then(|stem| stem.to_str()).unwrap_or(doc).to_string() +} + +fn expected_docs_for_case(case: &QueryCase) -> Vec { + let mut docs = Vec::with_capacity(case.allowed_alternate_docs.len().saturating_add(1)); + + docs.push(case.expected_doc.clone()); + docs.extend(case.allowed_alternate_docs.iter().cloned()); + + docs +} + fn embed_text(text: &str, vector_dim: u32) -> Vec { let dim = vector_dim as usize; let mut vector = vec![0.0_f32; dim]; @@ -966,6 +1027,8 @@ async fn run(args: Args) -> color_eyre::Result { let query_results = run_queries(&service, query_manifest.queries).await?; let pass_count = query_results.iter().filter(|result| result.matched).count(); let fail_count = query_results.len().saturating_sub(pass_count); + let latency_ms_total = query_results.iter().map(|result| result.latency_ms).sum::(); + let latency_ms_mean = latency_ms_total / query_results.len().max(1) as f64; let retrieval_status = if fail_count == 0 { "retrieval_pass" } else { "retrieval_wrong_result" }; let mut checks = vec![retrieval_check(&query_results), worker_indexing_check(initial_worker)]; @@ -1004,7 +1067,14 @@ async fn run(args: Args) -> color_eyre::Result { rebuild_missing_vector_count: rebuild.missing_vector_count, rebuild_error_count: rebuild.error_count, }, - summary: QuerySummary { total: query_results.len(), pass: pass_count, fail: fail_count }, + summary: QuerySummary { + total: query_results.len(), + pass: pass_count, + fail: fail_count, + wrong_result_count: fail_count, + latency_ms_total, + latency_ms_mean, + }, check_summary, checks, queries: query_results, @@ -1262,13 +1332,14 @@ async fn run_single_query( .ok() .and_then(|value| value.parse::().ok()) .unwrap_or(10); + let started_at = Instant::now(); let response = service .search_raw(SearchRequest { tenant_id: TENANT_ID.to_string(), project_id: PROJECT_ID.to_string(), agent_id: AGENT_ID.to_string(), token_id: None, - payload_level: PayloadLevel::default(), + payload_level: PayloadLevel::L2, read_profile: "private_only".to_string(), query: case.query.clone(), top_k: Some(top_k), @@ -1278,6 +1349,7 @@ async fn run_single_query( ranking: None, }) .await?; + let latency_ms = started_at.elapsed().as_secs_f64() * 1_000.0; let top = response.items.first(); let top_text = top.map(|item| item.snippet.clone()).unwrap_or_default(); let matched_terms = case @@ -1287,19 +1359,41 @@ async fn run_single_query( .cloned() .collect::>(); let top_key = top.and_then(|item| item.key.clone()); - let expected_key = key_for_doc(&case.expected_doc); - let matched = matched_terms.len() == case.expected_terms.len() - || top_key.as_deref().is_some_and(|key| key == expected_key); + let expected_docs = expected_docs_for_case(&case); + let matched_doc = + top_key.as_deref().and_then(|key| expected_docs.iter().find(|doc| key_for_doc(doc) == key)); + let top_evidence_id = top.and_then(|item| { + item.source_ref.get("document").and_then(Value::as_str).map(evidence_id_for_doc) + }); + let matched_evidence_id = matched_doc.map(|doc| evidence_id_for_doc(doc)); + let matched = matched_terms.len() == case.expected_terms.len() || matched_doc.is_some(); + let expected_evidence_ids = if case.expected_evidence_ids.is_empty() { + vec![evidence_id_for_doc(&case.expected_doc)] + } else { + case.expected_evidence_ids.clone() + }; + let allowed_alternate_evidence_ids = if case.allowed_alternate_evidence_ids.is_empty() { + case.allowed_alternate_docs.iter().map(|doc| evidence_id_for_doc(doc)).collect() + } else { + case.allowed_alternate_evidence_ids.clone() + }; Ok(QueryResult { id: case.id, + task: case.task, query: case.query, expected_doc: case.expected_doc, + allowed_alternate_docs: case.allowed_alternate_docs, expected_terms: case.expected_terms, + expected_evidence_ids, + allowed_alternate_evidence_ids, matched, matched_terms, + top_evidence_id, + matched_evidence_id, top_note_key: top_key, top_snippet: top.map(|item| item.snippet.clone()), + latency_ms, returned_count: response.items.len(), }) } @@ -1375,12 +1469,12 @@ async fn run_update_replacement_check( run_worker_until_indexed(runtime, service, &[update_note_id], "lifecycle_update").await?; let update_query = run_single_query( service, - QueryCase { - id: "lifecycle-update-new-marker".to_string(), - query: "Which rotated JWT key id does the auth middleware require?".to_string(), - expected_doc: update_note.source_doc.clone(), - expected_terms: vec!["kid-v4".to_string(), "RotatedJwtKeyPlan".to_string()], - }, + QueryCase::generated( + "lifecycle-update-new-marker".to_string(), + "Which rotated JWT key id does the auth middleware require?".to_string(), + update_note.source_doc.clone(), + vec!["kid-v4".to_string(), "RotatedJwtKeyPlan".to_string()], + ), ) .await?; let old_marker_absent = update_query @@ -1427,12 +1521,12 @@ async fn run_delete_suppression_check( run_worker_until_indexed(runtime, service, &[delete_note_id], "lifecycle_delete").await?; let delete_query = run_single_query( service, - QueryCase { - id: "lifecycle-delete-suppresses-note".to_string(), - query: delete_note.text.clone(), - expected_doc: delete_note.source_doc.clone(), - expected_terms: distinctive_terms(&delete_note.text, 2), - }, + QueryCase::generated( + "lifecycle-delete-suppresses-note".to_string(), + delete_note.text.clone(), + delete_note.source_doc.clone(), + distinctive_terms(&delete_note.text, 2), + ), ) .await?; let delete_pass = !delete_query.matched @@ -1464,12 +1558,12 @@ async fn run_cold_start_recovery_check( let recovery_service = build_service(runtime).await?; let recovery_query = run_single_query( &recovery_service, - QueryCase { - id: "lifecycle-cold-start-recovery".to_string(), - query: recovery_note.text.clone(), - expected_doc: recovery_note.source_doc.clone(), - expected_terms: distinctive_terms(&recovery_note.text, 2), - }, + QueryCase::generated( + "lifecycle-cold-start-recovery".to_string(), + recovery_note.text.clone(), + recovery_note.source_doc.clone(), + distinctive_terms(&recovery_note.text, 2), + ), ) .await?; let outbox_counts = pending_outbox_counts(service).await?; diff --git a/docs/guide/benchmarking/2026-06-09-production-corpus-report.md b/docs/guide/benchmarking/2026-06-09-production-corpus-report.md new file mode 100644 index 00000000..8d1505c8 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-09-production-corpus-report.md @@ -0,0 +1,55 @@ +# Live Baseline Benchmark Report + +Goal: Publish a Markdown summary for one generated live baseline aggregate report. +Read this when: You need a durable, reviewable summary of a live baseline JSON report. +Inputs: `tmp/live-baseline/live-baseline-report.json`. +Depends on: `scripts/live-baseline-benchmark.sh` and `docs/guide/benchmarking/live_baseline_benchmark.md`. +Verification: Compare this Markdown summary with the source JSON before committing. + +## Summary + +- Run ID: `live-baseline-20260609045306` +- Generated at: `2026-06-09T04:53:18Z` +- Verdict: `pass` +- Project filter: `ELF` +- Corpus profile: `production-synthetic` +- Corpus track: `synthetic_production` +- Corpus manifest: `synthetic-coding-agent-prod-corpus-2026-06-09` +- Documents: `8` +- Queries: `6` +- Wrong-result count: `0` +- Query latency mean: `7.137632833333334 ms` +- Project summary: `1 pass`, `0 fail`, `0 incomplete` +- Same-corpus summary: `1 pass`, `0 fail`, `0 incomplete` +- Full check summary: `7/7 pass` + +## Projects + +| Project | Status | Retrieval | Checks | Elapsed | Reason | +| --- | --- | --- | --- | --- | --- | +| ELF | `pass` | `retrieval_pass` | `7/7` | `12s` | ELF added the corpus, rebuilt Qdrant, and returned expected evidence for every query | + +## Embedding + +| Project | Mode | Provider | Model | Dimensions | Timeout | API Base | Path | +| --- | --- | --- | --- | --- | --- | --- | --- | +| ELF | `local` | `local` | `local-hash` | `256` | `1000ms` | `http://127.0.0.1` | `/embeddings` | + +## Query Evidence + +| Project | Query | Task | Expected Evidence | Allowed Alternates | Top Evidence | Matched | Latency | +| --- | --- | --- | --- | --- | --- | --- | --- | +| ELF | `q-resume-lane` | `resume_lane` | `issue-xy812-resume` | `` | `issue-xy812-resume` | `true` | `9.213627 ms` | +| ELF | `q-recover-exact-command` | `recover_exact_command` | `worktree-xy791-repair` | `runbook-live-baseline` | `worktree-xy791-repair` | `true` | `6.424872 ms` | +| ELF | `q-explain-stale-blocker` | `explain_stale_blocker` | `blocker-stale-qwen-key` | `` | `blocker-stale-qwen-key` | `true` | `7.749393 ms` | +| ELF | `q-find-prior-decision` | `find_prior_decision` | `decision-qdrant-derived` | `` | `decision-qdrant-derived` | `true` | `6.66385 ms` | +| ELF | `q-compare-project-status` | `compare_project_status` | `pr-110-review` | `recovery-xy640-ledger` | `recovery-xy640-ledger` | `true` | `6.344976 ms` | +| ELF | `q-detect-contradiction-update` | `detect_contradiction_update` | `decision-xy818-supersedes` | `` | `decision-xy818-supersedes` | `true` | `6.429079 ms` | + +## Result Semantics + +- `pass`: every encoded check for the selected project and profile passed. +- `fail`: clone, install, import, build, retrieval, lifecycle, recovery, concurrency, soak, resource-envelope, or another declared check failed. +- `incomplete`: the encoded check could not complete without extra provider keys, host integration, native dependency support, durable runtime wiring, or more adapter work. + +`incomplete` is not a pass; treat it as benchmark wiring debt. diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index 4493e306..3fcd0143 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -20,9 +20,12 @@ Outputs: The smallest benchmarking guide or report needed to continue. ## Guides And Reports - `live_baseline_benchmark.md`: run, clean up, publish, and interpret the live - Docker-only benchmark matrix. + Docker-only benchmark matrix, including generated public and production-corpus + profiles. - `2026-06-09-live-baseline-report.md`: checked-in evidence snapshot for the June 9, 2026 ELF production-provider stress run and all-project smoke comparison. +- `2026-06-09-production-corpus-report.md`: checked-in synthetic production-corpus + ELF adoption benchmark report with task queries and evidence IDs. ## Update Rules diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index b61b1e2b..c229eff6 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -3,7 +3,9 @@ Goal: Run Docker-isolated, current-HEAD baseline checks against ELF and the external memory projects compared with ELF. Read this when: You need evidence about which external projects actually run against a shared benchmark corpus. Preconditions: Docker and Docker Compose are available on the host. -Depends on: `docker-compose.baseline.yml`, `scripts/live-baseline-benchmark.sh`, and `docs/spec/system_competitive_parity_gate_v1.md`. +Depends on: `docker-compose.baseline.yml`, `scripts/live-baseline-benchmark.sh`, +`docs/spec/system_competitive_parity_gate_v1.md`, and +`docs/spec/production_corpus_manifest_v1.md`. Verification: `cargo make baseline-live-docker` writes `tmp/live-baseline/live-baseline-report.json`; `cargo make baseline-live-report` can render that JSON into a checked-in Markdown report. ## Scope @@ -40,9 +42,20 @@ Corpus profiles: that make the check closer to a production retrieval benchmark. - `stress`: 480 documents by default, 16 query cases, and alternate phrasings for every needle query. +- `production-synthetic`: checked-in synthetic coding-agent production corpus with + issues, PRs, worktrees, runbooks, decisions, blockers, recovery notes, and + task-oriented queries. Fixture: + `apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json`. +- `production-private`: local private/sanitized production corpus manifest supplied by + `ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST`. Use `ELF_BASELINE_SCALE_DOCS` and `ELF_BASELINE_STRESS_DOCS` to raise or lower the generated corpus sizes. +Use `ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST` to supply a local manifest that follows +`docs/spec/production_corpus_manifest_v1.md`. The private profile fails closed when the +manifest path is absent, the file is missing, a referenced `local_path` is missing, or a +query references an unknown evidence ID. It does not fall back to the checked-in +synthetic fixture. Use `ELF_BASELINE_CONCURRENT_NOTES`, `ELF_BASELINE_MAX_ELF_SECONDS`, and `ELF_BASELINE_MAX_ELF_RSS_KB` to tune ELF's concurrent-write and resource-envelope checks. @@ -138,6 +151,23 @@ ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker ELF_BASELINE_PROJECTS=ELF,memsearch cargo make baseline-live-docker ``` +To run the checked-in synthetic production-style corpus through ELF: + +```sh +cargo make baseline-production-synthetic +``` + +To run a private local production corpus without committing private content: + +```sh +ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST=tmp/private-production-corpus/manifest.json \ +cargo make baseline-production-private +``` + +The private manifest can contain sanitized inline `text` fields or `local_path` fields +that point to local sanitized text/Markdown files. Keep private manifests and local +evidence under `tmp/` or outside the repository. `tmp/` is ignored by git. + The only host artifact is: ```text @@ -146,12 +176,21 @@ tmp/live-baseline/ That directory contains the aggregate report, per-project logs, and the shared query fixture used by the run. The aggregate report records `corpus.profile`, -`corpus.document_count`, and `corpus.query_count` so smoke, scale, and stress runs are -not confused. Each project record includes `elapsed_seconds` for rough local runtime -comparison. ELF project records also include an `embedding` summary so deterministic -local and production-provider runs are not confused. Each project record also includes -`checks` and `check_summary`; the aggregate `full_check_summary` is the -adoption-relevant multi-check count. +`corpus.track`, `corpus.manifest_id`, `corpus.document_count`, and +`corpus.query_count` so generated public corpus results are not confused with +synthetic or private production-corpus results. Each project record includes +`elapsed_seconds` for rough local runtime comparison. ELF project records also include +an `embedding` summary so deterministic local and production-provider runs are not +confused. ELF query records include task, expected evidence IDs, allowed alternate +evidence IDs, top evidence ID, wrong-result count, and per-query latency. Each project +record also includes `checks` and `check_summary`; the aggregate `full_check_summary` +is the adoption-relevant multi-check count. + +Production-ready claims must cite a concrete report path. A claim based only on +generated public `smoke`, `scale`, or `stress` profiles is not enough for personal +production adoption. Cite a `production-synthetic` report for fixture coverage, and +cite a `production-private` report when making a private-corpus production-readiness +claim. ## Publish A Markdown Report diff --git a/docs/spec/index.md b/docs/spec/index.md index e7c8f30c..7cec41ce 100644 --- a/docs/spec/index.md +++ b/docs/spec/index.md @@ -37,6 +37,8 @@ Question this index answers: "what must remain true?" proposal contract over immutable source evidence. - `system_competitive_parity_gate_v1.md`: Docker-only adoption gate that decides whether ELF meets or exceeds selected external memory-system baselines. +- `production_corpus_manifest_v1.md`: Sanitized/private coding-agent production + corpus manifest schema for adoption benchmark runs. ## Spec document contract diff --git a/docs/spec/production_corpus_manifest_v1.md b/docs/spec/production_corpus_manifest_v1.md new file mode 100644 index 00000000..4d582958 --- /dev/null +++ b/docs/spec/production_corpus_manifest_v1.md @@ -0,0 +1,102 @@ +# Production Corpus Manifest v1 + +Purpose: Define the sanitized/private coding-agent production corpus manifest used by +ELF adoption benchmarks. +Status: normative +Read this when: You are creating, validating, or running a production-style personal +agent memory benchmark corpus. +Not this document: Docker benchmark run commands, report publication steps, or private +fixture storage procedures. +Defines: `elf.production_corpus_manifest/v1` fields, required evidence categories, +query tasks, evidence expectations, and private-content safety rules. + +## Contract + +A production corpus manifest is a JSON object with: + +- `schema`: exactly `elf.production_corpus_manifest/v1`. +- `manifest_id`: stable lower-risk identifier for the corpus snapshot. +- `description`: optional English summary. +- `evidence`: non-empty array of production-style memory evidence items. +- `queries`: non-empty array of task-oriented retrieval checks. + +The checked-in benchmark fixture must be synthetic and sanitized. Real private +production content must not be committed. + +## Evidence Items + +Each `evidence[]` item must include: + +- `evidence_id`: lower-case ASCII identifier safe for filenames. Allowed shape: + `[a-z0-9][a-z0-9_.-]{1,80}`. +- `category`: one of `issue`, `pr`, `worktree`, `runbook`, `decision`, `blocker`, + or `recovery_note`. +- `title`: short English title. +- Exactly one of: + - `text`: sanitized inline English evidence text. + - `local_path`: path to a local sanitized text/Markdown file, resolved relative to + the manifest when not absolute. + +Evidence text must not contain secrets, tokens, private keys, personal credentials, or +unsanitized private conversation content. + +## Query Cases + +Each `queries[]` item must include: + +- `query_id`: stable query identifier. +- `task`: one of `resume_lane`, `recover_exact_command`, `explain_stale_blocker`, + `find_prior_decision`, `compare_project_status`, or + `detect_contradiction_update`. +- `query`: English task-oriented search query. +- `expected_evidence_ids`: non-empty array of evidence IDs that satisfy the query. +- `allowed_alternate_evidence_ids`: array of acceptable alternate evidence IDs. Use + an empty array when no alternate is allowed. +- `expected_terms`: non-empty array of terms that should appear in the matched + evidence snippet when the expected note key is not the top result. + +Every query must record both expected evidence IDs and allowed alternates, even when +the allowed alternate list is empty. + +## Benchmark Mapping + +The Docker benchmark materializes each evidence item as a temporary Markdown document +inside the benchmark work directory. The source document filename is +`.md`. Reports must expose evidence IDs and allowed alternates, not local +private file paths. + +For `production-private` runs, the runner must fail closed when the manifest is absent, +the manifest references a missing `local_path`, or any query references an unknown +evidence ID. It must not silently fall back to the checked-in synthetic corpus. + +## Minimal Example + +```json +{ + "schema": "elf.production_corpus_manifest/v1", + "manifest_id": "local-private-prod-corpus-2026-06-09", + "evidence": [ + { + "evidence_id": "issue-xy123-resume", + "category": "issue", + "title": "XY-123 Resume State", + "text": "XY-123 resumes on branch y/example with command `cargo make checks`." + } + ], + "queries": [ + { + "query_id": "q-resume-xy123", + "task": "resume_lane", + "query": "How do I resume XY-123?", + "expected_evidence_ids": ["issue-xy123-resume"], + "allowed_alternate_evidence_ids": [], + "expected_terms": ["XY-123", "cargo make checks"] + } + ] +} +``` + +## Related Guides + +- `docs/guide/benchmarking/live_baseline_benchmark.md`: run commands, private fixture + placement, and report publication. diff --git a/scripts/live-baseline-benchmark.sh b/scripts/live-baseline-benchmark.sh index fbb56b05..1b5a6e0a 100755 --- a/scripts/live-baseline-benchmark.sh +++ b/scripts/live-baseline-benchmark.sh @@ -16,6 +16,10 @@ SCALE_DOC_COUNT="${ELF_BASELINE_SCALE_DOCS:-120}" STRESS_DOC_COUNT="${ELF_BASELINE_STRESS_DOCS:-480}" QUERY_TOP_K="${ELF_BASELINE_TOP_K:-10}" CURRENT_PROJECT_STARTED_AT="" +PRODUCTION_SYNTHETIC_MANIFEST="${ROOT_DIR}/apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json" +CORPUS_TRACK="generated_public" +CORPUS_PATH_DESCRIPTION="generated in Docker under /bench/corpus" +CORPUS_MANIFEST_ID="" if [[ ! -f "/.dockerenv" && "${ELF_BASELINE_ALLOW_HOST:-0}" != "1" ]]; then echo "Refusing to run live baseline benchmark outside Docker. Use cargo make baseline-live-docker." >&2 @@ -157,21 +161,28 @@ query_docs = anchors[: (3 if profile == "smoke" else len(anchors))] queries = [] for doc in query_docs: base_id = doc["name"].replace("-memory.md", "").replace(".md", "") + evidence_id = doc["name"].replace(".md", "") queries.append( { "id": f"q-{base_id}", + "task": "same_corpus_retrieval", "query": doc["query"], "expected_doc": doc["name"], "expected_terms": doc["terms"], + "expected_evidence_ids": [evidence_id], + "allowed_alternate_evidence_ids": [], } ) if profile == "stress": queries.append( { "id": f"q-{base_id}-alt", + "task": "same_corpus_retrieval", "query": doc["alternate_query"], "expected_doc": doc["name"], "expected_terms": doc["terms"], + "expected_evidence_ids": [evidence_id], + "allowed_alternate_evidence_ids": [], } ) @@ -191,13 +202,264 @@ queries_path.write_text( PY } +prepare_production_corpus() { + local manifest_path="${ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST:-}" + local corpus_summary="${REPORT_DIR}/production-corpus-summary.json" + + case "${CORPUS_PROFILE}" in + production-synthetic) + manifest_path="${manifest_path:-${PRODUCTION_SYNTHETIC_MANIFEST}}" + ;; + production-private) + if [[ -z "${manifest_path}" ]]; then + echo "ELF_BASELINE_PROFILE=production-private requires ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST." >&2 + exit 1 + fi + ;; + *) + echo "Unsupported production corpus profile: ${CORPUS_PROFILE}" >&2 + exit 1 + ;; + esac + + if [[ ! -f "${manifest_path}" ]]; then + echo "Missing production corpus manifest: ${manifest_path}" >&2 + exit 1 + fi + + python3 - "${CORPUS_PROFILE}" "${manifest_path}" "${CORPUS_DIR}" "${REPORT_DIR}/queries.json" "${corpus_summary}" <<'PY' +import json +import re +import sys +from collections import Counter +from pathlib import Path + +profile, manifest_path_raw, corpus_dir_raw, queries_path_raw, summary_path_raw = sys.argv[1:] +manifest_path = Path(manifest_path_raw) +corpus_dir = Path(corpus_dir_raw) +queries_path = Path(queries_path_raw) +summary_path = Path(summary_path_raw) +corpus_track = "synthetic_production" if profile == "production-synthetic" else "private_production" +allowed_categories = { + "issue", + "pr", + "worktree", + "runbook", + "decision", + "blocker", + "recovery_note", +} +allowed_tasks = { + "resume_lane", + "recover_exact_command", + "explain_stale_blocker", + "find_prior_decision", + "compare_project_status", + "detect_contradiction_update", +} +id_re = re.compile(r"[a-z0-9][a-z0-9_.-]{1,80}") + + +def fail(message): + raise SystemExit(f"Invalid production corpus manifest: {message}") + + +def require_string(obj, field, context): + value = obj.get(field) + if not isinstance(value, str) or not value.strip(): + fail(f"{context}.{field} must be a non-empty string") + return value.strip() + + +def require_string_list(obj, field, context): + value = obj.get(field) + if not isinstance(value, list) or not value: + fail(f"{context}.{field} must be a non-empty string array") + out = [] + for index, item in enumerate(value): + if not isinstance(item, str) or not item.strip(): + fail(f"{context}.{field}[{index}] must be a non-empty string") + out.append(item.strip()) + return out + + +def load_text(item, context): + has_text = isinstance(item.get("text"), str) + has_path = isinstance(item.get("local_path"), str) + if has_text == has_path: + fail(f"{context} must set exactly one of text or local_path") + if has_text: + text = item["text"].strip() + else: + local_path = Path(item["local_path"]) + if not local_path.is_absolute(): + local_path = manifest_path.parent / local_path + if not local_path.is_file(): + fail(f"{context}.local_path does not point to a readable file") + text = local_path.read_text(encoding="utf-8").strip() + if not text: + fail(f"{context} text must not be empty") + if "\x00" in text: + fail(f"{context} text contains a NUL byte") + return text + + +manifest = json.loads(manifest_path.read_text(encoding="utf-8")) +if manifest.get("schema") != "elf.production_corpus_manifest/v1": + fail("schema must be elf.production_corpus_manifest/v1") + +manifest_id = require_string(manifest, "manifest_id", "$") +evidence_items = manifest.get("evidence") +if not isinstance(evidence_items, list) or not evidence_items: + fail("$.evidence must be a non-empty array") +query_items = manifest.get("queries") +if not isinstance(query_items, list) or not query_items: + fail("$.queries must be a non-empty array") + +for existing in corpus_dir.glob("*.md"): + existing.unlink() + +evidence_by_id = {} +category_counts = Counter() +for index, item in enumerate(evidence_items): + context = f"$.evidence[{index}]" + if not isinstance(item, dict): + fail(f"{context} must be an object") + evidence_id = require_string(item, "evidence_id", context) + if not id_re.fullmatch(evidence_id): + fail(f"{context}.evidence_id must be lower-case ASCII and safe for filenames") + if evidence_id in evidence_by_id: + fail(f"{context}.evidence_id duplicates an earlier item") + category = require_string(item, "category", context) + if category not in allowed_categories: + fail(f"{context}.category must be one of {sorted(allowed_categories)}") + title = require_string(item, "title", context) + text = load_text(item, context) + evidence_by_id[evidence_id] = { + "category": category, + "title": title, + "text": text, + } + category_counts[category] += 1 + (corpus_dir / f"{evidence_id}.md").write_text( + "\n".join( + [ + f"# {title}", + "", + text, + "", + ] + ), + encoding="utf-8", + ) + +queries = [] +task_counts = Counter() +for index, item in enumerate(query_items): + context = f"$.queries[{index}]" + if not isinstance(item, dict): + fail(f"{context} must be an object") + query_id = require_string(item, "query_id", context) + task = require_string(item, "task", context) + if task not in allowed_tasks: + fail(f"{context}.task must be one of {sorted(allowed_tasks)}") + query = require_string(item, "query", context) + expected_ids = require_string_list(item, "expected_evidence_ids", context) + allowed_alternate_ids = item.get("allowed_alternate_evidence_ids", []) + if allowed_alternate_ids is None: + allowed_alternate_ids = [] + if not isinstance(allowed_alternate_ids, list): + fail(f"{context}.allowed_alternate_evidence_ids must be an array") + allowed_alternate_ids = [ + evidence_id.strip() + for evidence_id in allowed_alternate_ids + if isinstance(evidence_id, str) and evidence_id.strip() + ] + expected_terms = require_string_list(item, "expected_terms", context) + for evidence_id in [*expected_ids, *allowed_alternate_ids]: + if evidence_id not in evidence_by_id: + fail(f"{context} references unknown evidence_id {evidence_id!r}") + queries.append( + { + "id": query_id, + "task": task, + "query": query, + "expected_doc": f"{expected_ids[0]}.md", + "allowed_alternate_docs": [ + f"{evidence_id}.md" for evidence_id in [*expected_ids[1:], *allowed_alternate_ids] + ], + "expected_terms": expected_terms, + "expected_evidence_ids": expected_ids, + "allowed_alternate_evidence_ids": allowed_alternate_ids, + } + ) + task_counts[task] += 1 + +queries_path.write_text( + json.dumps( + { + "schema": "elf.live_baseline.queries/v1", + "profile": profile, + "corpus_track": corpus_track, + "manifest_schema": manifest["schema"], + "manifest_id": manifest_id, + "document_count": len(evidence_by_id), + "queries": queries, + }, + indent=2, + ) + + "\n", + encoding="utf-8", +) + +summary_path.write_text( + json.dumps( + { + "schema": "elf.production_corpus_summary/v1", + "corpus_track": corpus_track, + "manifest_schema": manifest["schema"], + "manifest_id": manifest_id, + "document_count": len(evidence_by_id), + "query_count": len(queries), + "category_counts": dict(sorted(category_counts.items())), + "task_counts": dict(sorted(task_counts.items())), + "evidence_ids": sorted(evidence_by_id), + "query_evidence": [ + { + "query_id": query["id"], + "task": query["task"], + "expected_evidence_ids": query["expected_evidence_ids"], + "allowed_alternate_evidence_ids": query["allowed_alternate_evidence_ids"], + } + for query in queries + ], + }, + indent=2, + ) + + "\n", + encoding="utf-8", +) +PY + + CORPUS_TRACK="$(jq -r '.corpus_track' "${corpus_summary}")" + CORPUS_MANIFEST_ID="$(jq -r '.manifest_id' "${corpus_summary}")" + CORPUS_PATH_DESCRIPTION="production corpus materialized in Docker under /bench/corpus" +} + rm -rf "${WORK_DIR}" mkdir -p "${REPORT_DIR}" find "${REPORT_DIR}" -maxdepth 1 -type f -delete mkdir -p "${REPOS_DIR}" "${CORPUS_DIR}" "${HOME_DIR}" : >"${RECORDS}" -generate_corpus +case "${CORPUS_PROFILE}" in + production-synthetic | production-private) + prepare_production_corpus + ;; + *) + generate_corpus + ;; +esac DOCUMENT_COUNT="$(find "${CORPUS_DIR}" -maxdepth 1 -type f -name '*.md' | wc -l | tr -d ' ')" QUERY_COUNT="$(jq '.queries | length' "${REPORT_DIR}/queries.json")" @@ -243,6 +505,8 @@ json_record() { command_summary: $command_summary, elapsed_seconds: $elapsed_seconds, embedding: ($checks[0].embedding // null), + query_summary: ($checks[0].query_summary // null), + queries: ($checks[0].queries // null), check_summary: $checks[0].check_summary, checks: $checks[0].checks }' >>"${RECORDS}" @@ -267,6 +531,8 @@ json_record() { log_path: $log_path, command_summary: $command_summary, elapsed_seconds: $elapsed_seconds, + query_summary: null, + queries: null, check_summary: { total: 1, pass: (if $retrieval_status == "retrieval_pass" then 1 else 0 end), @@ -333,6 +599,9 @@ finish_report() { --arg run_id "${RUN_ID}" \ --arg project_filter "${PROJECT_FILTER}" \ --arg corpus_profile "${CORPUS_PROFILE}" \ + --arg corpus_track "${CORPUS_TRACK}" \ + --arg corpus_path "${CORPUS_PATH_DESCRIPTION}" \ + --arg corpus_manifest_id "${CORPUS_MANIFEST_ID}" \ --argjson document_count "${DOCUMENT_COUNT}" \ --argjson query_count "${QUERY_COUNT}" \ --arg generated_at "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ @@ -344,9 +613,11 @@ finish_report() { project_filter: $project_filter, corpus: { profile: $corpus_profile, + track: $corpus_track, + manifest_id: (if $corpus_manifest_id == "" then null else $corpus_manifest_id end), document_count: $document_count, query_count: $query_count, - path: "generated in Docker under /bench/corpus", + path: $corpus_path, query_file: "tmp/live-baseline/queries.json" }, verdict: ( @@ -374,6 +645,14 @@ finish_report() { fail: ([.[] | .check_summary.fail // 0] | add // 0), incomplete: ([.[] | .check_summary.incomplete // 0] | add // 0) }, + wrong_result_count: ([.[] | .query_summary.wrong_result_count // .query_summary.fail // 0] | add // 0), + latency_ms: { + total: ([.[] | .query_summary.latency_ms_total // 0] | add // 0), + mean: ( + [.[] | select(.query_summary != null) | .query_summary.latency_ms_mean // 0] as $means + | if ($means | length) == 0 then 0 else (($means | add) / ($means | length)) end + ) + }, projects: . }' "${RECORDS}" >"${REPORT}" } @@ -419,7 +698,7 @@ project_elf() { if run_cmd "${project}: same-corpus retrieval" 1200 "${log_path}" \ "cd '${ROOT_DIR}' && cargo run -p elf-eval --bin live_baseline_elf -- --config config/local/elf.docker.toml --corpus '${CORPUS_DIR}' --queries '${REPORT_DIR}/queries.json' --out '${result_path}'"; then if [[ -s "${result_path}" ]] && jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then - jq '{embedding, check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" + jq '{embedding, query_summary: .summary, queries, check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" fi if [[ -s "${result_path}" ]] && jq -e --argjson document_count "${DOCUMENT_COUNT}" --argjson query_count "${QUERY_COUNT}" ' .schema == "elf.live_baseline.elf_result/v1" and diff --git a/scripts/live-baseline-report-to-md.sh b/scripts/live-baseline-report-to-md.sh index 651f29b4..bdb54ed8 100755 --- a/scripts/live-baseline-report-to-md.sh +++ b/scripts/live-baseline-report-to-md.sh @@ -4,6 +4,10 @@ set -euo pipefail ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" REPORT="${1:-${ELF_BASELINE_REPORT:-${ROOT_DIR}/tmp/live-baseline/live-baseline-report.json}}" OUT="${2:-${ELF_BASELINE_MARKDOWN_REPORT:-}}" +REPORT_DISPLAY="${REPORT}" +if [[ "${REPORT_DISPLAY}" == "${ROOT_DIR}/"* ]]; then + REPORT_DISPLAY="${REPORT_DISPLAY#"${ROOT_DIR}/"}" +fi if ! command -v jq >/dev/null 2>&1; then echo "Missing jq; cannot render live baseline Markdown report." >&2 @@ -16,7 +20,7 @@ if [[ ! -f "${REPORT}" ]]; then fi render_report() { - jq -r --arg report_path "${REPORT}" ' + jq -r --arg report_path "${REPORT_DISPLAY}" ' def dash: if . == null then "-" else tostring end; def md: @@ -39,8 +43,16 @@ render_report() { ("- Verdict: `" + (.verdict | md) + "`"), ("- Project filter: `" + (.project_filter | md) + "`"), ("- Corpus profile: `" + (.corpus.profile | md) + "`"), + ("- Corpus track: `" + ((.corpus.track // "generated_public") | md) + "`"), + ( + if (.corpus.manifest_id // null) == null then empty + else "- Corpus manifest: `" + (.corpus.manifest_id | md) + "`" + end + ), ("- Documents: `" + (.corpus.document_count | tostring) + "`"), ("- Queries: `" + (.corpus.query_count | tostring) + "`"), + ("- Wrong-result count: `" + ((.wrong_result_count // 0) | tostring) + "`"), + ("- Query latency mean: `" + ((.latency_ms.mean // 0) | tostring) + " ms`"), ("- Project summary: `" + (.summary.pass | tostring) + " pass`, `" + (.summary.fail | tostring) + " fail`, `" + (.summary.incomplete | tostring) + " incomplete`"), ("- Same-corpus summary: `" + (.same_corpus_summary.pass | tostring) + " pass`, `" + (.same_corpus_summary.fail | tostring) + " fail`, `" + (.same_corpus_summary.incomplete | tostring) + " incomplete`"), ("- Full check summary: `" + (.full_check_summary.pass | tostring) + "/" + (.full_check_summary.total | tostring) + " pass`"), @@ -80,6 +92,29 @@ render_report() { "" else empty end ), + ( + [.projects[] | {project, queries: (.queries // [])} | select((.queries | length) > 0)] as $query_projects + | if ($query_projects | length) > 0 then + "## Query Evidence", + "", + "| Project | Query | Task | Expected Evidence | Allowed Alternates | Top Evidence | Matched | Latency |", + "| --- | --- | --- | --- | --- | --- | --- | --- |", + ( + $query_projects[] + | .project as $project + | .queries[] + | "| " + ($project | md) + + " | `" + (.id | md) + "`" + + " | `" + ((.task // "-") | md) + "`" + + " | `" + (((.expected_evidence_ids // []) | join(", ")) | md) + "`" + + " | `" + (((.allowed_alternate_evidence_ids // []) | join(", ")) | md) + "`" + + " | `" + ((.top_evidence_id // "-") | md) + "`" + + " | `" + (.matched | tostring) + "`" + + " | `" + ((.latency_ms // 0) | tostring) + " ms` |" + ), + "" + else empty end + ), "## Result Semantics", "", "- `pass`: every encoded check for the selected project and profile passed.",