From 3ea74fc434a3365a68ffd5326daded49d72c33e3 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Thu, 11 Jun 2026 18:57:47 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Publish qmd trace replay diagnostics report","authority":"XY-923"} --- README.md | 2 + .../tests/real_world_job_benchmark.rs | 175 +++++++++++ ...-11-competitor-strength-adoption-report.md | 14 +- ...elf-qmd-trace-replay-diagnostics-report.md | 140 +++++++++ docs/guide/benchmarking/index.md | 4 + ...1-competitor-strength-adoption-report.json | 19 +- ...f-qmd-trace-replay-diagnostics-report.json | 293 ++++++++++++++++++ 7 files changed, 636 insertions(+), 11 deletions(-) create mode 100644 docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md create mode 100644 docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json diff --git a/README.md b/README.md index bdd884b3..51452873 100644 --- a/README.md +++ b/README.md @@ -195,6 +195,7 @@ Detailed evidence and interpretation: - [Live Real-World Adapter Sweep Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md) - [Post-Adapter Production Adoption Refresh - June 10, 2026](docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md) - [qmd and OpenViking Strength-Profile Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md) +- [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md) - [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Single-User Production Runbook](docs/guide/single_user_production.md) @@ -269,6 +270,7 @@ Detailed comparison, mechanism-level analysis, and source map: - [Post-Adapter Production Adoption Refresh - June 10, 2026](docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md) - [Competitor Strength Evidence Matrix - June 11, 2026](docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md) - [Temporal History Competitor Gap Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md) +- [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md) - [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Real-World Agent Memory Benchmark](docs/guide/benchmarking/real_world_agent_memory_benchmark.md) diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index ce163f29..bf0b0bbc 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -107,6 +107,36 @@ fn retrieval_debug_profile_json_path() -> Result { .join("2026-06-11-elf-qmd-retrieval-debug-profile.json")) } +fn trace_replay_diagnostics_report_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-11-elf-qmd-trace-replay-diagnostics-report.json")) +} + +fn trace_replay_diagnostics_markdown_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("benchmarking") + .join("2026-06-11-elf-qmd-trace-replay-diagnostics-report.md")) +} + +fn competitor_strength_adoption_report_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("benchmarking") + .join("2026-06-11-competitor-strength-adoption-report.md")) +} + +fn competitor_strength_adoption_report_json_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-11-competitor-strength-adoption-report.json")) +} + fn competitor_strength_matrix_path() -> Result { Ok(workspace_root()? .join("docs") @@ -1404,6 +1434,151 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> { Ok(()) } +#[test] +fn qmd_trace_replay_diagnostics_report_preserves_claim_boundaries() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + trace_replay_diagnostics_report_path()?, + )?)?; + let markdown = fs::read_to_string(trace_replay_diagnostics_markdown_path()?)?; + let readme = fs::read_to_string(readme_path()?)?; + let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?; + let adoption_report = fs::read_to_string(competitor_strength_adoption_report_path()?)?; + let adoption_json = serde_json::from_str::(&fs::read_to_string( + competitor_strength_adoption_report_json_path()?, + )?)?; + + assert_trace_replay_diagnostics_json(&report)?; + assert_trace_replay_diagnostics_markdown(&markdown); + + assert!(readme.contains("ELF/qmd Trace Replay Diagnostics Report - June 11, 2026")); + assert!(benchmarking_index.contains("2026-06-11-elf-qmd-trace-replay-diagnostics-report.md")); + assert!(benchmarking_index.contains("qmd top-10/replay artifact")); + assert!(benchmarking_index.contains("ELF trace/admin surfaces")); + assert!(adoption_report.contains("| Retrieval quality and local debug UX | `loss` |")); + assert!( + adoption_report + .contains("Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF") + ); + + assert_trace_replay_adoption_json(&adoption_json)?; + + Ok(()) +} + +fn assert_trace_replay_diagnostics_json(report: &Value) -> Result<()> { + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.trace_replay_diagnostics_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-923")); + assert_eq!( + string_array_at(report, "/outcome_terms")?, + ["win", "tie", "loss", "not_tested", "blocked", "non_goal"].map(str::to_owned) + ); + assert_eq!( + report.pointer("/summary/retrieval_correctness").and_then(Value::as_str), + Some("tie") + ); + assert_eq!(report.pointer("/summary/outcome_counts/loss").and_then(Value::as_u64), Some(2)); + assert_eq!( + report.pointer("/summary/outcome_counts/not_tested").and_then(Value::as_u64), + Some(4) + ); + assert_eq!(report.pointer("/summary/outcome_counts/non_goal").and_then(Value::as_u64), Some(1)); + + let scenarios = array_at(report, "/scenario_outcomes")?; + let retrieval = find_by_field(scenarios, "/scenario_id", "retrieval_correctness_guardrail")?; + let top10 = find_by_field(scenarios, "/scenario_id", "default_top10_candidate_artifact")?; + let replay = find_by_field(scenarios, "/scenario_id", "replay_command_locality")?; + let trace_surface = + find_by_field(scenarios, "/scenario_id", "trace_admin_replay_surface_availability")?; + let expansion = find_by_field(scenarios, "/scenario_id", "query_expansion_attribution")?; + let dense_sparse = + find_by_field(scenarios, "/scenario_id", "dense_sparse_channel_attribution")?; + let fusion = find_by_field(scenarios, "/scenario_id", "fusion_attribution")?; + let rerank = find_by_field(scenarios, "/scenario_id", "rerank_attribution")?; + let candidate_drop = find_by_field(scenarios, "/scenario_id", "candidate_drop_diagnostics")?; + let selected = + find_by_field(scenarios, "/scenario_id", "selected_but_not_narrated_wrong_results")?; + let tombstone = + find_by_field(scenarios, "/scenario_id", "evidence_absent_tombstone_diagnostics")?; + + assert_eq!(scenarios.len(), 11); + assert_eq!(retrieval.pointer("/outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(top10.pointer("/outcome").and_then(Value::as_str), Some("loss")); + assert_eq!(replay.pointer("/outcome").and_then(Value::as_str), Some("loss")); + assert_eq!(trace_surface.pointer("/outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(expansion.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); + assert_eq!(dense_sparse.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); + assert_eq!(fusion.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); + assert_eq!(rerank.pointer("/result_type").and_then(Value::as_str), Some("non_goal")); + assert_eq!(rerank.pointer("/outcome").and_then(Value::as_str), Some("non_goal")); + assert_eq!(candidate_drop.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); + assert!(array_contains_str(candidate_drop, "/typed_non_pass_states", "retrieved_but_dropped")?); + assert_eq!(selected.pointer("/result_type").and_then(Value::as_str), Some("wrong_result")); + assert!(array_contains_str(selected, "/typed_non_pass_states", "selected_but_not_narrated")?); + assert_eq!(tombstone.pointer("/outcome").and_then(Value::as_str), Some("win")); + assert_eq!(tombstone.pointer("/qmd_status").and_then(Value::as_str), Some("wrong_result")); + assert!(array_contains_str( + report, + "/wrong_result_diagnostics/qmd_missing_evidence", + "delete-tombstone" + )?); + assert!(array_contains_str( + report, + "/claim_boundaries", + "qmd currently wins the default local-debug artifact surface: top-10 rows plus short CLI replay." + )?); + assert!(array_contains_str( + report, + "/claim_boundaries", + "Do not claim qmd beats ELF as a memory system overall." + )?); + + Ok(()) +} + +fn assert_trace_replay_diagnostics_markdown(markdown: &str) { + assert!(markdown.contains("Retrieval correctness is still tied")); + assert!(markdown.contains("| Default top-10 candidate artifact |")); + assert!(markdown.contains("| Replay command locality |")); + assert!(markdown.contains("| Rerank attribution | `live_baseline_only` | `non_goal` |")); + assert!(markdown.contains("| Candidate-drop diagnostics | `research_gate` | `not_encoded` |")); + assert!(markdown.contains("`retrieved_but_dropped` | Defined but `not_tested`")); + assert!(markdown.contains("npx tsx src/cli/qmd.ts query")); + assert!(markdown.contains("cargo run -p elf-eval -- --config-a")); + assert!(markdown.contains("Do not claim qmd beats ELF as a memory system overall")); + assert!(markdown.contains("Do not score rerank superiority from a qmd `--no-rerank` run")); +} + +fn assert_trace_replay_adoption_json(adoption: &Value) -> Result<()> { + let local_debug = find_by_field( + array_at(adoption, "/scenario_outcomes")?, + "/scenario_id", + "local_debug_replay_ux", + )?; + + assert_eq!(local_debug.pointer("/outcome").and_then(Value::as_str), Some("loss")); + assert!( + local_debug + .pointer("/measured_claim") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("qmd stronger on immediate top-10")) + ); + assert!(array_contains_str( + local_debug, + "/command_artifacts", + "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md" + )?); + assert!(array_contains_str( + adoption, + "/claim_boundaries/not_allowed", + "Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system or retrieval-quality win." + )?); + + Ok(()) +} + fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> { let projects = array_at(matrix, "/project_matrix")?; let qmd = find_by_field(projects, "/project", "qmd")?; diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index e46ba1f7..1bf607f7 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -35,9 +35,11 @@ The remaining caveats are material: exists. - Credentialed provider production-ops gates are blocked until explicit provider setup exists. -- Several competitor strengths remain `not_tested`: qmd replay/debug UX, - mem0/OpenMemory history/UI, OpenViking trajectory, Letta core-vs-archival memory, - and graph/RAG navigation. +- Several competitor strengths remain `not_tested`: mem0/OpenMemory history/UI, + OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation. + The XY-923 follow-up now scores qmd's immediate top-10/replay artifact ergonomics + as stronger than ELF's default stress report, while expansion, fusion, rerank, and + candidate-drop diagnosis remain untested. ## Evidence Classes @@ -68,6 +70,7 @@ results, or lifecycle failures into one aggregate leaderboard. | `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke` | `2026-06-11-temporal-history-competitor-gap-report.md` | Graphiti/Zep temporal smoke remains blocked by `provider_api_key_missing`. | | `cargo make graphify-docker-graph-report-smoke` | `2026-06-11-graph-rag-scored-smoke-adapter-report.md` | graphify reaches tiny Docker graph/report scoring but remains wrong_result. | | `cargo make baseline-production-synthetic`, `cargo make baseline-backfill-docker`, backup/restore, Qdrant rebuild proof | `2026-06-10-production-adoption-refresh.md` | ELF has provider synthetic, stress, backfill, restore, and rebuild evidence; private-corpus proof is blocked by missing operator-owned manifest. | +| `ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` plus ELF trace-bundle and qmd CLI replay commands | `2026-06-11-elf-qmd-trace-replay-diagnostics-report.md` | Retrieval correctness remains tied, but qmd wins current immediate top-10/replay artifact ergonomics; ELF trace/admin surfaces are useful but not yet hydrated into the default stress artifact. | ## Scenario Matrix @@ -77,7 +80,7 @@ results, or lifecycle failures into one aggregate leaderboard. | Work resume and coding-agent continuity | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF and qmd both pass encoded live `work_resume` jobs; agentmemory, claude-mem, and OpenViking continuity strengths remain blocked or not encoded. | XY-925, XY-928 | | Project decisions and reversals | `tie` | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF and qmd both pass encoded `project_decisions` jobs; Letta-style core/archival decision memory is not tested. | XY-927 | | Retrieval quality | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF and qmd both pass encoded live retrieval and stress/same-corpus retrieval evidence. | XY-923 | -| Retrieval quality and local debug UX | `not_tested` | `live_baseline_only`, `research_gate`, `not_encoded` | qmd remains the local retrieval-debug UX reference, but no scored rule compares qmd top-10/replay artifacts with ELF trace/admin bundle surfaces. | XY-923 | +| Retrieval quality and local debug UX | `loss` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested. | XY-923 | | Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. | XY-905 | | Consolidation/proposal review | `not_tested` | `fixture_backed`, `not_encoded` | ELF fixture consolidation passes, but live consolidation proposal generation and review-action scoring are not encoded. | XY-926 | | Knowledge page compilation | `not_tested` | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `not_encoded` | ELF fixture knowledge pages pass, but live knowledge compilation is not encoded; graphify reaches a tiny scored smoke and remains wrong_result. | XY-926, XY-929 | @@ -120,6 +123,8 @@ results, or lifecycle failures into one aggregate leaderboard. ## Claims Not Allowed - Do not claim ELF broadly beats qmd. +- Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system + or retrieval-quality win. - Do not claim ELF beats mem0/OpenMemory on history, UI/export, hosted behavior, or graph memory. - Do not claim ELF beats OpenViking on staged context trajectory. @@ -128,4 +133,3 @@ results, or lifecycle failures into one aggregate leaderboard. - Do not promote `fixture_backed`, `live_baseline_only`, `smoke_only`, `research_gate`, `blocked`, `wrong_result`, `lifecycle_fail`, `unsupported`, or `not_encoded` states into a generic pass/fail score. - diff --git a/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md b/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md new file mode 100644 index 00000000..e3a7a7c7 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md @@ -0,0 +1,140 @@ +# ELF/qmd Trace Replay Diagnostics Report - June 11, 2026 + +Goal: Compare ELF and qmd on trace-level replay and wrong-result diagnostics while +keeping retrieval correctness as a separate guardrail. +Read this when: You need the XY-923 report lane for qmd top-10 replay artifacts, +ELF trace/admin bundle surfaces, and typed wrong-result diagnosis classes. +Inputs: The June 11 ELF/qmd retrieval-debug profile, qmd/OpenViking strength profile, +memory-evolution diagnostic, competitor-strength adoption report, live baseline +runner, ELF trace replay code, and the ELF service trace/admin contract. +Outputs: Scenario-level `win`, `tie`, `loss`, `not_tested`, `blocked`, or +`non_goal` outcomes plus concrete replay commands and artifact paths. + +Machine-readable companion: +`docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json`. + +## Executive Judgment + +Retrieval correctness is still tied: ELF and qmd both pass the encoded live retrieval +suite and both pass the 480-document generated-public stress baseline. + +Trace-level debugging is not tied. In the current checked-in artifacts, qmd is ahead +on immediate local replay ergonomics because the baseline keeps top-10 JSON rows with +files, scores, line numbers, snippets, and distractor visibility, and the replay path +is a short CLI sequence. ELF has a deeper service trace model and admin bundle +surfaces, but the stress report still does not hydrate the equivalent candidate list +by default. + +The resulting narrow position: + +- Retrieval correctness: `tie`. +- Default per-query candidate artifact: ELF `loss` against qmd. +- Replay command locality: ELF `loss` against qmd. +- ELF trace/admin replay surface: `tie` as an available but different replay surface, + not a default-artifact win. +- Expansion, dense/sparse contribution, fusion, and candidate-drop diagnostics: + `not_tested` until comparable stage artifacts are emitted. +- Rerank stage scoring: `non_goal` for the current qmd stress path because it uses + `--no-rerank`. +- Wrong-result selected-but-not-narrated diagnosis: `tie` on typed non-pass + classification, not on answer quality. + +This is not a broad qmd-over-ELF claim. It is a scored local-debug artifact gap. + +## Replay Artifact Manifest + +| System | Replay surface | Command | Artifact | +| --- | --- | --- | --- | +| ELF | Stress guardrail with trace ids | `ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` | `tmp/live-baseline/live-baseline-report.json`; summarized in `docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json` | +| ELF | Admin trace bundle hydration | `curl -fsS 'http://127.0.0.1:51891/v2/admin/traces//bundle?mode=full&stage_items_limit=256&candidates_limit=200' -H 'X-ELF-Tenant-Id: ' -H 'X-ELF-Project-Id: ' -H 'X-ELF-Agent-Id: '` | `elf.trace_bundle/v1` response from the admin service | +| ELF | Trace ranking replay | `cargo run -p elf-eval -- --config-a config/local/elf.docker.toml --config-b config/local/elf.docker.toml --trace-id ` | JSON trace compare output over `search_trace_candidates` | +| qmd | Stress guardrail and top-10 rows | `ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` | `tmp/live-baseline/qmd-query.json`; summarized in `docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json` | +| qmd | Per-query CLI replay | `npx tsx src/cli/qmd.ts query 'lex: \nvec: ' -c elfbench --json --no-rerank --min-score 0 -n 10` | JSON top-10 rows with `file`, line/snippet/score fields when qmd returns them | +| qmd | Lifecycle replay | `npx tsx src/cli/qmd.ts update && npx tsx src/cli/qmd.ts embed -f -c elfbench && npx tsx src/cli/qmd.ts query ... --json --no-rerank` | `tmp/live-baseline/qmd-query.json` checks for update, delete, and cold-start recovery | + +## Scenario Outcomes + +| Scenario | Evidence | Result type | ELF outcome | Diagnostic judgment | +| --- | --- | --- | --- | --- | +| Retrieval correctness guardrail | `live_real_world`, `live_baseline_only` | `pass` | `tie` | Both systems pass encoded retrieval and stress same-corpus checks; this row does not score debugging ergonomics. | +| Default top-10 candidate artifact | `live_baseline_only` | `pass` | `loss` | qmd exposes file, score, line/snippet, and distractor rows directly; ELF records trace ids and top evidence but not the full candidate list in the report. | +| Replay command locality | `live_baseline_only` | `pass` | `loss` | qmd replay is a short local CLI query/update/embed path; ELF replay requires a live service config, persisted traces, headers, and trace ids. | +| Trace/admin replay surface availability | `implementation_reference` | `not_encoded` | `tie` | ELF has admin trace bundles and `elf-eval` trace replay; qmd has direct CLI replay. They are different useful surfaces and are not scored as equivalent quality. | +| Query expansion attribution | `research_gate` | `not_encoded` | `not_tested` | No comparable artifact shows expansion variants or dynamic expansion decisions for both systems. | +| Dense/sparse channel attribution | `research_gate` | `not_encoded` | `not_tested` | ELF uses dense plus BM25 and qmd uses structured `lex:` plus `vec:`, but the scored artifacts do not expose comparable per-channel contribution. | +| Fusion attribution | `research_gate` | `not_encoded` | `not_tested` | No comparable artifact shows fusion inputs, RRF/weighted-fusion contributions, or fusion-stage candidate drops. | +| Rerank attribution | `live_baseline_only` | `non_goal` | `non_goal` | The current qmd stress and materializer paths use `--no-rerank`; no rerank-on comparison is claimed. | +| Candidate-drop diagnostics | `research_gate` | `not_encoded` | `not_tested` | `retrieved_but_dropped` is defined but not observed because current qmd artifacts lack intermediate candidate traces and the ELF stress report does not hydrate candidate bundles. | +| Selected-but-not-narrated wrong results | `live_real_world` | `wrong_result` | `tie` | Both live paths produce memory-evolution wrong results where evidence is present but current-vs-historical or lifecycle narration is missing. | +| Evidence-absent and tombstone diagnosis | `live_real_world` | `wrong_result` | `win` | ELF retrieved all required memory-evolution evidence and passed delete/TTL; qmd missed three required evidence links including the delete tombstone. | + +Summary: `1` ELF win, `3` ties, `2` ELF losses, `4` not-tested scenarios, `0` +blocked scenarios, and `1` non-goal scenario. The losses are local-debug artifact +losses only. They do not change the retrieval-correctness tie. + +## Stage Scoring Notes + +| Stage | Current score | Reason | +| --- | --- | --- | +| Expansion | `not_tested` | The current artifacts do not expose comparable expansion variants or dynamic expansion decisions. | +| Dense retrieval | `not_tested` | The systems have dense/vector surfaces, but no comparable scored dense-only contribution artifact. | +| Sparse retrieval | `not_tested` | qmd `lex:` and ELF BM25 are present in command or service design, but contribution and drops are not scored. | +| Fusion | `not_tested` | Fusion candidates and final fusion deltas are not materialized comparably. | +| Rerank | `non_goal` | qmd uses `--no-rerank` in the current path; rerank superiority is out of scope for this run. | +| Candidate drops | `not_tested` | No current report can prove retrieved-but-dropped evidence for qmd, and ELF candidate bundles are not hydrated into the stress artifact. | +| Selected-but-not-narrated | `tie` | Both systems have typed memory-evolution wrong-result rows where evidence is selected or available but not narrated as lifecycle history. | +| Replay commands | `loss` | qmd's local CLI replay is shorter and directly tied to top-10 JSON output. | + +## Typed Non-Pass States + +The report preserves the wrong-result classes from the June 11 diagnostics: + +| Class | Current coverage | +| --- | --- | +| `evidence_absent` | Observed for qmd on verdict caveat, preference rationale, and delete tombstone misses. | +| `retrieved_but_dropped` | Defined but `not_tested`; current artifacts do not expose enough candidate-stage data. | +| `selected_but_not_narrated` | Observed for both ELF and qmd on supersession and temporal-validity jobs. | +| `contradicted_by_lifecycle_evidence` | Observed when current, historical, supersession, or tombstone evidence makes the answer incomplete. | + +These states are typed evidence, not leaderboard shortcuts. A `wrong_result` with +good evidence recall is still a wrong result. + +## Claim Boundaries + +Allowed: + +- ELF and qmd remain tied on encoded retrieval correctness. +- qmd currently wins the default local-debug artifact surface: top-10 rows plus short + CLI replay. +- ELF has useful service trace/admin replay surfaces, but they are not yet hydrated + into the default stress report as qmd-like candidate artifacts. +- ELF narrowly wins the memory-evolution evidence-retention slice because qmd misses + the delete tombstone and two other required evidence links. +- Expansion, dense/sparse contribution, fusion, rerank-on quality, and + retrieved-but-dropped candidate diagnosis remain unproven. + +Not allowed: + +- Do not claim qmd beats ELF as a memory system overall. +- Do not claim ELF beats qmd retrieval overall. +- Do not turn qmd top-10 ergonomics into a retrieval-quality win. +- Do not treat ELF trace/admin endpoint availability as proof that the default + benchmark report has qmd-level candidate visibility. +- Do not score rerank superiority from a qmd `--no-rerank` run. +- Do not collapse `not_tested`, `non_goal`, or `wrong_result` into pass evidence. + +## Follow-Up Gate + +The next measurement should emit one candidate-replay artifact per suspicious query +with: + +1. Expansion variants and whether the original query was included. +2. Dense-only and sparse-only candidate sets. +3. Fusion rank and score contribution. +4. Rerank score, or an explicit rerank-disabled marker. +5. Final selected items. +6. Dropped or demoted expected evidence. +7. A one-command replay line for both ELF and qmd. + +Until that exists, the current evidence supports a qmd local-debug artifact win, not a +broad product or retrieval win. diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index b462818e..efab4bb0 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -74,6 +74,10 @@ cleanup, use `docs/guide/single_user_production.md`. report that separates qmd retrieval quality from debug/replay ergonomics, records qmd wrong-result diagnosis classes, and preserves OpenViking context-trajectory surfaces as `not_tested` until staged/hierarchical evidence is encoded. +- `2026-06-11-elf-qmd-trace-replay-diagnostics-report.md`: XY-923 trace-level + replay and wrong-result diagnostics report that scores qmd top-10/replay artifact + ergonomics against ELF trace/admin surfaces while keeping retrieval correctness, + rerank, fusion, candidate-drop, and typed non-pass boundaries separate. - `2026-06-11-first-generation-oss-adapter-promotion-report.md`: XY-898 first-generation OSS adapter promotion report that updates agentmemory, mem0/OpenMemory, memsearch, and claude-mem with fresh scenario-level baseline diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index e9fbb3e6..9226f5ca 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -12,7 +12,7 @@ "Live temporal reconciliation remains wrong_result for five of six memory_evolution jobs.", "Private-corpus production quality is blocked until an operator-owned manifest exists.", "Credentialed provider production-ops gates are blocked until explicit provider setup exists.", - "Several competitor strengths remain not_tested: qmd replay/debug UX, mem0/OpenMemory history/UI, OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation." + "Several competitor strengths remain not_tested: mem0/OpenMemory history/UI, OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation. The XY-923 follow-up now scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, rerank, and candidate-drop diagnosis remain untested." ] }, "evidence_class_terms": [ @@ -65,6 +65,11 @@ "command": "cargo make baseline-production-synthetic, cargo make baseline-backfill-docker, backup/restore plus Qdrant rebuild proof", "artifact": "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md", "claim": "ELF has provider synthetic, stress, backfill, restore, and rebuild evidence, while private-corpus proof remains blocked by missing operator-owned manifest." + }, + { + "command": "ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker plus ELF trace-bundle and qmd CLI replay commands", + "artifact": "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md", + "claim": "Retrieval correctness remains tied, but qmd wins current immediate top-10/replay artifact ergonomics; ELF trace/admin surfaces are useful but not yet hydrated into the default stress artifact." } ], "scenario_outcomes": [ @@ -122,15 +127,16 @@ { "scenario_id": "local_debug_replay_ux", "title": "Retrieval quality and local debug UX", - "outcome": "not_tested", - "evidence_classes": ["live_baseline_only", "research_gate", "not_encoded"], - "measured_claim": "qmd remains the local retrieval-debug UX reference, but no scored rule compares qmd top-10/replay artifacts with ELF trace/admin bundle surfaces.", + "outcome": "loss", + "evidence_classes": ["live_baseline_only", "research_gate", "wrong_result", "not_encoded"], + "measured_claim": "The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", - "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md", + "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md" ], "follow_up_issues": ["XY-923"], - "caveat": "No ELF loss is claimed until comparable replay and candidate-diagnosis evidence is scored." + "caveat": "The loss is a local-debug artifact loss only; retrieval correctness remains tied and no broad qmd-over-ELF memory-system claim is allowed." }, { "scenario_id": "memory_evolution_temporal_history", @@ -344,6 +350,7 @@ ], "not_allowed": [ "Do not claim ELF broadly beats qmd.", + "Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system or retrieval-quality win.", "Do not claim ELF beats mem0/OpenMemory on history, UI/export, hosted behavior, or graph memory.", "Do not claim ELF beats OpenViking on staged context trajectory.", "Do not claim ELF beats Letta on core-vs-archival memory.", diff --git a/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json b/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json new file mode 100644 index 00000000..ebc095d2 --- /dev/null +++ b/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json @@ -0,0 +1,293 @@ +{ + "schema": "elf.trace_replay_diagnostics_report/v1", + "run_id": "2026-06-11-elf-qmd-trace-replay-diagnostics", + "authority": "XY-923", + "created_at": "2026-06-11", + "scope": "ELF versus qmd trace-level replay and wrong-result diagnostics, with retrieval correctness kept as a separate guardrail.", + "inputs": [ + "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", + "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json", + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md", + "docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json", + "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md", + "docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json", + "scripts/live-baseline-benchmark.sh", + "apps/elf-eval/src/app.rs", + "docs/spec/system_elf_memory_service_v2.md" + ], + "outcome_terms": [ + "win", + "tie", + "loss", + "not_tested", + "blocked", + "non_goal" + ], + "result_type_terms": [ + "pass", + "wrong_result", + "blocked", + "not_encoded", + "non_goal" + ], + "summary": { + "retrieval_correctness": "tie", + "debug_ergonomics": "qmd wins the current default top-10 candidate artifact and short replay-command surfaces.", + "elf_trace_position": "ELF has service trace, admin bundle, and trace replay surfaces, but they are not hydrated into the default stress report as qmd-like candidate artifacts.", + "outcome_counts": { + "win": 1, + "tie": 3, + "loss": 2, + "not_tested": 4, + "blocked": 0, + "non_goal": 1 + } + }, + "commands": [ + { + "system": "ELF", + "purpose": "stress retrieval guardrail with trace ids", + "command": "ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker", + "status": "pass", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "system": "ELF", + "purpose": "admin trace bundle hydration", + "command": "curl -fsS 'http://127.0.0.1:51891/v2/admin/traces//bundle?mode=full&stage_items_limit=256&candidates_limit=200' -H 'X-ELF-Tenant-Id: ' -H 'X-ELF-Project-Id: ' -H 'X-ELF-Agent-Id: '", + "status": "available_not_hydrated_in_default_stress_report", + "artifact": "elf.trace_bundle/v1 admin response" + }, + { + "system": "ELF", + "purpose": "trace ranking replay from persisted candidates", + "command": "cargo run -p elf-eval -- --config-a config/local/elf.docker.toml --config-b config/local/elf.docker.toml --trace-id ", + "status": "available_not_run_for_the_checked_in_stress_report", + "artifact": "elf-eval trace compare JSON" + }, + { + "system": "qmd", + "purpose": "stress retrieval guardrail plus top-10 rows", + "command": "ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker", + "status": "pass", + "artifact": "tmp/live-baseline/qmd-query.json" + }, + { + "system": "qmd", + "purpose": "per-query replay", + "command": "npx tsx src/cli/qmd.ts query 'lex: \\nvec: ' -c elfbench --json --no-rerank --min-score 0 -n 10", + "status": "pass_in_baseline_driver", + "artifact": "tmp/live-baseline/qmd-query.json" + }, + { + "system": "qmd", + "purpose": "lifecycle replay", + "command": "npx tsx src/cli/qmd.ts update && npx tsx src/cli/qmd.ts embed -f -c elfbench && npx tsx src/cli/qmd.ts query ... --json --no-rerank", + "status": "pass_for_update_delete_cold_start_checks", + "artifact": "tmp/live-baseline/qmd-query.json" + } + ], + "scenario_outcomes": [ + { + "scenario_id": "retrieval_correctness_guardrail", + "surface": "retrieval correctness", + "evidence_class": "live_real_world_and_live_baseline_only", + "result_type": "pass", + "elf_status": "pass", + "qmd_status": "pass", + "outcome": "tie", + "diagnostic_judgment": "Both systems pass encoded retrieval and stress same-corpus checks; this row does not score debugging ergonomics.", + "artifacts": [ + "docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json", + "tmp/live-baseline/live-baseline-report.json" + ] + }, + { + "scenario_id": "default_top10_candidate_artifact", + "surface": "default top-10 candidate artifact", + "evidence_class": "live_baseline_only", + "result_type": "pass", + "elf_status": "not_encoded", + "qmd_status": "pass", + "outcome": "loss", + "diagnostic_judgment": "qmd exposes file, score, line/snippet, and distractor rows directly; ELF records trace ids and top evidence but not the full candidate list in the report.", + "artifacts": [ + "tmp/live-baseline/qmd-query.json", + "docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json" + ] + }, + { + "scenario_id": "replay_command_locality", + "surface": "replay command locality", + "evidence_class": "live_baseline_only", + "result_type": "pass", + "elf_status": "not_encoded", + "qmd_status": "pass", + "outcome": "loss", + "diagnostic_judgment": "qmd replay is a short local CLI query/update/embed path; ELF replay requires a live service config, persisted traces, headers, and trace ids.", + "artifacts": [ + "scripts/live-baseline-benchmark.sh", + "apps/elf-eval/src/app.rs", + "docs/spec/system_elf_memory_service_v2.md" + ] + }, + { + "scenario_id": "trace_admin_replay_surface_availability", + "surface": "trace/admin replay surface availability", + "evidence_class": "implementation_reference", + "result_type": "not_encoded", + "elf_status": "pass", + "qmd_status": "pass", + "outcome": "tie", + "diagnostic_judgment": "ELF has admin trace bundles and elf-eval trace replay; qmd has direct CLI replay. They are different useful surfaces and are not scored as equivalent quality.", + "artifacts": [ + "docs/spec/system_elf_memory_service_v2.md", + "apps/elf-eval/src/app.rs", + "scripts/live-baseline-benchmark.sh" + ] + }, + { + "scenario_id": "query_expansion_attribution", + "surface": "query expansion attribution", + "evidence_class": "research_gate", + "result_type": "not_encoded", + "elf_status": "not_encoded", + "qmd_status": "not_encoded", + "outcome": "not_tested", + "diagnostic_judgment": "No comparable artifact shows expansion variants or dynamic expansion decisions for both systems.", + "artifacts": [ + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + ] + }, + { + "scenario_id": "dense_sparse_channel_attribution", + "surface": "dense/sparse channel attribution", + "evidence_class": "research_gate", + "result_type": "not_encoded", + "elf_status": "not_encoded", + "qmd_status": "not_encoded", + "outcome": "not_tested", + "diagnostic_judgment": "ELF uses dense plus BM25 and qmd uses structured lex plus vec, but the scored artifacts do not expose comparable per-channel contribution.", + "artifacts": [ + "docs/spec/system_elf_memory_service_v2.md", + "scripts/live-baseline-benchmark.sh" + ] + }, + { + "scenario_id": "fusion_attribution", + "surface": "fusion attribution", + "evidence_class": "research_gate", + "result_type": "not_encoded", + "elf_status": "not_encoded", + "qmd_status": "not_encoded", + "outcome": "not_tested", + "diagnostic_judgment": "No comparable artifact shows fusion inputs, RRF or weighted-fusion contribution, or fusion-stage candidate drops.", + "artifacts": [ + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + ] + }, + { + "scenario_id": "rerank_attribution", + "surface": "rerank attribution", + "evidence_class": "live_baseline_only", + "result_type": "non_goal", + "elf_status": "not_encoded", + "qmd_status": "not_encoded", + "outcome": "non_goal", + "diagnostic_judgment": "The current qmd stress and materializer paths use --no-rerank; no rerank-on comparison is claimed.", + "artifacts": [ + "scripts/live-baseline-benchmark.sh", + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + ] + }, + { + "scenario_id": "candidate_drop_diagnostics", + "surface": "candidate-drop diagnostics", + "evidence_class": "research_gate", + "result_type": "not_encoded", + "elf_status": "not_encoded", + "qmd_status": "not_encoded", + "outcome": "not_tested", + "diagnostic_judgment": "retrieved_but_dropped is defined but not observed because current qmd artifacts lack intermediate candidate traces and the ELF stress report does not hydrate candidate bundles.", + "typed_non_pass_states": [ + "retrieved_but_dropped" + ], + "artifacts": [ + "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json", + "docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json" + ] + }, + { + "scenario_id": "selected_but_not_narrated_wrong_results", + "surface": "selected-but-not-narrated wrong-result diagnosis", + "evidence_class": "live_real_world", + "result_type": "wrong_result", + "elf_status": "wrong_result", + "qmd_status": "wrong_result", + "outcome": "tie", + "diagnostic_judgment": "Both live paths produce memory-evolution wrong results where evidence is present but current-vs-historical or lifecycle narration is missing.", + "typed_non_pass_states": [ + "selected_but_not_narrated", + "contradicted_by_lifecycle_evidence" + ], + "artifacts": [ + "docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json" + ] + }, + { + "scenario_id": "evidence_absent_tombstone_diagnostics", + "surface": "evidence-absent and tombstone diagnosis", + "evidence_class": "live_real_world", + "result_type": "wrong_result", + "elf_status": "pass", + "qmd_status": "wrong_result", + "outcome": "win", + "diagnostic_judgment": "ELF retrieved all required memory-evolution evidence and passed delete/TTL; qmd missed three required evidence links including the delete tombstone.", + "typed_non_pass_states": [ + "evidence_absent", + "contradicted_by_lifecycle_evidence" + ], + "artifacts": [ + "docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json" + ] + } + ], + "wrong_result_diagnostics": { + "typed_non_pass_states": [ + { + "class": "evidence_absent", + "coverage": "observed_for_qmd", + "meaning": "Required evidence is absent from produced evidence ids." + }, + { + "class": "retrieved_but_dropped", + "coverage": "not_tested", + "meaning": "Required evidence appears in an intermediate candidate set but is absent from the final selected or narrated answer." + }, + { + "class": "selected_but_not_narrated", + "coverage": "observed_for_elf_and_qmd", + "meaning": "Evidence is selected or available, but the answer does not narrate the required lifecycle relationship." + }, + { + "class": "contradicted_by_lifecycle_evidence", + "coverage": "observed_for_elf_and_qmd", + "meaning": "The answer is contradicted or made incomplete by current, historical, supersession, or tombstone evidence." + } + ], + "qmd_missing_evidence": [ + "verdict-bounded-private-caveat", + "pref-current-concise-rationale", + "delete-tombstone" + ] + }, + "claim_boundaries": [ + "ELF and qmd remain tied on encoded retrieval correctness.", + "qmd currently wins the default local-debug artifact surface: top-10 rows plus short CLI replay.", + "ELF trace/admin endpoint availability is not proof that the default benchmark report has qmd-level candidate visibility.", + "Rerank superiority is not scored from a qmd --no-rerank run.", + "Expansion, dense/sparse contribution, fusion, and retrieved-but-dropped candidate diagnostics remain not_tested.", + "Do not claim qmd beats ELF as a memory system overall.", + "Do not collapse not_tested, non_goal, or wrong_result into pass evidence." + ] +}