Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ Detailed evidence and interpretation:
- [Live Real-World Adapter Sweep Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md)
- [Post-Adapter Production Adoption Refresh - June 10, 2026](docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md)
- [qmd and OpenViking Strength-Profile Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md)
- [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md)
- [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md)
- [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
- [Single-User Production Runbook](docs/guide/single_user_production.md)
Expand Down Expand Up @@ -269,6 +270,7 @@ Detailed comparison, mechanism-level analysis, and source map:
- [Post-Adapter Production Adoption Refresh - June 10, 2026](docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md)
- [Competitor Strength Evidence Matrix - June 11, 2026](docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md)
- [Temporal History Competitor Gap Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md)
- [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md)
- [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md)
- [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
- [Real-World Agent Memory Benchmark](docs/guide/benchmarking/real_world_agent_memory_benchmark.md)
Expand Down
175 changes: 175 additions & 0 deletions apps/elf-eval/tests/real_world_job_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,36 @@ fn retrieval_debug_profile_json_path() -> Result<PathBuf> {
.join("2026-06-11-elf-qmd-retrieval-debug-profile.json"))
}

fn trace_replay_diagnostics_report_path() -> Result<PathBuf> {
Ok(workspace_root()?
.join("docs")
.join("research")
.join("2026-06-11-elf-qmd-trace-replay-diagnostics-report.json"))
}

fn trace_replay_diagnostics_markdown_path() -> Result<PathBuf> {
Ok(workspace_root()?
.join("docs")
.join("guide")
.join("benchmarking")
.join("2026-06-11-elf-qmd-trace-replay-diagnostics-report.md"))
}

fn competitor_strength_adoption_report_path() -> Result<PathBuf> {
Ok(workspace_root()?
.join("docs")
.join("guide")
.join("benchmarking")
.join("2026-06-11-competitor-strength-adoption-report.md"))
}

fn competitor_strength_adoption_report_json_path() -> Result<PathBuf> {
Ok(workspace_root()?
.join("docs")
.join("research")
.join("2026-06-11-competitor-strength-adoption-report.json"))
}

fn competitor_strength_matrix_path() -> Result<PathBuf> {
Ok(workspace_root()?
.join("docs")
Expand Down Expand Up @@ -1404,6 +1434,151 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> {
Ok(())
}

#[test]
fn qmd_trace_replay_diagnostics_report_preserves_claim_boundaries() -> Result<()> {
let report = serde_json::from_str::<Value>(&fs::read_to_string(
trace_replay_diagnostics_report_path()?,
)?)?;
let markdown = fs::read_to_string(trace_replay_diagnostics_markdown_path()?)?;
let readme = fs::read_to_string(readme_path()?)?;
let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?;
let adoption_report = fs::read_to_string(competitor_strength_adoption_report_path()?)?;
let adoption_json = serde_json::from_str::<Value>(&fs::read_to_string(
competitor_strength_adoption_report_json_path()?,
)?)?;

assert_trace_replay_diagnostics_json(&report)?;
assert_trace_replay_diagnostics_markdown(&markdown);

assert!(readme.contains("ELF/qmd Trace Replay Diagnostics Report - June 11, 2026"));
assert!(benchmarking_index.contains("2026-06-11-elf-qmd-trace-replay-diagnostics-report.md"));
assert!(benchmarking_index.contains("qmd top-10/replay artifact"));
assert!(benchmarking_index.contains("ELF trace/admin surfaces"));
assert!(adoption_report.contains("| Retrieval quality and local debug UX | `loss` |"));
assert!(
adoption_report
.contains("Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF")
);

assert_trace_replay_adoption_json(&adoption_json)?;

Ok(())
}

fn assert_trace_replay_diagnostics_json(report: &Value) -> Result<()> {
assert_eq!(
report.pointer("/schema").and_then(Value::as_str),
Some("elf.trace_replay_diagnostics_report/v1")
);
assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-923"));
assert_eq!(
string_array_at(report, "/outcome_terms")?,
["win", "tie", "loss", "not_tested", "blocked", "non_goal"].map(str::to_owned)
);
assert_eq!(
report.pointer("/summary/retrieval_correctness").and_then(Value::as_str),
Some("tie")
);
assert_eq!(report.pointer("/summary/outcome_counts/loss").and_then(Value::as_u64), Some(2));
assert_eq!(
report.pointer("/summary/outcome_counts/not_tested").and_then(Value::as_u64),
Some(4)
);
assert_eq!(report.pointer("/summary/outcome_counts/non_goal").and_then(Value::as_u64), Some(1));

let scenarios = array_at(report, "/scenario_outcomes")?;
let retrieval = find_by_field(scenarios, "/scenario_id", "retrieval_correctness_guardrail")?;
let top10 = find_by_field(scenarios, "/scenario_id", "default_top10_candidate_artifact")?;
let replay = find_by_field(scenarios, "/scenario_id", "replay_command_locality")?;
let trace_surface =
find_by_field(scenarios, "/scenario_id", "trace_admin_replay_surface_availability")?;
let expansion = find_by_field(scenarios, "/scenario_id", "query_expansion_attribution")?;
let dense_sparse =
find_by_field(scenarios, "/scenario_id", "dense_sparse_channel_attribution")?;
let fusion = find_by_field(scenarios, "/scenario_id", "fusion_attribution")?;
let rerank = find_by_field(scenarios, "/scenario_id", "rerank_attribution")?;
let candidate_drop = find_by_field(scenarios, "/scenario_id", "candidate_drop_diagnostics")?;
let selected =
find_by_field(scenarios, "/scenario_id", "selected_but_not_narrated_wrong_results")?;
let tombstone =
find_by_field(scenarios, "/scenario_id", "evidence_absent_tombstone_diagnostics")?;

assert_eq!(scenarios.len(), 11);
assert_eq!(retrieval.pointer("/outcome").and_then(Value::as_str), Some("tie"));
assert_eq!(top10.pointer("/outcome").and_then(Value::as_str), Some("loss"));
assert_eq!(replay.pointer("/outcome").and_then(Value::as_str), Some("loss"));
assert_eq!(trace_surface.pointer("/outcome").and_then(Value::as_str), Some("tie"));
assert_eq!(expansion.pointer("/outcome").and_then(Value::as_str), Some("not_tested"));
assert_eq!(dense_sparse.pointer("/outcome").and_then(Value::as_str), Some("not_tested"));
assert_eq!(fusion.pointer("/outcome").and_then(Value::as_str), Some("not_tested"));
assert_eq!(rerank.pointer("/result_type").and_then(Value::as_str), Some("non_goal"));
assert_eq!(rerank.pointer("/outcome").and_then(Value::as_str), Some("non_goal"));
assert_eq!(candidate_drop.pointer("/outcome").and_then(Value::as_str), Some("not_tested"));
assert!(array_contains_str(candidate_drop, "/typed_non_pass_states", "retrieved_but_dropped")?);
assert_eq!(selected.pointer("/result_type").and_then(Value::as_str), Some("wrong_result"));
assert!(array_contains_str(selected, "/typed_non_pass_states", "selected_but_not_narrated")?);
assert_eq!(tombstone.pointer("/outcome").and_then(Value::as_str), Some("win"));
assert_eq!(tombstone.pointer("/qmd_status").and_then(Value::as_str), Some("wrong_result"));
assert!(array_contains_str(
report,
"/wrong_result_diagnostics/qmd_missing_evidence",
"delete-tombstone"
)?);
assert!(array_contains_str(
report,
"/claim_boundaries",
"qmd currently wins the default local-debug artifact surface: top-10 rows plus short CLI replay."
)?);
assert!(array_contains_str(
report,
"/claim_boundaries",
"Do not claim qmd beats ELF as a memory system overall."
)?);

Ok(())
}

fn assert_trace_replay_diagnostics_markdown(markdown: &str) {
assert!(markdown.contains("Retrieval correctness is still tied"));
assert!(markdown.contains("| Default top-10 candidate artifact |"));
assert!(markdown.contains("| Replay command locality |"));
assert!(markdown.contains("| Rerank attribution | `live_baseline_only` | `non_goal` |"));
assert!(markdown.contains("| Candidate-drop diagnostics | `research_gate` | `not_encoded` |"));
assert!(markdown.contains("`retrieved_but_dropped` | Defined but `not_tested`"));
assert!(markdown.contains("npx tsx src/cli/qmd.ts query"));
assert!(markdown.contains("cargo run -p elf-eval -- --config-a"));
assert!(markdown.contains("Do not claim qmd beats ELF as a memory system overall"));
assert!(markdown.contains("Do not score rerank superiority from a qmd `--no-rerank` run"));
}

fn assert_trace_replay_adoption_json(adoption: &Value) -> Result<()> {
let local_debug = find_by_field(
array_at(adoption, "/scenario_outcomes")?,
"/scenario_id",
"local_debug_replay_ux",
)?;

assert_eq!(local_debug.pointer("/outcome").and_then(Value::as_str), Some("loss"));
assert!(
local_debug
.pointer("/measured_claim")
.and_then(Value::as_str)
.is_some_and(|claim| claim.contains("qmd stronger on immediate top-10"))
);
assert!(array_contains_str(
local_debug,
"/command_artifacts",
"docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md"
)?);
assert!(array_contains_str(
adoption,
"/claim_boundaries/not_allowed",
"Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system or retrieval-quality win."
)?);

Ok(())
}

fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> {
let projects = array_at(matrix, "/project_matrix")?;
let qmd = find_by_field(projects, "/project", "qmd")?;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,11 @@ The remaining caveats are material:
exists.
- Credentialed provider production-ops gates are blocked until explicit provider
setup exists.
- Several competitor strengths remain `not_tested`: qmd replay/debug UX,
mem0/OpenMemory history/UI, OpenViking trajectory, Letta core-vs-archival memory,
and graph/RAG navigation.
- Several competitor strengths remain `not_tested`: mem0/OpenMemory history/UI,
OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation.
The XY-923 follow-up now scores qmd's immediate top-10/replay artifact ergonomics
as stronger than ELF's default stress report, while expansion, fusion, rerank, and
candidate-drop diagnosis remain untested.

## Evidence Classes

Expand Down Expand Up @@ -68,6 +70,7 @@ results, or lifecycle failures into one aggregate leaderboard.
| `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke` | `2026-06-11-temporal-history-competitor-gap-report.md` | Graphiti/Zep temporal smoke remains blocked by `provider_api_key_missing`. |
| `cargo make graphify-docker-graph-report-smoke` | `2026-06-11-graph-rag-scored-smoke-adapter-report.md` | graphify reaches tiny Docker graph/report scoring but remains wrong_result. |
| `cargo make baseline-production-synthetic`, `cargo make baseline-backfill-docker`, backup/restore, Qdrant rebuild proof | `2026-06-10-production-adoption-refresh.md` | ELF has provider synthetic, stress, backfill, restore, and rebuild evidence; private-corpus proof is blocked by missing operator-owned manifest. |
| `ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` plus ELF trace-bundle and qmd CLI replay commands | `2026-06-11-elf-qmd-trace-replay-diagnostics-report.md` | Retrieval correctness remains tied, but qmd wins current immediate top-10/replay artifact ergonomics; ELF trace/admin surfaces are useful but not yet hydrated into the default stress artifact. |

## Scenario Matrix

Expand All @@ -77,7 +80,7 @@ results, or lifecycle failures into one aggregate leaderboard.
| Work resume and coding-agent continuity | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF and qmd both pass encoded live `work_resume` jobs; agentmemory, claude-mem, and OpenViking continuity strengths remain blocked or not encoded. | XY-925, XY-928 |
| Project decisions and reversals | `tie` | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF and qmd both pass encoded `project_decisions` jobs; Letta-style core/archival decision memory is not tested. | XY-927 |
| Retrieval quality | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF and qmd both pass encoded live retrieval and stress/same-corpus retrieval evidence. | XY-923 |
| Retrieval quality and local debug UX | `not_tested` | `live_baseline_only`, `research_gate`, `not_encoded` | qmd remains the local retrieval-debug UX reference, but no scored rule compares qmd top-10/replay artifacts with ELF trace/admin bundle surfaces. | XY-923 |
| Retrieval quality and local debug UX | `loss` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested. | XY-923 |
| Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. | XY-905 |
| Consolidation/proposal review | `not_tested` | `fixture_backed`, `not_encoded` | ELF fixture consolidation passes, but live consolidation proposal generation and review-action scoring are not encoded. | XY-926 |
| Knowledge page compilation | `not_tested` | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `not_encoded` | ELF fixture knowledge pages pass, but live knowledge compilation is not encoded; graphify reaches a tiny scored smoke and remains wrong_result. | XY-926, XY-929 |
Expand Down Expand Up @@ -120,6 +123,8 @@ results, or lifecycle failures into one aggregate leaderboard.
## Claims Not Allowed

- Do not claim ELF broadly beats qmd.
- Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system
or retrieval-quality win.
- Do not claim ELF beats mem0/OpenMemory on history, UI/export, hosted behavior, or
graph memory.
- Do not claim ELF beats OpenViking on staged context trajectory.
Expand All @@ -128,4 +133,3 @@ results, or lifecycle failures into one aggregate leaderboard.
- Do not promote `fixture_backed`, `live_baseline_only`, `smoke_only`,
`research_gate`, `blocked`, `wrong_result`, `lifecycle_fail`, `unsupported`, or
`not_encoded` states into a generic pass/fail score.

Loading