From ecf78e731bc240a61237ead3e83a2fc57ad125f3 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 17 Jun 2026 10:01:32 +0800 Subject: [PATCH 1/2] Add Dreaming competitor retest closeout --- README.md | 18 +- ...06-16-dreaming-readiness-stage-ledger.json | 22 +- ...ing-competitor-strength-retest-report.json | 504 ++++++++++++++++++ .../tests/real_world_job_benchmark.rs | 278 +++++++++- ...6-06-16-dreaming-readiness-stage-ledger.md | 10 +- ...aming-competitor-strength-retest-report.md | 131 +++++ docs/evidence/benchmarking/index.md | 1 + 7 files changed, 954 insertions(+), 10 deletions(-) create mode 100644 apps/elf-eval/fixtures/report_snapshots/2026-06-17-dreaming-competitor-strength-retest-report.json create mode 100644 docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md diff --git a/README.md b/README.md index 3628775b..aa49d41c 100644 --- a/README.md +++ b/README.md @@ -168,6 +168,15 @@ provider-backed ELF evidence was required. does not create a managed-memory parity claim. The new `proactive_brief` fixture scores 5 jobs, with 4 pass and 1 blocked private-corpus case; it does not create Pulse or hosted managed-memory parity. +- Dreaming competitor-strength closeout after XY-955: the June 17 retest keeps ELF + locally and partially stronger only. The aggregate fixture retest remains 53 pass + and 7 typed blockers, the representative graph/RAG slice remains typed non-pass, + first-generation OSS fixture coverage remains 4 pass and 2 blocked, and the fresh + full live-adapter rerun reports ELF at 40 pass/0 wrong_result versus qmd at 17 + pass/13 wrong_result while preserving qmd's separate debug-ergonomics edge. This + rejects broad superiority claims and leaves qmd debug ergonomics, + OpenViking trajectory, Letta core/archive, graph/RAG quality, and XY-930 + private/provider gates as follow-up work. - Full-suite live real-world adapter sweep after XY-926: ELF and qmd emit Docker-isolated `live_real_world` records for all 55 checked-in jobs across 13 suites through `cargo make real-world-memory-live-adapters`. Both keep the original @@ -275,6 +284,7 @@ Detailed evidence and interpretation: - [Live Temporal Reconciliation Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md) - [Proactive Brief Scoring Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md) - [Scheduled Memory Task Scoring Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md) +- [Dreaming Competitor-Strength Retest Report - June 17, 2026](docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md) - [Live Baseline Benchmark Runbook](docs/runbook/benchmarking/live_baseline_benchmark.md) - [Single-User Production Runbook](docs/runbook/single_user_production.md) - Benchmark contract: @@ -358,6 +368,7 @@ Detailed comparison, mechanism-level analysis, and source map: - [Live Temporal Reconciliation Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md) - [Proactive Brief Scoring Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md) - [Scheduled Memory Task Scoring Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md) +- [Dreaming Competitor-Strength Retest Report - June 17, 2026](docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md) - [Live Baseline Benchmark Runbook](docs/runbook/benchmarking/live_baseline_benchmark.md) - [Real-World Agent Memory Benchmark](docs/runbook/benchmarking/real_world_agent_memory_benchmark.md) - [External Memory Improvement Plan](docs/evidence/external_memory/external_memory_improvement_plan.md) @@ -369,9 +380,10 @@ Detailed comparison, mechanism-level analysis, and source map: - [Derived Knowledge Page Follow-Up Research](docs/research/derived_knowledge_page_followup.md) - [Dreaming Product Surface Follow-Up Research](docs/research/dreaming_product_surface_followup.md) -Latest real-world benchmark report: June 16, 2026. Latest external research refresh: -June 11, 2026; June 16 adds live temporal reconciliation, live consolidation -self-check evidence, and fixture-backed scheduled-memory task scoring. +Latest real-world benchmark report: June 17, 2026. Latest external research refresh: +June 11, 2026; June 17 adds the Dreaming competitor-strength closeout retest and +optimization queue after the June 16 temporal reconciliation, live consolidation +self-check, proactive-brief, and scheduled-memory scoring evidence. ## Documentation diff --git a/apps/elf-eval/fixtures/report_snapshots/2026-06-16-dreaming-readiness-stage-ledger.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-16-dreaming-readiness-stage-ledger.json index cbd7c1ed..bd5116e6 100644 --- a/apps/elf-eval/fixtures/report_snapshots/2026-06-16-dreaming-readiness-stage-ledger.json +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-16-dreaming-readiness-stage-ledger.json @@ -534,10 +534,11 @@ } ], "evidence_files": [ - "docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md", "docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md", "docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md", - "docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" + "docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md", + "docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md", + "apps/elf-eval/fixtures/report_snapshots/2026-06-17-dreaming-competitor-strength-retest-report.json" ], "baseline_counts": { "pass": 22, @@ -547,6 +548,23 @@ "not_encoded": 11 }, "baseline_basis": "ELF full live real-world sweep: 22 pass, 5 wrong_result, 2 blocked, and 11 not_encoded jobs. The not_encoded jobs are represented as not_tested for this stage gate while preserving the raw not_encoded count.", + "post_stage_counts": { + "pass": 40, + "wrong_result": 0, + "blocked": 7, + "not_tested": 0, + "not_encoded": 19, + "incomplete": 0 + }, + "post_stage_basis": "XY-955 closeout retest: aggregate fixture retest passes 53/60 with 7 typed blockers; representative graph/RAG remains 0 pass, 1 wrong_result, 1 incomplete, and 3 blocked; first-generation OSS fixture slice is 4 pass and 2 blocked; ELF live adapter materialization is 40 pass, 0 wrong_result, 7 blocked, and 19 not_encoded; qmd live adapter materialization is 17 pass, 13 wrong_result, 7 blocked, and 29 not_encoded; private/provider gates remain under XY-930.", + "qmd_post_stage_counts": { + "pass": 17, + "wrong_result": 13, + "blocked": 7, + "not_tested": 0, + "not_encoded": 29, + "incomplete": 0 + }, "comparison_judgment": "unchanged", "regression_rule": "Any higher wrong_result/blocked/not_tested count, missing typed blocker, or unsupported broad competitor win claim is a regression.", "improvement_rule": "An improvement requires reduced live wrong_result or not_tested counts with no weakened evidence-class boundary and no private/provider claim without inputs.", diff --git a/apps/elf-eval/fixtures/report_snapshots/2026-06-17-dreaming-competitor-strength-retest-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-17-dreaming-competitor-strength-retest-report.json new file mode 100644 index 00000000..13b4ec0d --- /dev/null +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-17-dreaming-competitor-strength-retest-report.json @@ -0,0 +1,504 @@ +{ + "schema": "elf.dreaming_competitor_strength_retest_report/v1", + "report_id": "xy-955-dreaming-competitor-strength-retest-2026-06-17", + "authority": "XY-955", + "created_at": "2026-06-17T00:00:00Z", + "purpose": "Close out the Dreaming-readiness benchmark program pass by comparing the XY-951 baseline and downstream stage evidence against a fresh local/public retest without converting blockers or fixture-only evidence into wins.", + "source_evidence_cutoff": "2026-06-17", + "source_baseline": { + "stage_ledger": "apps/elf-eval/fixtures/report_snapshots/2026-06-16-dreaming-readiness-stage-ledger.json", + "competitor_strength_adoption_report": "apps/elf-eval/fixtures/report_snapshots/2026-06-11-competitor-strength-adoption-report.json", + "competitor_strength_matrix": "apps/elf-eval/fixtures/report_snapshots/2026-06-11-xy-897-competitor-strength-matrix.json" + }, + "judgment_terms": [ + "improved", + "regressed", + "unchanged", + "blocked", + "not_tested" + ], + "status_terms": [ + "pass", + "wrong_result", + "blocked", + "not_tested", + "not_encoded", + "incomplete", + "typed_non_pass", + "non_goal", + "product_reference" + ], + "summary": { + "overall_judgment": "locally_and_partially_stronger_only", + "broader_superiority": "not_proven", + "improved_stage_count": 6, + "regressed_stage_count": 0, + "unchanged_stage_count": 2, + "blocked_stage_count": 0, + "not_tested_stage_count": 0, + "unsupported_claims_rejected": [ + "ELF does not broadly beat qmd from this retest.", + "ELF does not beat mem0/OpenMemory on UI/export, hosted Platform behavior, or optional graph memory.", + "ELF does not beat Letta on core/archive memory until a contained Letta export/readback runner exists.", + "ELF does not beat OpenViking on staged trajectory, hierarchy selection, or recursive context expansion.", + "ELF does not prove graph/RAG citation or navigation parity from representative typed non-pass fixtures.", + "ELF does not prove private-corpus or credentialed provider quality without XY-930 operator inputs." + ] + }, + "commands": [ + { + "command": "cargo make real-world-memory", + "status": "pass", + "artifact": "tmp/real-world-memory/real-world-memory-report.json", + "summary": { + "job_count": 60, + "pass": 53, + "wrong_result": 0, + "blocked": 7, + "not_encoded": 0, + "mean_score": 0.883, + "evidence_coverage": 1.0, + "source_ref_coverage": 1.0, + "quote_coverage": 1.0 + } + }, + { + "command": "cargo make real-world-memory-graph-rag", + "status": "pass", + "artifact": "tmp/real-world-memory/graph-rag/report.json", + "summary": { + "job_count": 5, + "pass": 0, + "wrong_result": 1, + "incomplete": 1, + "blocked": 3, + "not_encoded": 0, + "evidence_coverage": 0.25 + } + }, + { + "command": "cargo make real-world-first-generation-oss", + "status": "pass", + "artifact": "tmp/real-world-memory/first-generation-oss/report.json", + "summary": { + "job_count": 6, + "pass": 4, + "wrong_result": 0, + "blocked": 2, + "not_encoded": 0, + "evidence_coverage": 1.0 + } + }, + { + "command": "cargo make real-world-memory-live-adapters", + "status": "pass", + "artifact": "tmp/real-world-memory/live-adapters/summary.json", + "partial_summary": { + "elf_live_real_world": { + "job_count": 66, + "pass": 40, + "wrong_result": 0, + "blocked": 7, + "not_encoded": 19, + "incomplete": 0, + "mean_score": 0.606, + "evidence_coverage": 0.571, + "memory_evolution_status": "pass", + "consolidation_status": "pass", + "knowledge_compilation_status": "pass", + "operator_debugging_ux_status": "pass", + "capture_integration_status": "pass", + "proactive_brief_status": "blocked", + "scheduled_memory_status": "blocked", + "production_ops_status": "blocked", + "context_trajectory_status": "blocked" + }, + "qmd_live_real_world": { + "job_count": 66, + "pass": 17, + "wrong_result": 13, + "blocked": 7, + "not_encoded": 29, + "incomplete": 0, + "mean_score": 0.352, + "evidence_coverage": 0.379, + "memory_evolution_status": "wrong_result", + "retrieval_status": "pass", + "work_resume_status": "pass", + "project_decisions_status": "pass", + "operator_debugging_ux_status": "wrong_result", + "production_ops_status": "blocked", + "context_trajectory_status": "blocked" + } + } + } + ], + "stage_closeout": [ + { + "stage_id": "current_vs_historical_correctness", + "baseline_counts": { + "pass": 1, + "wrong_result": 5, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "current_counts": { + "pass": 6, + "wrong_result": 0, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "judgment": "improved", + "evidence": [ + "apps/elf-eval/fixtures/report_snapshots/2026-06-16-live-temporal-reconciliation-report.json", + "tmp/real-world-memory/live-adapters/elf-report.json" + ], + "boundary": "Improved for the encoded ELF live memory_evolution slice; not a Graphiti/Zep, mem0/OpenMemory, Letta, private-corpus, or broad qmd superiority claim." + }, + { + "stage_id": "preference_evolution", + "baseline_counts": { + "pass": 0, + "wrong_result": 1, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "current_counts": { + "pass": 1, + "wrong_result": 0, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "judgment": "improved", + "evidence": [ + "apps/elf-eval/fixtures/report_snapshots/2026-06-16-live-temporal-reconciliation-report.json", + "docs/evidence/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md" + ], + "boundary": "ELF's encoded current-vs-historical preference case improved; mem0/OpenMemory local OSS history remains a measured strength and UI/export remains separately blocked or non-goal." + }, + { + "stage_id": "deletion_ttl_tombstone_behavior", + "baseline_counts": { + "pass": 1, + "wrong_result": 0, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "current_counts": { + "pass": 1, + "wrong_result": 0, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "judgment": "unchanged", + "evidence": [ + "apps/elf-eval/fixtures/report_snapshots/2026-06-16-live-temporal-reconciliation-report.json" + ], + "boundary": "The single encoded tombstone/TTL job remains passing; broader update/delete/recreate history remains follow-up work." + }, + { + "stage_id": "reviewable_consolidation", + "baseline_counts": { + "pass": 4, + "wrong_result": 0, + "blocked": 0, + "not_tested": 1, + "not_encoded": 1 + }, + "current_counts": { + "pass": 4, + "wrong_result": 0, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "judgment": "improved", + "evidence": [ + "apps/elf-eval/fixtures/report_snapshots/2026-06-16-live-consolidation-proposal-scoring-report.json", + "tmp/real-world-memory/live-adapters/elf-report.json" + ], + "boundary": "ELF has live service-backed self-check evidence; direct competitor consolidation runners remain untested or product-reference only." + }, + { + "stage_id": "memory_summary_top_of_mind_behavior", + "baseline_counts": { + "pass": 8, + "wrong_result": 0, + "blocked": 0, + "not_tested": 1, + "not_encoded": 1 + }, + "current_counts": { + "pass": 9, + "wrong_result": 0, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "judgment": "improved", + "evidence": [ + "tmp/real-world-memory/real-world-memory-report.json", + "docs/spec/system_memory_summary_v1.md" + ], + "boundary": "Improved as fixture-backed source-trace contract evidence only; service-native top-of-mind behavior remains future work." + }, + { + "stage_id": "proactive_brief_readiness", + "baseline_counts": { + "pass": 0, + "wrong_result": 0, + "blocked": 0, + "not_tested": 1, + "not_encoded": 1 + }, + "current_counts": { + "pass": 4, + "wrong_result": 0, + "blocked": 1, + "not_tested": 0, + "not_encoded": 0 + }, + "judgment": "improved", + "evidence": [ + "apps/elf-eval/fixtures/report_snapshots/2026-06-16-proactive-brief-scoring-report.json", + "tmp/real-world-memory/real-world-memory-report.json" + ], + "boundary": "Improved as fixture-backed proactive brief scoring only; private-corpus refresh stays blocked under XY-930 and Pulse parity is not proven." + }, + { + "stage_id": "scheduled_memory_task_readiness", + "baseline_counts": { + "pass": 0, + "wrong_result": 0, + "blocked": 1, + "not_tested": 0, + "not_encoded": 0 + }, + "current_counts": { + "pass": 4, + "wrong_result": 0, + "blocked": 1, + "not_tested": 0, + "not_encoded": 0 + }, + "judgment": "improved", + "evidence": [ + "apps/elf-eval/fixtures/report_snapshots/2026-06-16-scheduled-memory-task-scoring-report.json", + "tmp/real-world-memory/real-world-memory-report.json" + ], + "boundary": "Improved as fixture-backed scheduled task readback only; hosted scheduler, notification, provider-backed private-corpus, and silent-mutation parity are not proven." + }, + { + "stage_id": "final_competitor_retest_status", + "baseline_counts": { + "pass": 22, + "wrong_result": 5, + "blocked": 2, + "not_tested": 11, + "not_encoded": 11 + }, + "current_counts": { + "pass": 40, + "wrong_result": 0, + "blocked": 7, + "not_tested": 0, + "not_encoded": 19, + "incomplete": 0 + }, + "judgment": "unchanged", + "evidence": [ + "tmp/real-world-memory/live-adapters/elf-report.json", + "tmp/real-world-memory/live-adapters/qmd-report.json", + "tmp/real-world-memory/live-adapters/summary.json", + "tmp/real-world-memory/graph-rag/report.json", + "tmp/real-world-memory/first-generation-oss/report.json" + ], + "qmd_current_counts": { + "pass": 17, + "wrong_result": 13, + "blocked": 7, + "not_tested": 0, + "not_encoded": 29, + "incomplete": 0 + }, + "boundary": "ELF live wrong_result count improved in the fresh local retest and qmd now has a fresh scored live report, but qmd debug ergonomics remains a measured ELF loss, Graph/RAG remains typed non-pass, OpenViking/Letta/private/provider gates remain blocked or not tested, and broader superiority is not proven." + } + ], + "scenario_retests": [ + { + "scenario_id": "qmd_debug_ergonomics", + "baseline_outcome": "loss", + "current_outcome": "unchanged", + "current_status": "pass", + "evidence_class": "live_baseline_only", + "evidence": "Fresh qmd live materialization produced a scored full-suite report with 17 pass, 13 wrong_result, 7 blocked, and 29 not_encoded jobs. That full-suite typed non-pass result does not retest or erase the prior qmd top-10/replay ergonomics advantage, which remains the authoritative debug-ergonomics evidence.", + "follow_up": "XY-923" + }, + { + "scenario_id": "mem0_openmemory_preference_history_export", + "baseline_outcome": "loss_for_preference_history_tie_for_scoped_personalization_blocked_for_ui_export", + "current_outcome": "unchanged", + "current_status": "blocked", + "evidence_class": "live_baseline_only", + "evidence": "ELF temporal preference case improved, but mem0/OpenMemory local OSS history/export-style readback evidence remains stronger for history and OpenMemory UI/export remains setup-blocked.", + "follow_up": "XY-930 for private/provider gates; dedicated OpenMemory UI/export runner remains a measured blocker." + }, + { + "scenario_id": "letta_core_archive", + "baseline_outcome": "blocked", + "current_outcome": "unchanged", + "current_status": "blocked", + "evidence_class": "fixture_backed", + "evidence": "ELF core_archival_memory fixture passes locally, but no contained Letta export/readback artifact maps core and archival source ids.", + "follow_up": "Decodex-ready issue brief: build a contained Letta core/archive export-readback adapter before win/tie/loss claims." + }, + { + "scenario_id": "graphiti_zep_temporal_graph_validity", + "baseline_outcome": "blocked", + "current_outcome": "unchanged", + "current_status": "blocked", + "evidence_class": "research_gate", + "evidence": "Graphiti/Zep remains blocked by provider/graph-store setup in the representative Graph/RAG slice; ELF temporal reconciliation improvement does not prove graph temporal validity parity.", + "follow_up": "Graph/RAG adapter follow-up; provider inputs remain explicit." + }, + { + "scenario_id": "openviking_trajectory_hierarchy", + "baseline_outcome": "blocked", + "current_outcome": "unchanged", + "current_status": "blocked", + "evidence_class": "fixture_backed", + "evidence": "OpenViking staged retrieval, hierarchy selection, and recursive expansion remain encoded blocked fixtures behind missing evidence-bearing staged artifacts.", + "follow_up": "XY-928" + }, + { + "scenario_id": "graph_rag_citation_navigation_knowledge_surfaces", + "baseline_outcome": "not_tested", + "current_outcome": "unchanged", + "current_status": "typed_non_pass", + "evidence_class": "fixture_backed", + "evidence": "The representative graph/RAG retest has 0 pass, 1 wrong_result, 1 incomplete, and 3 blocked jobs; this is not graph/RAG quality parity.", + "follow_up": "XY-929" + }, + { + "scenario_id": "private_provider_production_gates", + "baseline_outcome": "blocked", + "current_outcome": "unchanged", + "current_status": "blocked", + "evidence_class": "blocked", + "evidence": "No operator-owned private manifest or explicit credentialed provider setup was supplied in this lane.", + "follow_up": "XY-930" + } + ], + "optimization_queue": [ + { + "priority": "P0", + "issue": "XY-923", + "status": "existing", + "brief": "Re-run qmd trace/replay diagnostics with comparable immediate top-k/replay, expansion, fusion, rerank, and candidate-drop artifacts; preserve the qmd debug ergonomics loss unless ELF produces comparable artifacts." + }, + { + "priority": "P1", + "issue": "XY-930", + "status": "existing", + "brief": "Run private-corpus and credentialed provider gates only after operator-owned manifest and explicit provider setup exist; otherwise keep typed blockers." + }, + { + "priority": "P1", + "issue": "XY-928", + "status": "existing", + "brief": "Materialize OpenViking staged trajectory, hierarchy selection, and recursive expansion evidence before claiming ELF ties or beats those strengths." + }, + { + "priority": "P1", + "issue": "letta-core-archive-adapter-brief", + "status": "proposed", + "brief": "Add a contained Letta core/archive export-readback adapter that emits source ids for core blocks and archival memories; non-goals are ELF product changes and broad Letta claims." + }, + { + "priority": "P2", + "issue": "XY-929", + "status": "existing", + "brief": "Promote Graph/RAG citation, navigation, stale-source lint, and knowledge-surface cases only when adapters emit comparable evidence-linked outputs; keep blocked, incomplete, wrong_result, and not_tested states typed." + }, + { + "priority": "P2", + "issue": "service-native-dreaming-outputs-brief", + "status": "proposed", + "brief": "Move fixture-backed memory summary, proactive brief, and scheduled task contracts into service-native readback/materialization with the same source-ref, freshness, rationale, trace, and no-source-mutation gates." + } + ], + "claim_boundaries": { + "allowed": [ + "ELF is locally and partially stronger after the Dreaming stages on encoded temporal reconciliation, reviewable consolidation self-checks, fixture-backed memory summary, proactive brief, and scheduled-memory task scoring.", + "The public/local aggregate fixture retest remains 53 pass, 0 wrong_result, and 7 typed blocked jobs across 60 jobs.", + "The representative graph/RAG slice remains typed non-pass.", + "Private/provider gates remain blocked under XY-930." + ], + "not_allowed": [ + "Do not claim broad ELF-over-qmd superiority.", + "Do not claim ELF beats managed Dreaming, Pulse, ChatGPT Tasks, mem0/OpenMemory, Letta, OpenViking, Graphiti/Zep, or graph/RAG systems from fixture-only, partial live, blocked, or smoke-only evidence.", + "Do not collapse scenario-level outcomes into a leaderboard.", + "Do not treat qmd full-suite wrong_result counts as a regression of qmd debug ergonomics." + ] + }, + "follow_up_issue_briefs": { + "existing": [ + { + "issue": "XY-923", + "title": "qmd trace/replay diagnostics and debug ergonomics comparison", + "reason": "qmd remains the measured local retrieval-debug ergonomics loss; the fresh qmd full-suite live report does not provide comparable immediate top-k, replay, expansion, fusion, rerank, and candidate-drop debug artifacts.", + "scope": "Re-run qmd trace/replay diagnostics with comparable immediate top-k, replay, expansion, fusion, rerank, and candidate-drop artifacts.", + "non_goal": "Do not reinterpret qmd full-suite wrong_result counts as a regression of qmd debug ergonomics.", + "validation": "A scored qmd/ELF debug ergonomics artifact that preserves pass, wrong_result, blocked, and not_encoded states." + }, + { + "issue": "XY-930", + "title": "private-corpus and credentialed provider gates", + "reason": "operator-owned private manifest and explicit provider setup remain absent.", + "scope": "Run private-corpus and credentialed provider gates only when operator inputs exist; otherwise publish typed blockers.", + "non_goal": "Do not infer credentials or promote synthetic/provider smoke evidence into private-corpus pass evidence.", + "validation": "A public-safe report that states whether the private/provider caveats are removed or still blocked." + }, + { + "issue": "XY-928", + "title": "OpenViking trajectory and hierarchy evidence", + "reason": "OpenViking staged retrieval, hierarchy selection, and recursive expansion remain blocked by missing evidence-bearing staged artifacts.", + "scope": "Materialize same-corpus evidence ids and staged trajectory outputs before scoring hierarchy/recursive retrieval.", + "non_goal": "Do not claim ELF ties or beats OpenViking from fixture-only blocked rows.", + "validation": "Scored context-trajectory reports with typed pass, wrong_result, blocked, or incomplete outcomes." + }, + { + "issue": "XY-929", + "title": "Graph/RAG citation and navigation adapter promotion", + "reason": "The representative graph/RAG retest remains 0 pass, 1 wrong_result, 1 incomplete, and 3 blocked.", + "scope": "Promote graph/RAG citation, navigation, stale-source lint, and knowledge-surface cases only when adapters emit comparable evidence-linked outputs.", + "non_goal": "Do not convert research gates, tiny smokes, blocked setup, or graphify wrong_result into graph/RAG parity evidence.", + "validation": "Representative graph/RAG reports that keep blocked, incomplete, wrong_result, not_tested, and non_goal states typed." + } + ], + "proposed": [ + { + "issue": "letta-core-archive-adapter-brief", + "title": "contained Letta core/archive export-readback adapter", + "reason": "ELF has fixture-backed core/archival memory evidence, but no contained Letta artifact maps core blocks, archival readback, and source ids.", + "scope": "Create a Docker-contained Letta export/readback adapter over benchmark-owned data and score only mapped core/archive evidence.", + "non_goal": "Do not change ELF product behavior or make broad Letta win/tie/loss claims before comparable evidence exists.", + "validation": "A scored artifact containing Letta core block JSON, archival search/readback JSON, source ids, and typed outcome states." + }, + { + "issue": "service-native-dreaming-outputs-brief", + "title": "service-native memory summary, proactive brief, and scheduled task materialization", + "reason": "The Dreaming output improvements are currently fixture-backed contracts, not service-native generated/readback behavior.", + "scope": "Move memory summary, proactive brief, and scheduled task outputs into service-native materialization with source refs, freshness, rationale, trace, and no-source-mutation gates.", + "non_goal": "Do not build a polished hosted scheduler, Pulse clone, notification product, or private/provider path in this follow-up.", + "validation": "Service-native scored reports that preserve fixture boundaries and fail stale, tombstoned, unsupported, or untraced current claims." + } + ] + } +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 532add8b..e6aab322 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -206,6 +206,18 @@ fn dreaming_readiness_stage_ledger_markdown_path() -> Result { .join("2026-06-16-dreaming-readiness-stage-ledger.md")) } +fn dreaming_competitor_strength_retest_report_json_path() -> Result { + report_snapshot_path("2026-06-17-dreaming-competitor-strength-retest-report.json") +} + +fn dreaming_competitor_strength_retest_report_markdown_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("evidence") + .join("benchmarking") + .join("2026-06-17-dreaming-competitor-strength-retest-report.md")) +} + fn live_temporal_reconciliation_report_json_path() -> Result { report_snapshot_path("2026-06-16-live-temporal-reconciliation-report.json") } @@ -2817,6 +2829,241 @@ fn live_temporal_reconciliation_report_records_xy905_before_after() -> Result<() Ok(()) } +#[test] +fn dreaming_competitor_strength_retest_report_closes_xy955_without_overclaims() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + dreaming_competitor_strength_retest_report_json_path()?, + )?)?; + let markdown = fs::read_to_string(dreaming_competitor_strength_retest_report_markdown_path()?)?; + let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?; + let readme = fs::read_to_string(readme_path()?)?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.dreaming_competitor_strength_retest_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-955")); + assert_eq!( + report.pointer("/summary/overall_judgment").and_then(Value::as_str), + Some("locally_and_partially_stronger_only") + ); + assert_eq!( + report.pointer("/summary/broader_superiority").and_then(Value::as_str), + Some("not_proven") + ); + assert_eq!(report.pointer("/summary/regressed_stage_count").and_then(Value::as_u64), Some(0)); + assert!(array_contains_str(&report, "/status_terms", "typed_non_pass")?); + assert!(array_contains_str( + &report, + "/summary/unsupported_claims_rejected", + "ELF does not broadly beat qmd from this retest." + )?); + + assert_xy955_commands(&report)?; + assert_xy955_stage_closeout(&report)?; + assert_xy955_scenario_retests(&report)?; + assert_xy955_optimization_queue(&report)?; + assert_xy955_follow_up_issue_briefs(&report)?; + + assert!(markdown.contains("ELF is locally and partially stronger")); + assert!( + markdown.contains("The full live-adapter command now has fresh ELF and qmd scored reports") + ); + assert!( + markdown.contains( + "Do not treat qmd full-suite wrong_result counts as a regression of qmd debug" + ) + ); + assert!(markdown.contains("## Follow-Up Issue Briefs")); + assert!(markdown.contains( + "| GraphRAG/LightRAG/RAGFlow/llm-wiki/gbrain/graphify citation/navigation/knowledge surfaces |" + )); + assert!( + benchmarking_index.contains("2026-06-17-dreaming-competitor-strength-retest-report.md") + ); + assert!(readme.contains("Dreaming Competitor-Strength Retest Report - June 17, 2026")); + assert!(readme.contains("Latest real-world benchmark report: June 17, 2026")); + + Ok(()) +} + +fn assert_xy955_commands(report: &Value) -> Result<()> { + let commands = array_at(report, "/commands")?; + let aggregate = find_by_field(commands, "/command", "cargo make real-world-memory")?; + let graph_rag = find_by_field(commands, "/command", "cargo make real-world-memory-graph-rag")?; + let first_generation = + find_by_field(commands, "/command", "cargo make real-world-first-generation-oss")?; + let live = find_by_field(commands, "/command", "cargo make real-world-memory-live-adapters")?; + + assert_eq!(aggregate.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(aggregate.pointer("/summary/pass").and_then(Value::as_u64), Some(53)); + assert_eq!(aggregate.pointer("/summary/blocked").and_then(Value::as_u64), Some(7)); + assert_eq!(graph_rag.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(graph_rag.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + assert_eq!(graph_rag.pointer("/summary/incomplete").and_then(Value::as_u64), Some(1)); + assert_eq!(graph_rag.pointer("/summary/blocked").and_then(Value::as_u64), Some(3)); + assert_eq!(first_generation.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(first_generation.pointer("/summary/blocked").and_then(Value::as_u64), Some(2)); + assert_eq!(live.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + live.pointer("/partial_summary/elf_live_real_world/pass").and_then(Value::as_u64), + Some(40) + ); + assert_eq!( + live.pointer("/partial_summary/elf_live_real_world/wrong_result").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + live.pointer("/partial_summary/qmd_live_real_world/pass").and_then(Value::as_u64), + Some(17) + ); + assert_eq!( + live.pointer("/partial_summary/qmd_live_real_world/wrong_result").and_then(Value::as_u64), + Some(13) + ); + + Ok(()) +} + +fn assert_xy955_stage_closeout(report: &Value) -> Result<()> { + let stages = array_at(report, "/stage_closeout")?; + + assert_eq!(stages.len(), 8); + + let current = find_by_field(stages, "/stage_id", "current_vs_historical_correctness")?; + let proactive = find_by_field(stages, "/stage_id", "proactive_brief_readiness")?; + let scheduled = find_by_field(stages, "/stage_id", "scheduled_memory_task_readiness")?; + let final_retest = find_by_field(stages, "/stage_id", "final_competitor_retest_status")?; + + assert_eq!(current.pointer("/judgment").and_then(Value::as_str), Some("improved")); + assert_eq!(current.pointer("/current_counts/pass").and_then(Value::as_u64), Some(6)); + assert_eq!(current.pointer("/current_counts/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(proactive.pointer("/judgment").and_then(Value::as_str), Some("improved")); + assert_eq!(proactive.pointer("/current_counts/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!(scheduled.pointer("/current_counts/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(scheduled.pointer("/current_counts/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!(final_retest.pointer("/judgment").and_then(Value::as_str), Some("unchanged")); + assert_eq!(final_retest.pointer("/current_counts/pass").and_then(Value::as_u64), Some(40)); + assert_eq!( + final_retest.pointer("/current_counts/wrong_result").and_then(Value::as_u64), + Some(0) + ); + assert_eq!(final_retest.pointer("/current_counts/blocked").and_then(Value::as_u64), Some(7)); + assert_eq!( + final_retest.pointer("/current_counts/not_encoded").and_then(Value::as_u64), + Some(19) + ); + assert!(final_retest.pointer("/boundary").and_then(Value::as_str).is_some_and(|boundary| { + boundary.contains("qmd now has a fresh scored live report") + && boundary.contains("broader superiority is not proven") + })); + assert_eq!(final_retest.pointer("/qmd_current_counts/pass").and_then(Value::as_u64), Some(17)); + assert_eq!( + final_retest.pointer("/qmd_current_counts/wrong_result").and_then(Value::as_u64), + Some(13) + ); + + Ok(()) +} + +fn assert_xy955_scenario_retests(report: &Value) -> Result<()> { + let scenarios = array_at(report, "/scenario_retests")?; + let qmd = find_by_field(scenarios, "/scenario_id", "qmd_debug_ergonomics")?; + let mem0 = + find_by_field(scenarios, "/scenario_id", "mem0_openmemory_preference_history_export")?; + let letta = find_by_field(scenarios, "/scenario_id", "letta_core_archive")?; + let graph_rag = find_by_field( + scenarios, + "/scenario_id", + "graph_rag_citation_navigation_knowledge_surfaces", + )?; + let private_provider = + find_by_field(scenarios, "/scenario_id", "private_provider_production_gates")?; + + assert_eq!(qmd.pointer("/current_outcome").and_then(Value::as_str), Some("unchanged")); + assert_eq!(qmd.pointer("/current_status").and_then(Value::as_str), Some("pass")); + assert!(qmd.pointer("/evidence").and_then(Value::as_str).is_some_and(|evidence| { + evidence.contains("17 pass") + && evidence.contains("13 wrong_result") + && evidence.contains("does not retest or erase") + })); + assert_eq!(mem0.pointer("/current_outcome").and_then(Value::as_str), Some("unchanged")); + assert!(mem0.pointer("/evidence").and_then(Value::as_str).is_some_and(|evidence| { + evidence.contains("mem0/OpenMemory local OSS history") + && evidence.contains("OpenMemory UI/export remains setup-blocked") + })); + assert_eq!(letta.pointer("/current_status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + graph_rag.pointer("/current_status").and_then(Value::as_str), + Some("typed_non_pass") + ); + assert!(graph_rag.pointer("/evidence").and_then(Value::as_str).is_some_and(|evidence| { + evidence.contains("0 pass") + && evidence.contains("1 wrong_result") + && evidence.contains("3 blocked") + })); + assert_eq!(private_provider.pointer("/follow_up").and_then(Value::as_str), Some("XY-930")); + + Ok(()) +} + +fn assert_xy955_optimization_queue(report: &Value) -> Result<()> { + let queue = array_at(report, "/optimization_queue")?; + let qmd = find_by_field(queue, "/issue", "XY-923")?; + let private_provider = find_by_field(queue, "/issue", "XY-930")?; + let openviking = find_by_field(queue, "/issue", "XY-928")?; + let letta = find_by_field(queue, "/issue", "letta-core-archive-adapter-brief")?; + let service_native = find_by_field(queue, "/issue", "service-native-dreaming-outputs-brief")?; + + assert_eq!(qmd.pointer("/status").and_then(Value::as_str), Some("existing")); + assert_eq!(private_provider.pointer("/status").and_then(Value::as_str), Some("existing")); + assert_eq!(openviking.pointer("/status").and_then(Value::as_str), Some("existing")); + assert_eq!(letta.pointer("/status").and_then(Value::as_str), Some("proposed")); + assert_eq!(service_native.pointer("/status").and_then(Value::as_str), Some("proposed")); + assert!(array_contains_str( + report, + "/claim_boundaries/not_allowed", + "Do not treat qmd full-suite wrong_result counts as a regression of qmd debug ergonomics." + )?); + + Ok(()) +} + +fn assert_xy955_follow_up_issue_briefs(report: &Value) -> Result<()> { + let existing = array_at(report, "/follow_up_issue_briefs/existing")?; + let proposed = array_at(report, "/follow_up_issue_briefs/proposed")?; + let qmd = find_by_field(existing, "/issue", "XY-923")?; + let private_provider = find_by_field(existing, "/issue", "XY-930")?; + let letta = find_by_field(proposed, "/issue", "letta-core-archive-adapter-brief")?; + let service_native = + find_by_field(proposed, "/issue", "service-native-dreaming-outputs-brief")?; + + assert!(qmd.pointer("/scope").and_then(Value::as_str).is_some_and(|scope| { + scope.contains("immediate top-k") && scope.contains("candidate-drop artifacts") + })); + assert!(qmd.pointer("/non_goal").and_then(Value::as_str).is_some_and(|non_goal| { + non_goal.contains("qmd full-suite wrong_result counts") + && non_goal.contains("debug ergonomics") + })); + assert!( + private_provider + .pointer("/non_goal") + .and_then(Value::as_str) + .is_some_and(|non_goal| non_goal.contains("Do not infer credentials")) + ); + assert!(letta.pointer("/validation").and_then(Value::as_str).is_some_and(|validation| { + validation.contains("Letta core block JSON") && validation.contains("typed outcome states") + })); + assert!( + service_native + .pointer("/non_goal") + .and_then(Value::as_str) + .is_some_and(|non_goal| non_goal.contains("Pulse clone")) + ); + + Ok(()) +} + #[test] fn qmd_trace_replay_diagnostics_report_preserves_claim_boundaries() -> Result<()> { let report = serde_json::from_str::(&fs::read_to_string( @@ -4139,6 +4386,14 @@ fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) - Some(0) ); + assert_dreaming_final_competitor_retest_stage(ledger, stages)?; + assert_dreaming_memory_summary_stage(stages)?; + assert_dreaming_proactive_brief_stage(stages)?; + + Ok(()) +} + +fn assert_dreaming_final_competitor_retest_stage(ledger: &Value, stages: &[Value]) -> Result<()> { let retest = find_by_field(stages, "/stage_id", "final_competitor_retest_status")?; assert_eq!(retest.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(22)); @@ -4146,6 +4401,24 @@ fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) - assert_eq!(retest.pointer("/baseline_counts/blocked").and_then(Value::as_u64), Some(2)); assert_eq!(retest.pointer("/baseline_counts/not_tested").and_then(Value::as_u64), Some(11)); assert_eq!(retest.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64), Some(11)); + assert_eq!(retest.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(40)); + assert_eq!(retest.pointer("/post_stage_counts/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(retest.pointer("/post_stage_counts/blocked").and_then(Value::as_u64), Some(7)); + assert_eq!(retest.pointer("/post_stage_counts/not_encoded").and_then(Value::as_u64), Some(19)); + assert_eq!(retest.pointer("/qmd_post_stage_counts/pass").and_then(Value::as_u64), Some(17)); + assert_eq!( + retest.pointer("/qmd_post_stage_counts/wrong_result").and_then(Value::as_u64), + Some(13) + ); + assert!(retest.pointer("/post_stage_basis").and_then(Value::as_str).is_some_and(|basis| { + basis.contains("XY-955 closeout retest") + && basis.contains("qmd live adapter materialization is 17 pass") + })); + + assert_dreaming_readiness_summary_buckets(ledger) +} + +fn assert_dreaming_readiness_summary_buckets(ledger: &Value) -> Result<()> { assert!(array_contains_str(ledger, "/summary/improved", "current_vs_historical_correctness")?); assert!(array_contains_str(ledger, "/summary/improved", "preference_evolution")?); assert!(array_contains_str(ledger, "/summary/improved", "reviewable_consolidation")?); @@ -4162,9 +4435,6 @@ fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) - assert!(array_at(ledger, "/summary/blocked")?.is_empty()); assert!(array_at(ledger, "/summary/not_tested")?.is_empty()); - assert_dreaming_memory_summary_stage(stages)?; - assert_dreaming_proactive_brief_stage(stages)?; - Ok(()) } @@ -4253,7 +4523,9 @@ fn assert_dreaming_readiness_markdown_boundaries(markdown: &str) { assert!(markdown.contains("`regressed`: none")); assert!(markdown.contains("the XY-905 run passes all six memory-evolution jobs")); assert!(markdown.contains("XY-952 adds a reviewable `elf.memory_summary/v1`")); + assert!(markdown.contains("XY-955 closes the final competitor retest row")); assert!(markdown.contains("XY-905")); + assert!(markdown.contains("qmd live `pass=17`, `wrong_result=13`")); assert!( markdown .contains("Do not claim this ledger proves preference history against mem0/OpenMemory") diff --git a/docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md b/docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md index e6e0e379..413332d5 100644 --- a/docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md +++ b/docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md @@ -44,6 +44,12 @@ Current stage status: - `blocked`: none. - `not_tested`: none. +XY-955 closes the final competitor retest row for this program pass. The closeout +keeps the final competitor judgment `unchanged`: the fresh public/local retest +confirms ELF's encoded Dreaming-stage improvements, produces a fresh qmd full-suite +live report with typed non-pass states, keeps graph/RAG typed non-pass, and leaves +private/provider gates tied to XY-930. + The known live `memory_evolution` loss is now repaired for the encoded ELF live adapter slice: the XY-905 run passes all six memory-evolution jobs and reports current, historical, rationale, tombstone, invalidation, selected, dropped, and @@ -101,7 +107,7 @@ provider-backed private-corpus quality, or silent source mutation safety. | Memory summary and top-of-mind behavior | `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival` | `cargo make real-world-memory-summary`; `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival`; `cargo make real-world-memory-live-adapters` | `pass=8`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=9`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `improved` | Move from fixture-backed summary/source-trace readback into service-native admin readback and later live top-of-mind behavior; do not turn hidden summaries into authoritative memory. | | Proactive brief readiness | `cargo make real-world-first-generation-oss`; `cargo make real-world-job-operator-ux` | `cargo make real-world-memory-proactive-brief`; `cargo make real-world-memory`; `cargo test -p elf-eval --test real_world_job_benchmark -- --test-threads=1` | `pass=0`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=4`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0`; evidence-ref/freshness/rationale coverage `1.000`; invalid-current and tombstone violations `0` | `improved` | Move from fixture-backed proactive brief scoring into service-native generated brief readback and later live adapter materialization; keep scheduling and private-corpus refresh behind owned lanes and operator inputs. | | Scheduled memory task readiness | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-scheduled`; `cargo make real-world-memory`; `cargo test -p elf-eval --test real_world_job_benchmark scheduled_memory -- --test-threads=1` | `pass=0`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0` | `pass=4`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0`; evidence-ref/freshness/action/trace coverage `1.000`; invalid-current, unsupported-current, tombstone, and source-mutation violations `0` | `improved` | Move from fixture-backed scheduled task scoring into service-native queued task materialization and operator-visible readback; keep hosted/private/provider scheduler gates behind XY-930 inputs. | -| Final competitor retest status | `cargo make real-world-memory-live-adapters`; `cargo make real-world-first-generation-oss`; `cargo make real-world-memory-graph-rag`; `cargo make openmemory-ui-export-readback`; `cargo make baseline-production-private-addendum` when operator input exists | Same commands; private/provider commands may remain typed blocked under XY-930 | `pass=22`, `wrong_result=5`, `blocked=2`, `not_tested=11`, `not_encoded=11` | partial XY-905 evidence: ELF live adapter `pass=40`, `wrong_result=0`, `blocked=5`, `not_encoded=10` | `unchanged` | Rerun the broader competitor matrix after each optimization; the XY-905 live adapter improvement does not replace private/provider or external competitor gates. | +| Final competitor retest status | `cargo make real-world-memory-live-adapters`; `cargo make real-world-first-generation-oss`; `cargo make real-world-memory-graph-rag`; `cargo make openmemory-ui-export-readback`; `cargo make baseline-production-private-addendum` when operator input exists | Same commands; private/provider commands may remain typed blocked under XY-930 | `pass=22`, `wrong_result=5`, `blocked=2`, `not_tested=11`, `not_encoded=11` | XY-955 closeout: aggregate fixture retest `pass=53`, `blocked=7`; graph/RAG `wrong_result=1`, `incomplete=1`, `blocked=3`; first-generation OSS `pass=4`, `blocked=2`; ELF live `pass=40`, `wrong_result=0`, `blocked=7`, `not_encoded=19`; qmd live `pass=17`, `wrong_result=13`, `blocked=7`, `not_encoded=29` | `unchanged` | Convert only measured losses and typed blockers into follow-up issues; the ELF live improvement and qmd full-suite non-pass do not replace qmd debug ergonomics, private/provider, OpenViking, Letta, or graph/RAG gates. | ## Evidence Anchors @@ -114,7 +120,7 @@ provider-backed private-corpus quality, or silent source mutation safety. | Memory summary and top-of-mind behavior | `docs/spec/system_memory_summary_v1.md`; `apps/elf-eval/fixtures/real_world_memory/memory_summary/`; `apps/elf-eval/fixtures/real_world_memory/knowledge/`; `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/`; `docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | | Proactive brief readiness | `docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md`; `apps/elf-eval/fixtures/real_world_memory/proactive_brief/`; `docs/decisions/2026-06-08-agent-memory-selection.md`; `docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | | Scheduled memory task readiness | `docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md`; `apps/elf-eval/fixtures/real_world_memory/scheduled_memory/`; `docs/decisions/2026-06-08-agent-memory-selection.md` | -| Final competitor retest status | `docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md`; `docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md`; `docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | +| Final competitor retest status | `docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md`; `docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md`; `docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md`; `docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md`; `apps/elf-eval/fixtures/report_snapshots/2026-06-17-dreaming-competitor-strength-retest-report.json` | ## Report Shape For Downstream Issues diff --git a/docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md b/docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md new file mode 100644 index 00000000..62396f6d --- /dev/null +++ b/docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md @@ -0,0 +1,131 @@ +# Dreaming Competitor-Strength Retest Report - June 17, 2026 + +Goal: Close out the XY-955 Dreaming-readiness benchmark program pass with a +baseline-vs-current competitor-strength retest and optimization queue. +Read this when: You need the final stage-ledger closeout after XY-905, XY-934, +XY-952, XY-953, and XY-954, or need to know which remaining losses and blockers are +ready for follow-up issue work. +Inputs: +`apps/elf-eval/fixtures/report_snapshots/2026-06-17-dreaming-competitor-strength-retest-report.json`, +`apps/elf-eval/fixtures/report_snapshots/2026-06-16-dreaming-readiness-stage-ledger.json`, +`apps/elf-eval/fixtures/report_snapshots/2026-06-11-competitor-strength-adoption-report.json`, +the June 16 stage reports, and the fresh `tmp/real-world-memory/` retest outputs. +Outputs: Scenario-level improved/regressed/unchanged/blocked/not-tested judgments, +claim boundaries, and the next optimization queue. + +## Executive Judgment + +ELF is locally and partially stronger after the Dreaming-readiness stages. It is not +broadly superior to the tracked competitors. + +The public/local retest supports these narrow improvements: + +- Live ELF `memory_evolution` moved from `pass=1`, `wrong_result=5` in the XY-951 + baseline to `pass=6`, `wrong_result=0` in the XY-905 report and the fresh partial + ELF live adapter output. +- Live ELF consolidation self-checks now pass for service-backed proposal + materialization, source lineage, confidence/usefulness, unsupported-claim flags, + review actions, and zero source mutations. +- Fixture-backed memory summary, proactive brief, and scheduled-memory task scoring + are now encoded and passing except for their explicit private/provider blockers. + +The broader competitor-strength outcome is unchanged: + +- qmd debug ergonomics remain a measured ELF loss from the existing trace/replay + report. The fresh qmd full-suite live report is typed non-pass, but that does not + retest or erase qmd's top-k/replay artifact advantage. +- mem0/OpenMemory preference-history and export-style local OSS readback remain + separate measured strengths; OpenMemory UI/export and hosted Platform behavior are + not proven by this retest. +- Letta core/archive, OpenViking trajectory/hierarchy, Graphiti/Zep temporal graph, + and broad graph/RAG citation/navigation quality remain blocked, incomplete, + wrong-result, or not-tested. +- Private-corpus and credentialed provider gates remain tied to XY-930. + +No scenario regressed in the checked-in local/public retest evidence. The remaining +work is issue-shaped only for measured losses or typed blockers. + +## Commands + +| Command | Status | Artifact | Result | +| --- | --- | --- | --- | +| `cargo make real-world-memory` | `pass` | `tmp/real-world-memory/real-world-memory-report.json` | 60 jobs, 53 pass, 0 wrong_result, 7 blocked, evidence/source-ref/quote coverage 1.000. | +| `cargo make real-world-memory-graph-rag` | `pass` | `tmp/real-world-memory/graph-rag/report.json` | 5 jobs, 0 pass, 1 wrong_result, 1 incomplete, 3 blocked. This is typed non-pass graph/RAG evidence. | +| `cargo make real-world-first-generation-oss` | `pass` | `tmp/real-world-memory/first-generation-oss/report.json` | 6 jobs, 4 pass, 2 blocked, evidence coverage 1.000. | +| `cargo make real-world-memory-live-adapters` | `pass` | `tmp/real-world-memory/live-adapters/summary.json` | ELF live: 66 jobs, 40 pass, 0 wrong_result, 7 blocked, 19 not_encoded. qmd live: 66 jobs, 17 pass, 13 wrong_result, 7 blocked, 29 not_encoded. | + +The full live-adapter command now has fresh ELF and qmd scored reports. The qmd +full-suite non-pass result is not a regression of qmd debug ergonomics and is not a +broad ELF-over-qmd win. + +## Stage Closeout + +| Stage | Baseline | Current | Judgment | Boundary | +| --- | --- | --- | --- | --- | +| Current-vs-historical correctness | `pass=1`, `wrong_result=5` | `pass=6`, `wrong_result=0` | `improved` | Encoded ELF live `memory_evolution` only; no Graphiti/Zep, mem0/OpenMemory, Letta, private-corpus, or broad qmd claim. | +| Preference evolution | `wrong_result=1` | `pass=1`, `wrong_result=0` | `improved` | ELF current-vs-historical preference case improved; mem0/OpenMemory history remains separately stronger on the local OSS history surface. | +| Deletion, TTL, and tombstones | `pass=1` | `pass=1` | `unchanged` | Single encoded tombstone/TTL job remains passing; broader update/delete/recreate history is still follow-up work. | +| Reviewable consolidation | `pass=4`, `not_tested=1`, `not_encoded=1` | `pass=4`, `not_tested=0`, `not_encoded=0` | `improved` | ELF live self-check evidence only; direct competitor consolidation runners remain untested or product-reference only. | +| Memory summary/top-of-mind | `pass=8`, `not_tested=1`, `not_encoded=1` | `pass=9`, `not_tested=0`, `not_encoded=0` | `improved` | Fixture-backed `elf.memory_summary/v1` source-trace contract evidence only. | +| Proactive brief readiness | `pass=0`, `not_tested=1`, `not_encoded=1` | `pass=4`, `blocked=1` | `improved` | Fixture-backed proactive brief scoring only; private-corpus refresh stays blocked under XY-930 and Pulse parity is not proven. | +| Scheduled memory task readiness | `pass=0`, `blocked=1` | `pass=4`, `blocked=1` | `improved` | Fixture-backed scheduled task readback only; hosted scheduler, notification, provider-backed private-corpus, and silent-mutation parity are not proven. | +| Final competitor retest status | `pass=22`, `wrong_result=5`, `blocked=2`, `not_tested=11`, `not_encoded=11` | ELF live: `pass=40`, `wrong_result=0`, `blocked=7`, `not_encoded=19`; qmd live: `pass=17`, `wrong_result=13`, `blocked=7`, `not_encoded=29`; graph/RAG typed non-pass; first-generation OSS `pass=4`, `blocked=2` | `unchanged` | ELF live improvement and qmd full-suite non-pass do not remove qmd debug ergonomics, private/provider, OpenViking, Letta, or graph/RAG blockers. | + +## Scenario Retest Matrix + +| Scenario | Baseline outcome | Current outcome | Status | Follow-up | +| --- | --- | --- | --- | --- | +| qmd debug ergonomics | `loss` | `unchanged` | `pass` for fresh qmd full-suite materialization; debug ergonomics still a measured ELF loss | XY-923 | +| mem0/OpenMemory preference/history/export | ELF loss on correction history, tie on scoped personalization, UI/export blocked | `unchanged` | `blocked` for UI/export and private/provider inputs | XY-930 plus dedicated UI/export runner work | +| Letta core/archive | `blocked` | `unchanged` | `blocked` | Proposed Letta core/archive adapter brief | +| Graphiti/Zep temporal graph validity | `blocked` | `unchanged` | `blocked` | Graph/RAG adapter follow-up with explicit provider setup | +| OpenViking trajectory/hierarchy | `blocked` | `unchanged` | `blocked` | XY-928 | +| GraphRAG/LightRAG/RAGFlow/llm-wiki/gbrain/graphify citation/navigation/knowledge surfaces | `not_tested` | `unchanged` | typed non-pass: blocked, incomplete, wrong_result, not_tested, or non_goal | XY-929 | +| Private/provider production gates | `blocked` | `unchanged` | `blocked` | XY-930 | + +## Optimization Queue + +| Priority | Issue | Status | Brief | +| --- | --- | --- | --- | +| P0 | XY-923 | Existing | Re-run qmd trace/replay diagnostics with comparable immediate top-k/replay, expansion, fusion, rerank, and candidate-drop artifacts; preserve qmd's debug ergonomics edge unless ELF produces comparable artifacts. | +| P1 | XY-930 | Existing | Run private-corpus and credentialed provider gates only after operator-owned manifest and explicit provider setup exist; otherwise keep typed blockers. | +| P1 | XY-928 | Existing | Materialize OpenViking staged trajectory, hierarchy selection, and recursive expansion evidence before claiming ELF ties or beats those strengths. | +| P1 | Letta core/archive adapter | Proposed | Add a contained Letta core/archive export-readback adapter that emits source ids for core blocks and archival memories. Non-goals: ELF product changes and broad Letta claims. | +| P2 | XY-929 | Existing | Promote Graph/RAG citation, navigation, stale-source lint, and knowledge-surface cases only when adapters emit comparable evidence-linked outputs. | +| P2 | Service-native Dreaming outputs | Proposed | Move fixture-backed memory summary, proactive brief, and scheduled task contracts into service-native readback/materialization with source-ref, freshness, rationale, trace, and no-source-mutation gates. | + +## Follow-Up Issue Briefs + +These are Decodex-ready follow-up shapes for the remaining measured losses or typed +blockers. Existing Linear issues should be linked rather than duplicated. + +| Issue | State | Brief | +| --- | --- | --- | +| XY-923 | Existing | Re-run qmd trace/replay diagnostics with comparable immediate top-k, replay, expansion, fusion, rerank, and candidate-drop artifacts. Non-goal: do not reinterpret qmd full-suite wrong_result counts as a regression of qmd debug ergonomics. Validation: a scored qmd/ELF debug ergonomics artifact with typed outcomes preserved. | +| XY-930 | Existing | Run private-corpus and credentialed provider gates only after operator-owned manifest and explicit provider setup exist. Non-goal: do not infer credentials or promote synthetic/provider smoke evidence into private-corpus pass evidence. Validation: a public-safe report that states whether the private/provider caveats are removed or still blocked. | +| XY-928 | Existing | Materialize OpenViking same-corpus evidence ids and staged trajectory outputs before scoring hierarchy or recursive retrieval. Non-goal: do not claim ELF ties or beats OpenViking from fixture-only blocked rows. Validation: scored context-trajectory reports with typed pass, wrong_result, blocked, or incomplete outcomes. | +| XY-929 | Existing | Promote graph/RAG citation, navigation, stale-source lint, and knowledge-surface cases only when adapters emit comparable evidence-linked outputs. Non-goal: do not convert research gates, tiny smokes, blocked setup, or graphify wrong_result into graph/RAG parity evidence. Validation: representative graph/RAG reports with typed non-pass states preserved. | +| Letta core/archive adapter | Proposed | Create a Docker-contained Letta export/readback adapter over benchmark-owned data and score only mapped core/archive evidence. Non-goal: no ELF product change or broad Letta claim before comparable evidence exists. Validation: a scored artifact containing Letta core block JSON, archival search/readback JSON, source ids, and typed outcomes. | +| Service-native Dreaming outputs | Proposed | Move memory summary, proactive brief, and scheduled task outputs into service-native materialization with source refs, freshness, rationale, trace, and no-source-mutation gates. Non-goal: no polished hosted scheduler, Pulse clone, notification product, or private/provider path in this follow-up. Validation: service-native scored reports that fail stale, tombstoned, unsupported, or untraced current claims. | + +## Claim Boundaries + +Allowed: + +- ELF is locally and partially stronger after the Dreaming stages on encoded temporal + reconciliation, reviewable consolidation self-checks, fixture-backed memory + summary, proactive brief, and scheduled-memory task scoring. +- The public/local aggregate fixture retest remains 53 pass, 0 wrong_result, and 7 + typed blocked jobs across 60 jobs. +- The representative graph/RAG slice remains typed non-pass. +- Private/provider gates remain blocked under XY-930. + +Not allowed: + +- Do not claim broad ELF-over-qmd superiority. +- Do not claim ELF beats managed Dreaming, Pulse, ChatGPT Tasks, mem0/OpenMemory, + Letta, OpenViking, Graphiti/Zep, or graph/RAG systems from fixture-only, partial + live, blocked, or smoke-only evidence. +- Do not collapse scenario-level outcomes into a leaderboard. +- Do not treat qmd full-suite wrong_result counts as a regression of qmd debug + ergonomics. diff --git a/docs/evidence/benchmarking/index.md b/docs/evidence/benchmarking/index.md index e8f581b6..2f7c6428 100644 --- a/docs/evidence/benchmarking/index.md +++ b/docs/evidence/benchmarking/index.md @@ -35,3 +35,4 @@ Routes to: Benchmarking evidence concepts under `docs/evidence/benchmarking/`. - `2026-06-16-live-temporal-reconciliation-report.md`: Live Temporal Reconciliation Report - June 16, 2026. - `2026-06-16-proactive-brief-scoring-report.md`: Proactive Brief Scoring Report - June 16, 2026. - `2026-06-16-scheduled-memory-task-scoring-report.md`: Real-World Job Benchmark Report. +- `2026-06-17-dreaming-competitor-strength-retest-report.md`: Dreaming Competitor-Strength Retest Report - June 17, 2026. From 7bf44cca65fa6dbbe6197c021f6c30aaeddb0f21 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 19 Jun 2026 12:26:52 +0800 Subject: [PATCH 2/2] {"schema":"decodex/commit/1","summary":"Add Dreaming report OKF frontmatter","authority":"XY-955"} --- ...7-dreaming-competitor-strength-retest-report.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md b/docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md index 62396f6d..1415b002 100644 --- a/docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md +++ b/docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "Dreaming Competitor-Strength Retest Report - June 17, 2026" +description: "Checked-in benchmark evidence record: Dreaming Competitor-Strength Retest Report - June 17, 2026." +resource: docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-19 +tags: + - docs + - evidence + - benchmarking +--- # Dreaming Competitor-Strength Retest Report - June 17, 2026 Goal: Close out the XY-955 Dreaming-readiness benchmark program pass with a