diff --git a/Makefile.toml b/Makefile.toml index 5c89f94d..6e8e6c56 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -418,6 +418,7 @@ args = [ # | real-world-memory-consolidation | composite | | # | real-world-memory-consolidation-json | command | | # | real-world-memory-consolidation-report | command | | +# | real-world-memory-live-consolidation | command | | # | real-world-job-operator-ux | composite | | # | real-world-job-operator-ux-json | command | | # | real-world-job-operator-ux-report | command | | @@ -830,6 +831,14 @@ args = [ "tmp/real-world-memory/consolidation/report.md", ] +[tasks.real-world-memory-live-consolidation] +workspace = false +command = "bash" +args = [ + "-lc", + "docker compose -f docker-compose.baseline.yml run --build --rm -e ELF_CONSOLIDATION_LIVE_REPORT_DIR -e ELF_CONSOLIDATION_LIVE_FIXTURES baseline-runner bash scripts/real-world-consolidation-live-adapter.sh", +] + [tasks.real-world-memory-core-archival] workspace = false dependencies = [ diff --git a/README.md b/README.md index a4cae687..aa3b0350 100644 --- a/README.md +++ b/README.md @@ -181,6 +181,14 @@ provider-backed ELF evidence was required. evidence fields. qmd remains `wrong_result` on the same slice, but this is not a broad qmd, Graphiti/Zep, mem0/OpenMemory, Letta, hosted-memory, or private-corpus superiority claim. +- Live consolidation proposal scoring after XY-934: `cargo make + real-world-memory-live-consolidation` runs the consolidation fixture slice through + `ElfService` consolidation run creation, worker proposal materialization, and + apply/defer/discard review audit transitions. ELF passes 4/4 live consolidation jobs + with complete lineage, one unsupported-claim flag preserved, and zero source + mutations. Managed dreaming and Always-On Memory Agent patterns remain product + references, not direct live competitors, because no contained runner emits comparable + artifacts. - Live operator-debugging slice after XY-932: `cargo make real-world-job-operator-ux-live-adapters` emits narrow Docker-isolated `live_real_world` records for ELF and qmd over the operator-debugging fixtures. @@ -255,6 +263,7 @@ Detailed evidence and interpretation: - [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) - [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md) - [Capture/Write-Policy Live Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md) +- [Live Consolidation Proposal Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md) - [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md) - [Live Temporal Reconciliation Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) @@ -335,6 +344,7 @@ Detailed comparison, mechanism-level analysis, and source map: - [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) - [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md) - [Capture/Write-Policy Live Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md) +- [Live Consolidation Proposal Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md) - [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md) - [Live Temporal Reconciliation Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) @@ -347,7 +357,8 @@ Detailed comparison, mechanism-level analysis, and source map: - [RAG/Graph Adapter Feasibility Research Run](docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json) Latest real-world benchmark report: June 16, 2026. Latest external research refresh: -June 11, 2026. +June 11, 2026; June 16 adds live temporal reconciliation and live consolidation +self-check evidence. ## Documentation diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 9ff7a7f7..a7ea546b 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -175,6 +175,21 @@ fn capture_write_policy_live_markdown_path() -> Result { .join("2026-06-11-capture-write-policy-live-report.md")) } +fn live_consolidation_proposal_scoring_report_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-16-live-consolidation-proposal-scoring-report.json")) +} + +fn live_consolidation_proposal_scoring_markdown_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("benchmarking") + .join("2026-06-16-live-consolidation-proposal-scoring-report.md")) +} + fn temporal_history_competitor_gap_json_path() -> Result { Ok(workspace_root()? .join("docs") @@ -2021,6 +2036,124 @@ fn capture_write_policy_live_report_preserves_competitor_boundaries() -> Result< Ok(()) } +#[test] +fn live_consolidation_report_preserves_reviewable_output_boundaries() -> Result<()> { + let workspace = workspace_root()?; + let report = serde_json::from_str::(&fs::read_to_string( + live_consolidation_proposal_scoring_report_path()?, + )?)?; + let markdown = fs::read_to_string(live_consolidation_proposal_scoring_markdown_path()?)?; + let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?; + let readme = fs::read_to_string(readme_path()?)?; + let benchmark_guide = fs::read_to_string( + workspace + .join("docs") + .join("guide") + .join("benchmarking") + .join("real_world_agent_memory_benchmark.md"), + )?; + let makefile = fs::read_to_string(workspace.join("Makefile.toml"))?; + let live_script = + fs::read_to_string(workspace.join("scripts/real-world-consolidation-live-adapter.sh"))?; + let live_adapter = + fs::read_to_string(workspace.join("apps/elf-eval/src/bin/real_world_live_adapter.rs"))?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.live_consolidation_proposal_scoring_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-934")); + assert_eq!( + report + .pointer("/live_consolidation_results/elf_live_real_world/suite_status") + .and_then(Value::as_str), + Some("pass") + ); + assert_eq!( + report + .pointer("/live_consolidation_results/elf_live_real_world/encoded_job_count") + .and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report + .pointer("/live_consolidation_results/elf_live_real_world/proposal_count") + .and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report + .pointer("/live_consolidation_results/elf_live_real_world/source_mutation_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/live_consolidation_results/elf_live_real_world/review_event_count") + .and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report + .pointer("/live_consolidation_results/qmd_live_real_world/suite_status") + .and_then(Value::as_str), + Some("not_encoded") + ); + + let jobs = array_at(&report, "/jobs")?; + let project_summary = + find_by_field(jobs, "/job_id", "consolidation-project-summary-apply-001")?; + let preference = + find_by_field(jobs, "/job_id", "consolidation-preference-candidate-defer-001")?; + let contradiction = + find_by_field(jobs, "/job_id", "consolidation-contradiction-report-discard-001")?; + + assert_eq!( + project_summary.pointer("/final_review_state").and_then(Value::as_str), + Some("applied") + ); + assert_eq!(project_summary.pointer("/review_event_count").and_then(Value::as_u64), Some(2)); + assert_eq!(preference.pointer("/final_review_state").and_then(Value::as_str), Some("archived")); + assert_eq!( + contradiction.pointer("/final_review_state").and_then(Value::as_str), + Some("rejected") + ); + assert_eq!( + contradiction.pointer("/unsupported_claim_flag_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(contradiction.pointer("/source_lineage_count").and_then(Value::as_u64), Some(3)); + + let positions = array_at(&report, "/reference_positions")?; + let qmd = find_by_field(positions, "/project", "qmd")?; + let managed = find_by_field(positions, "/project", "managed_dreaming_memory_systems")?; + let always_on = find_by_field(positions, "/project", "always_on_memory_agent_patterns")?; + + assert_eq!(qmd.pointer("/position").and_then(Value::as_str), Some("untested")); + assert_eq!(managed.pointer("/position").and_then(Value::as_str), Some("product_reference")); + assert_eq!(always_on.pointer("/position").and_then(Value::as_str), Some("product_reference")); + assert!(markdown.contains("ELF now has service-backed live consolidation proposal scoring")); + assert!(markdown.contains("This is not scheduled production consolidation")); + assert!(markdown.contains("Source mutations")); + assert!(markdown.contains("Do not mix knowledge-page rebuild/lint scoring")); + assert!( + benchmarking_index.contains("2026-06-16-live-consolidation-proposal-scoring-report.md") + ); + assert!(readme.contains("Live Consolidation Proposal Scoring Report - June 16, 2026")); + assert!(readme.contains("real-world-memory-live-consolidation")); + assert!(benchmark_guide.contains("Current live consolidation increment")); + assert!(benchmark_guide.contains("tmp/real-world-memory/live-consolidation/summary.json")); + assert!(makefile.contains("[tasks.real-world-memory-live-consolidation]")); + assert!(makefile.contains("scripts/real-world-consolidation-live-adapter.sh")); + assert!(live_script.contains("elf.real_world_consolidation_live_adapter_sweep/v1")); + assert!(live_script.contains("real_world_live_adapter -- elf")); + assert!(!live_script.contains("real_world_live_adapter -- qmd")); + assert!(live_adapter.contains("fn materialize_elf_consolidation(")); + assert!(live_adapter.contains("ConsolidationProposalReviewRequest")); + + Ok(()) +} + fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Result<()> { let suites = array_at(adapter, "/suites")?; let capabilities = array_at(adapter, "/capabilities")?; @@ -3016,6 +3149,7 @@ fn assert_competitor_strength_matrix_scenario_json(scenarios: &[Value]) -> Resul let work_resume = find_by_field(scenarios, "/scenario_id", "work_resume")?; let operator_debug = find_by_field(scenarios, "/scenario_id", "operator_debugging")?; let context_trajectory = find_by_field(scenarios, "/scenario_id", "context_trajectory")?; + let consolidation = find_by_field(scenarios, "/scenario_id", "consolidation")?; assert!( retrieval_debug @@ -3051,6 +3185,20 @@ fn assert_competitor_strength_matrix_scenario_json(scenarios: &[Value]) -> Resul .and_then(Value::as_str) .is_some_and(|claim| claim.contains("OpenMemory and claude-mem UI/export")) ); + assert!( + consolidation + .pointer("/current_elf_evidence") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("XY-934 adds live_real_world") + && claim.contains("zero source mutations")) + ); + assert!( + consolidation + .pointer("/current_competitor_evidence") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("qmd remains not_encoded") + && claim.contains("product references only")) + ); let personalization = find_by_field(scenarios, "/scenario_id", "personalization")?; @@ -3927,12 +4075,24 @@ fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) - assert_eq!( consolidation.pointer("/comparison_judgment").and_then(Value::as_str), - Some("not_tested") + Some("improved") ); assert_eq!( consolidation.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64), Some(1) ); + assert_eq!(consolidation.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(4)); + assert_eq!( + consolidation.pointer("/post_stage_counts/not_encoded").and_then(Value::as_u64), + Some(0) + ); + assert!( + consolidation + .pointer("/post_stage_basis") + .and_then(Value::as_str) + .is_some_and(|basis| basis.contains("apply/defer/discard audit") + && basis.contains("zero source mutations")) + ); let scheduled = find_by_field(stages, "/stage_id", "scheduled_memory_task_readiness")?; @@ -3948,6 +4108,7 @@ fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) - assert_eq!(retest.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64), Some(11)); assert!(array_contains_str(ledger, "/summary/improved", "current_vs_historical_correctness")?); assert!(array_contains_str(ledger, "/summary/improved", "preference_evolution")?); + assert!(array_contains_str(ledger, "/summary/improved", "reviewable_consolidation")?); assert!(array_at(ledger, "/summary/regressed")?.is_empty()); assert!(array_contains_str(ledger, "/summary/unchanged", "deletion_ttl_tombstone_behavior")?); assert!(array_contains_str(ledger, "/summary/unchanged", "final_competitor_retest_status")?); @@ -3959,15 +4120,18 @@ fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) - fn assert_dreaming_readiness_markdown_boundaries(markdown: &str) { assert!( - markdown.contains("`improved`: current-vs-historical correctness and preference evolution") + markdown + .contains("`improved`: current-vs-historical correctness, preference evolution, and") + && markdown.contains("reviewable consolidation") ); assert!(markdown.contains("`regressed`: none")); assert!(markdown.contains("the XY-905 run passes all six memory-evolution jobs")); assert!(markdown.contains("XY-905")); assert!( markdown - .contains("Do not claim this ledger fixes preference history against mem0/OpenMemory") + .contains("Do not claim this ledger proves preference history against mem0/OpenMemory") ); + assert!(markdown.contains("Reviewable consolidation now has ELF live service-backed")); } #[test] diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index fee7cda8..686ed123 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -112,7 +112,7 @@ results, or lifecycle failures into one aggregate leaderboard. | Retrieval quality | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF and qmd both pass encoded live retrieval and stress/same-corpus retrieval evidence. | XY-923 | | Retrieval quality and local debug UX | `loss` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested. | XY-923 | | Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. The mem0 local OSS preference-correction history scenario is now measured and is also an ELF loss. | XY-905 | -| Consolidation/proposal review | `not_tested` | `fixture_backed`, `not_encoded` | ELF fixture consolidation passes, but live consolidation proposal generation and review-action scoring are not encoded. | XY-926 | +| Consolidation/proposal review | `not_tested` for direct competitors; ELF self-check passes | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF fixture consolidation passes and XY-934 adds live service-backed proposal materialization, lineage, confidence/usefulness, unsupported-claim flags, and apply/defer/discard audit evidence. Managed dreaming and Always-On Memory Agent patterns remain product references, not direct live competitors. | XY-934 | | Knowledge page compilation | `not_tested` | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `blocked`, `not_encoded` | ELF fixture knowledge pages pass, but live knowledge compilation is not encoded. The XY-929 graph/RAG representative slice scores graphify as wrong_result and keeps GraphRAG, llm-wiki, and gbrain as blocked or not_tested references. | XY-926, XY-929 | | Operator debugging/viewer UX | `win` | `fixture_backed`, `live_real_world`, `blocked`, `not_encoded` | ELF now has a narrow live operator-debug win over qmd on trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence. ELF ties qmd on replay-command availability and repair-action clarity. XY-925 adds claude-mem progressive-disclosure and retrieval-repair prompt coverage, but claude-mem viewer/operator workflows and OpenMemory UI/export remain blocked, so this is not a broad viewer-product superiority claim. | XY-926 | | Capture/write policy and redaction | `not_tested` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF live capture/write-policy self-check jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. qmd remains `not_encoded`; agentmemory and claude-mem hook-capture comparisons remain `blocked` until Docker-contained hook observations and write-policy/viewer readback artifacts exist, so no broad capture-hook superiority claim is allowed. | XY-933, XY-925 | @@ -131,7 +131,8 @@ results, or lifecycle failures into one aggregate leaderboard. | XY-923 | P0 | Backlog | qmd trace-level replay and wrong-result diagnostics. | | XY-924/XY-931 | P0 | Encoded local OSS history; UI/export setup blocker measured | mem0/OpenMemory local OSS history and SDK export-style readback are measured; OpenMemory UI/export has a blocked export-helper setup probe and still needs a dedicated compose/import path before any product-UX comparison. | | XY-925 | P1 | Fixture slice encoded; runtime paths still blocked | First-generation OSS prompt coverage and typed blockers are recorded for agentmemory, memsearch, and claude-mem; durable agentmemory hooks and claude-mem viewer/operator runs still need runtime adapters. | -| XY-926 | P1 | Backlog | Live consolidation and knowledge-page suites; broad operator-debugging remains dependent on OpenMemory and claude-mem UI runners. | +| XY-926 | P1 | Partial live suites encoded | ELF live knowledge-page scoring is encoded; broader knowledge-page external comparisons and broad operator-debugging remain dependent on contained llm-wiki/gbrain/GraphRAG/OpenMemory/claude-mem runners. Consolidation is split to XY-934. | +| XY-934 | P1 | ELF live self-check encoded | Live consolidation proposal scoring is encoded for ELF with lineage, confidence/usefulness, unsupported-claim flags, and review-action audit; direct competitor runners remain untested or product-reference only. | | XY-933 | P1 | Live ELF self-check encoded | Capture/write-policy redaction, exclusion, source-id, evidence-binding, and no-leak scoring for ELF; durable agentmemory/claude-mem capture-hook comparison remains blocked. | | XY-927 | P1 | Fixture encoded; Letta export blocked | ELF core-vs-archival fixture coverage is encoded; a contained Letta export/readback adapter remains future work before win/tie/loss claims. | | XY-928 | P1 | Encoded blocked fixtures | OpenViking context-trajectory and hierarchy benchmark is encoded but blocked until evidence-bearing same-corpus and staged artifacts exist. | diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index c1ca8dcf..0a956467 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -103,7 +103,7 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | Project decisions | Fixture and live project_decisions pass; the ELF core-archival fixture also scores project-decision recovery through core routing plus archival rationale. | qmd, Letta. | qmd live project_decisions pass; Letta project-decision recovery is `research_gate` `not_tested` or `blocked` until the contained export path exists. | Run the Letta core/archival export/readback contract before treating project-decision recovery as a comparable scenario. | | Source-of-truth | Fixture and live trust_source_of_truth pass. | memsearch. | memsearch canonical-store, reindex, delete, and reload smoke passes; XY-925 fixture-backed source-of-truth prompts now cover the canonical Markdown rebuild/reload boundary, but no live memsearch prompt adapter pass is claimed. | Promote memsearch source-of-truth rebuild/reload prompts into a live adapter before any suite-level win/loss claim. | | Temporal/current-vs-historical memory | Fixture memory_evolution passes; live memory_evolution is `wrong_result`. | Graphiti/Zep, mem0/OpenMemory. | Graphiti/Zep is `research_gate` `blocked`; mem0/OpenMemory local OSS preference history, entity scope, deletion audit, and SDK `get_all` now pass; OpenMemory UI/export is blocked by the export-helper setup probe; graph-memory scenarios are `not_encoded`. | Fix ELF/qmd live memory_evolution evidence links, add OpenMemory product app import/export readback, and run XY-888. | -| Consolidation | Fixture consolidation passes; live consolidation is `not_encoded`. | agentmemory, managed-memory references, llm-wiki. | No manifest project has live consolidation scoring. | Run reviewable consolidation proposal generation with source refs, unsupported-claim flags, and audit transitions. | +| Consolidation | Fixture consolidation passes; XY-934 adds ELF live service-backed proposal scoring with lineage, confidence/usefulness, unsupported-claim flags, and apply/defer/discard audit. | managed dreaming, Always-On Memory Agent patterns, agentmemory, llm-wiki. | No direct live competitor runner emits comparable consolidation artifacts; qmd remains `not_encoded`. | Keep competitor comparisons reference-only until a contained runner emits source ids, confidence, unsupported-claim flags, and review-action audit artifacts. | | Knowledge pages | Fixture knowledge_compilation passes; live knowledge_compilation is `not_encoded`. | llm-wiki, gbrain, GraphRAG, graphify. | llm-wiki and gbrain are `research_gate` `not_encoded` or `blocked`; GraphRAG is `blocked`; graphify has a tiny scored smoke `wrong_result`. | Encode live derived-page rebuild/lint scoring and run contained knowledge/RAG adapters only after setup proof. | | Operator debugging | Fixture operator_debugging_ux passes, and the narrow live operator-debug slice passes for trace hydration, candidate-drop visibility, selected-but-not-narrated evidence, replay-command availability, and repair-action clarity. | qmd, claude-mem, OpenMemory. | qmd ties replay-command availability and repair-action clarity but is `wrong_result` for trace hydration, candidate-drop stage visibility, and selected-but-not-narrated evidence. XY-925 adds claude-mem progressive-disclosure and retrieval-repair prompt coverage, while claude-mem viewer/operator and OpenMemory UI/export remain blocked. | Add bounded OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | | Capture/write policy | Fixture capture_integration passes; ELF live capture_integration passes 4/4 with zero redaction leaks, source ids, write-policy audit, and evidence binding. | agentmemory, claude-mem. | agentmemory and claude-mem hook capture remain `blocked` until Docker-contained hook observations and write-policy/viewer readback artifacts exist. | Run durable agentmemory and claude-mem capture-hook jobs proving redaction, exclusion, evidence binding, source ids, and no secret leakage. | diff --git a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md index 6fa05a45..f5a2ad4b 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md +++ b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md @@ -79,11 +79,12 @@ Interpretation: - Both pass `trust_source_of_truth`, `work_resume`, `project_decisions`, `retrieval`, and `personalization`. - Both fail most `memory_evolution` live conflict evidence with `wrong_result`. -- ELF now passes live `capture_integration`; qmd keeps that suite `not_encoded`. - Both leave consolidation, knowledge compilation, and production-ops operator - boundaries as `not_encoded` or `blocked`. Operator debugging has a separate narrow - live slice: ELF passes it, while qmd remains `wrong_result` for trace hydration and - candidate-drop stage visibility. +- ELF now passes live `capture_integration`. A separate XY-934 narrow run adds live + consolidation proposal review evidence for ELF; qmd keeps consolidation + `not_encoded` in the live sweep. Knowledge compilation and production-ops operator + boundaries remain typed `not_encoded` or `blocked`. Operator debugging has a + separate narrow live slice: ELF passes it, while qmd remains `wrong_result` for + trace hydration and candidate-drop stage visibility. ### Production Evidence @@ -134,7 +135,7 @@ one misleading score. | Project decisions | ELF and qmd live project-decision suites pass; ELF fixture-backed `core_archival_memory` also scores project-decision recovery, while Letta remains blocked without export evidence. | Run the Letta core/archival export/readback contract before treating project-decision recovery as comparable. | | Source of truth | ELF has the strongest measured source-of-truth evidence. | Borrow memsearch's local canonical-store ergonomics without making files or vectors authoritative. | | Temporal memory | ELF fixture passes, but live memory evolution is wrong_result. | Prioritize current-vs-historical evidence links and Graphiti/Zep-style validity windows. | -| Consolidation | ELF fixture passes, but live proposal generation is not encoded. | Build reviewable derived proposals with source refs, confidence, unsupported-claim flags, and apply/defer/discard audit. | +| Consolidation | ELF fixture passes and XY-934 adds live service-backed proposal materialization, lineage, confidence/usefulness, unsupported-claim flags, and apply/defer/discard audit; direct competitor runners remain untested. | Keep derived proposal review as the safety boundary and add competitor/reference runners only when they emit comparable artifacts. | | Knowledge pages | ELF fixture pages pass; live knowledge generation is not encoded. | Borrow llm-wiki lint/query-save loops, gbrain timelines, and graphify reports behind rebuild/lint benchmarks. | | Operator debugging | Fixture UX passes and the narrow live trace/viewer slice is scored: ELF passes, qmd ties replay/repair clarity but is wrong_result for trace hydration and candidate-drop visibility. | Expand coverage to OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | | Capture/write policy | ELF live capture/write-policy self-check passes with zero redaction leaks; qmd is `not_encoded`; agentmemory is `blocked`; claude-mem is `not_encoded`. | Borrow agentmemory/claude-mem capture breadth only after durable local hook/viewer evidence exists, while preserving redaction and evidence binding. | @@ -213,9 +214,13 @@ These improve day-to-day usefulness while preserving ELF's evidence-bound core. 2. Reviewable consolidation - Borrow from: managed memory dreaming and Always-On Memory Agent scheduling. + - Current state: ELF now has live service-backed proposal scoring for the + consolidation fixture slice; direct competitor/reference runners are still + untested. - ELF shape: derived proposals only; source notes are not silently rewritten. - - Benchmark gate: consolidation proposals include lineage, confidence, - unsupported-claim flags, and apply/defer/discard audit. + - Benchmark gate: preserve lineage, confidence, unsupported-claim flags, + apply/defer/discard audit, and zero source mutations; do not add scheduling until + it can remain derived and reviewable. 3. Knowledge pages - Borrow from: llm-wiki, gbrain, graphify, and GraphRAG. diff --git a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md index 470a89a7..841e945f 100644 --- a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md +++ b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md @@ -102,6 +102,11 @@ live adapter or competitor runtime can complete those jobs. `cargo make real-world-memory-live-adapters` produced: +XY-934 update: the June 11 consolidation row below is superseded for ELF by +`docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md`. +ELF now has live service-backed consolidation proposal scoring for the 4 checked-in +consolidation jobs; qmd remains typed `not_encoded` for this suite. + | Adapter | Jobs | Pass | Wrong result | Blocked | Not encoded | Mean score | Mean latency | Evidence recall | Evidence coverage | | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | | ELF live service adapter | `40` | `22` | `5` | `2` | `11` | `0.599` | `6.980 ms` | `50/80` | `58/88` | @@ -167,7 +172,7 @@ records `unique_project_names: 17` for the full project list including ELF. | Project | Best current evidence | Current measured state | Strongest unproven scenario | Next measurement before claim | | --- | --- | --- | --- | --- | -| ELF | `fixture_backed` plus `live_real_world` | Fixture aggregate passes except 5 blocked operator or measurement-gate boundaries; live full sweep is `wrong_result`; live capture/write-policy and narrow operator-debug slices pass. | Full live memory evolution, live consolidation, live knowledge pages, live production ops, competitor capture hooks, OpenViking staged trajectory artifacts, and broader operator UI runners. | Memory-evolution diagnostic report, then consolidation/knowledge reports plus agentmemory/claude-mem capture, OpenViking staged trajectory artifacts, and OpenMemory/claude-mem UI runners. | +| ELF | `fixture_backed` plus `live_real_world` | Fixture aggregate passes except 5 blocked operator or measurement-gate boundaries; live full sweep is `wrong_result`; live capture/write-policy, live consolidation proposal scoring, and narrow operator-debug slices pass. | Full live memory evolution, live knowledge pages, live production ops, competitor capture hooks, OpenViking staged trajectory artifacts, and broader operator UI runners. | Memory-evolution diagnostic report, then knowledge reports plus agentmemory/claude-mem capture, OpenViking staged trajectory artifacts, and OpenMemory/claude-mem UI runners. | | qmd | `live_real_world` plus `live_baseline_only` | Fresh full sweep is five passes behind ELF because qmd misses the delete/TTL tombstone job and keeps capture/write-policy jobs typed `not_encoded`; same-corpus baseline passes; narrow operator-debug live slice ties replay commands but is `wrong_result` for trace hydration and candidate-drop visibility. | Deep retrieval-debug ergonomics and trace replay beyond the narrow operator-debug slice. | qmd/ELF deep retrieval-debug profile with expansion, fusion, rerank, and dropped-candidate traces. | | agentmemory | `live_baseline_only` | `lifecycle_fail`; capture comparison is `blocked` because the Docker baseline uses a process-local StateKV Map and in-memory index, with no durable local session/capture path for source ids, exclusions, write-policy audit, or evidence-bound output. | Durable coding-agent continuity and capture hooks. | Durable lifecycle and work-resume/capture adapter report. | | mem0/OpenMemory | `live_baseline_only` | Basic local smoke and local OSS history/readback pass; OpenMemory UI/export is blocked, hosted Platform export is a non-goal, and optional graph plus broader prompt coverage remain `not_encoded`. | Entity history, lifecycle UI, OpenMemory inspection. | Entity-history, deletion-audit, and UI/export readback report. | @@ -194,7 +199,7 @@ records `unique_project_names: 17` for the full project list including ELF. | Project decisions | ELF and qmd live pass; ELF fixture coverage also passes core routing plus archival rationale recovery. | ELF is credible on encoded project-decision recovery. | Letta core/archival decision memory export and scoring. | | Source of truth | ELF and qmd live pass; ELF has stronger production restore/rebuild evidence. | ELF has strongest measured source-of-truth discipline. | memsearch source-of-truth reindex/reload evidence. | | Memory evolution | ELF live fails 5/6 jobs; qmd live fails 6/6 jobs after missing the delete/TTL tombstone evidence; fixture aggregate passes. | No broad live superiority claim. | Historical conflict evidence links and Graphiti/Zep temporal comparison. | -| Consolidation | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | Live proposal generation with lineage, confidence, and review-action audit. | +| Consolidation | Fixture aggregate passes; XY-934 adds ELF live service-backed proposal scoring, while qmd remains `not_encoded`. | ELF self-check claim only; no direct competitor win. | Contained competitor/reference runners only when they emit source ids, confidence, unsupported-claim flags, and review-action audit. | | Knowledge pages | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | Live page rebuild/lint plus llm-wiki, gbrain, GraphRAG, and graphify comparisons. | | Operator debugging | Fixture aggregate passes; narrow ELF/qmd live operator-debug slice is scored with ELF `pass` and qmd `wrong_result`. | Narrow ELF/qmd live claim only: ELF wins trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence; replay-command and repair-action clarity are tied. | OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | | Capture/write policy | Fixture aggregate passes; ELF live service adapter passes 4/4 capture jobs with zero redaction leaks; qmd is `not_encoded`; agentmemory is `blocked`; claude-mem hook/viewer capture is `blocked`. | ELF has live self-check evidence for redaction, exclusions, source ids, evidence binding, and no secret leakage. Against agentmemory/claude-mem capture breadth, the comparison remains blocked until durable hook/viewer evidence exists. | Durable agentmemory and claude-mem capture-hook runners with evidence-bound output. | diff --git a/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md b/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md index 0239e21c..df37634e 100644 --- a/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md +++ b/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md @@ -18,15 +18,16 @@ This ledger does not claim a broad product win. It records the gate later produc lanes must pass before they can claim a Dreaming or competitor-inspired stage is done, and now includes the XY-905 post-stage result for live temporal reconciliation. -Current baseline: +Current stage status: -- `improved`: current-vs-historical correctness and preference evolution. +- `improved`: current-vs-historical correctness, preference evolution, and + reviewable consolidation. - `regressed`: none. - `unchanged`: deletion/TTL/tombstone behavior and the final competitor retest baseline. - `blocked`: scheduled-memory-task readiness. -- `not_tested`: reviewable consolidation beyond fixtures, memory-summary/top-of-mind - live behavior, and proactive brief readiness. +- `not_tested`: memory-summary/top-of-mind live behavior and proactive brief + readiness. The known live `memory_evolution` loss is now repaired for the encoded ELF live adapter slice: the XY-905 run passes all six memory-evolution jobs and reports @@ -34,6 +35,11 @@ current, historical, rationale, tombstone, invalidation, selected, dropped, and non-narrated evidence fields. This is not a private-corpus, hosted memory, or broad competitor-superiority claim. +Reviewable consolidation is also improved for the narrow ELF self-check: XY-934 adds +service-backed proposal materialization, source lineage, confidence/usefulness, +unsupported-claim flags, apply/defer/discard audit transitions, and zero source +mutations. Direct competitor runners remain untested or product-reference only. + ## Ledger Rules - Every downstream Dreaming or competitor-improvement stage must write a post-stage @@ -57,7 +63,7 @@ competitor-superiority claim. | Current-vs-historical correctness | `cargo make real-world-memory-evolution`; `cargo make real-world-memory-live-adapters` | Same commands; publish post-stage JSON and Markdown evidence | `pass=1`, `wrong_result=5`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `pass=6`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `improved` | Move from benchmark materialization into service-native temporal reconciliation APIs and compare against mem0/OpenMemory history and Graphiti/Zep temporal graph evidence without broad superiority claims. | | Preference evolution and correction history | `cargo make real-world-memory-evolution`; `cargo make real-world-memory-live-adapters`; `cargo make openmemory-ui-export-readback` | Same commands; include mem0/OpenMemory boundary evidence | `pass=0`, `wrong_result=1`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `pass=1`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `improved` | Measure preference correction against mem0/OpenMemory history and UI/export surfaces before making any broader history-quality claim. | | Deletion, TTL, and tombstone behavior | `cargo make real-world-memory`; `cargo make real-world-memory-live-adapters` | Same commands | `pass=1`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `pass=1`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `unchanged` | Extend tombstone and TTL readback beyond the single encoded job into update/delete/recreate history cases. | -| Reviewable consolidation | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-consolidation`; `cargo make real-world-memory-live-adapters` | `pass=4`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | not run by XY-905 | `not_tested` | Keep Dreaming output derived and reviewable with lineage, confidence, unsupported-claim flags, apply/defer/discard audit, and no source mutation. | +| Reviewable consolidation | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-consolidation`; `cargo make real-world-memory-live-consolidation`; `cargo make real-world-memory-live-adapters` | `pass=4`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=4`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `improved` | Keep Dreaming output derived and reviewable, and add direct competitor/reference runners only when they emit comparable source ids, confidence, unsupported-claim flags, and review audit artifacts. | | Memory summary and top-of-mind behavior | `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival` | Same commands plus `cargo make real-world-memory-live-adapters` | `pass=8`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | not run by XY-905 | `not_tested` | Build summaries as cited, rebuildable derived pages or core blocks; do not turn hidden summaries into authoritative memory. | | Proactive brief readiness | `cargo make real-world-first-generation-oss`; `cargo make real-world-job-operator-ux` | Same commands plus `cargo make real-world-memory-live-adapters` | `pass=0`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | not run by XY-905 | `not_tested` | Add direct proactive-brief fixtures before any pass claim; briefs must be source-linked and repairable. | | Scheduled memory task readiness | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-consolidation`; `cargo make real-world-memory-live-adapters` | `pass=0`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0` | not run by XY-905 | `blocked` | Scheduled runs are future work; start with queued derived proposal runs and keep operator review mandatory. | @@ -70,7 +76,7 @@ competitor-superiority claim. | Current-vs-historical correctness | `docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md`; `docs/research/2026-06-16-live-temporal-reconciliation-report.json`; `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/research/2026-06-11-temporal-history-competitor-gap-report.json`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | | Preference evolution and correction history | `docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md`; `docs/research/2026-06-16-live-temporal-reconciliation-report.json`; `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md`; `docs/research/2026-06-11-temporal-history-competitor-gap-report.json` | | Deletion, TTL, and tombstone behavior | `docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md`; `docs/research/2026-06-16-live-temporal-reconciliation-report.json`; `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md` | -| Reviewable consolidation | `docs/spec/system_consolidation_proposals_v1.md`; `apps/elf-eval/fixtures/real_world_memory/consolidation/`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | +| Reviewable consolidation | `docs/spec/system_consolidation_proposals_v1.md`; `apps/elf-eval/fixtures/real_world_memory/consolidation/`; `docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md`; `docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json` | | Memory summary and top-of-mind behavior | `apps/elf-eval/fixtures/real_world_memory/knowledge/`; `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | | Proactive brief readiness | `docs/research/2026-06-08-agent-memory-selection.json`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | | Scheduled memory task readiness | `docs/spec/system_consolidation_proposals_v1.md`; `docs/research/2026-06-08-agent-memory-selection.json` | @@ -103,14 +109,16 @@ Allowed: files. - The current ledger preserves typed non-pass states and records the XY-905 live memory-evolution improvement. -- Fixture-backed consolidation, knowledge, and core/archival jobs can be used as - regression guards for report shape. +- Fixture-backed knowledge and core/archival jobs can be used as regression guards for + report shape. +- Reviewable consolidation now has ELF live service-backed proposal scoring evidence, + with direct competitor runners still untested. Not allowed: -- Do not claim this ledger fixes preference history against mem0/OpenMemory, - consolidation, proactive briefs, scheduled tasks, private-corpus gates, hosted - memory, or competitor adapters. +- Do not claim this ledger proves preference history against mem0/OpenMemory, + proactive briefs, scheduled tasks, private-corpus gates, hosted memory, broad + consolidation superiority, or competitor adapters. - Do not claim ELF has full-suite live real-world pass evidence. - Do not claim private-corpus or provider-backed production quality without the operator-owned inputs required by XY-930. diff --git a/docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md b/docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md new file mode 100644 index 00000000..4e7f8302 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md @@ -0,0 +1,86 @@ +# Live Consolidation Proposal Scoring Report - June 16, 2026 + +Goal: Record the XY-934 live consolidation proposal scoring evidence and product +reference boundaries. +Read this when: You need to know whether ELF has live evidence for reviewable +consolidation proposal generation, source lineage, confidence, unsupported-claim +flags, and apply/defer/discard review audit transitions. +Inputs: `cargo make real-world-memory-consolidation`, +`cargo make real-world-memory-live-consolidation`, +`apps/elf-eval/fixtures/real_world_memory/consolidation/`, +`apps/elf-eval/src/bin/real_world_live_adapter.rs`, and +`docs/spec/system_consolidation_proposals_v1.md`. +Outputs: Scenario-level consolidation results, live artifacts, and typed comparison +boundaries for managed dreaming and Always-On Memory Agent style references. + +## Verdict + +ELF now has service-backed live consolidation proposal scoring. The narrow live +command materializes all 4 `consolidation` jobs through `ElfService` consolidation +run creation, worker proposal materialization, and review-action audit transitions. + +This is not scheduled production consolidation and not live provider generation. The +run uses the deterministic fixture/manual proposal payload boundary required by +`elf.consolidation/v1`: source notes are immutable, proposals are derived outputs, and +review actions are explicit artifacts. + +## Fresh Runs + +| Command | Result | Artifact | +| --- | --- | --- | +| `cargo make real-world-memory-consolidation` | pass | `tmp/real-world-memory/consolidation/report.json` | +| `cargo make real-world-memory-live-consolidation` | pass | `tmp/real-world-memory/live-consolidation/summary.json` | + +## ELF Live Consolidation Results + +| Job | Live status | Source refs | Review action | Final review state | Unsupported claims | Source mutations | +| --- | --- | ---: | --- | --- | ---: | ---: | +| `consolidation-project-summary-apply-001` | `pass` | `2` | `apply` | `applied` | `0` | `0` | +| `consolidation-weekly-decision-summary-apply-001` | `pass` | `2` | `apply` | `applied` | `0` | `0` | +| `consolidation-preference-candidate-defer-001` | `pass` | `2` | `defer` | `archived` | `0` | `0` | +| `consolidation-contradiction-report-discard-001` | `pass` | `3` | `discard` | `rejected` | `1` | `0` | + +The generated benchmark report keeps the same consolidation metrics as the fixture +report: + +- `proposal_count = 4` +- `lineage_completeness = 1.0` +- `review_action_correctness = 1.0` +- `proposal_unsupported_claim_count = 1` +- `source_mutation_count = 0` +- `executable_gap_count = 0` + +The materialization artifact records service-backed run ids, proposal ids, source +lineage counts, unsupported-claim flag counts, review-event counts, review actions, +and final review states. It does not claim source memory rewrites. + +## Comparison Boundary + +| Compared target | Position | Reason | +| --- | --- | --- | +| qmd live real-world adapter | `untested` | qmd keeps consolidation jobs typed `not_encoded`; no qmd consolidation proposal generator or review-action audit runner exists in this benchmark. | +| Managed dreaming memory systems | `product_reference` | Managed dreaming motivates the proposal-review shape, but no contained runner emits comparable source ids, confidence, unsupported-claim flags, and review audit artifacts. | +| Always-On Memory Agent patterns | `product_reference` | Always-on scheduling remains a reference only. XY-934 does not implement scheduled consolidation and does not allow silent source-of-truth rewrites. | + +## Claims Allowed + +- ELF live consolidation self-checks pass for proposal materialization, source + lineage, confidence/usefulness thresholds, unsupported-claim flags, and + apply/defer/discard audit transitions. +- Fixture consolidation passes and live service-backed consolidation evidence are + separate evidence classes. +- qmd and other tracked projects remain untested or reference-only for live + consolidation proposal scoring until a contained runner emits comparable artifacts. +- Derived-output safety claims are tied to source lineage, immutable source snapshots, + zero source mutations, and review-action artifacts. + +## Claims Not Allowed + +- Do not claim scheduled production consolidation exists. +- Do not claim live provider-generated consolidation quality; the accepted + `elf.consolidation/v1` service boundary is deterministic fixture/manual proposal + materialization. +- Do not claim ELF broadly beats managed dreaming, Always-On Memory Agent, + agentmemory, qmd, or llm-wiki on consolidation without comparable contained live + runners. +- Do not mix knowledge-page rebuild/lint scoring into the consolidation claim. diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index 21f9b7b8..c6d926a5 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -105,6 +105,10 @@ cleanup, use `docs/guide/single_user_production.md`. report that scores ELF redaction, exclusions, source ids, evidence binding, and no secret leakage while preserving typed blocked/untested boundaries for agentmemory and claude-mem capture breadth. +- `2026-06-16-live-consolidation-proposal-scoring-report.md`: XY-934 live + consolidation proposal scoring report that separates fixture-backed consolidation + passes from service-backed live proposal materialization, lineage, confidence, + unsupported-claim flags, and apply/defer/discard audit evidence. - `2026-06-11-mem0-openmemory-history-ui-export-report.md`: XY-924 plus XY-931 mem0/OpenMemory local OSS history, preference-correction, deletion-audit, personalization, and export-readback comparison with normalized diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 81693524..ce1bcc1d 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -402,6 +402,27 @@ These fixtures use the same reviewable proposal shape as the runtime manual/fixt consolidation service. They remain offline fixture responses and do not claim scheduled provider-backed proposal generation. +Current live consolidation increment: + +```sh +cargo make real-world-memory-live-consolidation +``` + +This runs only `apps/elf-eval/fixtures/real_world_memory/consolidation/` through the +ELF live service adapter and writes: + +```text +tmp/real-world-memory/live-consolidation/elf-materialization.json +tmp/real-world-memory/live-consolidation/elf-report.json +tmp/real-world-memory/live-consolidation/elf-report.md +tmp/real-world-memory/live-consolidation/summary.json +``` + +The live increment proves service-backed proposal materialization and review audit for +the current checked-in consolidation jobs. It does not implement scheduled production +consolidation, live provider-generated proposal quality, source-of-truth rewrites, or +knowledge-page rebuild/lint scoring. + Current checked-in knowledge-compilation increment: ```sh diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index c918eab9..bc5761b4 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -229,16 +229,19 @@ "outcome": "not_tested", "evidence_classes": [ "fixture_backed", + "live_real_world", + "research_gate", "not_encoded" ], - "measured_claim": "ELF fixture consolidation passes, but live consolidation proposal generation and review-action scoring are not encoded.", + "measured_claim": "ELF fixture consolidation passes, and XY-934 adds live service-backed proposal materialization, source lineage, confidence/usefulness, unsupported-claim flags, and apply/defer/discard audit evidence. Managed dreaming and Always-On Memory Agent patterns remain product references, not direct live competitors.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md" + "docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md", + "docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json" ], "follow_up_issues": [ - "XY-926" + "XY-934" ], - "caveat": "Fixture evidence cannot be promoted into live proposal-quality proof." + "caveat": "The live evidence is an ELF self-check for deterministic fixture/manual proposal materialization; no direct managed dreaming, Always-On Memory Agent, qmd, agentmemory, or llm-wiki live competitor runner is claimed." }, { "scenario_id": "knowledge_page_compilation", diff --git a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json index 59e5a19f..3de690bd 100644 --- a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json +++ b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json @@ -478,11 +478,11 @@ { "scenario_id": "consolidation", "scenario": "consolidation", - "current_elf_evidence": "ELF fixture-backed consolidation passes, but live_real_world consolidation is not_encoded.", - "strongest_competitor_or_reference": "agentmemory, managed dreaming references, llm-wiki", - "current_competitor_evidence": "Manifest projects do not yet have live consolidation scoring; llm-wiki knowledge workflow is research_gate not_encoded.", - "current_state": "Fixture-only ELF evidence is useful, but no live proposal-generation parity claim is allowed.", - "next_measurement": "Run a reviewable consolidation-worker benchmark that emits proposals, source refs, unsupported-claim flags, and apply/discard/defer audit events." + "current_elf_evidence": "ELF fixture-backed consolidation passes, and XY-934 adds live_real_world service-backed proposal scoring with source lineage, confidence/usefulness, unsupported-claim flags, apply/defer/discard audit, and zero source mutations.", + "strongest_competitor_or_reference": "managed dreaming, Always-On Memory Agent patterns, agentmemory, llm-wiki", + "current_competitor_evidence": "No direct live competitor runner emits comparable consolidation artifacts; qmd remains not_encoded and managed dreaming plus Always-On Memory Agent patterns are product references only.", + "current_state": "ELF has live consolidation self-check evidence, but no broad consolidation superiority or direct competitor parity claim is allowed without contained external runners.", + "next_measurement": "Add contained competitor/reference runners only if they can emit source ids, confidence, unsupported-claim flags, and review-action audit artifacts." }, { "scenario_id": "knowledge_pages", diff --git a/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json b/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json index 596791e9..76104dc5 100644 --- a/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json +++ b/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json @@ -4,7 +4,7 @@ "authority": "XY-951", "created_at": "2026-06-16T00:00:00Z", "purpose": "Define the benchmark evidence gate that every Dreaming-inspired ELF optimization stage must update before claiming completion.", - "source_evidence_cutoff": "Checked-in benchmark and research evidence through the XY-905 live temporal reconciliation run on 2026-06-16; no private-corpus or provider-backed production pass is claimed by this ledger.", + "source_evidence_cutoff": "Checked-in benchmark and research evidence through the XY-905 live temporal reconciliation run and XY-934 live consolidation proposal scoring run on 2026-06-16; no private-corpus or provider-backed production pass is claimed by this ledger.", "typed_status_terms": [ "pass", "wrong_result", @@ -36,12 +36,14 @@ "Typed non-pass states must remain typed; blocked, not_tested, not_encoded, incomplete, lifecycle_fail, unsupported, and wrong_result must not be collapsed into a generic fail or hidden under pass.", "Fixture-backed evidence may prove benchmark shape but must not be promoted into live_real_world product quality.", "Private-corpus and provider-backed production gates remain typed blocked unless the operator supplies explicit inputs; those blockers are tracked under XY-930.", - "The XY-905 post-stage live memory_evolution result is a narrow temporal reconciliation improvement only; it must not be converted into private-corpus, hosted memory, or broad competitor superiority claims." + "The XY-905 post-stage live memory_evolution result is a narrow temporal reconciliation improvement only; it must not be converted into private-corpus, hosted memory, or broad competitor superiority claims.", + "The XY-934 live consolidation result is a narrow ELF self-check only; it must not be converted into broad managed dreaming, Always-On Memory Agent, qmd, agentmemory, or llm-wiki superiority claims without comparable contained runners." ], "summary": { "improved": [ "current_vs_historical_correctness", - "preference_evolution" + "preference_evolution", + "reviewable_consolidation" ], "regressed": [], "unchanged": [ @@ -52,7 +54,6 @@ "scheduled_memory_task_readiness" ], "not_tested": [ - "reviewable_consolidation", "memory_summary_top_of_mind_behavior", "proactive_brief_readiness" ] @@ -234,8 +235,8 @@ { "stage_id": "reviewable_consolidation", "stage_name": "Reviewable consolidation", - "dependent_issue": "XY-926", - "evidence_class": "fixture_backed", + "dependent_issue": "XY-934", + "evidence_class": "live_real_world", "baseline_commands": [ { "command": "cargo make real-world-memory-consolidation", @@ -248,6 +249,10 @@ "command": "cargo make real-world-memory-consolidation", "required_artifact": "tmp/real-world-memory/consolidation/report.json" }, + { + "command": "cargo make real-world-memory-live-consolidation", + "required_artifact": "tmp/real-world-memory/live-consolidation/summary.json" + }, { "command": "cargo make real-world-memory-live-adapters", "required_artifact": "tmp/real-world-memory/live-adapters/" @@ -255,7 +260,8 @@ ], "evidence_files": [ "docs/spec/system_consolidation_proposals_v1.md", - "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md", + "docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md", + "docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json", "apps/elf-eval/fixtures/real_world_memory/consolidation/" ], "baseline_counts": { @@ -265,11 +271,19 @@ "not_tested": 1, "not_encoded": 1 }, - "baseline_basis": "Consolidation fixtures pass, but live consolidation proposal generation and review-action scoring are not encoded.", - "comparison_judgment": "not_tested", + "baseline_basis": "Before XY-934, consolidation fixtures passed but live consolidation proposal generation and review-action scoring were not encoded.", + "post_stage_counts": { + "pass": 4, + "wrong_result": 0, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "post_stage_basis": "XY-934 adds ELF live service-backed proposal materialization, source lineage, confidence/usefulness, unsupported-claim flags, apply/defer/discard audit, and zero source mutations for 4 consolidation jobs.", + "comparison_judgment": "improved", "regression_rule": "Any source mutation, missing lineage, or collapse of review actions into an automatic rewrite is a regression.", - "improvement_rule": "An improvement requires live or service-backed consolidation scoring without provider hidden state and without mutating authoritative sources.", - "next_optimization_direction": "Keep Dreaming output derived and reviewable: proposal lineage, confidence, unsupported-claim flags, apply/defer/discard audit, and immutable source snapshots." + "improvement_rule": "The stage is improved when live or service-backed consolidation scoring exists without provider hidden state and without mutating authoritative sources.", + "next_optimization_direction": "Keep Dreaming output derived and reviewable, and add direct competitor/reference runners only when they emit comparable source ids, confidence, unsupported-claim flags, and review audit artifacts." }, { "stage_id": "memory_summary_top_of_mind_behavior", diff --git a/docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json b/docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json new file mode 100644 index 00000000..4f33fed9 --- /dev/null +++ b/docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json @@ -0,0 +1,137 @@ +{ + "schema": "elf.live_consolidation_proposal_scoring_report/v1", + "report_id": "xy-934-live-consolidation-proposal-scoring-2026-06-16", + "authority": "XY-934", + "created_at": "2026-06-16T00:00:00Z", + "commands": [ + { + "command": "cargo make real-world-memory-consolidation", + "status": "pass", + "artifact": "tmp/real-world-memory/consolidation/report.json" + }, + { + "command": "cargo make real-world-memory-live-consolidation", + "status": "pass", + "artifact": "tmp/real-world-memory/live-consolidation/summary.json" + } + ], + "fixture_aggregate": { + "suite_id": "consolidation", + "evidence_class": "fixture_backed", + "encoded_job_count": 4, + "suite_status": "pass", + "proposal_count": 4, + "source_mutation_count": 0, + "proposal_unsupported_claim_count": 1, + "lineage_completeness": 1.0, + "review_action_correctness": 1.0, + "executable_gap_count": 0 + }, + "live_consolidation_results": { + "elf_live_real_world": { + "evidence_class": "live_real_world", + "suite_status": "pass", + "encoded_job_count": 4, + "proposal_count": 4, + "source_mutation_count": 0, + "proposal_unsupported_claim_count": 1, + "lineage_completeness": 1.0, + "review_action_correctness": 1.0, + "review_event_count": 6, + "artifact": "tmp/real-world-memory/live-consolidation/elf-report.json", + "materialization_artifact": "tmp/real-world-memory/live-consolidation/elf-materialization.json" + }, + "qmd_live_real_world": { + "evidence_class": "live_real_world", + "suite_status": "not_encoded", + "encoded_job_count": 4, + "proposal_count": 0, + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.json" + } + }, + "jobs": [ + { + "job_id": "consolidation-project-summary-apply-001", + "status": "pass", + "proposal_kind": "project_summary", + "source_lineage_count": 2, + "usefulness_score": 0.93, + "min_usefulness_score": 0.8, + "review_action": "apply", + "final_review_state": "applied", + "review_event_count": 2, + "unsupported_claim_flag_count": 0, + "source_mutation_count": 0 + }, + { + "job_id": "consolidation-weekly-decision-summary-apply-001", + "status": "pass", + "proposal_kind": "weekly_decision_summary", + "source_lineage_count": 2, + "usefulness_score": 0.91, + "min_usefulness_score": 0.8, + "review_action": "apply", + "final_review_state": "applied", + "review_event_count": 2, + "unsupported_claim_flag_count": 0, + "source_mutation_count": 0 + }, + { + "job_id": "consolidation-preference-candidate-defer-001", + "status": "pass", + "proposal_kind": "preference_candidate", + "source_lineage_count": 2, + "usefulness_score": 0.86, + "min_usefulness_score": 0.75, + "review_action": "defer", + "final_review_state": "archived", + "review_event_count": 1, + "unsupported_claim_flag_count": 0, + "source_mutation_count": 0 + }, + { + "job_id": "consolidation-contradiction-report-discard-001", + "status": "pass", + "proposal_kind": "contradiction_report", + "source_lineage_count": 3, + "usefulness_score": 0.9, + "min_usefulness_score": 0.8, + "review_action": "discard", + "final_review_state": "rejected", + "review_event_count": 1, + "unsupported_claim_flag_count": 1, + "source_mutation_count": 0 + } + ], + "reference_positions": [ + { + "project": "qmd", + "position": "untested", + "reason": "qmd keeps consolidation jobs typed not_encoded in the full live sweep; no proposal generation or review-action audit runner exists for qmd." + }, + { + "project": "managed_dreaming_memory_systems", + "position": "product_reference", + "reason": "Managed dreaming motivates the derived proposal-review shape, but no contained runner emits comparable source ids, confidence, unsupported-claim flags, and review audit artifacts." + }, + { + "project": "always_on_memory_agent_patterns", + "position": "product_reference", + "reason": "Always-on scheduling remains a reference only; XY-934 does not implement scheduled consolidation and does not allow silent source-of-truth rewrites." + } + ], + "claim_boundary": { + "allowed": [ + "ELF live consolidation self-checks pass for proposal materialization, source lineage, confidence/usefulness thresholds, unsupported-claim flags, and apply/defer/discard audit transitions.", + "Fixture consolidation passes and live service-backed consolidation evidence are separate evidence classes.", + "qmd and other tracked projects remain untested or reference-only for live consolidation proposal scoring until a contained runner emits comparable artifacts.", + "Derived-output safety claims are tied to source lineage, immutable source snapshots, zero source mutations, and review-action artifacts." + ], + "not_allowed": [ + "Do not claim scheduled production consolidation exists.", + "Do not claim live provider-generated consolidation quality; the accepted elf.consolidation/v1 service boundary is deterministic fixture/manual proposal materialization.", + "Do not claim ELF broadly beats managed dreaming, Always-On Memory Agent, agentmemory, qmd, or llm-wiki on consolidation without comparable contained live runners.", + "Do not mix knowledge-page rebuild/lint scoring into the consolidation claim." + ] + } +} diff --git a/scripts/real-world-consolidation-live-adapter.sh b/scripts/real-world-consolidation-live-adapter.sh new file mode 100755 index 00000000..5d506134 --- /dev/null +++ b/scripts/real-world-consolidation-live-adapter.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_CONSOLIDATION_LIVE_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-memory/live-consolidation}" +FIXTURE_DIR="${ELF_CONSOLIDATION_LIVE_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_memory/consolidation}" + +if [[ ! -f "/.dockerenv" && "${ELF_CONSOLIDATION_LIVE_ALLOW_HOST:-0}" != "1" ]]; then + echo "Refusing to run live consolidation adapter outside Docker. Use cargo make real-world-memory-live-consolidation." >&2 + exit 1 +fi + +for cmd in bash cargo jq; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd} in live consolidation runner." >&2 + exit 1 + fi +done + +mkdir -p "${REPORT_DIR}" +rm -rf "${REPORT_DIR:?}/elf-fixtures" \ + "${REPORT_DIR:?}/elf-materialization.json" \ + "${REPORT_DIR:?}/elf-report.json" \ + "${REPORT_DIR:?}/elf-report.md" \ + "${REPORT_DIR:?}/summary.json" + +cd "${ROOT_DIR}" + +cargo run -p elf-eval --bin real_world_live_adapter -- elf \ + --fixtures "${FIXTURE_DIR}" \ + --out-fixtures "${REPORT_DIR}/elf-fixtures" \ + --evidence-out "${REPORT_DIR}/elf-materialization.json" \ + --config config/local/elf.docker.toml + +cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures "${REPORT_DIR}/elf-fixtures" \ + --out "${REPORT_DIR}/elf-report.json" \ + --run-id real-world-memory-live-consolidation \ + --adapter-id elf_live_real_world \ + --adapter-name "ELF live consolidation service adapter" \ + --adapter-behavior live_real_world_adapter \ + --adapter-storage-status pass \ + --adapter-runtime-status pass \ + --adapter-notes "Materialized by real_world_live_adapter through ElfService consolidation_run_create, worker proposal materialization, and apply/defer/discard review audit transitions; source notes remain immutable derived-output evidence." + +cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report "${REPORT_DIR}/elf-report.json" \ + --out "${REPORT_DIR}/elf-report.md" + +jq -n \ + --slurpfile materialization "${REPORT_DIR}/elf-materialization.json" \ + --slurpfile report "${REPORT_DIR}/elf-report.json" \ + '{ + schema: "elf.real_world_consolidation_live_adapter_sweep/v1", + generated_at: (now | todateiso8601), + fixture_dir: (env.ELF_CONSOLIDATION_LIVE_FIXTURES // "apps/elf-eval/fixtures/real_world_memory/consolidation"), + artifact_dir: (env.ELF_CONSOLIDATION_LIVE_REPORT_DIR // "tmp/real-world-memory/live-consolidation"), + adapter: { + adapter_id: "elf_live_real_world", + evidence_class: "live_real_world", + materialization: $materialization[0], + report: { + json: "tmp/real-world-memory/live-consolidation/elf-report.json", + markdown: "tmp/real-world-memory/live-consolidation/elf-report.md", + summary: $report[0].summary, + suites: $report[0].suites + } + } + }' >"${REPORT_DIR}/summary.json"