From f78661a2d3d236fc8c11637ac1d4e01269a5597e Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 12 Jun 2026 00:58:32 +0800 Subject: [PATCH 1/7] {"schema":"decodex/commit/1","summary":"Expand first-generation OSS adapter benchmark coverage","authority":"XY-925"} --- Makefile.toml | 52 ++++ README.md | 12 +- ...ntmemory_durable_capture_path_blocked.json | 208 ++++++++++++++ .../claude_mem_hook_viewer_blocked.json | 208 ++++++++++++++ .../claude_mem_progressive_disclosure.json | 215 +++++++++++++++ .../claude_mem_retrieval_repair.json | 192 +++++++++++++ .../memsearch_markdown_rebuild_reload.json | 192 +++++++++++++ .../memsearch_retrieval_debug_prompt.json | 254 ++++++++++++++++++ .../memory_projects_manifest.json | 144 ++++++++-- .../tests/real_world_job_benchmark.rs | 175 ++++++++++-- ...-11-competitor-strength-adoption-report.md | 26 +- ...-11-competitor-strength-evidence-matrix.md | 16 +- ...tion-oss-continuity-source-store-report.md | 99 +++++++ .../2026-06-11-measurement-coverage-audit.md | 4 +- docs/guide/benchmarking/index.md | 5 + ...1-competitor-strength-adoption-report.json | 34 ++- ...on-oss-continuity-source-store-report.json | 140 ++++++++++ ...-11-xy-897-competitor-strength-matrix.json | 56 ++-- 18 files changed, 1920 insertions(+), 112 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json create mode 100644 apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json create mode 100644 apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json create mode 100644 apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json create mode 100644 apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json create mode 100644 apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json create mode 100644 docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md create mode 100644 docs/research/2026-06-11-first-generation-oss-continuity-source-store-report.json diff --git a/Makefile.toml b/Makefile.toml index 42b2033c..9dcc099b 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -839,6 +839,9 @@ args = [ # | real-world-memory-knowledge | composite | | # | real-world-memory-knowledge-json | command | | # | real-world-memory-knowledge-report | command | | +# | real-world-first-generation-oss | composite | | +# | real-world-first-generation-oss-json | command | | +# | real-world-first-generation-oss-report | command | | # | ragflow-docker-smoke | command | | # | lightrag-docker-context-smoke | command | | # | graphrag-docker-smoke | command | | @@ -933,6 +936,55 @@ args = [ "tmp/real-world-memory/knowledge-report.md", ] +[tasks.real-world-first-generation-oss] +workspace = false +dependencies = [ + "real-world-first-generation-oss-report", +] + +[tasks.real-world-first-generation-oss-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss", + "--out", + "tmp/real-world-memory/first-generation-oss/report.json", + "--run-id", + "first-generation-oss-continuity-source-store", + "--adapter-id", + "fixture_first_generation_oss", + "--adapter-name", + "First-generation OSS fixture coverage", +] + +[tasks.real-world-first-generation-oss-report] +workspace = false +dependencies = [ + "real-world-first-generation-oss-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/first-generation-oss/report.json", + "--out", + "tmp/real-world-memory/first-generation-oss/report.md", +] + # External memory pattern radar # | task | type | cwd | diff --git a/README.md b/README.md index f9ef9e1b..11319c42 100644 --- a/README.md +++ b/README.md @@ -172,6 +172,13 @@ provider-backed ELF evidence was required. command and repair-action clarity but is `wrong_result` for trace hydration and candidate-drop stage visibility. OpenMemory UI/export and claude-mem viewer flows remain blocked or not encoded, so this is not a broad viewer-product claim. +- First-generation OSS continuity/source-store follow-up after XY-925: `cargo make + real-world-first-generation-oss` emits a fixture-backed external-adapter slice for + agentmemory, memsearch, and claude-mem with 6 jobs, 4 pass, 2 blocked, and full + evidence/source-ref/quote coverage. It selects agentmemory's durable local path, + adds memsearch canonical Markdown source-store and retrieval-debug prompt coverage, + and records claude-mem progressive-disclosure/retrieval-repair coverage while + keeping hook and viewer/operator workflows blocked. - Expanded adapter-pack coverage after XY-834: the real-world external adapter manifest now includes `research_gate` records for RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, and deeper @@ -208,7 +215,8 @@ provider-backed ELF evidence was required. `cargo make baseline-backfill-10k-docker`, `cargo make baseline-backfill-100k-docker`, `cargo make baseline-soak-docker`, `cargo make baseline-live-report`, - `cargo make real-world-memory-live-adapters`, and + `cargo make real-world-memory-live-adapters`, + `cargo make real-world-first-generation-oss`, and `cargo make baseline-live-docker-clean`. Expensive 100k and long-soak profiles are opt-in and do not run in normal checks. @@ -225,6 +233,7 @@ Detailed evidence and interpretation: - [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) - [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md) - [Capture/Write-Policy Live Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md) +- [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Single-User Production Runbook](docs/guide/single_user_production.md) - Benchmark contract: @@ -303,6 +312,7 @@ Detailed comparison, mechanism-level analysis, and source map: - [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) - [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md) - [Capture/Write-Policy Live Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md) +- [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Real-World Agent Memory Benchmark](docs/guide/benchmarking/real_world_agent_memory_benchmark.md) - [External Memory Improvement Plan](docs/guide/research/external_memory_improvement_plan.md) diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json new file mode 100644 index 00000000..68cc2395 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json @@ -0,0 +1,208 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "first-gen-agentmemory-durable-capture-blocked-001", + "suite": "capture_integration", + "title": "Select the durable agentmemory capture path before scoring hooks", + "encoding": { + "status": "blocked", + "reason": "agentmemory's current Docker baseline still uses a process-local SDK/KV mock, so work-resume and write-policy hook capture cannot be scored until a persistent local session, KV, and index path survives a fresh process.", + "follow_up": { + "title": "Wire agentmemory durable local session capture for work-resume jobs", + "reason": "The fair path is a Docker-contained adapter that persists the agentmemory observation log, KV store, and searchable index between capture and replay processes." + } + }, + "corpus": { + "corpus_id": "first-generation-oss-agentmemory-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "agentmemory-selected-durable-path", + "kind": "adapter_plan", + "text": "Selected agentmemory path: run capture hooks into a Docker-local session directory, persist the SDK KV store and searchable index, restart a fresh process, then score work_resume and write-policy prompts against that recovered store.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "agentmemory_durable_capture_path_blocked", + "evidence_id": "agentmemory-selected-durable-path" + }, + "locator": { + "quote": "persist the SDK KV store and searchable index" + } + }, + "created_at": "2026-06-11T10:00:00Z" + }, + { + "evidence_id": "agentmemory-mock-boundary", + "kind": "adapter_blocker", + "text": "Current blocker: the live-baseline adapter registers agentmemory functions against a process-local StateKV Map and in-memory index, so it cannot prove cold-start recovery or hook capture durability.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "agentmemory_durable_capture_path_blocked", + "evidence_id": "agentmemory-mock-boundary" + }, + "locator": { + "quote": "process-local StateKV Map and in-memory index" + } + }, + "created_at": "2026-06-11T10:01:00Z" + }, + { + "evidence_id": "agentmemory-pass-decoy", + "kind": "adapter_state", + "text": "Decoy: agentmemory same-corpus retrieval passing through the mock proves durable coding-agent continuity and write-policy capture.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "agentmemory_durable_capture_path_blocked", + "evidence_id": "agentmemory-pass-decoy" + } + }, + "created_at": "2026-06-11T09:59:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_first_generation_oss", + "answer": { + "content": "agentmemory remains blocked for durable work-resume and write-policy hook capture. The selected local path is a Docker-contained session directory that persists the SDK KV store and searchable index across a fresh process; the current StateKV Map and in-memory index cannot prove that.", + "claims": [ + { + "claim_id": "selected_durable_path", + "text": "The selected local path persists the SDK KV store and searchable index across a fresh process.", + "evidence_ids": ["agentmemory-selected-durable-path"], + "confidence": "high" + }, + { + "claim_id": "current_mock_blocker", + "text": "The current StateKV Map and in-memory index cannot prove durable continuity.", + "evidence_ids": ["agentmemory-mock-boundary"], + "confidence": "high" + } + ], + "evidence_ids": ["agentmemory-selected-durable-path", "agentmemory-mock-boundary"], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + }, + "capture_behaviors": { + "blocked": [ + "agentmemory durable hook capture waits for a persistent Docker-local session, KV, and index path." + ], + "notes": [ + "Same-corpus mock retrieval is not promoted into work-resume or capture integration pass evidence." + ] + } + }, + "timeline": [ + { + "event_id": "agentmemory-durable-path-selected", + "ts": "2026-06-11T10:00:00Z", + "actor": "benchmark", + "action": "selected_durable_adapter_path", + "evidence_ids": ["agentmemory-selected-durable-path"], + "summary": "The next fair agentmemory path must persist capture state across a fresh process." + }, + { + "event_id": "agentmemory-mock-blocker-preserved", + "ts": "2026-06-11T10:01:00Z", + "actor": "benchmark", + "action": "kept_blocked_state", + "evidence_ids": ["agentmemory-mock-boundary"], + "summary": "The current in-memory adapter remains blocked for durable continuity." + } + ], + "prompt": { + "role": "user", + "content": "What local agentmemory path should be used for work-resume and write-policy capture, and can the current mock be scored?", + "job_mode": "operate", + "constraints": ["cite_evidence", "state_blockers", "do_not_promote_mock_smoke"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "selected_durable_path", + "text": "The selected local path persists the SDK KV store and searchable index across a fresh process." + }, + { + "claim_id": "current_mock_blocker", + "text": "The current StateKV Map and in-memory index cannot prove durable continuity." + } + ], + "must_not_include": [ + "same-corpus retrieval passing through the mock proves durable coding-agent continuity" + ], + "evidence_links": { + "selected_durable_path": ["agentmemory-selected-durable-path"], + "current_mock_blocker": ["agentmemory-mock-boundary"] + }, + "answer_type": "blocked_plan", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "agentmemory-selected-durable-path", + "claim_id": "selected_durable_path", + "requirement": "cite", + "quote": "persist the SDK KV store and searchable index" + }, + { + "evidence_id": "agentmemory-mock-boundary", + "claim_id": "current_mock_blocker", + "requirement": "cite", + "quote": "process-local StateKV Map and in-memory index" + } + ], + "negative_traps": [ + { + "trap_id": "mock-smoke-durable-pass", + "type": "unsupported_prior", + "evidence_ids": ["agentmemory-pass-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "uncertainty_handling": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Keeps the durable path blocked until persistent state is proven." + }, + "workflow_helpfulness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Names the concrete local path needed for the next adapter." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites the selected path and the current mock boundary." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not promote the mock same-corpus smoke into durable continuity proof." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "agentmemory", "capture_integration", "blocked", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json new file mode 100644 index 00000000..49d0dc92 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json @@ -0,0 +1,208 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "first-gen-claude-mem-hook-viewer-blocked-001", + "suite": "capture_integration", + "title": "Keep claude-mem hook and viewer workflows blocked until Docker-contained", + "encoding": { + "status": "blocked", + "reason": "The current claude-mem Docker baseline exercises repository classes and durable SQLite only; it does not launch hooks, timeline capture, the local viewer, or an operator workflow over the same corpus.", + "follow_up": { + "title": "Encode claude-mem hook capture and viewer workflow in Docker", + "reason": "A fair UX comparison requires hook observations, timeline/viewer readback, and retrieval repair artifacts produced inside the same containerized run." + } + }, + "corpus": { + "corpus_id": "first-generation-oss-claude-mem-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "claude-mem-hook-viewer-blocker", + "kind": "adapter_blocker", + "text": "claude-mem hook/viewer blocker: the current Docker runner uses repository classes only and does not execute hook capture, local viewer timeline readback, or operator repair workflows.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_hook_viewer_blocked", + "evidence_id": "claude-mem-hook-viewer-blocker" + }, + "locator": { + "quote": "does not execute hook capture, local viewer timeline readback" + } + }, + "created_at": "2026-06-11T10:50:00Z" + }, + { + "evidence_id": "claude-mem-needed-docker-path", + "kind": "adapter_plan", + "text": "Needed claude-mem path: run hook capture and viewer/operator readback inside Docker against the same durable SQLite corpus, then emit timeline, detail hydration, and repair-command artifacts.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_hook_viewer_blocked", + "evidence_id": "claude-mem-needed-docker-path" + }, + "locator": { + "quote": "run hook capture and viewer/operator readback inside Docker" + } + }, + "created_at": "2026-06-11T10:51:00Z" + }, + { + "evidence_id": "claude-mem-hook-pass-decoy", + "kind": "adapter_state", + "text": "Decoy: repository class tests prove claude-mem hook capture and viewer workflows pass.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_hook_viewer_blocked", + "evidence_id": "claude-mem-hook-pass-decoy" + } + }, + "created_at": "2026-06-11T10:49:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_first_generation_oss", + "answer": { + "content": "claude-mem hook capture and viewer/operator workflows remain blocked. The current runner uses repository classes only; the next comparable path must run hook capture plus viewer/operator readback inside Docker against the same durable SQLite corpus and emit timeline, hydration, and repair-command artifacts.", + "claims": [ + { + "claim_id": "hook_viewer_blocked", + "text": "The current runner does not execute hook capture or local viewer timeline readback.", + "evidence_ids": ["claude-mem-hook-viewer-blocker"], + "confidence": "high" + }, + { + "claim_id": "needed_docker_path", + "text": "The needed path is hook capture and viewer/operator readback inside Docker against the same durable SQLite corpus.", + "evidence_ids": ["claude-mem-needed-docker-path"], + "confidence": "high" + } + ], + "evidence_ids": ["claude-mem-hook-viewer-blocker", "claude-mem-needed-docker-path"], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + }, + "capture_behaviors": { + "blocked": [ + "claude-mem hook capture and viewer/operator readback are not Docker-contained yet." + ], + "notes": [ + "Repository class lifecycle and hydration evidence must not be reused as hook or viewer workflow proof." + ] + } + }, + "timeline": [ + { + "event_id": "claude-mem-hook-viewer-blocker-recorded", + "ts": "2026-06-11T10:50:00Z", + "actor": "benchmark", + "action": "recorded_blocker", + "evidence_ids": ["claude-mem-hook-viewer-blocker"], + "summary": "Hook capture and local viewer readback are outside the current Docker runner." + }, + { + "event_id": "claude-mem-needed-path-recorded", + "ts": "2026-06-11T10:51:00Z", + "actor": "benchmark", + "action": "selected_next_path", + "evidence_ids": ["claude-mem-needed-docker-path"], + "summary": "The next fair path must run hook capture and viewer/operator readback inside Docker." + } + ], + "prompt": { + "role": "user", + "content": "Can claude-mem hook capture and viewer workflows be scored from the current Docker baseline?", + "job_mode": "operate", + "constraints": ["cite_evidence", "state_blockers", "avoid_repository_overclaim"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "hook_viewer_blocked", + "text": "The current runner does not execute hook capture or local viewer timeline readback." + }, + { + "claim_id": "needed_docker_path", + "text": "The needed path is hook capture and viewer/operator readback inside Docker against the same durable SQLite corpus." + } + ], + "must_not_include": [ + "repository class tests prove claude-mem hook capture and viewer workflows pass" + ], + "evidence_links": { + "hook_viewer_blocked": ["claude-mem-hook-viewer-blocker"], + "needed_docker_path": ["claude-mem-needed-docker-path"] + }, + "answer_type": "blocked_plan", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "claude-mem-hook-viewer-blocker", + "claim_id": "hook_viewer_blocked", + "requirement": "cite", + "quote": "does not execute hook capture, local viewer timeline readback" + }, + { + "evidence_id": "claude-mem-needed-docker-path", + "claim_id": "needed_docker_path", + "requirement": "explain", + "quote": "run hook capture and viewer/operator readback inside Docker" + } + ], + "negative_traps": [ + { + "trap_id": "repository-class-hook-viewer-pass", + "type": "unsupported_prior", + "evidence_ids": ["claude-mem-hook-pass-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "uncertainty_handling": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Keeps hook/viewer workflow blocked until a Docker-contained run exists." + }, + "workflow_helpfulness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Names the next comparable Docker path." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites the current blocker and needed path." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not reuse repository class checks as hook/viewer proof." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "claude-mem", "capture_integration", "blocked", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json new file mode 100644 index 00000000..48bd8092 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json @@ -0,0 +1,215 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "first-gen-claude-mem-progressive-disclosure-001", + "suite": "operator_debugging_ux", + "title": "Preserve claude-mem progressive-disclosure evidence boundary", + "corpus": { + "corpus_id": "first-generation-oss-claude-mem-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "claude-mem-detail-hydration", + "kind": "adapter_artifact", + "text": "claude-mem progressive evidence: the Docker repository path verified search result to getById detail hydration plus listSources source evidence on a durable SQLite repository.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_progressive_disclosure", + "evidence_id": "claude-mem-detail-hydration" + }, + "locator": { + "quote": "getById detail hydration plus listSources source evidence" + } + }, + "created_at": "2026-06-11T10:30:00Z" + }, + { + "evidence_id": "claude-mem-progressive-boundary", + "kind": "claim_boundary", + "text": "claude-mem boundary: repository search-to-detail hydration is useful progressive-disclosure evidence, but it does not execute hooks, timeline capture, viewer workflows, or real-world prompt scoring.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_progressive_disclosure", + "evidence_id": "claude-mem-progressive-boundary" + }, + "locator": { + "quote": "does not execute hooks, timeline capture, viewer workflows" + } + }, + "created_at": "2026-06-11T10:31:00Z" + }, + { + "evidence_id": "claude-mem-viewer-decoy", + "kind": "adapter_state", + "text": "Decoy: repository detail hydration proves claude-mem viewer and hook workflows pass.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_progressive_disclosure", + "evidence_id": "claude-mem-viewer-decoy" + } + }, + "created_at": "2026-06-11T10:29:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_first_generation_oss", + "answer": { + "content": "claude-mem has Docker-contained progressive-disclosure evidence at the repository layer: search results can be hydrated through getById and listSources on durable SQLite. That should stay separate from hook, timeline, viewer, and real-world prompt scoring, which are not executed by the current runner.", + "claims": [ + { + "claim_id": "repository_progressive_evidence", + "text": "claude-mem search results can be hydrated through getById and listSources on durable SQLite.", + "evidence_ids": ["claude-mem-detail-hydration"], + "confidence": "high" + }, + { + "claim_id": "viewer_hook_boundary", + "text": "Hook, timeline, viewer, and real-world prompt scoring are not executed by the current runner.", + "evidence_ids": ["claude-mem-progressive-boundary"], + "confidence": "high" + } + ], + "evidence_ids": ["claude-mem-detail-hydration", "claude-mem-progressive-boundary"], + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "claude-mem-detail-hydration-recorded", + "ts": "2026-06-11T10:30:00Z", + "actor": "benchmark", + "action": "recorded_progressive_disclosure_evidence", + "evidence_ids": ["claude-mem-detail-hydration"], + "summary": "The Docker repository path exposes search-to-detail/source hydration." + }, + { + "event_id": "claude-mem-viewer-boundary-recorded", + "ts": "2026-06-11T10:31:00Z", + "actor": "benchmark", + "action": "preserved_viewer_hook_boundary", + "evidence_ids": ["claude-mem-progressive-boundary"], + "summary": "Repository hydration is not promoted into hook or viewer pass evidence." + } + ], + "prompt": { + "role": "user", + "content": "What claude-mem progressive-disclosure evidence is measured, and what remains outside the Docker-contained path?", + "job_mode": "debug", + "constraints": ["cite_evidence", "separate_repository_from_viewer", "avoid_hook_claims"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "repository_progressive_evidence", + "text": "claude-mem search results can be hydrated through getById and listSources on durable SQLite." + }, + { + "claim_id": "viewer_hook_boundary", + "text": "Hook, timeline, viewer, and real-world prompt scoring are not executed by the current runner." + } + ], + "must_not_include": [ + "repository detail hydration proves claude-mem viewer and hook workflows pass" + ], + "evidence_links": { + "repository_progressive_evidence": ["claude-mem-detail-hydration"], + "viewer_hook_boundary": ["claude-mem-progressive-boundary"] + }, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "claude-mem-detail-hydration", + "claim_id": "repository_progressive_evidence", + "requirement": "cite", + "quote": "getById detail hydration plus listSources source evidence" + }, + { + "evidence_id": "claude-mem-progressive-boundary", + "claim_id": "viewer_hook_boundary", + "requirement": "cite", + "quote": "does not execute hooks, timeline capture, viewer workflows" + } + ], + "negative_traps": [ + { + "trap_id": "repository-hydration-viewer-pass", + "type": "unsupported_prior", + "evidence_ids": ["claude-mem-viewer-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "debuggability": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Explains the measured progressive-disclosure path." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites detail hydration and boundary evidence." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Separates repository evidence from viewer/hook follow-up." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not promote repository hydration into viewer or hook claims." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "viewer_hook_workflow_not_encoded", + "trace_id": "claude-mem-repository-detail", + "root_cause": "The Docker-contained evidence stops at repository detail/source hydration and does not run the product viewer or hooks.", + "steps_to_root_cause": 2, + "raw_sql_needed": false, + "dropped_candidate_visibility": "repository search result can be hydrated to detail and source rows", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "trace_available": true, + "replay_command_available": true, + "replay_command": "ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker", + "replay_artifact": "tmp/live-baseline/claude-mem.log", + "viewer_panels": ["Repository Search Result", "Memory Item Detail", "Source List"], + "cli_steps": [ + "run the claude-mem Docker baseline", + "inspect getById detail hydration", + "inspect listSources evidence", + "keep hook and viewer workflows blocked until separately encoded" + ], + "trace_evidence": ["claude-mem-detail-hydration", "claude-mem-progressive-boundary"], + "ux_gaps": [] + }, + "tags": ["external_adapter", "claude-mem", "operator_debugging_ux", "progressive_disclosure", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json new file mode 100644 index 00000000..4fb20191 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "first-gen-claude-mem-retrieval-repair-001", + "suite": "retrieval", + "title": "Preserve claude-mem retrieval repair evidence after same-corpus miss", + "corpus": { + "corpus_id": "first-generation-oss-claude-mem-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "claude-mem-same-corpus-miss", + "kind": "adapter_artifact", + "text": "claude-mem retrieval repair evidence: the Docker baseline built the durable SQLite repository but same-corpus retrieval returned 0 of 3 expected query checks, so retrieval quality remains wrong_result.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_retrieval_repair", + "evidence_id": "claude-mem-same-corpus-miss" + }, + "locator": { + "quote": "same-corpus retrieval returned 0 of 3 expected query checks" + } + }, + "created_at": "2026-06-11T10:40:00Z" + }, + { + "evidence_id": "claude-mem-repair-command", + "kind": "debug_command", + "text": "claude-mem repair command: rerun ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker, then inspect tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json before changing retrieval scoring.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_retrieval_repair", + "evidence_id": "claude-mem-repair-command" + }, + "locator": { + "quote": "inspect tmp/live-baseline/claude-mem.log" + } + }, + "created_at": "2026-06-11T10:41:00Z" + }, + { + "evidence_id": "claude-mem-retrieval-pass-decoy", + "kind": "adapter_state", + "text": "Decoy: because claude-mem repository lifecycle passed, same-corpus retrieval also passed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_retrieval_repair", + "evidence_id": "claude-mem-retrieval-pass-decoy" + } + }, + "created_at": "2026-06-11T10:39:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_first_generation_oss", + "answer": { + "content": "claude-mem retrieval remains wrong_result: the durable SQLite repository built, but same-corpus retrieval returned 0 of 3 expected query checks. The repair path is to rerun the claude-mem baseline, inspect tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json, then fix retrieval before any pass claim.", + "claims": [ + { + "claim_id": "retrieval_wrong_result", + "text": "claude-mem same-corpus retrieval returned 0 of 3 expected query checks.", + "evidence_ids": ["claude-mem-same-corpus-miss"], + "confidence": "high" + }, + { + "claim_id": "repair_artifact_path", + "text": "The repair path is to inspect tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json.", + "evidence_ids": ["claude-mem-repair-command"], + "confidence": "high" + } + ], + "evidence_ids": ["claude-mem-same-corpus-miss", "claude-mem-repair-command"], + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "claude-mem-wrong-result-recorded", + "ts": "2026-06-11T10:40:00Z", + "actor": "benchmark", + "action": "recorded_same_corpus_wrong_result", + "evidence_ids": ["claude-mem-same-corpus-miss"], + "summary": "The same-corpus result remains wrong_result despite durable repository lifecycle evidence." + }, + { + "event_id": "claude-mem-repair-artifact-recorded", + "ts": "2026-06-11T10:41:00Z", + "actor": "benchmark", + "action": "recorded_repair_artifact_path", + "evidence_ids": ["claude-mem-repair-command"], + "summary": "The repair path points at the reproducible Docker baseline and logs." + } + ], + "prompt": { + "role": "user", + "content": "Did claude-mem retrieval pass, and what artifact should I inspect to repair the miss?", + "job_mode": "debug", + "constraints": ["cite_evidence", "preserve_wrong_result", "name_repair_artifact"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "retrieval_wrong_result", + "text": "claude-mem same-corpus retrieval returned 0 of 3 expected query checks." + }, + { + "claim_id": "repair_artifact_path", + "text": "The repair path is to inspect tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json." + } + ], + "must_not_include": [ + "same-corpus retrieval also passed" + ], + "evidence_links": { + "retrieval_wrong_result": ["claude-mem-same-corpus-miss"], + "repair_artifact_path": ["claude-mem-repair-command"] + }, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "claude-mem-same-corpus-miss", + "claim_id": "retrieval_wrong_result", + "requirement": "cite", + "quote": "same-corpus retrieval returned 0 of 3 expected query checks" + }, + { + "evidence_id": "claude-mem-repair-command", + "claim_id": "repair_artifact_path", + "requirement": "explain", + "quote": "inspect tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json" + } + ], + "negative_traps": [ + { + "trap_id": "lifecycle-pass-implies-retrieval-pass", + "type": "unsupported_prior", + "evidence_ids": ["claude-mem-retrieval-pass-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Keeps same-corpus retrieval as wrong_result." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites the wrong-result artifact and repair command." + }, + "workflow_helpfulness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Names the concrete artifact path for repair." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not infer retrieval pass from lifecycle pass." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "claude-mem", "retrieval", "wrong_result", "repair"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json new file mode 100644 index 00000000..c94b9486 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "first-gen-memsearch-markdown-rebuild-reload-001", + "suite": "trust_source_of_truth", + "title": "Verify memsearch canonical Markdown rebuild and reload boundary", + "corpus": { + "corpus_id": "first-generation-oss-memsearch-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "memsearch-canonical-markdown-store", + "kind": "source_store", + "text": "memsearch source-store evidence: the canonical Markdown corpus file is the source of truth, and the index is rebuilt by rerunning memsearch index over the file tree.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "memsearch_markdown_rebuild_reload", + "evidence_id": "memsearch-canonical-markdown-store" + }, + "locator": { + "quote": "canonical Markdown corpus file is the source of truth" + } + }, + "created_at": "2026-06-11T10:10:00Z" + }, + { + "evidence_id": "memsearch-reload-proof", + "kind": "adapter_artifact", + "text": "memsearch reload proof: the Docker baseline rewrote auth-memory.md, deleted another corpus file, reran memsearch index, and a fresh memsearch search process retrieved the replacement marker while suppressing deleted evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "memsearch_markdown_rebuild_reload", + "evidence_id": "memsearch-reload-proof" + }, + "locator": { + "quote": "a fresh memsearch search process retrieved the replacement marker" + } + }, + "created_at": "2026-06-11T10:11:00Z" + }, + { + "evidence_id": "memsearch-suite-pass-decoy", + "kind": "claim_boundary", + "text": "Decoy: because memsearch reload passed a Docker smoke, memsearch has passed the full real-world source-of-truth suite.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "memsearch_markdown_rebuild_reload", + "evidence_id": "memsearch-suite-pass-decoy" + } + }, + "created_at": "2026-06-11T10:09:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_first_generation_oss", + "answer": { + "content": "memsearch's comparable source-store path is the canonical Markdown corpus file, with the derived index rebuilt by rerunning memsearch index. The Docker smoke proves rewrite, delete, reindex, and fresh-process reload behavior, but it must not be promoted to a full real-world suite pass.", + "claims": [ + { + "claim_id": "markdown_is_source_store", + "text": "The canonical Markdown corpus file is the source of truth for memsearch.", + "evidence_ids": ["memsearch-canonical-markdown-store"], + "confidence": "high" + }, + { + "claim_id": "rebuild_reload_smoke", + "text": "The Docker smoke proves rewrite, delete, reindex, and fresh-process reload behavior.", + "evidence_ids": ["memsearch-reload-proof"], + "confidence": "high" + } + ], + "evidence_ids": ["memsearch-canonical-markdown-store", "memsearch-reload-proof"], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "memsearch-markdown-store-selected", + "ts": "2026-06-11T10:10:00Z", + "actor": "benchmark", + "action": "selected_canonical_markdown_store", + "evidence_ids": ["memsearch-canonical-markdown-store"], + "summary": "The memsearch comparable source-store job uses the Markdown corpus as authoritative state." + }, + { + "event_id": "memsearch-reload-artifact-recorded", + "ts": "2026-06-11T10:11:00Z", + "actor": "benchmark", + "action": "recorded_reindex_reload_smoke", + "evidence_ids": ["memsearch-reload-proof"], + "summary": "The Docker smoke supplies command-level reindex/reload evidence." + } + ], + "prompt": { + "role": "user", + "content": "What is the comparable memsearch source-of-truth path, and what does the rebuild/reload evidence prove?", + "job_mode": "answer", + "constraints": ["cite_evidence", "state_claim_boundary", "avoid_suite_promotion"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "markdown_is_source_store", + "text": "The canonical Markdown corpus file is the source of truth for memsearch." + }, + { + "claim_id": "rebuild_reload_smoke", + "text": "The Docker smoke proves rewrite, delete, reindex, and fresh-process reload behavior." + } + ], + "must_not_include": [ + "memsearch has passed the full real-world source-of-truth suite" + ], + "evidence_links": { + "markdown_is_source_store": ["memsearch-canonical-markdown-store"], + "rebuild_reload_smoke": ["memsearch-reload-proof"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "memsearch-canonical-markdown-store", + "claim_id": "markdown_is_source_store", + "requirement": "cite", + "quote": "canonical Markdown corpus file is the source of truth" + }, + { + "evidence_id": "memsearch-reload-proof", + "claim_id": "rebuild_reload_smoke", + "requirement": "cite", + "quote": "a fresh memsearch search process retrieved the replacement marker" + } + ], + "negative_traps": [ + { + "trap_id": "memsearch-smoke-suite-pass", + "type": "unsupported_prior", + "evidence_ids": ["memsearch-suite-pass-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Identifies Markdown as source store and index as rebuildable derived state." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites source-store and reload proof evidence." + }, + "lifecycle_behavior": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Explains rewrite, delete, reindex, and fresh-process reload behavior." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not promote smoke evidence into full suite pass evidence." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "memsearch", "source_store", "markdown", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json new file mode 100644 index 00000000..e3dbacdc --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json @@ -0,0 +1,254 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "first-gen-memsearch-retrieval-debug-001", + "suite": "operator_debugging_ux", + "title": "Debug memsearch retrieval through Markdown file and index artifacts", + "corpus": { + "corpus_id": "first-generation-oss-memsearch-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "memsearch-debug-command", + "kind": "debug_command", + "text": "memsearch retrieval-debug evidence: rerun memsearch search with --top-k, inspect the matching Markdown file, and rerun memsearch index after any file rewrite or delete.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "memsearch_retrieval_debug_prompt", + "evidence_id": "memsearch-debug-command" + }, + "locator": { + "quote": "inspect the matching Markdown file" + } + }, + "created_at": "2026-06-11T10:20:00Z" + }, + { + "evidence_id": "memsearch-debug-boundary", + "kind": "claim_boundary", + "text": "memsearch debug boundary: the current adapter exposes CLI search output and canonical Markdown files, but it does not emit staged query-expansion, fusion, rerank, or candidate-drop trace bundles.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "memsearch_retrieval_debug_prompt", + "evidence_id": "memsearch-debug-boundary" + }, + "locator": { + "quote": "does not emit staged query-expansion, fusion, rerank, or candidate-drop trace bundles" + } + }, + "created_at": "2026-06-11T10:21:00Z" + }, + { + "evidence_id": "memsearch-trace-decoy", + "kind": "adapter_state", + "text": "Decoy: memsearch exposes the same staged retrieval trajectory and candidate-drop trace bundle as ELF.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "memsearch_retrieval_debug_prompt", + "evidence_id": "memsearch-trace-decoy" + } + }, + "created_at": "2026-06-11T10:19:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_first_generation_oss", + "answer": { + "content": "For memsearch retrieval debugging, rerun memsearch search with --top-k, inspect the matching Markdown file, and rerun memsearch index after file changes. The useful debug surface is source-file transparency plus CLI replay; staged expansion, fusion, rerank, and candidate-drop trace bundles are not emitted by the current adapter.", + "claims": [ + { + "claim_id": "debug_replay_path", + "text": "Rerun memsearch search with --top-k and inspect the matching Markdown file.", + "evidence_ids": ["memsearch-debug-command"], + "confidence": "high" + }, + { + "claim_id": "trace_boundary", + "text": "The current adapter does not emit staged expansion, fusion, rerank, or candidate-drop trace bundles.", + "evidence_ids": ["memsearch-debug-boundary"], + "confidence": "high" + } + ], + "evidence_ids": ["memsearch-debug-command", "memsearch-debug-boundary"], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "memsearch-cli-debug", + "failure_stage": "trace_bundle", + "failure_reason": "memsearch exposes CLI replay and source Markdown inspection, not staged retrieval trace bundles.", + "stages": [ + { + "stage_name": "cli.search", + "kept_evidence": ["memsearch-debug-command"], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": ["memsearch-trace-decoy"], + "notes": "CLI replay can reproduce the visible result set." + }, + { + "stage_name": "source.markdown", + "kept_evidence": ["memsearch-debug-command"], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "The Markdown file remains inspectable as canonical source." + }, + { + "stage_name": "trace_bundle", + "kept_evidence": ["memsearch-debug-boundary"], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": ["memsearch-trace-decoy"], + "notes": "Candidate-drop trace bundles are not encoded for memsearch." + } + ] + } + } + } + }, + "timeline": [ + { + "event_id": "memsearch-debug-path-recorded", + "ts": "2026-06-11T10:20:00Z", + "actor": "benchmark", + "action": "recorded_debug_path", + "evidence_ids": ["memsearch-debug-command"], + "summary": "The retrieval-debug job points at CLI replay and source Markdown inspection." + }, + { + "event_id": "memsearch-trace-boundary-recorded", + "ts": "2026-06-11T10:21:00Z", + "actor": "benchmark", + "action": "recorded_trace_gap", + "evidence_ids": ["memsearch-debug-boundary"], + "summary": "The job keeps staged trace bundles as not encoded for memsearch." + } + ], + "prompt": { + "role": "user", + "content": "How should I debug a wrong memsearch retrieval result, and what trace visibility is not available?", + "job_mode": "debug", + "constraints": ["cite_evidence", "identify_debug_surface", "avoid_trace_overclaim"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "debug_replay_path", + "text": "Rerun memsearch search with --top-k and inspect the matching Markdown file." + }, + { + "claim_id": "trace_boundary", + "text": "The current adapter does not emit staged expansion, fusion, rerank, or candidate-drop trace bundles." + } + ], + "must_not_include": [ + "memsearch exposes the same staged retrieval trajectory and candidate-drop trace bundle as ELF" + ], + "evidence_links": { + "debug_replay_path": ["memsearch-debug-command"], + "trace_boundary": ["memsearch-debug-boundary"] + }, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "memsearch-debug-command", + "claim_id": "debug_replay_path", + "requirement": "explain", + "quote": "inspect the matching Markdown file" + }, + { + "evidence_id": "memsearch-debug-boundary", + "claim_id": "trace_boundary", + "requirement": "explain", + "quote": "does not emit staged query-expansion, fusion, rerank, or candidate-drop trace bundles" + } + ], + "negative_traps": [ + { + "trap_id": "memsearch-full-trace-decoy", + "type": "unsupported_prior", + "evidence_ids": ["memsearch-trace-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "debuggability": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Names the available memsearch debug path." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites CLI/source debug and trace-boundary evidence." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Provides a concrete replay and reindex sequence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not overclaim staged trace visibility." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "memsearch_trace_bundle_not_encoded", + "trace_id": "memsearch-cli-debug", + "root_cause": "memsearch debugging is available through CLI replay and canonical Markdown inspection, while staged candidate-drop trace bundles are not encoded.", + "steps_to_root_cause": 3, + "raw_sql_needed": false, + "dropped_candidate_visibility": "not encoded; inspect CLI search output and Markdown source instead", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "trace_available": false, + "replay_command_available": true, + "replay_command": "memsearch search '' --top-k 10 && memsearch index ", + "replay_artifact": "tmp/live-baseline/memsearch.log", + "viewer_panels": ["CLI Search Output", "Markdown Source File", "Index Rebuild Log"], + "cli_steps": [ + "rerun memsearch search with --top-k", + "open the matching Markdown file", + "edit or delete the canonical file if needed", + "rerun memsearch index", + "rerun search from a fresh process" + ], + "trace_evidence": ["memsearch-debug-command", "memsearch-debug-boundary"], + "ux_gaps": [ + { + "gap_id": "staged-trace-bundle-not-encoded", + "severity": "medium", + "description": "No staged expansion/fusion/rerank/candidate-drop bundle is emitted by the current memsearch adapter.", + "follow_up_issue": "XY-925" + } + ] + }, + "tags": ["external_adapter", "memsearch", "operator_debugging_ux", "retrieval_debug", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index c6074d60..33cbf264 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -1,6 +1,6 @@ { "schema": "elf.real_world_external_adapter_manifest/v1", - "manifest_id": "real-world-memory-project-adapters-2026-06-11-openmemory-ui-export", + "manifest_id": "real-world-memory-project-adapters-2026-06-11-first-generation-continuity-source-store", "docker_isolation": { "default": true, "compose_file": "docker-compose.baseline.yml", @@ -806,10 +806,20 @@ "status": "blocked", "evidence": "A persistent upstream KV/index path or hosted runtime is needed before cold-start recovery can be fairly scored." }, + { + "capability": "durable_work_resume_capture_path", + "status": "blocked", + "evidence": "XY-925 selects the next local path as a Docker-contained agentmemory session directory with persisted SDK KV store, observation log, and searchable index across a fresh process; the current StateKV Map and in-memory index still block scoring." + }, + { + "capability": "write_policy_hook_capture", + "status": "blocked", + "evidence": "Capture/write-policy jobs require live agentmemory hook observations plus persisted write-policy audit evidence. The current adapter does not execute those hooks." + }, { "capability": "real_world_job_adapter", - "status": "not_encoded", - "evidence": "No agentmemory adapter currently executes real_world_job prompts and answer scoring." + "status": "blocked", + "evidence": "XY-925 adds fixture-backed blocked prompt coverage for the required durable path, but no live agentmemory real_world_job adapter executes prompts until the persistent local store exists." } ], "suites": [ @@ -835,6 +845,7 @@ "suite_id": "retrieval", "status": "pass", "elf_position": "untested", + "comparison_outcome": "not_tested", "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports agentmemory retrieval_pass with 3/3 same-corpus retrieval checks through mem::remember and mem::search. This is live-baseline-only evidence through an in-memory mock, not a real_world_job suite pass.", "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", "artifact": "tmp/live-baseline/live-baseline-report.json" @@ -844,6 +855,7 @@ "suite_id": "memory_evolution", "status": "lifecycle_fail", "elf_position": "wins", + "comparison_outcome": "win", "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks, while agentmemory update_replaces_note_text is lifecycle_fail and cold_start_recovery_search is blocked because the harness uses an in-memory SDK/KV mock. This is an ELF baseline win only at the local lifecycle-smoke evidence class.", "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", "artifact": "tmp/live-baseline/live-baseline-report.json" @@ -853,8 +865,20 @@ "suite_id": "work_resume", "status": "blocked", "elf_position": "untested", - "evidence": "agentmemory's relevant strength is durable coding-agent continuity and capture, but the Docker harness has not proven a persistent session/capture path. Keep work_resume and capture claims blocked until a durable local adapter path exists.", - "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + "comparison_outcome": "blocked", + "evidence": "agentmemory's relevant strength is durable coding-agent continuity and capture, but the Docker harness has not proven a persistent session/capture path. XY-925 selects the durable local path as a Docker-contained session directory that persists the SDK KV store and searchable index across a fresh process; keep work_resume and capture claims blocked until that path exists.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "tmp/real-world-memory/first-generation-oss/report.json" + }, + { + "scenario_id": "durable_work_resume_local_path", + "suite_id": "work_resume", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The selected comparable path is explicit: capture into a Docker-local agentmemory session directory, persist the SDK KV/index and observation log, restart a fresh process, then score work_resume prompts. The checked-in fixture records this as blocked rather than scoring the current mock.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" }, { "scenario_id": "capture_write_policy_hooks", @@ -862,8 +886,9 @@ "status": "blocked", "elf_position": "untested", "comparison_outcome": "blocked", - "evidence": "agentmemory capture breadth is blocked for comparison because the current Docker baseline uses a process-local StateKV Map and in-memory index; no durable local session/capture path stores source ids, exclusions, write-policy audit, or evidence-bound capture output.", - "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + "evidence": "agentmemory capture/write-policy comparison needs live hook observations and write-policy audit evidence persisted through the selected local store. The fixture preserves this as a typed blocker and does not convert the mem::remember smoke into capture proof.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" } ], "evidence": [ @@ -1120,19 +1145,24 @@ { "capability": "real_world_job_adapter", "status": "not_encoded", - "evidence": "No memsearch adapter currently executes real_world_job prompts and answer scoring." + "evidence": "XY-925 adds fixture-backed prompt coverage for the Markdown source-store and retrieval-debug jobs, but no live memsearch runtime adapter executes real_world_job prompts and answer scoring." + }, + { + "capability": "markdown_source_store_prompt_jobs", + "status": "pass", + "evidence": "The first-generation OSS fixture slice encodes source-of-truth rebuild/reload and retrieval-debug prompts over the canonical Markdown store while preserving the live-baseline-only evidence boundary." } ], "suites": [ { "suite_id": "trust_source_of_truth", - "status": "not_encoded", - "evidence": "The Markdown-first source model passed the local reindex/reload smoke, but no real_world_job source-of-truth prompt run is encoded." + "status": "pass", + "evidence": "The Markdown-first source model passed the local reindex/reload smoke, and XY-925 adds fixture-backed source-of-truth prompt coverage over the canonical Markdown store. No live memsearch runtime adapter executes prompt scoring yet." }, { "suite_id": "retrieval", - "status": "not_encoded", - "evidence": "The Docker same-corpus check now passes, but no job-level real_world retrieval run is encoded for memsearch." + "status": "pass", + "evidence": "The Docker same-corpus check passes, and XY-925 adds fixture-backed retrieval-debug prompt coverage over memsearch CLI replay and Markdown source inspection. No live memsearch runtime adapter executes retrieval prompt scoring yet." }, { "suite_id": "memory_evolution", @@ -1146,15 +1176,37 @@ "suite_id": "trust_source_of_truth", "status": "pass", "elf_position": "untested", + "comparison_outcome": "not_tested", "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports memsearch passed same-corpus retrieval, update reindex, delete suppression, and cold-start reload over a canonical Markdown corpus. ELF has no directly comparable canonical Markdown source-store scenario in this baseline, so the ELF position remains untested.", "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", "artifact": "tmp/live-baseline/live-baseline-report.json" }, + { + "scenario_id": "markdown_source_store_rebuild_reload_prompt", + "suite_id": "trust_source_of_truth", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds a checked-in real_world_job prompt fixture that asks for the memsearch source-of-truth path and rebuild/reload boundary: canonical Markdown files are authoritative, while the index is derived by rerunning memsearch index. This is fixture-backed scenario coverage plus baseline artifact evidence, not a memsearch live real_world_job suite pass.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json" + }, + { + "scenario_id": "markdown_retrieval_debug_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds a checked-in retrieval-debug prompt over memsearch's canonical Markdown store. The expected debug surface is CLI replay plus Markdown source inspection and reindexing; staged expansion/fusion/rerank/candidate-drop trace bundles remain not encoded for memsearch.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json" + }, { "scenario_id": "ttl_expiry_lifecycle", "suite_id": "memory_evolution", "status": "unsupported", "elf_position": "untested", + "comparison_outcome": "non_goal", "evidence": "The encoded memsearch CLI path supports reindex/delete but no TTL or expiry behavior. Unsupported TTL behavior is preserved as unsupported competitor evidence and does not create an ELF win/loss claim without a directly comparable scenario artifact.", "artifact": "tmp/live-baseline/live-baseline-report.json" }, @@ -1163,7 +1215,8 @@ "suite_id": "retrieval", "status": "not_encoded", "elf_position": "untested", - "evidence": "No memsearch adapter currently executes real_world_job prompts and answer scoring; baseline retrieval/reindex evidence must stay separate from suite pass claims.", + "comparison_outcome": "not_tested", + "evidence": "No live memsearch runtime adapter currently executes real_world_job prompts and answer scoring. XY-925 fixture-backed prompt jobs document the source-store and retrieval-debug shape, while baseline retrieval/reindex evidence remains separate from suite pass claims.", "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" } ], @@ -1325,25 +1378,35 @@ }, { "capability": "progressive_disclosure_real_world_job", - "status": "not_encoded", - "evidence": "Hook, timeline, viewer, and observation workflows are not encoded against real_world_job prompts." + "status": "pass", + "evidence": "XY-925 adds fixture-backed prompt coverage for the Docker-contained repository progressive-disclosure path: search result to getById detail hydration and listSources evidence on durable SQLite. Hook, timeline, and viewer workflows remain blocked separately." + }, + { + "capability": "retrieval_repair_artifact", + "status": "wrong_result", + "evidence": "The same-corpus retrieval smoke remains wrong_result, and XY-925 records a repair prompt that tells operators to rerun ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker before inspecting tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json." + }, + { + "capability": "hook_capture_viewer_workflow", + "status": "blocked", + "evidence": "The current Docker runner does not launch claude-mem hooks, timeline capture, local viewer readback, or an operator workflow over the same corpus." } ], "suites": [ { "suite_id": "work_resume", - "status": "wrong_result", + "status": "not_encoded", "evidence": "The durable repository run is encoded, but hook-driven capture and real_world_job work-resume prompts are not proven by that local repository check." }, { "suite_id": "operator_debugging_ux", - "status": "not_encoded", - "evidence": "Local viewer/operator workflow is not encoded in the benchmark runner." + "status": "blocked", + "evidence": "XY-925 adds fixture-backed progressive-disclosure and retrieval-repair prompt coverage, but local viewer/operator workflow remains blocked until a Docker-contained viewer or equivalent readback runner exists." }, { "suite_id": "capture_integration", - "status": "not_encoded", - "evidence": "claude-mem hooks are not executed by this runner." + "status": "blocked", + "evidence": "claude-mem hook capture remains blocked because hooks, timeline capture, and observation workflows are not executed by this runner." } ], "scenarios": [ @@ -1352,15 +1415,27 @@ "suite_id": "retrieval", "status": "wrong_result", "elf_position": "wins", + "comparison_outcome": "win", "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF retrieval_pass and claude-mem same_corpus_retrieval as wrong_result with 0/3 expected query checks passing, while its durable repository setup completed. This is an ELF baseline win for the narrow retrieval smoke scenario.", "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", "artifact": "tmp/live-baseline/live-baseline-report.json" }, + { + "scenario_id": "retrieval_repair_artifact_path", + "suite_id": "retrieval", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "XY-925 adds a checked-in repair prompt that preserves the claude-mem wrong_result and names rerun/inspection targets from the reproducible Docker baseline: tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json. This is repair evidence for a miss, not a retrieval pass.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json" + }, { "scenario_id": "repository_lifecycle_reload", "suite_id": "memory_evolution", "status": "pass", "elf_position": "ties", + "comparison_outcome": "tie", "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing local lifecycle checks and claude-mem update, delete, and cold-start reload checks passing over a durable Docker-local SQLite repository. This is a local lifecycle-smoke tie, not a hook-driven work-resume or full progressive-disclosure job pass.", "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", "artifact": "tmp/live-baseline/live-baseline-report.json" @@ -1370,17 +1445,40 @@ "suite_id": "operator_debugging_ux", "status": "pass", "elf_position": "untested", + "comparison_outcome": "not_tested", "evidence": "claude-mem passed the repository-level search-to-detail/source hydration check, which is a useful progressive-disclosure signal. ELF does not have a directly comparable claude-mem-style progressive-disclosure scenario in this baseline, so the ELF position remains untested rather than a loss claim.", "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", "artifact": "tmp/live-baseline/live-baseline-report.json" }, + { + "scenario_id": "progressive_disclosure_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds fixture-backed prompt coverage that asks for the measured claude-mem progressive-disclosure boundary: repository search results hydrate through getById and listSources on durable SQLite, but hooks, timeline, viewer, and live prompt scoring are not executed.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json" + }, { "scenario_id": "hook_capture_viewer_workflow", "suite_id": "capture_integration", - "status": "not_encoded", + "status": "blocked", "elf_position": "untested", - "evidence": "The Docker baseline uses repository classes only. claude-mem hooks, timeline, observations, viewer capture, and automatic capture review workflows are not executed by the runner, so capture breadth remains untested rather than an ELF win/loss.", - "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + "comparison_outcome": "blocked", + "evidence": "The Docker baseline uses repository classes only. claude-mem hooks, viewer, timeline, and observation workflows are not executed by the runner, so XY-925 preserves this as a typed blocker rather than not_encoded prose.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + }, + { + "scenario_id": "viewer_operator_workflow", + "suite_id": "operator_debugging_ux", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "A fair claude-mem viewer/operator comparison needs a Docker-contained run that opens the local viewer or equivalent readback over the same durable SQLite corpus and emits timeline, detail hydration, and repair-command artifacts. That path is not available in the current runner.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" } ], "evidence": [ diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 9b39fd6a..d1ac86e5 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -393,6 +393,7 @@ fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { .ok_or_else(|| eyre::eyre!("missing agentmemory adapter"))?; set_json_pointer(adapter, "/scenarios/0/elf_position", serde_json::json!("loses"))?; + set_json_pointer(adapter, "/scenarios/0/comparison_outcome", serde_json::json!("loss"))?; let temp_dir = env::temp_dir().join(format!("elf-real-world-loss-manifest-test-{}", process::id())); @@ -429,7 +430,7 @@ fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { report .pointer("/external_adapters/summary/scenario_position_counts/untested") .and_then(Value::as_u64), - Some(11) + Some(16) ); assert_eq!( report @@ -462,7 +463,9 @@ fn assert_external_adapter_manifest_summary(report: &Value) { ); assert_eq!( report.pointer("/external_adapters/manifest_id").and_then(Value::as_str), - Some("real-world-memory-project-adapters-2026-06-11-openmemory-ui-export") + Some( + "real-world-memory-project-adapters-2026-06-11-first-generation-continuity-source-store" + ) ); assert_eq!( report.pointer("/external_adapters/docker_isolation/default").and_then(Value::as_bool), @@ -500,6 +503,12 @@ fn assert_external_adapter_manifest_summary(report: &Value) { report.pointer("/external_adapters/summary/research_gate_count").and_then(Value::as_u64), Some(11) ); + + assert_external_adapter_manifest_status_summary(report); + assert_external_adapter_manifest_scenario_summary(report); +} + +fn assert_external_adapter_manifest_status_summary(report: &Value) { assert_eq!( report .pointer("/external_adapters/summary/overall_status_counts/pass") @@ -552,7 +561,13 @@ fn assert_external_adapter_manifest_summary(report: &Value) { report .pointer("/external_adapters/summary/suite_status_counts/blocked") .and_then(Value::as_u64), - Some(16) + Some(18) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/suite_status_counts/pass") + .and_then(Value::as_u64), + Some(24) ); assert_eq!( report @@ -560,8 +575,12 @@ fn assert_external_adapter_manifest_summary(report: &Value) { .and_then(Value::as_u64), Some(0) ); - - assert_external_adapter_manifest_scenario_summary(report); + assert_eq!( + report + .pointer("/external_adapters/summary/suite_status_counts/not_encoded") + .and_then(Value::as_u64), + Some(38) + ); } fn assert_external_adapter_manifest_scenario_summary(report: &Value) { @@ -587,7 +606,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/blocked") .and_then(Value::as_u64), - Some(3) + Some(6) ); assert_eq!( report @@ -599,7 +618,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/wrong_result") .and_then(Value::as_u64), - Some(4) + Some(5) ); assert_eq!( report @@ -611,19 +630,19 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/pass") .and_then(Value::as_u64), - Some(17) + Some(20) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_status_counts/not_encoded") .and_then(Value::as_u64), - Some(3) + Some(2) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_position_counts/wins") .and_then(Value::as_u64), - Some(8) + Some(9) ); assert_eq!( report @@ -641,13 +660,13 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_position_counts/untested") .and_then(Value::as_u64), - Some(12) + Some(17) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_outcome_counts/win") .and_then(Value::as_u64), - Some(8) + Some(9) ); assert_eq!( report @@ -671,13 +690,13 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_outcome_counts/blocked") .and_then(Value::as_u64), - Some(2) + Some(6) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_outcome_counts/non_goal") .and_then(Value::as_u64), - Some(2) + Some(3) ); } @@ -964,6 +983,13 @@ fn assert_first_generation_adapter_records( memsearch: &Value, claude_mem: &Value, ) { + assert_agentmemory_first_generation_records(agentmemory); + assert_mem0_first_generation_records(mem0); + assert_memsearch_first_generation_records(memsearch); + assert_claude_mem_first_generation_records(claude_mem); +} + +fn assert_agentmemory_first_generation_records(agentmemory: &Value) { assert_eq!( agentmemory.pointer("/scenarios/1/status").and_then(Value::as_str), Some("lifecycle_fail") @@ -973,6 +999,9 @@ fn assert_first_generation_adapter_records( Some("wins") ); assert_eq!(agentmemory.pointer("/scenarios/2/status").and_then(Value::as_str), Some("blocked")); +} + +fn assert_mem0_first_generation_records(mem0: &Value) { assert_eq!( mem0.pointer("/capabilities/2/capability").and_then(Value::as_str), Some("local_lifecycle_update_delete_reload") @@ -1027,6 +1056,9 @@ fn assert_first_generation_adapter_records( mem0.pointer("/scenarios/6/comparison_outcome").and_then(Value::as_str), Some("non_goal") ); +} + +fn assert_memsearch_first_generation_records(memsearch: &Value) { assert_eq!( memsearch.pointer("/capabilities/2/capability").and_then(Value::as_str), Some("reindex_update_delete_reload") @@ -1040,28 +1072,83 @@ fn assert_first_generation_adapter_records( memsearch.pointer("/scenarios/0/elf_position").and_then(Value::as_str), Some("untested") ); + assert_eq!(memsearch.pointer("/suites/0/status").and_then(Value::as_str), Some("pass")); + assert!(memsearch.pointer("/suites/0/evidence").and_then(Value::as_str).is_some_and( + |evidence| evidence.contains("fixture-backed source-of-truth prompt coverage") + && evidence.contains("No live memsearch runtime adapter executes prompt scoring yet.") + )); + assert_eq!(memsearch.pointer("/suites/1/status").and_then(Value::as_str), Some("pass")); + assert!(memsearch.pointer("/suites/1/evidence").and_then(Value::as_str).is_some_and( + |evidence| evidence.contains("fixture-backed retrieval-debug prompt coverage") + && evidence.contains( + "No live memsearch runtime adapter executes retrieval prompt scoring yet." + ) + )); + assert_eq!(memsearch.pointer("/scenarios/1/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + memsearch.pointer("/scenarios/1/elf_position").and_then(Value::as_str), + Some("untested") + ); assert_eq!( - memsearch.pointer("/scenarios/1/status").and_then(Value::as_str), + memsearch.pointer("/scenarios/3/status").and_then(Value::as_str), Some("unsupported") ); assert_eq!( - memsearch.pointer("/scenarios/1/elf_position").and_then(Value::as_str), - Some("untested") + memsearch.pointer("/capabilities/4/capability").and_then(Value::as_str), + Some("markdown_source_store_prompt_jobs") ); + assert_eq!(memsearch.pointer("/capabilities/4/status").and_then(Value::as_str), Some("pass")); +} + +fn assert_claude_mem_first_generation_records(claude_mem: &Value) { assert_eq!(claude_mem.pointer("/capabilities/1/status").and_then(Value::as_str), Some("real")); assert_eq!( claude_mem.pointer("/capabilities/3/capability").and_then(Value::as_str), Some("repository_progressive_disclosure") ); + assert_eq!(claude_mem.pointer("/capabilities/4/status").and_then(Value::as_str), Some("pass")); assert_eq!( - claude_mem.pointer("/capabilities/4/status").and_then(Value::as_str), - Some("not_encoded") + claude_mem.pointer("/capabilities/6/status").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!(claude_mem.pointer("/suites/0/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(claude_mem.pointer("/suites/1/status").and_then(Value::as_str), Some("blocked")); + assert!( + claude_mem + .pointer("/suites/1/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("fixture-backed progressive-disclosure") + && evidence.contains("viewer/operator workflow remains blocked")) + ); + assert_eq!(claude_mem.pointer("/suites/2/status").and_then(Value::as_str), Some("blocked")); + assert!( + claude_mem + .pointer("/suites/2/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("hook capture remains blocked")) ); assert_eq!( claude_mem.pointer("/scenarios/0/status").and_then(Value::as_str), Some("wrong_result") ); - assert_eq!(claude_mem.pointer("/scenarios/1/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + claude_mem.pointer("/scenarios/1/scenario_id").and_then(Value::as_str), + Some("retrieval_repair_artifact_path") + ); + assert_eq!( + claude_mem.pointer("/scenarios/1/status").and_then(Value::as_str), + Some("wrong_result") + ); + assert!( + claude_mem + .pointer("/scenarios/1/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("rerun/inspection targets") + && evidence.contains("tmp/live-baseline/claude-mem-checks.json")) + ); + assert_eq!(claude_mem.pointer("/scenarios/2/status").and_then(Value::as_str), Some("pass")); + assert_eq!(claude_mem.pointer("/scenarios/4/status").and_then(Value::as_str), Some("pass")); + assert_eq!(claude_mem.pointer("/scenarios/5/status").and_then(Value::as_str), Some("blocked")); } fn assert_graphiti_zep_adapter(adapter: &Value) { @@ -1901,6 +1988,8 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> { competitor_matrix .contains("broader live suites remain `wrong_result`, `blocked`, or `not_encoded`") ); + assert!(competitor_matrix.contains("claude-mem work_resume remains `not_encoded`")); + assert!(!competitor_matrix.contains("claude-mem `wrong_result`, OpenViking work_resume")); assert!(external_manifest.contains( "The record is a full-suite sweep, not a full-suite pass; wrong_result, blocked, and not_encoded states remain visible." )); @@ -2195,15 +2284,20 @@ fn assert_trace_replay_adoption_json(adoption: &Value) -> Result<()> { fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> { let projects = array_at(matrix, "/project_matrix")?; - let qmd = find_by_field(projects, "/project", "qmd")?; - let mem0 = find_by_field(projects, "/project", "mem0/OpenMemory")?; - let openviking = find_by_field(projects, "/project", "OpenViking")?; let scenarios = array_at(matrix, "/scenario_matrix")?; - let retrieval_debug = find_by_field(scenarios, "/scenario_id", "retrieval_debug")?; - let operator_debug = find_by_field(scenarios, "/scenario_id", "operator_debugging")?; - let context_trajectory = find_by_field(scenarios, "/scenario_id", "context_trajectory")?; assert_competitor_strength_matrix_manifest_counts(matrix); + assert_competitor_strength_matrix_project_json(projects)?; + assert_competitor_strength_matrix_scenario_json(scenarios)?; + + Ok(()) +} + +fn assert_competitor_strength_matrix_project_json(projects: &[Value]) -> Result<()> { + let qmd = find_by_field(projects, "/project", "qmd")?; + let mem0 = find_by_field(projects, "/project", "mem0/OpenMemory")?; + let claude_mem = find_by_field(projects, "/project", "claude-mem")?; + let openviking = find_by_field(projects, "/project", "OpenViking")?; assert_eq!( qmd.pointer("/current_evidence_class").and_then(Value::as_str), @@ -2237,6 +2331,13 @@ fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> { .and_then(Value::as_str) .is_some_and(|claim| claim.contains("OpenMemory product app import/export")) ); + assert!( + claude_mem + .pointer("/unsupported_or_blocked_status/details") + .and_then(Value::as_str) + .is_some_and(|details| details.contains("rerun/inspection targets") + && details.contains("tmp/live-baseline/claude-mem-checks.json")) + ); assert_eq!( openviking.pointer("/current_evidence_class").and_then(Value::as_str), Some("live_baseline_only") @@ -2261,6 +2362,16 @@ fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> { .and_then(Value::as_str) .is_some_and(|claim| claim.contains("evidence-bearing same-corpus output pass")) ); + + Ok(()) +} + +fn assert_competitor_strength_matrix_scenario_json(scenarios: &[Value]) -> Result<()> { + let retrieval_debug = find_by_field(scenarios, "/scenario_id", "retrieval_debug")?; + let work_resume = find_by_field(scenarios, "/scenario_id", "work_resume")?; + let operator_debug = find_by_field(scenarios, "/scenario_id", "operator_debugging")?; + let context_trajectory = find_by_field(scenarios, "/scenario_id", "context_trajectory")?; + assert!( retrieval_debug .pointer("/current_state") @@ -2270,6 +2381,13 @@ fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> { assert!(retrieval_debug.pointer("/current_state").and_then(Value::as_str).is_some_and( |state| state.contains("qmd remains stronger on local debug ergonomics not fully scored") )); + assert!( + work_resume + .pointer("/current_competitor_evidence") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("claude-mem work_resume remains not_encoded") + && !claim.contains("claude-mem is wrong_result")) + ); assert!( operator_debug .pointer("/current_elf_evidence") @@ -2792,9 +2910,9 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("xy844-current-worktree")); assert!(markdown.contains("Existing live-baseline reports remain valid")); assert!(markdown.contains("### Adapter Scenario Judgments")); - assert!(markdown.contains("ELF scenario positions: `wins=8, ties=9, loses=1, untested=12`")); + assert!(markdown.contains("ELF scenario positions: `wins=9, ties=9, loses=1, untested=17`")); assert!(markdown.contains( - "Scenario comparison outcomes: `win=8, tie=9, loss=1, not_tested=8, blocked=2, non_goal=2`" + "Scenario comparison outcomes: `win=9, tie=9, loss=1, not_tested=8, blocked=6, non_goal=3`" )); assert!(markdown.contains("| `claude_mem_live_baseline` | `same_corpus_retrieval`")); assert!(markdown.contains("| `memsearch_live_baseline` | `ttl_expiry_lifecycle`")); @@ -2818,6 +2936,7 @@ fn external_adapter_markdown_renders_nonzero_scenario_losses() -> Result<()> { .ok_or_else(|| eyre::eyre!("missing agentmemory adapter"))?; set_json_pointer(adapter, "/scenarios/0/elf_position", serde_json::json!("loses"))?; + set_json_pointer(adapter, "/scenarios/0/comparison_outcome", serde_json::json!("loss"))?; set_json_pointer( &mut report, "/external_adapters/summary/scenario_position_counts", diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index 000e7dd1..07ef05ad 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -9,7 +9,8 @@ Inputs: `2026-06-11-measurement-coverage-audit.md`, `2026-06-11-qmd-openviking-strength-profile-report.md`, `2026-06-11-temporal-history-competitor-gap-report.md`, `2026-06-11-graph-rag-scored-smoke-adapter-report.md`, -`2026-06-11-mem0-openmemory-history-ui-export-report.md`, and +`2026-06-11-mem0-openmemory-history-ui-export-report.md`, +`2026-06-11-first-generation-oss-continuity-source-store-report.md`, and `2026-06-10-production-adoption-refresh.md`. Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md` and the current external adapter manifest. @@ -47,10 +48,14 @@ The remaining caveats are material: ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory - UI/export and claude-mem viewer workflows remain blocked or not encoded. XY-933 - adds an ELF live capture/write-policy self-check, but agentmemory capture breadth - is blocked by mocked/in-memory storage and claude-mem hook/viewer capture remains - untested. + UI/export and claude-mem viewer workflows remain blocked or not encoded. XY-925 + now adds fixture-backed first-generation OSS prompt coverage and typed blockers for + agentmemory durable continuity, memsearch Markdown source-store/debug jobs, and + claude-mem progressive-disclosure, retrieval-repair, hook, and viewer/operator + surfaces; those rows still do not create live external real-world suite passes. + XY-933 adds an ELF live capture/write-policy self-check, but agentmemory capture + breadth is blocked by mocked/in-memory storage and claude-mem hook/viewer capture + remains blocked until Docker-contained hook/viewer evidence exists. ## Evidence Classes @@ -80,6 +85,7 @@ results, or lifecycle failures into one aggregate leaderboard. | `cargo make real-world-memory-live-adapters` | `2026-06-11-capture-write-policy-live-report.md` | ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage; qmd remains not_encoded, agentmemory is blocked, and claude-mem is untested for capture breadth. | | `cargo make real-world-job-operator-ux-live-adapters` | `tmp/real-world-job/operator-ux-live-adapters/summary.json` | The narrow live operator-debug slice scores ELF as pass and qmd as wrong_result: ELF wins trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence; both systems expose replay commands and repair-action guidance. | | `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker` | `2026-06-11-first-generation-oss-adapter-promotion-report.md` | mem0/OpenMemory and memsearch pass basic local baseline smokes; agentmemory remains lifecycle_fail and claude-mem remains wrong_result. | +| `cargo make real-world-first-generation-oss` | `2026-06-11-first-generation-oss-continuity-source-store-report.md` | First-generation OSS fixture slice reports 6 jobs: 4 pass, 2 blocked, full evidence/source-ref/quote coverage, and manifest scenario outcomes across win, tie, loss, not_tested, blocked, and non_goal without promoting smoke evidence into live suite passes. | | `cargo make openmemory-ui-export-readback` | `2026-06-11-mem0-openmemory-history-ui-export-report.md` | mem0 local OSS passes preference correction history, entity-scoped personalization, local `get_all` export-style readback, and deletion audit history; OpenMemory export-helper setup emits a separate blocked artifact with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`, and hosted Platform export remains non-goal. | | `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke` | `2026-06-11-temporal-history-competitor-gap-report.md` | Graphiti/Zep temporal smoke remains blocked by `provider_api_key_missing`. | | `cargo make graphify-docker-graph-report-smoke` | `2026-06-11-graph-rag-scored-smoke-adapter-report.md` | graphify reaches tiny Docker graph/report scoring but remains wrong_result. | @@ -91,15 +97,15 @@ results, or lifecycle failures into one aggregate leaderboard. | Scenario | ELF outcome | Evidence classes | Measured claim | Follow-up | | --- | --- | --- | --- | --- | | Source-of-truth rebuild and evidence-bound writes | `win` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF has the strongest measured source-of-truth and rebuild story: Postgres is authoritative, Qdrant is rebuildable, trust-source jobs pass, and production restore/rebuild proof exists. | None | -| Work resume and coding-agent continuity | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF and qmd both pass encoded live `work_resume` jobs; agentmemory, claude-mem, and OpenViking continuity strengths remain blocked or not encoded. | XY-925, XY-928 | +| Work resume and coding-agent continuity | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF and qmd both pass encoded live `work_resume` jobs. XY-925 selects agentmemory's next durable local path but keeps it blocked until the SDK KV/index and observation log survive a fresh process; claude-mem and OpenViking continuity strengths remain blocked or not encoded. | XY-928 | | Project decisions and reversals | `tie` | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF and qmd both pass encoded `project_decisions` jobs; Letta-style core/archival decision memory is not tested. | XY-927 | | Retrieval quality | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF and qmd both pass encoded live retrieval and stress/same-corpus retrieval evidence. | XY-923 | | Retrieval quality and local debug UX | `loss` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested. | XY-923 | | Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. The mem0 local OSS preference-correction history scenario is now measured and is also an ELF loss. | XY-905 | | Consolidation/proposal review | `not_tested` | `fixture_backed`, `not_encoded` | ELF fixture consolidation passes, but live consolidation proposal generation and review-action scoring are not encoded. | XY-926 | | Knowledge page compilation | `not_tested` | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `not_encoded` | ELF fixture knowledge pages pass, but live knowledge compilation is not encoded; graphify reaches a tiny scored smoke and remains wrong_result. | XY-926, XY-929 | -| Operator debugging/viewer UX | `win` | `fixture_backed`, `live_real_world`, `blocked`, `not_encoded` | ELF now has a narrow live operator-debug win over qmd on trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence. ELF ties qmd on replay-command availability and repair-action clarity. OpenMemory UI/export remains blocked and claude-mem UI remains not encoded, so this is not a broad viewer-product superiority claim. | XY-926 | -| Capture/write policy and redaction | `not_tested` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF live capture/write-policy self-check jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. qmd remains `not_encoded`; agentmemory comparison is `blocked`; claude-mem capture breadth is `not_encoded`, so no broad capture-hook superiority claim is allowed. | XY-933, XY-925 | +| Operator debugging/viewer UX | `win` | `fixture_backed`, `live_real_world`, `blocked`, `not_encoded` | ELF now has a narrow live operator-debug win over qmd on trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence. ELF ties qmd on replay-command availability and repair-action clarity. XY-925 adds claude-mem progressive-disclosure and retrieval-repair prompt coverage, but claude-mem viewer/operator workflows and OpenMemory UI/export remain blocked, so this is not a broad viewer-product superiority claim. | XY-926 | +| Capture/write policy and redaction | `not_tested` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF live capture/write-policy self-check jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. qmd remains `not_encoded`; agentmemory and claude-mem hook-capture comparisons remain `blocked` until Docker-contained hook observations and write-policy/viewer readback artifacts exist, so no broad capture-hook superiority claim is allowed. | XY-933, XY-925 | | Production ops, restore, backfill, and rebuild | `win` | `live_baseline_only`, `blocked` | ELF has the strongest measured local production-operation story: provider synthetic, stress, resumable backfill, backup/restore, and Qdrant rebuild evidence. | XY-930 | | Private corpus and provider boundaries | `blocked` | `blocked` | Private production profile fails closed without an operator-owned manifest; provider-backed production-ops gates require explicit credentials. | XY-930 | | Personalization and scoped preferences | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `not_encoded` | ELF and qmd both pass the single encoded live personalization job. mem0 local OSS now passes entity-scoped personalization, so scoped preference behavior is a measured tie; preference correction history remains a separate ELF loss. | XY-927 | @@ -114,9 +120,9 @@ results, or lifecycle failures into one aggregate leaderboard. | XY-905 | P0 | Backlog | Live temporal reconciliation answer and trace contract. | | XY-923 | P0 | Backlog | qmd trace-level replay and wrong-result diagnostics. | | XY-924/XY-931 | P0 | Encoded local OSS history; UI/export setup blocker measured | mem0/OpenMemory local OSS history and SDK export-style readback are measured; OpenMemory UI/export has a blocked export-helper setup probe and still needs a dedicated compose/import path before any product-UX comparison. | -| XY-925 | P1 | Backlog | First-generation OSS continuity and source-store adapters. | +| XY-925 | P1 | Fixture slice encoded; runtime paths still blocked | First-generation OSS prompt coverage and typed blockers are recorded for agentmemory, memsearch, and claude-mem; durable agentmemory hooks and claude-mem viewer/operator runs still need runtime adapters. | | XY-926 | P1 | Backlog | Live consolidation and knowledge-page suites; broad operator-debugging remains dependent on OpenMemory and claude-mem UI runners. | -| XY-933 | P1 | Live ELF self-check encoded | Capture/write-policy redaction, exclusion, source-id, evidence-binding, and no-leak scoring for ELF; durable agentmemory/claude-mem capture-hook comparison remains blocked or untested. | +| XY-933 | P1 | Live ELF self-check encoded | Capture/write-policy redaction, exclusion, source-id, evidence-binding, and no-leak scoring for ELF; durable agentmemory/claude-mem capture-hook comparison remains blocked. | | XY-927 | P1 | Backlog | Letta-style core-vs-archival memory comparison. | | XY-928 | P1 | Encoded blocked fixtures | OpenViking context-trajectory and hierarchy benchmark is encoded but blocked until evidence-bearing same-corpus and staged artifacts exist. | | XY-929 | P2 | Backlog | Graph/RAG adapters beyond scored smokes. | diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index c2cdc983..4fb3b15e 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -77,9 +77,9 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | qmd | Local retrieval-debug workflow with transparent CLI indexing, querying, expansion, fusion, and rerank ergonomics. | `live_real_world`; supporting `live_baseline_only` and `research_gate`. | `wrong_result` full live sweep: `cargo make real-world-memory-live-adapters`, `tmp/real-world-memory/live-adapters/qmd-report.md`; targeted retrieval suites pass; the narrow operator-debug slice ties replay commands but is `wrong_result` for trace hydration and candidate-drop visibility. | `not_encoded`: deep profile and non-retrieval live behavior are not encoded; memory_evolution is `wrong_result`. | Keep qmd deep retrieval/debug profiling separate from the narrow operator-debug live slice; no broad ELF-over-qmd or qmd-over-ELF claim is allowed until comparable stage artifacts exist. | Weighted fusion, rerank explanation, local debug knobs, and command-line replay. | | agentmemory | Coding-agent continuity, MCP/REST packaging, viewer workflow, and durable cross-agent memory lifecycle. | `live_baseline_only`. | `lifecycle_fail`: `ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. | `blocked`: durable cold-start, capture-hook persistence, and real-world adapter coverage are missing; current Docker baseline uses a process-local StateKV Map and in-memory index. | Durable local adapter with update, delete, cold-start reload, work_resume, capture/write-policy, and lifecycle-staleness jobs. | Cross-agent hooks, packaging, continuity scenarios, and viewer affordances. | | mem0/OpenMemory | Memory lifecycle, personalization, hosted/OpenMemory UI ergonomics, and optional graph memory. | `live_baseline_only`. | `pass`: fresh scoped run `cargo make openmemory-ui-export-readback`, `tmp/live-baseline/live-baseline-report.json`, with mem0 `8/8` local SDK checks passing; `blocked`: OpenMemory export-helper setup probe emits `tmp/live-baseline/mem0-openmemory-ui-export.json` with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`. | `blocked`: OpenMemory UI/export cannot be compared until a compose/import path loads the same corpus into the product app; `unsupported`: hosted Platform export; `not_encoded`: optional graph memory and real-world prompt adapter coverage. | Add a Docker-contained OpenMemory product app import/export path, then score browser/API readback separately from SDK `get_all`; keep hosted Platform and graph memory opt-in/non-goal unless explicitly enabled. | Entity-scoped history, lifecycle surfaces, async update ergonomics, and OpenMemory inspection UX. | -| memsearch | Markdown-first canonical store with rebuildable local index and practical hybrid retrieval. | `live_baseline_only`. | `pass`: fresh scoped run `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`, with memsearch `4/4` local checks passing. | `not_encoded`: real-world source-of-truth, retrieval, and memory-evolution prompt adapters are not encoded; TTL/expiry is unsupported by the current CLI path. | Score source-of-truth and retrieval-debug real-world jobs over the canonical Markdown store; keep TTL/expiry as unsupported unless a comparable path exists. | Canonical markdown store, local reindex clarity, and user-inspectable source files. | +| memsearch | Markdown-first canonical store with rebuildable local index and practical hybrid retrieval. | `live_baseline_only`; XY-925 `fixture_backed`. | `pass`: fresh scoped run `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`, with memsearch `4/4` local checks passing. XY-925 adds fixture-backed source-store and retrieval-debug prompts through `cargo make real-world-first-generation-oss`, `tmp/real-world-memory/first-generation-oss/report.json`. | `not_encoded`: no live memsearch runtime adapter executes real-world prompt scoring; memory-evolution prompt adapters remain not encoded; TTL/expiry is unsupported by the current CLI path. | Promote the fixture-backed source-store and retrieval-debug prompts into a live memsearch real-world adapter before any suite-level win/loss claim; keep TTL/expiry as unsupported unless a comparable path exists. | Canonical markdown store, local reindex clarity, and user-inspectable source files. | | OpenViking | Filesystem-like context trajectory, hierarchical retrieval, and staged context loading. | `live_baseline_only`; supporting `fixture_backed` and `research_gate`. | `wrong_result`: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`; `blocked`: checked-in `context_trajectory` fixtures cover staged retrieval, hierarchy selection, and recursive/context expansion gates. | `blocked`: hierarchical context trajectory is encoded but blocked until same-corpus evidence ids match and staged artifacts are materialized. | Make evidence-bearing same-corpus output pass, then score staged trajectory and hierarchy expansion. | `viking://`-style context model, trajectory readback, and staged retrieval planning. | -| claude-mem | Progressive disclosure, automatic capture loop, repository-local lifecycle, and local viewer workflow. | `live_baseline_only`. | `wrong_result`: `ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. | `not_encoded`: progressive-disclosure and hook/viewer capture real-world jobs are not encoded. | Durable repository-backed work_resume, operator_debugging_ux, capture/write-policy, and progressive-disclosure jobs. | Progressive disclosure, automatic capture review loops, and local viewer/operator comfort. | +| claude-mem | Progressive disclosure, automatic capture loop, repository-local lifecycle, and local viewer workflow. | `live_baseline_only`; XY-925 `fixture_backed`. | `wrong_result`: `ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. XY-925 adds fixture-backed progressive-disclosure and retrieval-repair prompts through `cargo make real-world-first-generation-oss`, `tmp/real-world-memory/first-generation-oss/report.json`. | `blocked`: hook capture and viewer/operator workflows still lack a Docker-contained runner; retrieval remains `wrong_result`, and the repair prompt lists rerun/inspection targets `tmp/live-baseline/claude-mem.log` and `tmp/live-baseline/claude-mem-checks.json`. | Promote durable repository-backed work_resume, operator_debugging_ux, capture/write-policy, and progressive-disclosure prompts into a live claude-mem adapter before any broader UX claim. | Progressive disclosure, automatic capture review loops, and local viewer/operator comfort. | | RAGFlow | Full RAG application workflow with document, chunk, and reference evidence handles. | `research_gate`. | `blocked`: `ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make ragflow-docker-smoke`, `tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json`. | `blocked`: Docker resource envelope and adapter output mapping still need proof. | XY-885 tiny Docker evidence-smoke adapter mapping `reference.chunks` to scored evidence. | Document/chunk references, resource-envelope reporting, and RAG app evidence handles. | | LightRAG | Lightweight graph/RAG context export with source file-path citation shape. | `research_gate`. | `blocked`: `ELF_LIGHTRAG_CONTEXT_START=1 cargo make lightrag-docker-context-smoke`, `tmp/real-world-memory/lightrag-context/summary.json`. | `blocked`: Docker service setup and context export are not proven. | XY-886 Docker context-export adapter with explicit provider config and source citation mapping. | Context-only query modes, graph-aware retrieval layout, and file-path citation readback. | | GraphRAG | GraphRAG indexing, graph summaries, and document/text-unit evidence tables. | `research_gate`. | `blocked`: `ELF_GRAPHRAG_SMOKE_RUN=1 cargo make graphrag-docker-smoke`, `tmp/real-world-memory/graphrag-smoke/summary.json`. | `blocked`: indexing resource envelope and source citation mapping are not proven. | XY-887 cost-bounded Docker adapter over a tiny corpus and scored output tables. | Graph summary artifacts, local/global search separation, and source table evidence mapping. | @@ -96,14 +96,14 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | Scenario | Current ELF evidence | Strongest competitor/reference | Current competitor evidence | Next measurement before claim | | --- | --- | --- | --- | --- | | Retrieval/debug | Fixture retrieval passes; live retrieval passes. | qmd. | qmd live retrieval passes and live baseline passes, but full-suite live status is `wrong_result`. | Run qmd deep profile and ELF/qmd trace-level replay with expansion, fusion, rerank, and candidate-drop diagnostics. | -| Work resume | Fixture and live work_resume pass. | agentmemory, claude-mem, OpenViking. | agentmemory `lifecycle_fail`, claude-mem `wrong_result`, OpenViking work_resume `not_encoded`. | Encode durable work_resume adapters or keep each blocked with lifecycle/setup evidence. | +| Work resume | Fixture and live work_resume pass. | agentmemory, claude-mem, OpenViking. | agentmemory `lifecycle_fail`; claude-mem work_resume remains `not_encoded` pending a durable repository-backed adapter; OpenViking work_resume is `not_encoded`. | Encode durable work_resume adapters or keep each blocked with lifecycle/setup evidence. | | Project decisions | Fixture and live project_decisions pass. | qmd, Letta. | qmd live project_decisions pass; Letta is `research_gate` `not_encoded`. | Add Letta core/archival decision jobs only after a contained export path exists. | -| Source-of-truth | Fixture and live trust_source_of_truth pass. | memsearch. | memsearch canonical-store, reindex, delete, and reload smoke now passes, but source-of-truth real_world_job prompts are `not_encoded`. | Score memsearch source-of-truth rebuild/reload jobs before any suite-level win/loss claim. | +| Source-of-truth | Fixture and live trust_source_of_truth pass. | memsearch. | memsearch canonical-store, reindex, delete, and reload smoke passes; XY-925 fixture-backed source-of-truth prompts now cover the canonical Markdown rebuild/reload boundary, but no live memsearch prompt adapter pass is claimed. | Promote memsearch source-of-truth rebuild/reload prompts into a live adapter before any suite-level win/loss claim. | | Temporal/current-vs-historical memory | Fixture memory_evolution passes; live memory_evolution is `wrong_result`. | Graphiti/Zep, mem0/OpenMemory. | Graphiti/Zep is `research_gate` `blocked`; mem0/OpenMemory local OSS preference history, entity scope, deletion audit, and SDK `get_all` now pass; OpenMemory UI/export is blocked by the export-helper setup probe; graph-memory scenarios are `not_encoded`. | Fix ELF/qmd live memory_evolution evidence links, add OpenMemory product app import/export readback, and run XY-888. | | Consolidation | Fixture consolidation passes; live consolidation is `not_encoded`. | agentmemory, managed-memory references, llm-wiki. | No manifest project has live consolidation scoring. | Run reviewable consolidation proposal generation with source refs, unsupported-claim flags, and audit transitions. | | Knowledge pages | Fixture knowledge_compilation passes; live knowledge_compilation is `not_encoded`. | llm-wiki, gbrain, GraphRAG, graphify. | llm-wiki and gbrain are `research_gate` `not_encoded` or `blocked`; GraphRAG is `blocked`; graphify has a tiny scored smoke `wrong_result`. | Encode live derived-page rebuild/lint scoring and run contained knowledge/RAG adapters only after setup proof. | -| Operator debugging | Fixture operator_debugging_ux passes, and the narrow live operator-debug slice passes for trace hydration, candidate-drop visibility, selected-but-not-narrated evidence, replay-command availability, and repair-action clarity. | qmd, claude-mem, OpenMemory. | qmd ties replay-command availability and repair-action clarity but is `wrong_result` for trace hydration, candidate-drop stage visibility, and selected-but-not-narrated evidence; claude-mem and OpenMemory UX remain `not_encoded` or blocked. | Add bounded OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | -| Capture/write policy | Fixture capture_integration passes; ELF live capture_integration passes 4/4 with zero redaction leaks, source ids, write-policy audit, and evidence binding. | agentmemory, claude-mem. | agentmemory capture is `blocked` by mocked/in-memory storage; claude-mem hook/viewer capture is `not_encoded`. | Run durable agentmemory and claude-mem capture-hook jobs proving redaction, exclusion, evidence binding, source ids, and no secret leakage. | +| Operator debugging | Fixture operator_debugging_ux passes, and the narrow live operator-debug slice passes for trace hydration, candidate-drop visibility, selected-but-not-narrated evidence, replay-command availability, and repair-action clarity. | qmd, claude-mem, OpenMemory. | qmd ties replay-command availability and repair-action clarity but is `wrong_result` for trace hydration, candidate-drop stage visibility, and selected-but-not-narrated evidence. XY-925 adds claude-mem progressive-disclosure and retrieval-repair prompt coverage, while claude-mem viewer/operator and OpenMemory UI/export remain blocked. | Add bounded OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | +| Capture/write policy | Fixture capture_integration passes; ELF live capture_integration passes 4/4 with zero redaction leaks, source ids, write-policy audit, and evidence binding. | agentmemory, claude-mem. | agentmemory and claude-mem hook capture remain `blocked` until Docker-contained hook observations and write-policy/viewer readback artifacts exist. | Run durable agentmemory and claude-mem capture-hook jobs proving redaction, exclusion, evidence binding, source ids, and no secret leakage. | | Production ops | Fixture production_ops has 4 pass and 2 blocked; live production_ops is `blocked`; production adoption has provider/backfill/restore evidence. | ELF production gate, qmd, RAG/RAGFlow resource gates. | qmd live production_ops is `blocked`; RAG/resource gates are `research_gate` `blocked`. | Rerun private-corpus and credentialed gates only when operator-owned manifest and credentials exist. | | Personalization | Fixture and live personalization pass. | mem0/OpenMemory, Letta. | mem0/OpenMemory and Letta personalization are `not_encoded`. | Encode scoped preference readback for mem0/OpenMemory and Letta before personalization superiority claims. | | Context trajectory | ELF has trace direction but no comparable staged trajectory scenario. | OpenViking. | OpenViking setup is pinned, same-corpus retrieval is `wrong_result`, and staged/hierarchy/recursive trajectory jobs are encoded as `blocked`. | Make OpenViking evidence-bearing retrieval pass, then score staged context trajectory outputs. | @@ -121,9 +121,9 @@ now explicit: | agentmemory durable lifecycle adapter | `[ELF benchmark P0] Make external adapters lifecycle-durable and fail-typed` | yes | Durable local adapter path selection. | Update, delete, cold-start reload, work_resume, and capture/write-policy jobs. | | agentmemory/claude-mem capture-hook breadth | Follow-up after XY-933 | yes | Docker-contained hook/viewer capture path with durable artifacts. | Source ids, redaction/exclusion audit, evidence-bound output, and typed blocker reporting. | | mem0/OpenMemory history and UI coverage | New adapter repair issue | yes | Comparable local OSS path for history/UI/readback evidence. | Preference/entity history, deletion audit readback, personalization, OpenMemory inspection/export, and optional graph-context jobs. | -| memsearch source-of-truth real-world coverage | New adapter repair issue | yes | Real-world prompt adapter over the canonical Markdown store. | Source-of-truth rebuild/reload jobs and retrieval-debug jobs that preserve baseline reindex/update/delete evidence without converting it into suite pass claims. | +| memsearch source-of-truth live adapter coverage | New adapter repair issue | yes | Fixture-backed source-store and retrieval-debug prompts are encoded by XY-925; live prompt execution remains missing. | Runtime adapter execution for the existing source-of-truth rebuild/reload and retrieval-debug prompt jobs without converting baseline smoke into suite pass claims. | | OpenViking context trajectory | XY-928 encoded blocked fixtures | yes | Evidence-bearing same-corpus retrieval output and staged artifacts. | Hierarchical expansion, staged trajectory, recursive/context expansion, and comparable ELF trace/session evidence jobs. | -| claude-mem progressive disclosure | New adapter issue | yes | Durable repository path and progressive-disclosure output contract. | Work resume, operator debugging, capture/write-policy, and progressive disclosure jobs. | +| claude-mem hook/viewer runtime coverage | New adapter issue | yes | Fixture-backed progressive-disclosure and retrieval-repair prompts are encoded by XY-925; hook capture and viewer/operator workflows remain blocked. | Work resume, operator debugging, capture/write-policy, viewer/operator, and live progressive-disclosure adapter execution. | | RAGFlow evidence smoke | XY-885 | yes | Resource envelope accepted for tiny Docker smoke. | `reference.chunks` to benchmark evidence mapping. | | LightRAG context export | XY-886 | yes | Docker service setup and explicit provider config. | Retrieved context export and source file-path citations. | | GraphRAG cost-bounded adapter | XY-887 | yes | Tiny corpus cost/resource envelope. | Document, text-unit, graph-summary, and citation output tables. | diff --git a/docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md b/docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md new file mode 100644 index 00000000..1484abcf --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md @@ -0,0 +1,99 @@ +# First-Generation OSS Continuity and Source-Store Report - June 11, 2026 + +Goal: Expand first-generation OSS adapter coverage for durable continuity, +canonical source-store, retrieval-debug, progressive-disclosure, hook capture, and +viewer/operator surfaces without promoting smoke evidence into real-world suite pass +evidence. +Read this when: You need the XY-925 result for agentmemory, memsearch, and +claude-mem after the XY-898 first-generation adapter promotion. +Inputs: `cargo make real-world-first-generation-oss`, the external adapter manifest, +and the June 11 first-generation OSS adapter promotion report. +Outputs: Fixture-backed prompt coverage, scenario-level comparison outcomes, typed +blockers, and updated claim boundaries. + +## Scope Boundary + +This is benchmark/report coverage only. It does not change ELF retrieval behavior, +external project code, or baseline adapter runtime behavior. + +The new first-generation fixture slice lives outside +`apps/elf-eval/fixtures/real_world_memory/`, so it is not counted as the aggregate ELF +real-world suite. The slice exists to encode comparable prompt shapes and blockers for +external OSS adapter surfaces while the external adapter manifest keeps evidence +classes explicit. + +## Fresh Run + +| Command | Result | Artifact | +| --- | --- | --- | +| `cargo make real-world-first-generation-oss` | pass | `tmp/real-world-memory/first-generation-oss/report.json` | + +Generated report summary: + +| Metric | Value | +| --- | ---: | +| Jobs | 6 | +| Encoded suites | 4 | +| Pass | 4 | +| Blocked | 2 | +| Evidence coverage | 12/12 | +| Source-ref coverage | 12/12 | +| Quote coverage | 12/12 | +| Operator-debug jobs | 2 | +| Raw SQL needed | 0 | + +External adapter manifest scenario outcomes now preserve every normalized outcome: + +| Outcome | Count | +| --- | ---: | +| win | 9 | +| tie | 8 | +| loss | 1 | +| not_tested | 8 | +| blocked | 6 | +| non_goal | 3 | + +## Scenario Additions + +| Project | Scenario | Status | Outcome | Evidence | +| --- | --- | --- | --- | --- | +| agentmemory | `durable_work_resume_local_path` | `blocked` | `blocked` | The selected comparable path is a Docker-local session directory that persists the SDK KV/index and observation log across a fresh process. | +| agentmemory | `capture_write_policy_hooks` | `blocked` | `blocked` | Live hook observations and write-policy audit evidence are required before scoring capture/write-policy jobs. | +| memsearch | `markdown_source_store_rebuild_reload_prompt` | `pass` | `not_tested` | The prompt fixture covers canonical Markdown as source of truth and `memsearch index` as derived rebuild/reload behavior. | +| memsearch | `markdown_retrieval_debug_prompt` | `pass` | `not_tested` | The prompt fixture covers CLI replay plus Markdown source inspection while keeping staged trace bundles not encoded. | +| claude-mem | `retrieval_repair_artifact_path` | `wrong_result` | `win` | The repair prompt preserves the same-corpus retrieval miss and names rerun/inspection targets `tmp/live-baseline/claude-mem.log` and `tmp/live-baseline/claude-mem-checks.json`. | +| claude-mem | `progressive_disclosure_prompt` | `pass` | `not_tested` | The prompt fixture covers repository search-to-detail/source hydration on durable SQLite. | +| claude-mem | `hook_capture_viewer_workflow` | `blocked` | `blocked` | The current Docker baseline uses repository classes only and does not execute hooks, timeline capture, or viewer workflows. | +| claude-mem | `viewer_operator_workflow` | `blocked` | `blocked` | A fair viewer/operator comparison needs Docker-contained readback over the same durable SQLite corpus. | + +## Claim Boundaries + +Allowed: + +- agentmemory has a selected durable local path for future work-resume and + capture/write-policy scoring. +- memsearch now has checked-in source-store and retrieval-debug prompt coverage over + the canonical Markdown store. +- claude-mem has checked-in progressive-disclosure and retrieval-repair prompt + coverage for the Docker-contained repository path. +- claude-mem hook capture and viewer/operator workflows remain typed blockers. + +Not allowed: + +- Do not claim agentmemory durable continuity from the in-memory same-corpus smoke. +- Do not claim memsearch full real-world suite parity from Markdown reindex/reload + smoke or fixture-backed prompt coverage. +- Do not claim claude-mem retrieval passed; same-corpus retrieval remains + `wrong_result`. +- Do not claim claude-mem hooks or viewer workflows pass from repository + class-level hydration evidence. + +## Touched Artifacts + +- `Makefile.toml`: adds `cargo make real-world-first-generation-oss`. +- `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/`: + checked-in prompt and blocker fixtures. +- `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`: + updated scenario rows and explicit `comparison_outcome` values. +- `docs/research/2026-06-11-first-generation-oss-continuity-source-store-report.json`: + machine-readable companion report. diff --git a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md index efd546a1..0974dcb6 100644 --- a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md +++ b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md @@ -165,9 +165,9 @@ records `unique_project_names: 17` for the full project list including ELF. | qmd | `live_real_world` plus `live_baseline_only` | Fresh full sweep is five passes behind ELF because qmd misses the delete/TTL tombstone job and keeps capture/write-policy jobs typed `not_encoded`; same-corpus baseline passes; narrow operator-debug live slice ties replay commands but is `wrong_result` for trace hydration and candidate-drop visibility. | Deep retrieval-debug ergonomics and trace replay beyond the narrow operator-debug slice. | qmd/ELF deep retrieval-debug profile with expansion, fusion, rerank, and dropped-candidate traces. | | agentmemory | `live_baseline_only` | `lifecycle_fail`; capture comparison is `blocked` because the Docker baseline uses a process-local StateKV Map and in-memory index, with no durable local session/capture path for source ids, exclusions, write-policy audit, or evidence-bound output. | Durable coding-agent continuity and capture hooks. | Durable lifecycle and work-resume/capture adapter report. | | mem0/OpenMemory | `live_baseline_only` | Basic local smoke now passes; history/UI/hosted/graph behavior remains `not_encoded`. | Entity history, lifecycle UI, OpenMemory inspection. | Entity-history, deletion-audit, and UI/export readback report. | -| memsearch | `live_baseline_only` | Basic canonical Markdown reindex/reload smoke now passes; real-world prompt coverage remains `not_encoded`. | Markdown canonical store and local reindex clarity. | Source-of-truth and retrieval-debug real-world adapter report. | +| memsearch | `live_baseline_only`; XY-925 `fixture_backed` | Basic canonical Markdown reindex/reload smoke passes, and XY-925 adds fixture-backed source-store and retrieval-debug prompts without claiming a live memsearch adapter pass. | Markdown canonical store and local reindex clarity. | Runtime source-of-truth and retrieval-debug adapter execution over the existing prompt jobs. | | OpenViking | `live_baseline_only` plus `fixture_backed` and `research_gate` | Same-corpus retrieval is `wrong_result`; staged retrieval, hierarchy selection, and recursive/context expansion are encoded as blocked fixtures. | Hierarchical staged context trajectory. | Evidence-bearing retrieval fix, then materialized staged trajectory report. | -| claude-mem | `live_baseline_only` | `wrong_result`; capture breadth is `not_encoded` because hooks, timeline, observations, viewer capture, and automatic capture review were not run against real-world jobs. | Progressive disclosure and automatic capture review. | Work-resume, operator-debugging, and capture/write-policy report. | +| claude-mem | `live_baseline_only`; XY-925 `fixture_backed` | Same-corpus retrieval remains `wrong_result`; XY-925 adds fixture-backed progressive-disclosure and retrieval-repair prompts, with hook capture and viewer/operator workflows still blocked. | Progressive disclosure and automatic capture review. | Work-resume, operator-debugging, capture/write-policy, and viewer/operator runtime report. | | RAGFlow | `research_gate` | `blocked`. | RAG app workflow with document/chunk references. | Tiny Docker evidence-smoke with `reference.chunks` mapped to evidence ids. | | LightRAG | `research_gate` | `blocked`. | Graph/RAG context export with source-path citations. | Docker context-export report with explicit provider config and source citation mapping. | | GraphRAG | `research_gate` | `blocked`. | Graph summaries and document/text-unit evidence tables. | Cost-bounded Docker adapter report over a tiny corpus. | diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index 34fbe8b1..1668aa31 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -84,6 +84,11 @@ cleanup, use `docs/guide/single_user_production.md`. mem0/OpenMemory, memsearch, and claude-mem with fresh scenario-level baseline evidence and ELF win/tie/loss/untested positions without converting baseline-only evidence into real-world suite wins. +- `2026-06-11-first-generation-oss-continuity-source-store-report.md`: XY-925 + follow-up report that adds first-generation OSS fixture-backed prompt coverage and + typed blockers for agentmemory durable continuity, memsearch canonical Markdown + source-store/debug jobs, and claude-mem progressive-disclosure, retrieval-repair, + hook, and viewer/operator surfaces. - `2026-06-11-graph-rag-scored-smoke-adapter-report.md`: XY-900 graph/RAG scored-smoke adapter report that promotes RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify smoke contracts into scored or typed non-pass diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index 5426b5cb..689132a6 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -12,7 +12,7 @@ "Live temporal reconciliation remains wrong_result for five of six memory_evolution jobs.", "Private-corpus production quality is blocked until an operator-owned manifest exists.", "Credentialed provider production-ops gates are blocked until explicit provider setup exists.", - "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation remain unproven. XY-928 encodes OpenViking staged trajectory, hierarchy selection, and recursive/context expansion as blocked fixtures behind same-corpus evidence output and missing staged artifacts. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export and claude-mem viewer workflows remain blocked or not encoded. XY-933 adds an ELF live capture/write-policy self-check, but agentmemory capture breadth is blocked by mocked/in-memory storage and claude-mem hook/viewer capture remains untested." + "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and Letta core-vs-archival memory plus graph/RAG navigation remain unproven. XY-928 encodes OpenViking staged trajectory, hierarchy selection, and recursive/context expansion as blocked fixtures behind same-corpus evidence output and missing staged artifacts. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export and claude-mem viewer workflows remain blocked or not encoded. XY-925 adds fixture-backed first-generation OSS prompt coverage and typed blockers for agentmemory durable continuity, memsearch Markdown source-store/debug jobs, and claude-mem progressive-disclosure, retrieval-repair, hook, and viewer/operator surfaces without creating live external real-world suite passes. XY-933 adds an ELF live capture/write-policy self-check, but agentmemory and claude-mem hook-capture breadth remain blocked until Docker-contained hook/viewer evidence exists." ] }, "evidence_class_terms": [ @@ -61,6 +61,11 @@ "artifact": "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md", "claim": "mem0/OpenMemory and memsearch pass basic local baseline smokes; agentmemory remains lifecycle_fail and claude-mem remains wrong_result on same-corpus retrieval." }, + { + "command": "cargo make real-world-first-generation-oss", + "artifact": "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md", + "claim": "First-generation OSS fixture slice reports 6 jobs: 4 pass, 2 blocked, full evidence/source-ref/quote coverage, and manifest scenario outcomes across win, tie, loss, not_tested, blocked, and non_goal without promoting smoke evidence into live suite passes." + }, { "command": "cargo make openmemory-ui-export-readback", "artifact": "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md", @@ -103,7 +108,7 @@ "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md" ], "follow_up_issues": [], - "caveat": "memsearch canonical Markdown reindex/reload is a useful ergonomics reference, but real-world source-of-truth prompts are not encoded." + "caveat": "XY-925 encodes fixture-backed memsearch canonical Markdown source-store prompts, but no live memsearch real_world_job runtime adapter pass is claimed." }, { "scenario_id": "work_resume_coding_agent_continuity", @@ -116,13 +121,13 @@ "blocked", "not_encoded" ], - "measured_claim": "ELF and qmd both pass the encoded live work_resume jobs. agentmemory, claude-mem, and OpenViking continuity strengths remain blocked or not encoded.", + "measured_claim": "ELF and qmd both pass the encoded live work_resume jobs. XY-925 selects agentmemory's durable local path but keeps it blocked until the SDK KV/index and observation log survive a fresh process; claude-mem and OpenViking continuity strengths remain blocked or not encoded.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", - "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md" + "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md", + "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" ], "follow_up_issues": [ - "XY-925", "XY-928" ], "caveat": "The tie is only for encoded live work_resume behavior, not for broad capture hooks or staged context." @@ -256,17 +261,18 @@ "blocked", "not_encoded" ], - "measured_claim": "ELF now has a narrow live operator-debug win over qmd on trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence. ELF ties qmd on replay-command availability and repair-action clarity. OpenMemory UI/export remains blocked and claude-mem UI remains not encoded, so this is not a broad viewer-product superiority claim.", + "measured_claim": "ELF now has a narrow live operator-debug win over qmd on trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence. ELF ties qmd on replay-command availability and repair-action clarity. XY-925 adds claude-mem progressive-disclosure and retrieval-repair prompt coverage, but claude-mem viewer/operator workflows and OpenMemory UI/export remain blocked, so this is not a broad viewer-product superiority claim.", "command_artifacts": [ "tmp/real-world-job/operator-ux-live-adapters/summary.json", "tmp/real-world-job/operator-ux-live-adapters/elf-report.json", "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json", - "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md" + "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md", + "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" ], "follow_up_issues": [ "XY-926" ], - "caveat": "The live slice compares ELF and qmd only; OpenMemory UI/export and claude-mem viewer workflows remain typed blocked or not_encoded until a bounded local runner exists." + "caveat": "The live slice compares ELF and qmd only; OpenMemory UI/export and claude-mem viewer workflows remain typed blocked until a bounded local runner exists." }, { "scenario_id": "capture_write_policy_redaction", @@ -279,15 +285,17 @@ "blocked", "not_encoded" ], - "measured_claim": "ELF live capture/write-policy self-check jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. qmd remains not_encoded; agentmemory comparison is blocked by mocked/in-memory storage; claude-mem capture breadth is not_encoded because hooks, timeline, observations, viewer capture, and automatic capture review were not run against real-world jobs.", + "measured_claim": "ELF live capture/write-policy self-check jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. qmd remains not_encoded; XY-925 records agentmemory and claude-mem hook capture as typed blockers until Docker-contained hook observations and write-policy/viewer readback artifacts exist.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", "docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md", - "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md" + "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md", + "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" ], "follow_up_issues": [ "XY-933", - "XY-925" + "XY-925", + "XY-926" ], "caveat": "This is an ELF self-check and qmd not_encoded delta, not a broad capture-breadth win over agentmemory or claude-mem." }, @@ -427,8 +435,8 @@ { "issue": "XY-925", "priority": "P1", - "state": "Backlog", - "gap": "First-generation OSS continuity and source-store adapters." + "state": "Fixture slice encoded; runtime paths still blocked", + "gap": "First-generation OSS prompt coverage and typed blockers are recorded for agentmemory, memsearch, and claude-mem; durable agentmemory hooks and claude-mem viewer/operator runs still need runtime adapters." }, { "issue": "XY-926", diff --git a/docs/research/2026-06-11-first-generation-oss-continuity-source-store-report.json b/docs/research/2026-06-11-first-generation-oss-continuity-source-store-report.json new file mode 100644 index 00000000..f69909b6 --- /dev/null +++ b/docs/research/2026-06-11-first-generation-oss-continuity-source-store-report.json @@ -0,0 +1,140 @@ +{ + "schema": "elf.first_generation_oss_continuity_source_store_report/v1", + "report_id": "xy-925-first-generation-oss-continuity-source-store-2026-06-11", + "authority": "XY-925", + "created_at": "2026-06-11T00:00:00Z", + "scope": "Fixture-backed first-generation OSS prompt coverage and typed blockers for agentmemory, memsearch, and claude-mem without promoting smoke evidence into real-world suite pass evidence.", + "validation": { + "command": "cargo make real-world-first-generation-oss", + "status": "pass", + "json_artifact": "tmp/real-world-memory/first-generation-oss/report.json", + "markdown_artifact": "tmp/real-world-memory/first-generation-oss/report.md", + "summary": { + "job_count": 6, + "encoded_suite_count": 4, + "pass": 4, + "blocked": 2, + "evidence_coverage": 1.0, + "source_ref_coverage": 1.0, + "quote_coverage": 1.0, + "operator_debug_job_count": 2, + "raw_sql_needed_count": 0 + } + }, + "manifest": { + "path": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json", + "manifest_id": "real-world-memory-project-adapters-2026-06-11-first-generation-continuity-source-store", + "scenario_outcome_counts": { + "win": 9, + "tie": 8, + "loss": 1, + "not_tested": 8, + "blocked": 6, + "non_goal": 3 + }, + "scenario_status_counts": { + "unsupported": 2, + "blocked": 6, + "wrong_result": 5, + "lifecycle_fail": 1, + "pass": 19, + "not_encoded": 2 + } + }, + "scenario_judgments": [ + { + "project": "agentmemory", + "scenario_id": "durable_work_resume_local_path", + "suite_id": "work_resume", + "status": "blocked", + "comparison_outcome": "blocked", + "evidence": "The selected local path is a Docker-contained session directory that persists the SDK KV/index and observation log across a fresh process.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" + }, + { + "project": "agentmemory", + "scenario_id": "capture_write_policy_hooks", + "suite_id": "capture_integration", + "status": "blocked", + "comparison_outcome": "blocked", + "evidence": "Live agentmemory hook observations and persisted write-policy audit evidence are required before capture/write-policy scoring.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" + }, + { + "project": "memsearch", + "scenario_id": "markdown_source_store_rebuild_reload_prompt", + "suite_id": "trust_source_of_truth", + "status": "pass", + "comparison_outcome": "not_tested", + "evidence": "The prompt fixture covers canonical Markdown files as source of truth and memsearch index as derived rebuild/reload behavior.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json" + }, + { + "project": "memsearch", + "scenario_id": "markdown_retrieval_debug_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "comparison_outcome": "not_tested", + "evidence": "The prompt fixture covers CLI replay, Markdown source inspection, and reindexing while keeping staged trace bundles not encoded.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json" + }, + { + "project": "claude-mem", + "scenario_id": "retrieval_repair_artifact_path", + "suite_id": "retrieval", + "status": "wrong_result", + "comparison_outcome": "win", + "evidence": "The prompt fixture preserves claude-mem same-corpus retrieval as wrong_result and names rerun/inspection targets tmp/live-baseline/claude-mem.log plus tmp/live-baseline/claude-mem-checks.json.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json" + }, + { + "project": "claude-mem", + "scenario_id": "progressive_disclosure_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "comparison_outcome": "not_tested", + "evidence": "The prompt fixture covers repository search-to-detail/source hydration on durable SQLite and separates it from hook/viewer claims.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json" + }, + { + "project": "claude-mem", + "scenario_id": "hook_capture_viewer_workflow", + "suite_id": "capture_integration", + "status": "blocked", + "comparison_outcome": "blocked", + "evidence": "The current Docker baseline uses repository classes only and does not execute hooks, timeline capture, or viewer workflows.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + }, + { + "project": "claude-mem", + "scenario_id": "viewer_operator_workflow", + "suite_id": "operator_debugging_ux", + "status": "blocked", + "comparison_outcome": "blocked", + "evidence": "A fair viewer/operator comparison needs Docker-contained readback over the same durable SQLite corpus.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + } + ], + "claim_boundaries": { + "allowed": [ + "agentmemory has a selected durable local path for future work-resume and capture/write-policy scoring.", + "memsearch has checked-in source-store and retrieval-debug prompt coverage over the canonical Markdown store.", + "claude-mem has checked-in progressive-disclosure and retrieval-repair prompt coverage for the Docker-contained repository path.", + "claude-mem hook capture and viewer/operator workflows remain typed blockers." + ], + "not_allowed": [ + "Do not claim agentmemory durable continuity from the in-memory same-corpus smoke.", + "Do not claim memsearch full real-world suite parity from Markdown reindex/reload smoke or fixture-backed prompt coverage.", + "Do not claim claude-mem retrieval passed; same-corpus retrieval remains wrong_result.", + "Do not claim claude-mem hooks or viewer workflows pass from repository class-level hydration evidence." + ] + } +} diff --git a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json index b2760325..82ac877e 100644 --- a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json +++ b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json @@ -167,19 +167,20 @@ "strongest_user_facing_scenario": "Markdown-first canonical store with rebuildable local index and practical hybrid retrieval.", "current_evidence_class": "live_baseline_only", "supporting_evidence_classes": [ - "live_baseline_only" + "live_baseline_only", + "fixture_backed" ], "measured_status": "pass", "proof": { - "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", - "artifact": "tmp/live-baseline/live-baseline-report.json" + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker; cargo make real-world-first-generation-oss", + "artifact": "tmp/live-baseline/live-baseline-report.json; tmp/real-world-memory/first-generation-oss/report.json" }, "unsupported_or_blocked_status": { "state": "not_encoded", - "typed_reason": "source_of_truth_and_reindex_real_world_jobs_not_encoded", - "details": "Basic canonical Markdown same-corpus/reindex/update/delete/reload smoke now passes, but source-of-truth, retrieval-debug, and memory-evolution real-world prompt adapters are not encoded." + "typed_reason": "live_prompt_runtime_adapter_not_encoded", + "details": "Basic canonical Markdown same-corpus/reindex/update/delete/reload smoke passes, and XY-925 adds fixture-backed source-store and retrieval-debug prompts. No live memsearch runtime adapter executes prompt scoring yet; memory-evolution prompt adapters remain not encoded and TTL/expiry is unsupported by the current CLI path." }, - "benchmark_before_claim": "Score source-of-truth and retrieval-debug real-world jobs over the canonical Markdown store; keep TTL/expiry unsupported unless a comparable path exists.", + "benchmark_before_claim": "Promote the fixture-backed source-store and retrieval-debug prompts into a live memsearch real-world adapter before any suite-level win/loss claim; keep TTL/expiry unsupported unless a comparable path exists.", "borrow_if_stronger": "Borrow the canonical markdown-store ergonomics, local reindex clarity, and user-inspectable source files." }, { @@ -209,19 +210,20 @@ "strongest_user_facing_scenario": "Progressive disclosure, automatic capture loop, repository-local lifecycle, and practical local viewer workflow.", "current_evidence_class": "live_baseline_only", "supporting_evidence_classes": [ - "live_baseline_only" + "live_baseline_only", + "fixture_backed" ], "measured_status": "wrong_result", "proof": { - "command": "ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker", - "artifact": "tmp/live-baseline/live-baseline-report.json" + "command": "ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker; cargo make real-world-first-generation-oss", + "artifact": "tmp/live-baseline/live-baseline-report.json; tmp/real-world-memory/first-generation-oss/report.json" }, "unsupported_or_blocked_status": { - "state": "not_encoded", - "typed_reason": "progressive_disclosure_and_capture_real_world_jobs_not_encoded", - "details": "Current Docker evidence is not a clean retrieval pass, and progressive-disclosure plus hook/viewer capture jobs are not encoded." + "state": "blocked", + "typed_reason": "hook_viewer_runtime_paths_blocked", + "details": "Same-corpus retrieval remains wrong_result; XY-925 adds fixture-backed progressive-disclosure and retrieval-repair prompts. Hook capture and viewer/operator workflows still lack a Docker-contained runner, and the repair prompt lists rerun/inspection targets tmp/live-baseline/claude-mem.log plus tmp/live-baseline/claude-mem-checks.json." }, - "benchmark_before_claim": "Add durable repository-backed work_resume, operator_debugging_ux, capture/write-policy, and progressive-disclosure jobs.", + "benchmark_before_claim": "Promote durable repository-backed work_resume, operator_debugging_ux, capture/write-policy, viewer/operator, and progressive-disclosure prompts into a live claude-mem adapter before any broader UX claim.", "borrow_if_stronger": "Borrow progressive disclosure, automatic capture review loops, and local viewer/operator comfort." }, { @@ -440,7 +442,7 @@ "scenario": "work resume", "current_elf_evidence": "ELF fixture-backed work_resume passes and ELF live_real_world work_resume passes.", "strongest_competitor_or_reference": "agentmemory, claude-mem, OpenViking", - "current_competitor_evidence": "agentmemory is live_baseline_only with lifecycle_fail; claude-mem is wrong_result; OpenViking work_resume is not_encoded.", + "current_competitor_evidence": "agentmemory is live_baseline_only with lifecycle_fail; claude-mem work_resume remains not_encoded pending a durable repository-backed adapter; OpenViking work_resume is not_encoded.", "current_state": "ELF and qmd have current encoded live pass evidence, but continuity-oriented competitors remain undermeasured.", "next_measurement": "Encode durable agentmemory, claude-mem, and OpenViking work_resume adapters or declare each blocked with lifecycle/setup evidence." }, @@ -458,9 +460,9 @@ "scenario": "source-of-truth", "current_elf_evidence": "ELF fixture-backed trust_source_of_truth passes and ELF live_real_world trust_source_of_truth passes.", "strongest_competitor_or_reference": "memsearch", - "current_competitor_evidence": "memsearch has live_baseline_only canonical store evidence and now passes same-corpus retrieval, reindex/update/delete, and cold-start reload smoke, but trust_source_of_truth real-world prompts are not_encoded.", - "current_state": "ELF has stronger measured real-world source-of-truth evidence; memsearch now ties the local canonical-store reindex/reload smoke and remains a local-store ergonomics reference.", - "next_measurement": "Run memsearch source-of-truth rebuild and reload real_world_job prompts before any suite-level win/loss claim." + "current_competitor_evidence": "memsearch canonical-store, reindex, delete, and reload smoke passes; XY-925 fixture-backed source-of-truth prompts now cover the canonical Markdown rebuild/reload boundary, but no live memsearch prompt adapter pass is claimed.", + "current_state": "ELF has stronger measured live real-world source-of-truth evidence; memsearch now ties the local canonical-store reindex/reload smoke and has fixture-backed prompt coverage as a local-store ergonomics reference.", + "next_measurement": "Promote memsearch source-of-truth rebuild/reload prompts into a live adapter before any suite-level win/loss claim." }, { "scenario_id": "temporal_current_historical", @@ -494,8 +496,8 @@ "scenario": "operator debugging", "current_elf_evidence": "ELF fixture-backed operator_debugging_ux passes, and the narrow live_real_world operator-debug slice passes for trace hydration, candidate-drop visibility, selected-but-not-narrated evidence, replay-command availability, and repair-action clarity.", "strongest_competitor_or_reference": "qmd, claude-mem, OpenMemory", - "current_competitor_evidence": "qmd now has a narrow live_real_world operator-debug slice: replay-command availability and repair-action clarity pass, but trace hydration, candidate-drop stage visibility, and selected-but-not-narrated evidence are wrong_result. claude-mem and OpenMemory UX remain not_encoded or blocked.", - "current_state": "ELF has a narrow comparable live win over qmd for trace hydration and candidate-drop visibility, while OpenMemory and claude-mem UI workflows remain unmeasured.", + "current_competitor_evidence": "qmd now has a narrow live_real_world operator-debug slice: replay-command availability and repair-action clarity pass, but trace hydration, candidate-drop stage visibility, and selected-but-not-narrated evidence are wrong_result. XY-925 adds claude-mem progressive-disclosure and retrieval-repair prompt coverage, while claude-mem viewer/operator and OpenMemory UI/export remain blocked.", + "current_state": "ELF has a narrow comparable live win over qmd for trace hydration and candidate-drop visibility, while OpenMemory and claude-mem viewer/operator workflows remain blocked for broad UX claims.", "next_measurement": "Add bounded OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim." }, { @@ -503,8 +505,8 @@ "scenario": "capture/write policy", "current_elf_evidence": "ELF fixture-backed capture_integration passes, and ELF live_real_world capture_integration passes 4/4 with zero redaction leaks, source ids, write-policy audit, and evidence binding.", "strongest_competitor_or_reference": "agentmemory, claude-mem", - "current_competitor_evidence": "agentmemory capture_integration is blocked by mocked/in-memory storage and claude-mem hook/viewer capture is not_encoded.", - "current_state": "ELF has live capture/write-policy self-check evidence, but agentmemory and claude-mem capture-breadth comparisons remain blocked or untested.", + "current_competitor_evidence": "agentmemory and claude-mem hook capture remain blocked until Docker-contained hook observations and write-policy/viewer readback artifacts exist.", + "current_state": "ELF has live capture/write-policy self-check evidence, but agentmemory and claude-mem capture-breadth comparisons remain blocked.", "next_measurement": "Run durable agentmemory and claude-mem capture-hook jobs that prove redaction, exclusion, evidence binding, source ids, and no secret leakage." }, { @@ -583,11 +585,11 @@ "measurement": "Preference/entity history, deletion audit readback, personalization, OpenMemory inspection/export, and optional graph-context jobs." }, { - "workstream": "memsearch source-of-truth real-world coverage", + "workstream": "memsearch source-of-truth live adapter coverage", "issue_or_candidate": "new adapter repair issue", "parallelizable": true, - "blocked_by": "Real-world prompt adapter over the canonical Markdown store.", - "measurement": "Source-of-truth rebuild/reload jobs and retrieval-debug jobs that preserve baseline reindex/update/delete evidence without converting it into suite pass claims." + "blocked_by": "Fixture-backed source-store and retrieval-debug prompts are encoded by XY-925; live prompt execution remains missing.", + "measurement": "Runtime adapter execution for the existing source-of-truth rebuild/reload and retrieval-debug prompt jobs without converting baseline smoke into suite pass claims." }, { "workstream": "OpenViking context trajectory", @@ -597,11 +599,11 @@ "measurement": "Hierarchical expansion, staged trajectory, and resume/retrieval evidence jobs." }, { - "workstream": "claude-mem progressive disclosure", + "workstream": "claude-mem hook/viewer runtime coverage", "issue_or_candidate": "new adapter issue", "parallelizable": true, - "blocked_by": "Durable repository path and progressive-disclosure output contract.", - "measurement": "Work resume, operator debugging, capture/write-policy, and progressive disclosure jobs." + "blocked_by": "Fixture-backed progressive-disclosure and retrieval-repair prompts are encoded by XY-925; hook capture and viewer/operator workflows remain blocked.", + "measurement": "Work resume, operator debugging, capture/write-policy, viewer/operator, and live progressive-disclosure adapter execution." }, { "workstream": "RAGFlow evidence smoke", From 1b893f6ab2be291834989075276195080df45c5d Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 12 Jun 2026 01:29:07 +0800 Subject: [PATCH 2/7] {"schema":"decodex/commit/1","summary":"Align first-generation OSS benchmark assertions","authority":"XY-925"} --- apps/elf-eval/tests/real_world_job_benchmark.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index d1ac86e5..46b4a2e1 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -1465,8 +1465,9 @@ fn live_adapter_supports_elf_capture_write_policy_without_external_hook_claims() assert!(manifest.contains("\"scenario_id\": \"capture_write_policy_hooks\"")); assert!(manifest.contains("\"comparison_outcome\": \"blocked\"")); assert!(manifest.contains("Four redaction, exclusion, source-id, evidence-binding")); - assert!(manifest.contains("no durable local session/capture path stores source ids")); - assert!(manifest.contains("hooks, timeline, observations, viewer capture")); + assert!(manifest.contains("durable upstream agentmemory session/capture path")); + assert!(manifest.contains("Docker-contained session directory")); + assert!(manifest.contains("claude-mem hooks, viewer, timeline, and observation workflows")); Ok(()) } From 38ded160ac97bf40cb4b53e425f891716b51e37a Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 12 Jun 2026 01:41:41 +0800 Subject: [PATCH 3/7] {"schema":"decodex/commit/1","summary":"Align first-generation OSS report counts","authority":"XY-925"} --- ...-11-first-generation-oss-continuity-source-store-report.md | 2 +- ...1-first-generation-oss-continuity-source-store-report.json | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md b/docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md index 1484abcf..80e944cc 100644 --- a/docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md +++ b/docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md @@ -47,7 +47,7 @@ External adapter manifest scenario outcomes now preserve every normalized outcom | Outcome | Count | | --- | ---: | | win | 9 | -| tie | 8 | +| tie | 9 | | loss | 1 | | not_tested | 8 | | blocked | 6 | diff --git a/docs/research/2026-06-11-first-generation-oss-continuity-source-store-report.json b/docs/research/2026-06-11-first-generation-oss-continuity-source-store-report.json index f69909b6..f5d38617 100644 --- a/docs/research/2026-06-11-first-generation-oss-continuity-source-store-report.json +++ b/docs/research/2026-06-11-first-generation-oss-continuity-source-store-report.json @@ -26,7 +26,7 @@ "manifest_id": "real-world-memory-project-adapters-2026-06-11-first-generation-continuity-source-store", "scenario_outcome_counts": { "win": 9, - "tie": 8, + "tie": 9, "loss": 1, "not_tested": 8, "blocked": 6, @@ -37,7 +37,7 @@ "blocked": 6, "wrong_result": 5, "lifecycle_fail": 1, - "pass": 19, + "pass": 20, "not_encoded": 2 } }, From 6b742038426089ea8c61973f82ebd9966659e899 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 12 Jun 2026 02:19:57 +0800 Subject: [PATCH 4/7] {"schema":"decodex/commit/1","summary":"Constrain first-generation suite evidence claims","authority":"XY-925"} --- .../memory_projects_manifest.json | 8 ++++---- apps/elf-eval/tests/real_world_job_benchmark.rs | 15 ++++++++------- ...6-06-11-competitor-strength-adoption-report.md | 2 +- ...06-11-competitor-strength-adoption-report.json | 2 +- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 33cbf264..61fbcf7f 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -1156,13 +1156,13 @@ "suites": [ { "suite_id": "trust_source_of_truth", - "status": "pass", - "evidence": "The Markdown-first source model passed the local reindex/reload smoke, and XY-925 adds fixture-backed source-of-truth prompt coverage over the canonical Markdown store. No live memsearch runtime adapter executes prompt scoring yet." + "status": "not_encoded", + "evidence": "The Markdown-first source model passed the local reindex/reload smoke, and XY-925 adds fixture-backed source-of-truth prompt coverage over the canonical Markdown store. No live memsearch runtime adapter executes prompt scoring yet, so this is not a suite pass." }, { "suite_id": "retrieval", - "status": "pass", - "evidence": "The Docker same-corpus check passes, and XY-925 adds fixture-backed retrieval-debug prompt coverage over memsearch CLI replay and Markdown source inspection. No live memsearch runtime adapter executes retrieval prompt scoring yet." + "status": "not_encoded", + "evidence": "The Docker same-corpus check passes, and XY-925 adds fixture-backed retrieval-debug prompt coverage over memsearch CLI replay and Markdown source inspection. No live memsearch runtime adapter executes retrieval prompt scoring yet, so this is not a suite pass." }, { "suite_id": "memory_evolution", diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 46b4a2e1..99aca745 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -567,7 +567,7 @@ fn assert_external_adapter_manifest_status_summary(report: &Value) { report .pointer("/external_adapters/summary/suite_status_counts/pass") .and_then(Value::as_u64), - Some(24) + Some(22) ); assert_eq!( report @@ -579,7 +579,7 @@ fn assert_external_adapter_manifest_status_summary(report: &Value) { report .pointer("/external_adapters/summary/suite_status_counts/not_encoded") .and_then(Value::as_u64), - Some(38) + Some(40) ); } @@ -1072,17 +1072,18 @@ fn assert_memsearch_first_generation_records(memsearch: &Value) { memsearch.pointer("/scenarios/0/elf_position").and_then(Value::as_str), Some("untested") ); - assert_eq!(memsearch.pointer("/suites/0/status").and_then(Value::as_str), Some("pass")); + assert_eq!(memsearch.pointer("/suites/0/status").and_then(Value::as_str), Some("not_encoded")); assert!(memsearch.pointer("/suites/0/evidence").and_then(Value::as_str).is_some_and( |evidence| evidence.contains("fixture-backed source-of-truth prompt coverage") - && evidence.contains("No live memsearch runtime adapter executes prompt scoring yet.") + && evidence.contains("No live memsearch runtime adapter executes prompt scoring yet") + && evidence.contains("not a suite pass") )); - assert_eq!(memsearch.pointer("/suites/1/status").and_then(Value::as_str), Some("pass")); + assert_eq!(memsearch.pointer("/suites/1/status").and_then(Value::as_str), Some("not_encoded")); assert!(memsearch.pointer("/suites/1/evidence").and_then(Value::as_str).is_some_and( |evidence| evidence.contains("fixture-backed retrieval-debug prompt coverage") && evidence.contains( - "No live memsearch runtime adapter executes retrieval prompt scoring yet." - ) + "No live memsearch runtime adapter executes retrieval prompt scoring yet" + ) && evidence.contains("not a suite pass") )); assert_eq!(memsearch.pointer("/scenarios/1/status").and_then(Value::as_str), Some("pass")); assert_eq!( diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index 07ef05ad..6a63a1e1 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -82,7 +82,7 @@ results, or lifecycle failures into one aggregate leaderboard. | --- | --- | --- | | `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` | ELF fixture aggregate covers 43 jobs across 12 suites with 38 pass and 5 blocked production-ops or OpenViking context-trajectory measurement gates. | | `cargo make real-world-memory-live-adapters` | `2026-06-11-measurement-coverage-audit.md` | ELF live service adapter reports 22 pass, 5 wrong_result, 2 blocked, and 11 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 15 not_encoded jobs. | -| `cargo make real-world-memory-live-adapters` | `2026-06-11-capture-write-policy-live-report.md` | ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage; qmd remains not_encoded, agentmemory is blocked, and claude-mem is untested for capture breadth. | +| `cargo make real-world-memory-live-adapters` | `2026-06-11-capture-write-policy-live-report.md` | ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage; qmd remains not_encoded, while agentmemory and claude-mem capture breadth are blocked until durable hook/viewer evidence exists. | | `cargo make real-world-job-operator-ux-live-adapters` | `tmp/real-world-job/operator-ux-live-adapters/summary.json` | The narrow live operator-debug slice scores ELF as pass and qmd as wrong_result: ELF wins trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence; both systems expose replay commands and repair-action guidance. | | `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker` | `2026-06-11-first-generation-oss-adapter-promotion-report.md` | mem0/OpenMemory and memsearch pass basic local baseline smokes; agentmemory remains lifecycle_fail and claude-mem remains wrong_result. | | `cargo make real-world-first-generation-oss` | `2026-06-11-first-generation-oss-continuity-source-store-report.md` | First-generation OSS fixture slice reports 6 jobs: 4 pass, 2 blocked, full evidence/source-ref/quote coverage, and manifest scenario outcomes across win, tie, loss, not_tested, blocked, and non_goal without promoting smoke evidence into live suite passes. | diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index 689132a6..cb69967b 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -49,7 +49,7 @@ { "command": "cargo make real-world-memory-live-adapters", "artifact": "docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md", - "claim": "ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage; qmd remains not_encoded, agentmemory is blocked, and claude-mem is untested for capture breadth." + "claim": "ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage; qmd remains not_encoded, while agentmemory and claude-mem capture breadth are blocked until durable hook/viewer evidence exists." }, { "command": "cargo make real-world-job-operator-ux-live-adapters", From b5c18e80f648570fb34f01f53809d722bf4cf99e Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 12 Jun 2026 02:34:50 +0800 Subject: [PATCH 5/7] {"schema":"decodex/commit/1","summary":"Normalize first-generation evidence summaries","authority":"XY-925"} --- README.md | 3 +- .../tests/real_world_job_benchmark.rs | 52 ++++++++++++++++++- ...-06-11-capture-write-policy-live-report.md | 4 +- ...-11-competitor-strength-adoption-report.md | 2 +- ...-11-competitor-strength-evidence-matrix.md | 4 +- .../2026-06-11-measurement-coverage-audit.md | 8 +-- ...6-11-capture-write-policy-live-report.json | 6 +-- ...1-competitor-strength-adoption-report.json | 4 +- ...-11-xy-897-competitor-strength-matrix.json | 6 +-- 9 files changed, 69 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 11319c42..22df99ec 100644 --- a/README.md +++ b/README.md @@ -208,7 +208,8 @@ provider-backed ELF evidence was required. source refs, write-policy redaction audit counts, evidence binding, and no secret leakage. qmd remains `not_encoded` for this suite. agentmemory capture comparison is blocked by mocked/in-memory storage, and claude-mem hook/viewer capture remains - untested, so no broad capture-breadth superiority claim is allowed. + blocked until Docker-contained hook/viewer capture evidence exists, so no broad + capture-breadth superiority claim is allowed. - The benchmark runner and report publisher are checked in and Docker-isolated: `cargo make baseline-live-docker`, `cargo make baseline-backfill-docker`, `cargo make baseline-production-private-addendum`, diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 99aca745..792ffef4 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -1565,20 +1565,30 @@ fn capture_write_policy_live_report_preserves_competitor_boundaries() -> Result< assert!(agentmemory.pointer("/reason").and_then(Value::as_str).is_some_and(|reason| { reason.contains("process-local StateKV Map") && reason.contains("in-memory index") })); - assert_eq!(claude_mem.pointer("/position").and_then(Value::as_str), Some("untested")); + assert_eq!(claude_mem.pointer("/position").and_then(Value::as_str), Some("blocked")); assert!( claude_mem .pointer("/reason") .and_then(Value::as_str) - .is_some_and(|reason| reason.contains("hooks, timeline, observations")) + .is_some_and(|reason| reason.contains("hooks, timeline, observations") + && reason.contains("Docker-contained hook/viewer runner")) ); assert!(markdown.contains("ELF now has live capture/write-policy self-check evidence")); assert!(markdown.contains("not an ELF-over-qmd win")); + assert!(markdown.contains("| claude-mem capture/viewer flows | `blocked` |")); + assert!(!markdown.contains("claude-mem capture breadth is untested")); assert!(markdown.contains("runtime `source_ref` metadata returned by search")); assert!(markdown.contains("Do not claim ELF broadly beats agentmemory or claude-mem")); assert!(benchmarking_index.contains("2026-06-11-capture-write-policy-live-report.md")); assert!(readme.contains("Capture/Write-Policy Live Report - June 11, 2026")); + let readme_normalized = readme.split_whitespace().collect::>().join(" "); + + assert!( + readme_normalized + .contains("claude-mem hook/viewer capture remains blocked until Docker-contained") + ); + Ok(()) } @@ -1985,6 +1995,7 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> { ); assert_measurement_audit_adapter_status_counts(&measurement_audit); + assert_first_generation_current_summary_boundaries(&measurement_audit, &competitor_matrix); assert!( competitor_matrix @@ -2069,6 +2080,26 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> { Ok(()) } +fn assert_first_generation_current_summary_boundaries( + measurement_audit: &str, + competitor_matrix: &str, +) { + assert!(measurement_audit.contains("claude-mem hook/viewer capture is `blocked`")); + assert!(!measurement_audit.contains("claude-mem hook/viewer capture remains untested")); + assert!(!measurement_audit.contains("blocked or untested")); + assert!(competitor_matrix.contains( + "Overall adapter-status counts: 4 `pass`,\n6 `wrong_result`, 1 `lifecycle_fail`, 6 `blocked`, and 6 `not_encoded`." + )); + assert!(!competitor_matrix.contains("5 `blocked`, and 7 `not_encoded`")); + assert!( + competitor_matrix + .contains("mem0/OpenMemory local OSS entity-scoped personalization now passes") + ); + assert!( + !competitor_matrix.contains("mem0/OpenMemory and Letta personalization are `not_encoded`") + ); +} + #[test] fn qmd_trace_replay_diagnostics_report_preserves_claim_boundaries() -> Result<()> { let report = serde_json::from_str::(&fs::read_to_string( @@ -2408,6 +2439,23 @@ fn assert_competitor_strength_matrix_scenario_json(scenarios: &[Value]) -> Resul .and_then(Value::as_str) .is_some_and(|claim| claim.contains("OpenMemory and claude-mem UI/export")) ); + + let personalization = find_by_field(scenarios, "/scenario_id", "personalization")?; + + assert!( + personalization + .pointer("/current_competitor_evidence") + .and_then(Value::as_str) + .is_some_and(|claim| claim + .contains("mem0/OpenMemory local OSS entity-scoped personalization now passes") + && claim.contains("Letta personalization is research_gate not_encoded")) + ); + assert!( + personalization + .pointer("/current_state") + .and_then(Value::as_str) + .is_some_and(|state| state.contains("scoped personalization is a tie")) + ); assert!( context_trajectory .pointer("/current_state") diff --git a/docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md b/docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md index cb6ff281..185ab65b 100644 --- a/docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md +++ b/docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md @@ -53,7 +53,7 @@ The ELF materialization artifact records: | --- | --- | --- | | qmd live real-world adapter | `untested` | ELF executes and passes 4/4 live capture jobs; qmd keeps the same jobs typed `not_encoded`, so this remains an ELF self-check rather than a qmd comparison result. | | agentmemory capture hooks | `blocked` | The current Docker baseline uses a process-local StateKV Map and in-memory index. No durable local session/capture path stores source ids, exclusions, write-policy audit, or evidence-bound output. | -| claude-mem capture/viewer flows | `untested` | The checked evidence exercises repository storage, lifecycle, progressive disclosure, and same-corpus retrieval only. Hooks, timeline, observations, viewer capture, and automatic capture review are not run against real-world jobs. | +| claude-mem capture/viewer flows | `blocked` | The checked evidence exercises repository storage, lifecycle, progressive disclosure, and same-corpus retrieval only. Hooks, timeline, observations, viewer capture, and automatic capture review need a Docker-contained hook/viewer runner before scoring. | ## Claims Allowed @@ -62,7 +62,7 @@ The ELF materialization artifact records: - qmd remains `not_encoded` for capture/write-policy jobs in the full live sweep. - agentmemory capture comparison is blocked by mocked/in-memory storage and lack of a durable local capture artifact. -- claude-mem capture breadth is untested until a Docker-contained hook/viewer capture +- claude-mem capture breadth is blocked until a Docker-contained hook/viewer capture runner exists. ## Claims Not Allowed diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index 6a63a1e1..4aa963e4 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -158,7 +158,7 @@ results, or lifecycle failures into one aggregate leaderboard. - Do not claim ELF broadly beats OpenMemory or claude-mem viewer UX from the narrow ELF/qmd operator-debug slice. - Do not claim ELF broadly beats agentmemory or claude-mem on capture breadth; the - current comparison is blocked or untested for their hook/viewer capture paths. + current comparison is blocked for their hook/viewer capture paths. - Do not claim ELF beats OpenViking on staged context trajectory. - Do not claim ELF beats Letta on core-vs-archival memory. - Do not claim graph/RAG parity from smoke-only evidence. diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index 4fb3b15e..40c4c53a 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -46,7 +46,7 @@ Current boundary: The current manifest has 23 adapter records across 16 external projects plus ELF. Evidence-class counts: 1 `fixture_backed`, 6 `live_baseline_only`, 5 `live_real_world`, and 11 `research_gate`. Overall adapter-status counts: 4 `pass`, -6 `wrong_result`, 1 `lifecycle_fail`, 5 `blocked`, and 7 `not_encoded`. +6 `wrong_result`, 1 `lifecycle_fail`, 6 `blocked`, and 6 `not_encoded`. ## State Taxonomy @@ -105,7 +105,7 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | Operator debugging | Fixture operator_debugging_ux passes, and the narrow live operator-debug slice passes for trace hydration, candidate-drop visibility, selected-but-not-narrated evidence, replay-command availability, and repair-action clarity. | qmd, claude-mem, OpenMemory. | qmd ties replay-command availability and repair-action clarity but is `wrong_result` for trace hydration, candidate-drop stage visibility, and selected-but-not-narrated evidence. XY-925 adds claude-mem progressive-disclosure and retrieval-repair prompt coverage, while claude-mem viewer/operator and OpenMemory UI/export remain blocked. | Add bounded OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | | Capture/write policy | Fixture capture_integration passes; ELF live capture_integration passes 4/4 with zero redaction leaks, source ids, write-policy audit, and evidence binding. | agentmemory, claude-mem. | agentmemory and claude-mem hook capture remain `blocked` until Docker-contained hook observations and write-policy/viewer readback artifacts exist. | Run durable agentmemory and claude-mem capture-hook jobs proving redaction, exclusion, evidence binding, source ids, and no secret leakage. | | Production ops | Fixture production_ops has 4 pass and 2 blocked; live production_ops is `blocked`; production adoption has provider/backfill/restore evidence. | ELF production gate, qmd, RAG/RAGFlow resource gates. | qmd live production_ops is `blocked`; RAG/resource gates are `research_gate` `blocked`. | Rerun private-corpus and credentialed gates only when operator-owned manifest and credentials exist. | -| Personalization | Fixture and live personalization pass. | mem0/OpenMemory, Letta. | mem0/OpenMemory and Letta personalization are `not_encoded`. | Encode scoped preference readback for mem0/OpenMemory and Letta before personalization superiority claims. | +| Personalization | Fixture and live personalization pass. | mem0/OpenMemory, Letta. | mem0/OpenMemory local OSS entity-scoped personalization now passes, so scoped preference behavior is a measured tie; OpenMemory UI/export remains blocked, hosted Platform export is non-goal, optional graph memory remains outside local OSS scoring, and Letta personalization is `research_gate` `not_encoded`. | Add OpenMemory product app import/export and contained Letta scoped-preference readback before broader personalization superiority claims. | | Context trajectory | ELF has trace direction but no comparable staged trajectory scenario. | OpenViking. | OpenViking setup is pinned, same-corpus retrieval is `wrong_result`, and staged/hierarchy/recursive trajectory jobs are encoded as `blocked`. | Make OpenViking evidence-bearing retrieval pass, then score staged context trajectory outputs. | | Core-vs-archival memory | ELF core-block semantics exist in the service contract, but comparative benchmark coverage is not encoded here. | Letta. | Letta is `research_gate` `not_encoded` until contained export proof exists. | Add ELF core-block versus archival-search jobs; compare Letta only after contained export proof. | | Graph/RAG navigation | ELF relation context is not enough to claim graph/RAG navigation parity. | RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, graphify. | RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain `research_gate` blocked/incomplete without explicit setup; graphify has only a tiny scored smoke `wrong_result`. | Run larger contained graph/RAG adapters with evidence-linked outputs before any ELF graph/RAG win, tie, or loss claim. | diff --git a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md index 0974dcb6..3174aeed 100644 --- a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md +++ b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md @@ -34,8 +34,8 @@ What is proven today: - ELF now has live capture/write-policy self-check evidence for redaction, exclusions, source ids, evidence binding, and no secret leakage. This is not a broad capture-hook win over agentmemory or claude-mem: agentmemory comparison is blocked - by mocked/in-memory storage, and claude-mem hook/viewer capture remains untested in - the Docker real-world job runner. + by mocked/in-memory storage, and claude-mem hook/viewer capture remains blocked + until Docker-contained hook/viewer evidence exists. - ELF is ahead on production-operation evidence among tracked systems because it has checked-in provider synthetic, stress, backfill, backup/restore, and Qdrant rebuild evidence. @@ -191,7 +191,7 @@ records `unique_project_names: 17` for the full project list including ELF. | Consolidation | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | Live proposal generation with lineage, confidence, and review-action audit. | | Knowledge pages | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | Live page rebuild/lint plus llm-wiki, gbrain, GraphRAG, and graphify comparisons. | | Operator debugging | Fixture aggregate passes; narrow ELF/qmd live operator-debug slice is scored with ELF `pass` and qmd `wrong_result`. | Narrow ELF/qmd live claim only: ELF wins trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence; replay-command and repair-action clarity are tied. | OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | -| Capture/write policy | Fixture aggregate passes; ELF live service adapter passes 4/4 capture jobs with zero redaction leaks; qmd is `not_encoded`; agentmemory is `blocked`; claude-mem is `not_encoded`. | ELF has live self-check evidence for redaction, exclusions, source ids, evidence binding, and no secret leakage. Against agentmemory/claude-mem capture breadth, the comparison remains blocked or untested. | Durable agentmemory and claude-mem capture-hook runners with evidence-bound output. | +| Capture/write policy | Fixture aggregate passes; ELF live service adapter passes 4/4 capture jobs with zero redaction leaks; qmd is `not_encoded`; agentmemory is `blocked`; claude-mem hook/viewer capture is `blocked`. | ELF has live self-check evidence for redaction, exclusions, source ids, evidence binding, and no secret leakage. Against agentmemory/claude-mem capture breadth, the comparison remains blocked until durable hook/viewer evidence exists. | Durable agentmemory and claude-mem capture-hook runners with evidence-bound output. | | Production ops | ELF has separate production-provider/backfill/restore evidence; live sweep is not a full production-ops pass. | Bounded personal-production adoption claim with caveats. | Private corpus manifest and credentialed provider gates. | | Personalization | ELF and qmd live pass one scoped preference job. | Narrow encoded pass only. | mem0/OpenMemory and Letta entity/preference history comparison. | | Context trajectory | Not comparable. | No claim. | OpenViking staged hierarchy/trajectory scoring. | @@ -216,7 +216,7 @@ Order these by decision value, not implementation convenience: 3. External capture-hook report for agentmemory and claude-mem - Why: ELF now has a live capture/write-policy self-check, but the strongest - agentmemory and claude-mem capture-breadth claims are still blocked or untested. + agentmemory and claude-mem capture-breadth claims are still blocked. - Output: durable local capture artifacts, source ids, redaction/exclusion audit, and typed blocker reasons when hooks or viewer capture cannot run in Docker. diff --git a/docs/research/2026-06-11-capture-write-policy-live-report.json b/docs/research/2026-06-11-capture-write-policy-live-report.json index a00e9a5e..574e1cc1 100644 --- a/docs/research/2026-06-11-capture-write-policy-live-report.json +++ b/docs/research/2026-06-11-capture-write-policy-live-report.json @@ -199,8 +199,8 @@ }, { "project": "claude-mem", - "position": "untested", - "reason": "Repository storage, lifecycle, progressive disclosure, and same-corpus retrieval are checked; hooks, timeline, observations, viewer capture, and automatic capture review are not run against real-world jobs." + "position": "blocked", + "reason": "Repository storage, lifecycle, progressive disclosure, and same-corpus retrieval are checked; hooks, timeline, observations, viewer capture, and automatic capture review need a Docker-contained hook/viewer runner before scoring." } ], "claim_boundary": { @@ -208,7 +208,7 @@ "ELF live capture/write-policy self-checks pass for redaction, exclusions, source ids, evidence binding, and no secret leakage.", "qmd remains not_encoded for capture/write-policy jobs in the full live sweep.", "agentmemory capture comparison is blocked by mocked/in-memory storage and lack of a durable local capture artifact.", - "claude-mem capture breadth is untested until a Docker-contained hook/viewer capture runner exists." + "claude-mem capture breadth is blocked until a Docker-contained hook/viewer capture runner exists." ], "not_allowed": [ "Do not claim ELF broadly beats agentmemory or claude-mem on capture breadth.", diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index cb69967b..149bb854 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -448,7 +448,7 @@ "issue": "XY-933", "priority": "P1", "state": "Live ELF self-check encoded", - "gap": "Capture/write-policy redaction, exclusion, source-id, evidence-binding, and no-leak scoring for ELF; durable agentmemory/claude-mem capture-hook comparison remains blocked or untested." + "gap": "Capture/write-policy redaction, exclusion, source-id, evidence-binding, and no-leak scoring for ELF; durable agentmemory/claude-mem capture-hook comparison remains blocked until Docker-contained hook/viewer evidence exists." }, { "issue": "XY-927", @@ -500,7 +500,7 @@ "Do not claim graph/RAG parity from smoke-only evidence.", "Do not promote fixture-backed, live_baseline_only, smoke_only, research_gate, blocked, wrong_result, lifecycle_fail, unsupported, or not_encoded states into a generic pass/fail score.", "Do not claim ELF broadly beats OpenMemory or claude-mem viewer UX from the narrow ELF/qmd operator-debug slice.", - "Do not claim ELF broadly beats agentmemory or claude-mem on capture breadth; the current comparison is blocked or untested for their hook/viewer capture paths." + "Do not claim ELF broadly beats agentmemory or claude-mem on capture breadth; the current comparison is blocked for their hook/viewer capture paths." ] } } diff --git a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json index 82ac877e..7233bf66 100644 --- a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json +++ b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json @@ -523,9 +523,9 @@ "scenario": "personalization", "current_elf_evidence": "ELF fixture-backed personalization passes and ELF live_real_world personalization passes.", "strongest_competitor_or_reference": "mem0/OpenMemory, Letta", - "current_competitor_evidence": "mem0/OpenMemory personalization is not_encoded and Letta personalization is research_gate not_encoded.", - "current_state": "ELF and qmd have live encoded evidence; personalization-specialized competitors are not yet comparable.", - "next_measurement": "Encode mem0/OpenMemory and Letta scoped-preference readback jobs before making personalization superiority claims." + "current_competitor_evidence": "mem0/OpenMemory local OSS entity-scoped personalization now passes; OpenMemory UI/export remains blocked, hosted Platform export is non-goal, optional graph memory remains outside local OSS scoring, and Letta personalization is research_gate not_encoded.", + "current_state": "ELF, qmd, and mem0 local OSS have measured scoped-preference evidence, so scoped personalization is a tie on the current surface; mem0 preference-correction history remains a separate ELF loss.", + "next_measurement": "Add OpenMemory product app import/export and contained Letta scoped-preference readback before making broader personalization superiority claims." }, { "scenario_id": "context_trajectory", From fc59da9bc56186b844608bccc4e1bf65a77c98f9 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 12 Jun 2026 02:47:29 +0800 Subject: [PATCH 6/7] {"schema":"decodex/commit/1","summary":"Type first-generation viewer blockers","authority":"XY-925"} --- README.md | 5 +- .../memory_projects_manifest.json | 2 +- .../tests/real_world_job_benchmark.rs | 54 +++++++++++++++++-- ...-11-competitor-strength-adoption-report.md | 5 +- ...elf-qmd-trace-replay-diagnostics-report.md | 6 ++- ...1-competitor-strength-adoption-report.json | 4 +- ...f-qmd-trace-replay-diagnostics-report.json | 4 +- 7 files changed, 65 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 22df99ec..3e7ec848 100644 --- a/README.md +++ b/README.md @@ -170,8 +170,9 @@ provider-backed ELF evidence was required. ELF passes trace hydration, candidate-drop visibility, selected-but-not-narrated evidence, replay-command availability, and repair-action clarity. qmd ties replay command and repair-action clarity but is `wrong_result` for trace hydration and - candidate-drop stage visibility. OpenMemory UI/export and claude-mem viewer flows - remain blocked or not encoded, so this is not a broad viewer-product claim. + candidate-drop stage visibility. OpenMemory UI/export remains blocked, and + claude-mem viewer flows remain blocked until Docker-contained hook/viewer evidence + exists, so this is not a broad viewer-product claim. - First-generation OSS continuity/source-store follow-up after XY-925: `cargo make real-world-first-generation-oss` emits a fixture-backed external-adapter slice for agentmemory, memsearch, and claude-mem with 6 jobs, 4 pass, 2 blocked, and full diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 61fbcf7f..1189ec5f 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -595,7 +595,7 @@ "status": "pass", "elf_position": "ties", "comparison_outcome": "tie", - "evidence": "ELF and qmd generated clear repair/replay steps for the narrow operator-debug jobs; OpenMemory and claude-mem UI repair paths remain blocked or not encoded.", + "evidence": "ELF and qmd generated clear repair/replay steps for the narrow operator-debug jobs; OpenMemory UI/export remains blocked, and claude-mem UI repair paths remain blocked until Docker-contained hook/viewer evidence exists.", "command": "cargo make real-world-job-operator-ux-live-adapters", "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" }, diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 792ffef4..2ee9d46a 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -78,6 +78,10 @@ fn workspace_root() -> Result { Ok(root.to_path_buf()) } +fn collapse_whitespace(text: &str) -> String { + text.split_whitespace().collect::>().join(" ") +} + fn strength_profile_report_path() -> Result { Ok(workspace_root()? .join("docs") @@ -1581,11 +1585,8 @@ fn capture_write_policy_live_report_preserves_competitor_boundaries() -> Result< assert!(markdown.contains("Do not claim ELF broadly beats agentmemory or claude-mem")); assert!(benchmarking_index.contains("2026-06-11-capture-write-policy-live-report.md")); assert!(readme.contains("Capture/Write-Policy Live Report - June 11, 2026")); - - let readme_normalized = readme.split_whitespace().collect::>().join(" "); - assert!( - readme_normalized + collapse_whitespace(&readme) .contains("claude-mem hook/viewer capture remains blocked until Docker-contained") ); @@ -2017,6 +2018,7 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> { "wrong_result, incomplete, blocked, and not_encoded states remain visible", "broader live suites remain `wrong_result`, `incomplete`, or `not_encoded`", "The qmd live real-world slice covers representative jobs only", + "blocked or not encoded", ] { assert!(!measurement_audit.contains(stale_phrase)); assert!(!competitor_matrix.contains(stale_phrase)); @@ -2121,6 +2123,15 @@ fn qmd_trace_replay_diagnostics_report_preserves_claim_boundaries() -> Result<() assert!(benchmarking_index.contains("qmd top-10/replay artifact")); assert!(benchmarking_index.contains("ELF trace/admin surfaces")); assert!(adoption_report.contains("| Retrieval quality and local debug UX | `loss` |")); + + assert_trace_replay_viewer_blocker_boundaries( + &readme, + &markdown, + &adoption_report, + &report, + &adoption_json, + )?; + assert!( adoption_report .contains("Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF") @@ -2265,6 +2276,41 @@ fn assert_trace_replay_diagnostics_markdown(markdown: &str) { assert!(markdown.contains("Do not score rerank superiority from a qmd `--no-rerank` run")); } +fn assert_trace_replay_viewer_blocker_boundaries( + readme: &str, + markdown: &str, + adoption_report: &str, + report: &Value, + adoption_json: &Value, +) -> Result<()> { + let checked_surfaces = [ + collapse_whitespace(readme), + collapse_whitespace(markdown), + collapse_whitespace(adoption_report), + report.to_string(), + adoption_json.to_string(), + ]; + + for surface in checked_surfaces { + assert!(!surface.contains("blocked or not encoded")); + } + + assert!( + collapse_whitespace(readme) + .contains("claude-mem viewer flows remain blocked until Docker-contained") + ); + assert!( + collapse_whitespace(markdown) + .contains("claude-mem UI repair paths remain blocked until Docker-contained") + ); + assert!( + collapse_whitespace(adoption_report) + .contains("claude-mem viewer workflows remain blocked until Docker-contained") + ); + + Ok(()) +} + fn assert_trace_replay_adoption_json(adoption: &Value) -> Result<()> { let local_debug = find_by_field( array_at(adoption, "/scenario_outcomes")?, diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index 4aa963e4..5636fc71 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -48,7 +48,8 @@ The remaining caveats are material: ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory - UI/export and claude-mem viewer workflows remain blocked or not encoded. XY-925 + UI/export remains blocked and claude-mem viewer workflows remain blocked until + Docker-contained hook/viewer evidence exists. XY-925 now adds fixture-backed first-generation OSS prompt coverage and typed blockers for agentmemory durable continuity, memsearch Markdown source-store/debug jobs, and claude-mem progressive-disclosure, retrieval-repair, hook, and viewer/operator @@ -97,7 +98,7 @@ results, or lifecycle failures into one aggregate leaderboard. | Scenario | ELF outcome | Evidence classes | Measured claim | Follow-up | | --- | --- | --- | --- | --- | | Source-of-truth rebuild and evidence-bound writes | `win` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF has the strongest measured source-of-truth and rebuild story: Postgres is authoritative, Qdrant is rebuildable, trust-source jobs pass, and production restore/rebuild proof exists. | None | -| Work resume and coding-agent continuity | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF and qmd both pass encoded live `work_resume` jobs. XY-925 selects agentmemory's next durable local path but keeps it blocked until the SDK KV/index and observation log survive a fresh process; claude-mem and OpenViking continuity strengths remain blocked or not encoded. | XY-928 | +| Work resume and coding-agent continuity | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF and qmd both pass encoded live `work_resume` jobs. XY-925 selects agentmemory's next durable local path but keeps it blocked until the SDK KV/index and observation log survive a fresh process; claude-mem work_resume remains `not_encoded`, and OpenViking continuity trajectory remains `blocked`. | XY-928 | | Project decisions and reversals | `tie` | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF and qmd both pass encoded `project_decisions` jobs; Letta-style core/archival decision memory is not tested. | XY-927 | | Retrieval quality | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF and qmd both pass encoded live retrieval and stress/same-corpus retrieval evidence. | XY-923 | | Retrieval quality and local debug UX | `loss` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested. | XY-923 | diff --git a/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md b/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md index aa6213ae..189566c2 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md +++ b/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md @@ -69,7 +69,7 @@ This is not a broad qmd-over-ELF claim. It is a scored local-debug artifact gap. | Operator-debug trace hydration | `live_real_world` | `pass` | `win` | ELF live operator-debug jobs generate trace ids, viewer URLs, admin trace-bundle URLs, and `trace_available=true`; qmd generates local replay commands but no service trace hydration surface. | | Operator-debug replay command availability | `live_real_world` | `pass` | `tie` | ELF emits admin trace-bundle curl commands and qmd emits local CLI query replay commands for the same operator-debugging scenarios; this scores command availability, not equivalent UI quality. | | Operator-debug candidate-drop visibility | `live_real_world` | `pass` | `win` | ELF exposes dropped-candidate visibility through generated operator-debug metadata without direct SQL assumptions; qmd exposes top-k replay rows but no intermediate candidate-drop stages in this slice. | -| Operator-debug repair-action clarity | `live_real_world` | `pass` | `tie` | Both live operator-debug adapters emit concrete next steps for replay or trace-bundle inspection; OpenMemory and claude-mem UI repair paths remain blocked or not encoded. | +| Operator-debug repair-action clarity | `live_real_world` | `pass` | `tie` | Both live operator-debug adapters emit concrete next steps for replay or trace-bundle inspection; OpenMemory UI/export remains blocked, and claude-mem UI repair paths remain blocked until Docker-contained hook/viewer evidence exists. | | Operator-debug selected-but-not-narrated evidence | `live_real_world` | `pass` | `win` | The operator-debug slice now scores selected-but-not-narrated evidence as a trace/answer-composition repair surface without direct database inspection. | | Query expansion attribution | `research_gate` | `not_encoded` | `not_tested` | No comparable artifact shows expansion variants or dynamic expansion decisions for both systems. | | Dense/sparse channel attribution | `research_gate` | `not_encoded` | `not_tested` | ELF uses dense plus BM25 and qmd uses structured `lex:` plus `vec:`, but the scored artifacts do not expose comparable per-channel contribution. | @@ -139,7 +139,9 @@ Not allowed: - Do not score rerank superiority from a qmd `--no-rerank` run. - Do not collapse `not_tested`, `non_goal`, or `wrong_result` into pass evidence. - Do not convert the XY-932 operator-debug trace slice into a broad viewer-product win - over OpenMemory or claude-mem; those UI paths remain blocked or not encoded. + over OpenMemory or claude-mem; OpenMemory UI/export remains blocked, and + claude-mem UI repair paths remain blocked until Docker-contained hook/viewer + evidence exists. ## Follow-Up Gate diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index 149bb854..7bb448bd 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -12,7 +12,7 @@ "Live temporal reconciliation remains wrong_result for five of six memory_evolution jobs.", "Private-corpus production quality is blocked until an operator-owned manifest exists.", "Credentialed provider production-ops gates are blocked until explicit provider setup exists.", - "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and Letta core-vs-archival memory plus graph/RAG navigation remain unproven. XY-928 encodes OpenViking staged trajectory, hierarchy selection, and recursive/context expansion as blocked fixtures behind same-corpus evidence output and missing staged artifacts. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export and claude-mem viewer workflows remain blocked or not encoded. XY-925 adds fixture-backed first-generation OSS prompt coverage and typed blockers for agentmemory durable continuity, memsearch Markdown source-store/debug jobs, and claude-mem progressive-disclosure, retrieval-repair, hook, and viewer/operator surfaces without creating live external real-world suite passes. XY-933 adds an ELF live capture/write-policy self-check, but agentmemory and claude-mem hook-capture breadth remain blocked until Docker-contained hook/viewer evidence exists." + "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and Letta core-vs-archival memory plus graph/RAG navigation remain unproven. XY-928 encodes OpenViking staged trajectory, hierarchy selection, and recursive/context expansion as blocked fixtures behind same-corpus evidence output and missing staged artifacts. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export remains blocked and claude-mem viewer workflows remain blocked until Docker-contained hook/viewer evidence exists. XY-925 adds fixture-backed first-generation OSS prompt coverage and typed blockers for agentmemory durable continuity, memsearch Markdown source-store/debug jobs, and claude-mem progressive-disclosure, retrieval-repair, hook, and viewer/operator surfaces without creating live external real-world suite passes. XY-933 adds an ELF live capture/write-policy self-check, but agentmemory and claude-mem hook-capture breadth remain blocked until Docker-contained hook/viewer evidence exists." ] }, "evidence_class_terms": [ @@ -121,7 +121,7 @@ "blocked", "not_encoded" ], - "measured_claim": "ELF and qmd both pass the encoded live work_resume jobs. XY-925 selects agentmemory's durable local path but keeps it blocked until the SDK KV/index and observation log survive a fresh process; claude-mem and OpenViking continuity strengths remain blocked or not encoded.", + "measured_claim": "ELF and qmd both pass the encoded live work_resume jobs. XY-925 selects agentmemory's durable local path but keeps it blocked until the SDK KV/index and observation log survive a fresh process; claude-mem work_resume remains not_encoded, and OpenViking continuity trajectory remains blocked.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md", diff --git a/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json b/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json index 42c22615..84a38938 100644 --- a/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json +++ b/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json @@ -199,7 +199,7 @@ "elf_status": "pass", "qmd_status": "pass", "outcome": "tie", - "diagnostic_judgment": "Both live operator-debug adapters emit concrete next steps for replay or trace-bundle inspection; OpenMemory and claude-mem UI repair paths remain blocked or not encoded.", + "diagnostic_judgment": "Both live operator-debug adapters emit concrete next steps for replay or trace-bundle inspection; OpenMemory UI/export remains blocked, and claude-mem UI repair paths remain blocked until Docker-contained hook/viewer evidence exists.", "artifacts": [ "tmp/real-world-job/operator-ux-live-adapters/summary.json" ] @@ -364,6 +364,6 @@ "Do not collapse not_tested, non_goal, or wrong_result into pass evidence.", "ELF narrowly wins the live operator-debug trace hydration and candidate-drop visibility slice against qmd; qmd still ties replay-command and repair-action clarity.", "Expansion, dense/sparse contribution, fusion, rerank-on quality, and broad retrieved-but-dropped diagnosis outside the operator-debug slice remain unproven.", - "Do not convert the XY-932 operator-debug trace slice into a broad viewer-product win over OpenMemory or claude-mem; those UI paths remain blocked or not encoded." + "Do not convert the XY-932 operator-debug trace slice into a broad viewer-product win over OpenMemory or claude-mem; OpenMemory UI/export remains blocked, and claude-mem UI repair paths remain blocked until Docker-contained hook/viewer evidence exists." ] } From eeb5595e3d3a3d84bb6f5d1ab590e44441ade6f0 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 12 Jun 2026 03:13:11 +0800 Subject: [PATCH 7/7] {"schema":"decodex/commit/1","summary":"Use local tokenizer for context e2e harness","authority":"XY-925"} --- scripts/context-misranking-harness.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/context-misranking-harness.sh b/scripts/context-misranking-harness.sh index 3290fdef..578f09a5 100755 --- a/scripts/context-misranking-harness.sh +++ b/scripts/context-misranking-harness.sh @@ -205,7 +205,7 @@ min_importance = 0.0 enabled = true max_tokens = 512 overlap_tokens = 128 -tokenizer_repo = "gpt2" +tokenizer_repo = "config/local/tokenizer.wordlevel.json" [search.expansion] include_original = true