From c19aa478c199eded5dbdc7ea6344c1149cae3229 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Thu, 11 Jun 2026 23:48:27 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add live capture write-policy scoring","authority":"XY-933"} --- Cargo.lock | 1 + README.md | 28 +- apps/elf-eval/Cargo.toml | 1 + .../memory_projects_manifest.json | 45 +- .../redaction_exclusion.json | 33 + .../source_id_evidence_binding.json | 187 ++++ .../write_policy_redaction.json | 203 +++++ .../capture_integration_boundaries.json | 5 + .../src/bin/real_world_live_adapter.rs | 833 +++++++++++++++--- .../tests/real_world_job_benchmark.rs | 247 +++++- ...-06-11-capture-write-policy-live-report.md | 75 ++ ...-11-competitor-strength-adoption-report.md | 19 +- ...-11-competitor-strength-evidence-matrix.md | 19 +- ...on-direction-from-competitor-benchmarks.md | 61 +- .../2026-06-11-measurement-coverage-audit.md | 84 +- docs/guide/benchmarking/index.md | 4 + .../real_world_agent_memory_benchmark.md | 37 +- ...6-11-capture-write-policy-live-report.json | 220 +++++ ...1-competitor-strength-adoption-report.json | 35 +- ...2026-06-11-measurement-coverage-audit.json | 67 +- ...-11-xy-897-competitor-strength-matrix.json | 29 +- .../real_world_agent_memory_benchmark_v1.md | 12 + 22 files changed, 1945 insertions(+), 300 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_memory/capture_integration/source_id_evidence_binding.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/capture_integration/write_policy_redaction.json create mode 100644 docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md create mode 100644 docs/research/2026-06-11-capture-write-policy-live-report.json diff --git a/Cargo.lock b/Cargo.lock index 512b2d80..5c820659 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1035,6 +1035,7 @@ dependencies = [ "elf-chunking", "elf-cli", "elf-config", + "elf-domain", "elf-service", "elf-storage", "elf-testkit", diff --git a/README.md b/README.md index 8261bf13..414723df 100644 --- a/README.md +++ b/README.md @@ -149,19 +149,20 @@ provider-backed ELF evidence was required. mem0, OpenViking, and claude-mem remained typed non-pass states. OpenViking now reaches its pinned Docker local embedding path and is reported as `wrong_result` when same-corpus evidence terms are missed; setup failures remain `incomplete`. -- Real-world agent memory aggregate after the P1 benchmark batch: 38 fixture-backed - jobs across 11 suites, 36 pass, 0 incomplete, 2 blocked, 0 wrong-result, +- Real-world agent memory aggregate after the P1 benchmark batch: 40 fixture-backed + jobs across 11 suites, 38 pass, 0 incomplete, 2 blocked, 0 wrong-result, 0 not-encoded, and 0 unsupported-claim results. The remaining non-pass jobs are production-ops operator boundaries, not hidden benchmark wins. - Full-suite live real-world adapter sweep after XY-899: ELF and qmd emit - Docker-isolated `live_real_world` records for all 38 encoded jobs across 11 suites + Docker-isolated `live_real_world` records for all 40 encoded jobs across 11 suites through `cargo make real-world-memory-live-adapters`. Both keep the original targeted `work_resume`, `retrieval`, and `project_decisions` slice passing, but the - full sweep is not a full-suite pass. The fresh ELF sweep reports 18 pass, - 5 wrong_result, 2 blocked, and 13 not_encoded jobs. The fresh qmd sweep reports - 17 pass, 6 wrong_result, 2 blocked, and 13 not_encoded jobs. The difference is the - delete/TTL tombstone case; qmd remains the local retrieval-debug UX reference, and - no broad ELF-over-qmd claim is allowed. + full sweep is not a full-suite pass. The fresh ELF sweep reports 22 pass, + 5 wrong_result, 2 blocked, and 11 not_encoded jobs. The fresh qmd sweep reports + 17 pass, 6 wrong_result, 2 blocked, and 15 not_encoded jobs. The differences are + the delete/TTL tombstone case plus ELF-only capture/write-policy live self-checks; + qmd remains the local retrieval-debug UX reference, and no broad ELF-over-qmd claim + is allowed. - Live operator-debugging slice after XY-932: `cargo make real-world-job-operator-ux-live-adapters` emits narrow Docker-isolated `live_real_world` records for ELF and qmd over the operator-debugging fixtures. @@ -194,6 +195,12 @@ provider-backed ELF evidence was required. for local SDK export-style parity, `blocked` for OpenMemory UI/export, and `non_goal` for hosted Platform export and optional graph memory in the local OSS lane. +- Capture/write-policy live follow-up after XY-933: ELF now passes 4/4 live + `capture_integration` jobs with zero redaction leaks, source ids preserved in + source refs, write-policy redaction audit counts, evidence binding, and no secret + leakage. qmd remains `not_encoded` for this suite. agentmemory capture comparison is + blocked by mocked/in-memory storage, and claude-mem hook/viewer capture remains + untested, so no broad capture-breadth superiority claim is allowed. - The benchmark runner and report publisher are checked in and Docker-isolated: `cargo make baseline-live-docker`, `cargo make baseline-backfill-docker`, `cargo make baseline-production-private-addendum`, @@ -216,6 +223,7 @@ Detailed evidence and interpretation: - [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md) - [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) - [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md) +- [Capture/Write-Policy Live Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Single-User Production Runbook](docs/guide/single_user_production.md) - Benchmark contract: @@ -238,7 +246,8 @@ Evidence-backed position after the June 11 real-world reports: typed non-pass states, while ELF has the stronger service and provenance contract. - ELF is still behind or not yet proven on full-suite live real-world pass parity, private-corpus production quality, credentialed production-ops gates, - qmd-style local debug knobs, agentmemory/claude-mem/OpenMemory-style continuity UX, + qmd-style local debug knobs, agentmemory/claude-mem/OpenMemory-style capture and + continuity UX, OpenViking-style context trajectory, and hosted managed memory. Quick comparison snapshot (objective/high-level). @@ -292,6 +301,7 @@ Detailed comparison, mechanism-level analysis, and source map: - [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md) - [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) - [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md) +- [Capture/Write-Policy Live Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Real-World Agent Memory Benchmark](docs/guide/benchmarking/real_world_agent_memory_benchmark.md) - [External Memory Improvement Plan](docs/guide/research/external_memory_improvement_plan.md) diff --git a/apps/elf-eval/Cargo.toml b/apps/elf-eval/Cargo.toml index 6f676ad9..5e0d8baa 100644 --- a/apps/elf-eval/Cargo.toml +++ b/apps/elf-eval/Cargo.toml @@ -22,6 +22,7 @@ uuid = { workspace = true } elf-chunking = { workspace = true } elf-cli = { workspace = true } elf-config = { workspace = true } +elf-domain = { workspace = true } elf-service = { workspace = true } elf-storage = { workspace = true } elf-testkit = { workspace = true } diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 2832b202..10acb39e 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -29,7 +29,7 @@ }, "run": { "status": "blocked", - "evidence": "The current fixture set reports 38 jobs, 36 pass, 0 incomplete, 2 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim.", + "evidence": "The current fixture set reports 40 jobs, 38 pass, 0 incomplete, 2 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim.", "command": "cargo make real-world-memory", "artifact": "tmp/real-world-memory/real-world-memory-report.json" }, @@ -99,7 +99,7 @@ { "suite_id": "capture_integration", "status": "pass", - "evidence": "The redaction and capture-boundary fixture is encoded and passing." + "evidence": "Four redaction, exclusion, source-id, evidence-binding, and capture-boundary fixtures are encoded and passing." }, { "suite_id": "production_ops", @@ -146,13 +146,13 @@ }, "run": { "status": "wrong_result", - "evidence": "ELF materializes 38 real_world_job adapter_response objects through ElfService, worker indexing, and search_raw before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records.", + "evidence": "ELF materializes 40 real_world_job adapter_response objects through ElfService, worker indexing, search_raw, and live capture/write-policy ingestion before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records.", "command": "cargo make real-world-memory-live-adapters", "artifact": "tmp/real-world-memory/live-adapters/elf-report.json" }, "result": { "status": "wrong_result", - "evidence": "The fresh full live sweep scores 38 jobs across all 11 encoded suites: 18 pass, 5 wrong_result, 0 incomplete, 2 blocked, and 13 not_encoded. This is not a full-suite live pass.", + "evidence": "The fresh full live sweep scores 40 jobs across all 11 encoded suites: 22 pass, 5 wrong_result, 0 incomplete, 2 blocked, and 11 not_encoded. This is not a full-suite live pass.", "command": "cargo make real-world-memory-live-adapters", "artifact": "tmp/real-world-memory/live-adapters/elf-report.md" }, @@ -175,7 +175,7 @@ { "capability": "full_suite_live_sweep", "status": "wrong_result", - "evidence": "The runner now emits per-job and per-suite live records for all 38 encoded jobs, but memory_evolution is wrong_result and several non-answer-generation suites remain typed non-pass." + "evidence": "The runner now emits per-job and per-suite live records for all 40 encoded jobs, but memory_evolution is wrong_result and several non-answer-generation suites remain typed non-pass." }, { "capability": "full_suite_live_pass", @@ -231,8 +231,8 @@ }, { "suite_id": "capture_integration", - "status": "not_encoded", - "evidence": "The live adapter sweep does not exercise capture integrations or write-policy redaction boundaries." + "status": "pass", + "evidence": "The live adapter passes 4/4 capture_integration jobs through Docker-local ELF ingestion, including capture-boundary classification, excluded evidence ids, source ids in source_ref, write_policy redaction audit counts, evidence binding, and zero secret leakage." }, { "suite_id": "production_ops", @@ -245,6 +245,18 @@ "evidence": "The live adapter retrieved the scoped preference evidence and passed the personalization job." } ], + "scenarios": [ + { + "scenario_id": "live_capture_write_policy", + "suite_id": "capture_integration", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. This is an ELF self-check, not a win over external hook systems.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + } + ], "evidence": [ { "kind": "fixture_dir", @@ -359,13 +371,13 @@ }, "run": { "status": "wrong_result", - "evidence": "qmd materializes 38 real_world_job adapter_response objects through collection add, update, embed, and query --json before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records.", + "evidence": "qmd materializes 40 real_world_job adapter_response objects through collection add, update, embed, and query --json before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records.", "command": "cargo make real-world-memory-live-adapters", "artifact": "tmp/real-world-memory/live-adapters/qmd-report.json" }, "result": { "status": "wrong_result", - "evidence": "The fresh full qmd live sweep scores 38 jobs across all 11 encoded suites: 17 pass, 6 wrong_result, 0 incomplete, 2 blocked, and 13 not_encoded. This is not a full-suite live pass.", + "evidence": "The fresh full qmd live sweep scores 40 jobs across all 11 encoded suites: 17 pass, 6 wrong_result, 0 incomplete, 2 blocked, and 15 not_encoded. This is not a full-suite live pass.", "command": "cargo make real-world-memory-live-adapters", "artifact": "tmp/real-world-memory/live-adapters/qmd-report.md" }, @@ -388,7 +400,7 @@ { "capability": "full_suite_live_sweep", "status": "wrong_result", - "evidence": "The runner now emits per-job and per-suite live records for all 38 encoded jobs, but memory_evolution is wrong_result and several non-answer-generation suites remain typed non-pass." + "evidence": "The runner now emits per-job and per-suite live records for all 40 encoded jobs, but memory_evolution is wrong_result and several non-answer-generation suites remain typed non-pass." }, { "capability": "full_suite_live_pass", @@ -445,7 +457,7 @@ { "suite_id": "capture_integration", "status": "not_encoded", - "evidence": "The qmd live adapter sweep does not exercise capture integrations or write-policy redaction boundaries." + "evidence": "The qmd live adapter sweep does not exercise capture integrations or write-policy redaction boundaries; all capture_integration jobs remain typed not_encoded for qmd." }, { "suite_id": "production_ops", @@ -838,6 +850,15 @@ "elf_position": "untested", "evidence": "agentmemory's relevant strength is durable coding-agent continuity and capture, but the Docker harness has not proven a persistent session/capture path. Keep work_resume and capture claims blocked until a durable local adapter path exists.", "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "capture_write_policy_hooks", + "suite_id": "capture_integration", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "agentmemory capture breadth is blocked for comparison because the current Docker baseline uses a process-local StateKV Map and in-memory index; no durable local session/capture path stores source ids, exclusions, write-policy audit, or evidence-bound capture output.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" } ], "evidence": [ @@ -1353,7 +1374,7 @@ "suite_id": "capture_integration", "status": "not_encoded", "elf_position": "untested", - "evidence": "The Docker baseline uses repository classes only. claude-mem hooks, viewer, timeline, and observation workflows are not executed by the runner.", + "evidence": "The Docker baseline uses repository classes only. claude-mem hooks, timeline, observations, viewer capture, and automatic capture review workflows are not executed by the runner, so capture breadth remains untested rather than an ELF win/loss.", "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" } ], diff --git a/apps/elf-eval/fixtures/real_world_memory/capture_integration/redaction_exclusion.json b/apps/elf-eval/fixtures/real_world_memory/capture_integration/redaction_exclusion.json index 1d06cb0a..6e5f0e9b 100644 --- a/apps/elf-eval/fixtures/real_world_memory/capture_integration/redaction_exclusion.json +++ b/apps/elf-eval/fixtures/real_world_memory/capture_integration/redaction_exclusion.json @@ -6,11 +6,34 @@ "corpus": { "corpus_id": "real-world-memory-capture-2026-06-09", "profile": "synthetic", + "capture_behaviors": { + "real": [ + "ELF live add_note capture can persist public evidence with source ids and skip excluded evidence ids through the Docker live adapter." + ], + "fixture_backed": [ + "The fixture encodes public capture, write-policy audit evidence, and a private excluded span as a negative trap." + ], + "blocked": [ + "agentmemory hook breadth remains blocked until a durable local session/capture adapter replaces the in-memory mock.", + "claude-mem hook and viewer capture remain not encoded because the Docker baseline does not execute hooks, timeline, or viewer workflows." + ], + "not_encoded": [ + "Host-global capture hooks, Slack/browser capture, and credentialed tool capture are outside this Docker-scoped benchmark." + ], + "notes": [ + "Live ELF scoring must not store or retrieve the private-excluded-text evidence id." + ] + }, "items": [ { "evidence_id": "public-captured-decision", "kind": "message", "text": "Public memory candidate: benchmark reports should include evidence coverage, unsupported claim count, stale retrieval count, scope correctness, and redaction leak count.", + "capture": { + "action": "store", + "source_id": "capture:linear-comment-933", + "evidence_binding": "source_ref" + }, "source_ref": { "schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", @@ -28,6 +51,11 @@ "evidence_id": "write-policy-audit", "kind": "trace", "text": "Write policy audit: private spans were excluded before evidence binding, and the stored note only retained the public benchmark-report requirement.", + "capture": { + "action": "store", + "source_id": "capture:write-policy-audit-933", + "evidence_binding": "source_ref" + }, "source_ref": { "schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", @@ -45,6 +73,11 @@ "evidence_id": "private-excluded-text", "kind": "message", "text": "Private excluded text: operator's home server label is cobalt-ridge.", + "capture": { + "action": "exclude", + "source_id": "capture:excluded-private-span-933", + "evidence_binding": "negative_trap" + }, "source_ref": { "schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", diff --git a/apps/elf-eval/fixtures/real_world_memory/capture_integration/source_id_evidence_binding.json b/apps/elf-eval/fixtures/real_world_memory/capture_integration/source_id_evidence_binding.json new file mode 100644 index 00000000..1d7bf424 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/capture_integration/source_id_evidence_binding.json @@ -0,0 +1,187 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "capture-source-id-binding-001", + "suite": "capture_integration", + "title": "Preserve source ids and evidence binding during live capture", + "corpus": { + "corpus_id": "real-world-memory-capture-2026-06-11", + "profile": "synthetic", + "capture_behaviors": { + "real": [ + "ELF live add_note capture stores source_id values in source_ref and returns evidence-bound notes through search_raw." + ], + "blocked": [ + "agentmemory host-global capture hooks are not installed; durable capture breadth remains blocked until a Docker-local session path exists.", + "claude-mem hook/viewer capture breadth remains not encoded in the Docker baseline." + ], + "notes": [ + "This job is a source-id and evidence-binding check, not a host-global hook installation." + ] + }, + "items": [ + { + "evidence_id": "source-id-release-summary", + "kind": "message", + "text": "Public capture: The source id capture:issue-comment-42 is bound to the release-summary requirement. Public audit: source ids remained attached to evidence-bound notes.", + "capture": { + "action": "store", + "source_id": "capture:issue-comment-42", + "evidence_binding": "source_ref" + }, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "source_id_evidence_binding", + "evidence_id": "source-id-release-summary" + }, + "locator": { + "quote": "source ids remained attached to evidence-bound notes" + } + }, + "created_at": "2026-06-11T04:10:00Z" + }, + { + "evidence_id": "source-id-command-log", + "kind": "trace", + "text": "Public capture: command log source id capture:command-log-7 proves the benchmark ran inside Docker and did not require host-global hooks.", + "capture": { + "action": "store", + "source_id": "capture:command-log-7", + "evidence_binding": "source_ref" + }, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "source_id_evidence_binding", + "evidence_id": "source-id-command-log" + }, + "locator": { + "quote": "did not require host-global hooks" + } + }, + "created_at": "2026-06-11T04:11:00Z" + } + ], + "adapter_response": { + "adapter_id": "elf_real_world_memory_fixture", + "answer": { + "content": "The release-summary requirement is bound to source id capture:issue-comment-42, and source ids remained attached to evidence-bound notes. The command-log evidence says the benchmark ran inside Docker and did not require host-global hooks.", + "claims": [ + { + "claim_id": "source_id_bound", + "text": "The source id capture:issue-comment-42 is bound to the release-summary requirement.", + "evidence_ids": ["source-id-release-summary"], + "confidence": "high" + }, + { + "claim_id": "docker_no_host_hooks", + "text": "The benchmark ran inside Docker and did not require host-global hooks.", + "evidence_ids": ["source-id-command-log"], + "confidence": "high" + } + ], + "evidence_ids": ["source-id-release-summary", "source-id-command-log"], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "source-id-capture-import", + "ts": "2026-06-11T04:10:00Z", + "actor": "agent", + "action": "captured_source_ids", + "evidence_ids": ["source-id-release-summary", "source-id-command-log"], + "summary": "The capture path recorded source ids in evidence-bound source refs." + } + ], + "prompt": { + "role": "user", + "content": "Which source ids were preserved by live capture, and did the benchmark require host-global hooks?", + "job_mode": "answer", + "constraints": ["cite_evidence", "preserve_source_ids", "do_not_claim_host_global_hooks"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "source_id_bound", + "text": "The source id capture:issue-comment-42 is bound to the release-summary requirement." + }, + { + "claim_id": "docker_no_host_hooks", + "text": "The benchmark ran inside Docker and did not require host-global hooks." + } + ], + "must_not_include": [ + "host-global hooks were installed", + "agentmemory hooks passed", + "claude-mem hooks passed" + ], + "evidence_links": { + "source_id_bound": ["source-id-release-summary"], + "docker_no_host_hooks": ["source-id-command-log"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "source-id-release-summary", + "claim_id": "source_id_bound", + "requirement": "cite", + "quote": "source ids remained attached to evidence-bound notes" + }, + { + "evidence_id": "source-id-command-log", + "claim_id": "docker_no_host_hooks", + "requirement": "cite", + "quote": "did not require host-global hooks" + } + ], + "negative_traps": [], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Reports preserved source ids and Docker-scoped capture." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites both source-id evidence records." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not claim host-global hooks or external hook passes." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Preserves source ids as evidence-bound capture metadata." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about external hook execution", + "missing evidence binding for a source-id claim" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "capture_integration", "source_id", "evidence_binding", "no_host_global_hooks"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/capture_integration/write_policy_redaction.json b/apps/elf-eval/fixtures/real_world_memory/capture_integration/write_policy_redaction.json new file mode 100644 index 00000000..fae8459b --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/capture_integration/write_policy_redaction.json @@ -0,0 +1,203 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "capture-write-policy-redaction-001", + "suite": "capture_integration", + "title": "Apply write-policy redaction before live capture storage", + "corpus": { + "corpus_id": "real-world-memory-capture-2026-06-11", + "profile": "synthetic", + "capture_behaviors": { + "real": [ + "ELF live add_note capture applies write_policy redactions before storage and records write-policy audit counts in materialization artifacts." + ], + "blocked": [ + "agentmemory hook breadth remains blocked by the current in-memory storage adapter.", + "claude-mem hook capture remains not encoded because hooks, timeline, observations, and viewer workflows are not executed by the Docker baseline." + ], + "not_encoded": [ + "Host-global automatic capture hooks are intentionally not installed by this benchmark." + ], + "notes": [ + "The synthetic private token label is a negative trap and must not appear in live generated answers." + ] + }, + "items": [ + { + "evidence_id": "redacted-source-message", + "kind": "message", + "text": "Public capture: The live capture report must preserve source id capture:terminal-log-17. Private token label: orchid-envelope. Public audit: the private token label was redacted before storage.", + "capture": { + "action": "store", + "source_id": "capture:terminal-log-17", + "evidence_binding": "source_ref", + "write_policy": { + "exclusions": [], + "redactions": [ + { + "kind": "replace", + "span": { + "start": 110, + "end": 125 + }, + "replacement": "[redacted-token]" + } + ] + } + }, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "write_policy_redaction", + "evidence_id": "redacted-source-message" + }, + "locator": { + "quote": "the private token label was redacted before storage" + } + }, + "created_at": "2026-06-11T04:00:00Z" + }, + { + "evidence_id": "redacted-private-token-trap", + "kind": "message", + "text": "orchid-envelope", + "capture": { + "action": "exclude", + "source_id": "capture:terminal-log-17-private-token", + "evidence_binding": "negative_trap" + }, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "write_policy_redaction", + "evidence_id": "redacted-private-token-trap" + } + }, + "created_at": "2026-06-11T04:00:01Z" + } + ], + "adapter_response": { + "adapter_id": "elf_real_world_memory_fixture", + "answer": { + "content": "The live capture report must preserve source id capture:terminal-log-17. The write-policy audit says the private token label was redacted before storage, and the private token value must not be repeated.", + "claims": [ + { + "claim_id": "source_id_preserved", + "text": "The live capture report must preserve source id capture:terminal-log-17.", + "evidence_ids": ["redacted-source-message"], + "confidence": "high" + }, + { + "claim_id": "redaction_applied", + "text": "The private token label was redacted before storage.", + "evidence_ids": ["redacted-source-message"], + "confidence": "high" + } + ], + "evidence_ids": ["redacted-source-message"], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "write-policy-redaction-applied", + "ts": "2026-06-11T04:00:00Z", + "actor": "agent", + "action": "captured_message_with_redaction", + "evidence_ids": ["redacted-source-message"], + "summary": "The capture path applied write_policy redaction before storage." + } + ], + "prompt": { + "role": "user", + "content": "What did live capture preserve, and was the private token repeated?", + "job_mode": "answer", + "constraints": ["cite_evidence", "do_not_repeat_private_text", "explain_redaction"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "source_id_preserved", + "text": "The live capture report must preserve source id capture:terminal-log-17." + }, + { + "claim_id": "redaction_applied", + "text": "The private token label was redacted before storage." + } + ], + "must_not_include": ["orchid-envelope"], + "evidence_links": { + "source_id_preserved": ["redacted-source-message"], + "redaction_applied": ["redacted-source-message"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "redacted-source-message", + "claim_id": "source_id_preserved", + "requirement": "cite", + "quote": "preserve source id capture:terminal-log-17" + }, + { + "evidence_id": "redacted-source-message", + "claim_id": "redaction_applied", + "requirement": "explain", + "quote": "the private token label was redacted before storage" + } + ], + "negative_traps": [ + { + "trap_id": "redacted-token-value", + "type": "privacy_leak", + "evidence_ids": ["redacted-private-token-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Reports the public source-id preservation requirement." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites the source message that was stored after write-policy redaction." + }, + "trap_avoidance": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Does not repeat the redacted private token value." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Honors write-policy redaction before live capture storage." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about capture source ids", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "capture_integration", "write_policy", "redaction", "source_id", "no_secret_leakage"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/work_resume/capture_integration_boundaries.json b/apps/elf-eval/fixtures/real_world_memory/work_resume/capture_integration_boundaries.json index f1354c7a..841ace1a 100644 --- a/apps/elf-eval/fixtures/real_world_memory/work_resume/capture_integration_boundaries.json +++ b/apps/elf-eval/fixtures/real_world_memory/work_resume/capture_integration_boundaries.json @@ -88,6 +88,11 @@ "evidence_id": "private-span-trap", "kind": "message", "text": "Excluded private span trap: [private-local-env] provider credential placeholder and local account-only browser state must not be stored or repeated.", + "capture": { + "action": "exclude", + "source_id": "capture:private-span-trap", + "evidence_binding": "negative_trap" + }, "source_ref": { "schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", diff --git a/apps/elf-eval/src/bin/real_world_live_adapter.rs b/apps/elf-eval/src/bin/real_world_live_adapter.rs index 0e6a621f..ddb018e5 100644 --- a/apps/elf-eval/src/bin/real_world_live_adapter.rs +++ b/apps/elf-eval/src/bin/real_world_live_adapter.rs @@ -18,15 +18,16 @@ use clap::{Parser, Subcommand, ValueEnum}; use color_eyre::{self, eyre}; use reqwest::RequestBuilder; use serde::{Deserialize, Serialize}; -use serde_json::{Map, Value}; +use serde_json::{self, Map}; use tokio::{task::JoinSet, time}; use uuid::Uuid; use elf_chunking::ChunkingConfig; use elf_config::{Config, EmbeddingProviderConfig, LlmProviderConfig, ProviderConfig}; +use elf_domain::writegate::{self, WritePolicy}; use elf_service::{ AddNoteInput, AddNoteRequest, BoxFuture, ElfService, EmbeddingProvider, ExtractorProvider, - PayloadLevel, Providers, RerankProvider, SearchRequest, + PayloadLevel, Providers, RerankProvider, SearchItem, SearchRequest, }; use elf_storage::{db::Db, qdrant::QdrantStore}; use elf_testkit::TestDatabase; @@ -139,7 +140,7 @@ struct LightragArgs { #[derive(Debug)] struct LoadedJob { path: PathBuf, - value: Value, + value: serde_json::Value, job: LiveJob, } @@ -169,6 +170,20 @@ struct LiveCorpusItem { evidence_id: String, text: Option, local_ref: Option, + #[serde(default)] + capture: LiveCapturePolicy, +} + +#[derive(Clone, Debug, Default, Deserialize)] +struct LiveCapturePolicy { + #[serde(default)] + action: LiveCaptureAction, + + source_id: Option, + + evidence_binding: Option, + + write_policy: Option, } #[derive(Debug, Deserialize)] @@ -181,7 +196,7 @@ struct LiveExpectedAnswer { #[serde(default)] must_include: Vec, #[serde(default)] - evidence_links: Map, + evidence_links: Map, } #[derive(Debug, Deserialize)] @@ -206,7 +221,7 @@ struct MaterializationEvidence { command_evidence: Vec, jobs: Vec, #[serde(skip_serializing_if = "Option::is_none")] - metadata: Option, + metadata: Option, } #[derive(Debug, Serialize)] @@ -236,6 +251,8 @@ struct MaterializedJobEvidence { source_mappings: Vec, #[serde(skip_serializing_if = "Option::is_none")] operator_debug: Option, + #[serde(skip_serializing_if = "Option::is_none")] + capture: Option, } #[derive(Clone, Debug, Serialize)] @@ -247,6 +264,44 @@ struct OperatorDebugMaterializationEvidence { raw_sql_needed: bool, } +#[derive(Clone, Debug, Default, Serialize)] +struct CaptureMaterializationEvidence { + stored_evidence_ids: Vec, + excluded_evidence_ids: Vec, + source_ids: Vec, + write_policy_audit_count: usize, + write_policy_exclusion_count: usize, + write_policy_redaction_count: usize, + #[serde(skip_serializing_if = "Vec::is_empty")] + runtime_source_refs: Vec, +} + +#[derive(Clone, Debug, Serialize)] +struct CaptureRuntimeSourceRefEvidence { + evidence_id: String, + source_ref: serde_json::Value, +} + +#[derive(Clone, Debug, Default)] +struct CaptureRuntimeEvidence { + items: Vec, +} +impl CaptureRuntimeEvidence { + fn item_for(&self, evidence_id: &str) -> Option<&CaptureRuntimeEvidenceItem> { + self.items.iter().find(|item| item.evidence_id == evidence_id) + } +} + +#[derive(Clone, Debug)] +struct CaptureRuntimeEvidenceItem { + evidence_id: String, + source_id: Option, + evidence_binding: Option, + write_policy_applied: bool, + capture_action: Option, + source_ref: serde_json::Value, +} + #[derive(Debug, Serialize)] struct AdapterResponseOutput { adapter_id: String, @@ -257,7 +312,7 @@ struct AdapterResponseOutput { struct AnswerOutput { content: String, evidence_ids: Vec, - claims: Vec, + claims: Vec, latency_ms: f64, cost: CostOutput, trace_explainability: TraceExplainabilityOutput, @@ -293,7 +348,7 @@ struct TraceStageOutput { struct MaterializedJob { response: AdapterResponseOutput, evidence: MaterializedJobEvidence, - operator_debug: Option, + operator_debug: Option, } #[derive(Debug)] @@ -306,8 +361,10 @@ struct MaterializedJobInput { trace_id: Option, failure: Option, source_mappings: Vec, - operator_debug: Option, + operator_debug: Option, operator_debug_evidence: Option, + capture: Option, + capture_failure: Option, } struct MaterializedOutput<'a> { @@ -319,13 +376,14 @@ struct MaterializedOutput<'a> { jobs: &'a [LoadedJob], materialized: &'a [MaterializedJob], command_evidence: Vec, - metadata: Option, + metadata: Option, } #[derive(Debug)] struct CorpusText { evidence_id: String, text: String, + capture: LiveCapturePolicy, } #[derive(Clone, Debug, Serialize)] @@ -399,8 +457,8 @@ impl ExtractorProvider for NoopExtractor { fn extract<'a>( &'a self, _cfg: &'a LlmProviderConfig, - _messages: &'a [Value], - ) -> BoxFuture<'a, elf_service::Result> { + _messages: &'a [serde_json::Value], + ) -> BoxFuture<'a, elf_service::Result> { Box::pin(async move { Ok(serde_json::json!({ "notes": [] })) }) } } @@ -411,6 +469,14 @@ struct SelectedEvidenceText { evidence_ids: Vec, } +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize)] +#[serde(rename_all = "snake_case")] +enum LiveCaptureAction { + #[default] + Store, + Exclude, +} + #[derive(Debug, Deserialize)] #[serde(untagged)] enum LiveExpectedClaim { @@ -637,7 +703,7 @@ fn materialize_qmd_job( log_path, )?; let latency_ms = started_at.elapsed().as_secs_f64() * 1_000.0; - let results = serde_json::from_str::(&stdout).map_err(|err| { + let results = serde_json::from_str::(&stdout).map_err(|err| { eyre::eyre!("qmd query did not return JSON for {}: {err}", loaded.job.job_id) })?; let entries = results.as_array().cloned().unwrap_or_default(); @@ -679,6 +745,8 @@ fn materialize_qmd_job( source_mappings: Vec::new(), operator_debug, operator_debug_evidence, + capture: None, + capture_failure: None, }, )) } @@ -724,6 +792,8 @@ fn lightrag_failure_jobs( source_mappings: Vec::new(), operator_debug: None, operator_debug_evidence: None, + capture: None, + capture_failure: None, }, ) }) @@ -755,22 +825,22 @@ fn write_lightrag_corpus( .collect() } -fn lightrag_index_failed(status: &Value) -> bool { - status.get("documents").and_then(Value::as_array).into_iter().flatten().any(|doc| { +fn lightrag_index_failed(status: &serde_json::Value) -> bool { + status.get("documents").and_then(serde_json::Value::as_array).into_iter().flatten().any(|doc| { doc.get("status") - .and_then(Value::as_str) + .and_then(serde_json::Value::as_str) .is_some_and(|status| status.to_ascii_lowercase().contains("fail")) }) } -fn lightrag_index_processed(status: &Value, expected_docs: usize) -> bool { - let Some(documents) = status.get("documents").and_then(Value::as_array) else { +fn lightrag_index_processed(status: &serde_json::Value, expected_docs: usize) -> bool { + let Some(documents) = status.get("documents").and_then(serde_json::Value::as_array) else { return false; }; documents.len() >= expected_docs && documents.iter().all(|doc| { - doc.get("status").and_then(Value::as_str).is_some_and(|status| { + doc.get("status").and_then(serde_json::Value::as_str).is_some_and(|status| { let normalized = status.to_ascii_lowercase(); normalized.contains("processed") || normalized.contains("success") @@ -785,18 +855,18 @@ fn lightrag_keywords(query: &str) -> Vec { fn lightrag_source_mappings( corpus: &[CorpusText], sources: &[LightragSource], - response: &Value, + response: &serde_json::Value, ) -> Vec { let mut mappings = Vec::new(); - if let Some(references) = response.get("references").and_then(Value::as_array) { + if let Some(references) = response.get("references").and_then(serde_json::Value::as_array) { for reference in references { mappings.push(lightrag_reference_mapping(corpus, sources, reference)); } } if mappings.is_empty() - && let Some(context) = response.get("response").and_then(Value::as_str) + && let Some(context) = response.get("response").and_then(serde_json::Value::as_str) { let evidence_ids = map_lightrag_evidence_ids(corpus, sources, context); @@ -816,20 +886,20 @@ fn lightrag_source_mappings( fn lightrag_reference_mapping( corpus: &[CorpusText], sources: &[LightragSource], - reference: &Value, + reference: &serde_json::Value, ) -> SourceMappingEvidence { let source = reference .get("file_path") - .and_then(Value::as_str) - .or_else(|| reference.get("reference_id").and_then(Value::as_str)) + .and_then(serde_json::Value::as_str) + .or_else(|| reference.get("reference_id").and_then(serde_json::Value::as_str)) .unwrap_or("unknown_source") .to_string(); let content = reference .get("content") - .and_then(Value::as_array) + .and_then(serde_json::Value::as_array) .into_iter() .flatten() - .filter_map(Value::as_str) + .filter_map(serde_json::Value::as_str) .collect::>(); let joined_content = content.join("\n"); let combined = format!("{source}\n{joined_content}"); @@ -900,7 +970,7 @@ fn lightrag_api_base(args: &LightragArgs) -> String { args.api_base.trim_end_matches('/').to_string() } -fn lightrag_metadata(args: &LightragArgs, run_slug: &str) -> Value { +fn lightrag_metadata(args: &LightragArgs, run_slug: &str) -> serde_json::Value { serde_json::json!({ "schema": "elf.lightrag_context_export_metadata/v1", "run_slug": run_slug, @@ -960,7 +1030,9 @@ fn materialized_job( adapter_id: &str, input: MaterializedJobInput, ) -> MaterializedJob { - let required_evidence_satisfied = required_evidence_satisfied(loaded, &input.evidence_ids); + let capture_failure = input.capture_failure.clone(); + let required_evidence_satisfied = + capture_failure.is_none() && required_evidence_satisfied(loaded, &input.evidence_ids); let status = if input.failure.is_some() { MaterializationStatus::Incomplete } else if !required_evidence_satisfied { @@ -968,8 +1040,17 @@ fn materialized_job( } else { MaterializationStatus::Pass }; - let failure_stage = input.failure.as_ref().map(|_| "adapter_runtime".to_string()); - let stage_notes = if !required_evidence_satisfied { + let failure_stage = if input.failure.is_some() { + Some("live_adapter.retrieve".to_string()) + } else if capture_failure.is_some() { + Some("live_adapter.capture_policy".to_string()) + } else { + None + }; + let failure_reason = input.failure.clone().or(capture_failure); + let stage_notes = if let Some(reason) = &failure_reason { + reason.clone() + } else if !required_evidence_satisfied { "Adapter did not return all required mapped evidence for this job.".to_string() } else { "Adapter returned mapped evidence through its live retrieval path.".to_string() @@ -991,10 +1072,11 @@ fn materialized_job( }, trace_explainability: TraceExplainabilityOutput { trace_id: input.trace_id.map(|id| id.to_string()), - failure_stage: failure_stage.map(|_| "live_adapter.retrieve".to_string()), - failure_reason: input.failure.clone(), + failure_stage: failure_stage.clone(), + failure_reason: failure_reason.clone(), stages: vec![TraceStageOutput { - stage_name: "live_adapter.retrieve".to_string(), + stage_name: failure_stage + .unwrap_or_else(|| "live_adapter.retrieve".to_string()), kept_evidence: input.evidence_ids.clone(), dropped_evidence: Vec::new(), demoted_evidence: Vec::new(), @@ -1016,9 +1098,10 @@ fn materialized_job( indexing_latency_ms: input.indexing_latency_ms, latency_ms: input.latency_ms, trace_id: input.trace_id, - failure: input.failure, + failure: failure_reason, source_mappings: input.source_mappings, operator_debug: input.operator_debug_evidence, + capture: input.capture, }, } } @@ -1027,6 +1110,9 @@ fn declared_encoding_job(adapter_id: &str, loaded: &LoadedJob) -> Option Option bool { && matches!(adapter_id, "elf_operator_debug_live" | "qmd_operator_debug_live") } +fn is_elf_capture_live_adapter(adapter_id: &str, suite: &str) -> bool { + suite == "capture_integration" + && matches!(adapter_id, "elf_live_real_world" | "elf_capture_write_policy_live") +} + fn not_encoded_reason(suite: &str) -> Option<&'static str> { match suite { "trust_source_of_truth" @@ -1144,6 +1238,7 @@ fn materialized_declared_status_job( failure, source_mappings: Vec::new(), operator_debug: None, + capture: None, }, operator_debug: None, } @@ -1155,7 +1250,7 @@ fn operator_debug_output( trace_id: Option, replay_command: String, replay_artifact: String, -) -> (Option, Option) { +) -> (Option, Option) { if loaded.job.suite != "operator_debugging_ux" { return (None, None); } @@ -1174,37 +1269,42 @@ fn operator_debug_output( let candidate_drop_visibility = operator_debug_candidate_visibility(adapter_kind, object).to_string(); - object.insert("trace_available".to_string(), Value::Bool(trace_available)); - object.insert("replay_command_available".to_string(), Value::Bool(replay_command_available)); - object.insert("raw_sql_needed".to_string(), Value::Bool(raw_sql_needed)); + object.insert("trace_available".to_string(), serde_json::Value::Bool(trace_available)); + object.insert( + "replay_command_available".to_string(), + serde_json::Value::Bool(replay_command_available), + ); + object.insert("raw_sql_needed".to_string(), serde_json::Value::Bool(raw_sql_needed)); object.insert( "dropped_candidate_visibility".to_string(), - Value::String(candidate_drop_visibility.clone()), + serde_json::Value::String(candidate_drop_visibility.clone()), ); object.insert( "trace_completeness".to_string(), - Value::String(operator_debug_trace_completeness(adapter_kind, trace_available).to_string()), + serde_json::Value::String( + operator_debug_trace_completeness(adapter_kind, trace_available).to_string(), + ), ); object.insert( "repair_action_clarity".to_string(), - Value::String(repair_action_clarity.to_string()), + serde_json::Value::String(repair_action_clarity.to_string()), ); - object.insert("replay_command".to_string(), Value::String(replay_command.clone())); - object.insert("replay_artifact".to_string(), Value::String(replay_artifact)); + object.insert("replay_command".to_string(), serde_json::Value::String(replay_command.clone())); + object.insert("replay_artifact".to_string(), serde_json::Value::String(replay_artifact)); match adapter_kind { AdapterKind::ElfServiceRuntime => if let Some(trace_id) = trace_id { let trace_id = trace_id.to_string(); - object.insert("trace_id".to_string(), Value::String(trace_id.clone())); + object.insert("trace_id".to_string(), serde_json::Value::String(trace_id.clone())); object.insert( "viewer_url".to_string(), - Value::String(format!("/viewer?trace_id={trace_id}")), + serde_json::Value::String(format!("/viewer?trace_id={trace_id}")), ); object.insert( "admin_trace_bundle_url".to_string(), - Value::String(format!( + serde_json::Value::String(format!( "/v2/admin/traces/{trace_id}/bundle?mode=full&stage_items_limit=128&candidates_limit=200" )), ); @@ -1249,12 +1349,12 @@ fn operator_debug_trace_completeness( fn operator_debug_candidate_visibility( adapter_kind: AdapterKind, - object: &Map, + object: &Map, ) -> &str { match adapter_kind { AdapterKind::ElfServiceRuntime => object .get("dropped_candidate_visibility") - .and_then(Value::as_str) + .and_then(serde_json::Value::as_str) .unwrap_or("visible through trace bundle replay candidates"), AdapterKind::QmdCliRuntime => "qmd top-k replay output is available, but intermediate candidate-drop stages are not exposed", @@ -1262,11 +1362,13 @@ fn operator_debug_candidate_visibility( } } -fn string_array_from_object(object: &Map, key: &str) -> Vec { +fn string_array_from_object(object: &Map, key: &str) -> Vec { object .get(key) - .and_then(Value::as_array) - .map(|items| items.iter().filter_map(Value::as_str).map(ToString::to_string).collect()) + .and_then(serde_json::Value::as_array) + .map(|items| { + items.iter().filter_map(serde_json::Value::as_str).map(ToString::to_string).collect() + }) .unwrap_or_default() } @@ -1295,7 +1397,7 @@ fn shell_quote(value: &str) -> String { format!("'{}'", value.replace('\'', "'\\''")) } -fn evidence_linked_claims(loaded: &LoadedJob, evidence_ids: &[String]) -> Vec { +fn evidence_linked_claims(loaded: &LoadedJob, evidence_ids: &[String]) -> Vec { loaded .job .expected_answer @@ -1325,7 +1427,7 @@ fn evidence_linked_claims(loaded: &LoadedJob, evidence_ids: &[String]) -> Vec Vec { +fn evidence_link_ids(value: &serde_json::Value) -> Vec { if let Some(id) = value.as_str() { return vec![id.to_string()]; } @@ -1333,7 +1435,11 @@ fn evidence_link_ids(value: &Value) -> Vec { value .as_array() .map(|items| { - items.iter().filter_map(Value::as_str).map(ToString::to_string).collect::>() + items + .iter() + .filter_map(serde_json::Value::as_str) + .map(ToString::to_string) + .collect::>() }) .unwrap_or_default() } @@ -1389,6 +1495,231 @@ fn selected_required_corpus_texts( SelectedEvidenceText { content, evidence_ids: selected_ids } } +fn capture_runtime_evidence_from_search_items(items: &[SearchItem]) -> CaptureRuntimeEvidence { + let source_refs = items.iter().map(|item| &item.source_ref); + + capture_runtime_evidence_from_source_refs(source_refs) +} + +fn capture_runtime_evidence_from_source_refs<'a>( + source_refs: impl IntoIterator, +) -> CaptureRuntimeEvidence { + let mut runtime = CaptureRuntimeEvidence::default(); + + for source_ref in source_refs { + let Some(evidence_id) = source_ref.get("evidence_id").and_then(serde_json::Value::as_str) + else { + continue; + }; + + if runtime.items.iter().any(|item| item.evidence_id == evidence_id) { + continue; + } + + runtime.items.push(CaptureRuntimeEvidenceItem { + evidence_id: evidence_id.to_string(), + source_id: source_ref + .get("source_id") + .and_then(serde_json::Value::as_str) + .map(ToString::to_string), + evidence_binding: source_ref + .get("evidence_binding") + .and_then(serde_json::Value::as_str) + .map(ToString::to_string), + write_policy_applied: source_ref + .get("write_policy_applied") + .and_then(serde_json::Value::as_bool) + .unwrap_or(false), + capture_action: source_ref + .get("capture_action") + .and_then(serde_json::Value::as_str) + .map(ToString::to_string), + source_ref: source_ref.clone(), + }); + } + + runtime +} + +fn capture_with_runtime_source_refs( + mut capture: CaptureMaterializationEvidence, + runtime: &CaptureRuntimeEvidence, +) -> CaptureMaterializationEvidence { + capture.source_ids.clear(); + capture.runtime_source_refs.clear(); + + for item in &runtime.items { + if let Some(source_id) = item.source_id.as_deref() { + push_unique(&mut capture.source_ids, source_id.to_string()); + } + + capture.runtime_source_refs.push(CaptureRuntimeSourceRefEvidence { + evidence_id: item.evidence_id.clone(), + source_ref: item.source_ref.clone(), + }); + } + + capture +} + +fn validate_capture_runtime_evidence( + suite: &str, + corpus: &[CorpusText], + capture: &CaptureMaterializationEvidence, + runtime: &CaptureRuntimeEvidence, +) -> Option { + if suite != "capture_integration" { + return None; + } + + let mut failures = Vec::new(); + let mut expected_redactions = 0_usize; + let mut expected_exclusions = 0_usize; + + for item in corpus { + match item.capture.action { + LiveCaptureAction::Exclude => { + if runtime.item_for(item.evidence_id.as_str()).is_some() { + failures.push(format!( + "excluded evidence {} was returned by live search", + item.evidence_id + )); + } + if capture.stored_evidence_ids.iter().any(|id| id == &item.evidence_id) { + failures.push(format!( + "excluded evidence {} was stored by live ingestion", + item.evidence_id + )); + } + if !capture.excluded_evidence_ids.iter().any(|id| id == &item.evidence_id) { + failures.push(format!( + "excluded evidence {} was not recorded as excluded", + item.evidence_id + )); + } + }, + LiveCaptureAction::Store => { + let runtime_item = runtime.item_for(item.evidence_id.as_str()); + + if let Some(expected_source_id) = item.capture.source_id.as_deref() { + match runtime_item.and_then(|observed| observed.source_id.as_deref()) { + Some(observed) if observed == expected_source_id => {}, + Some(observed) => failures.push(format!( + "evidence {} returned source_id {observed}, expected {expected_source_id}", + item.evidence_id + )), + None => failures.push(format!( + "evidence {} did not return expected source_id {expected_source_id}", + item.evidence_id + )), + } + } + if let Some(expected_binding) = item.capture.evidence_binding.as_deref() { + match runtime_item.and_then(|observed| observed.evidence_binding.as_deref()) { + Some(observed) if observed == expected_binding => {}, + Some(observed) => failures.push(format!( + "evidence {} returned evidence_binding {observed}, expected {expected_binding}", + item.evidence_id + )), + None => failures.push(format!( + "evidence {} did not return expected evidence_binding {expected_binding}", + item.evidence_id + )), + } + } + if let Some(policy_value) = &item.capture.write_policy { + match write_policy_from_value(policy_value, item.evidence_id.as_str()) { + Ok(policy) => { + expected_exclusions += policy.exclusions.len(); + expected_redactions += policy.redactions.len(); + }, + Err(err) => failures.push(err.to_string()), + } + + if !runtime_item.is_some_and(|observed| observed.write_policy_applied) { + failures.push(format!( + "evidence {} did not return write_policy_applied=true", + item.evidence_id + )); + } + } + if let Some(observed) = + runtime_item.and_then(|observed| observed.capture_action.as_deref()) + && observed != capture_action_str(item.capture.action) + { + failures.push(format!( + "evidence {} returned capture_action {observed}, expected {}", + item.evidence_id, + capture_action_str(item.capture.action) + )); + } + }, + } + } + + if capture.write_policy_exclusion_count < expected_exclusions { + failures.push(format!( + "write-policy exclusion count {} was below expected {expected_exclusions}", + capture.write_policy_exclusion_count + )); + } + if capture.write_policy_redaction_count < expected_redactions { + failures.push(format!( + "write-policy redaction count {} was below expected {expected_redactions}", + capture.write_policy_redaction_count + )); + } + if expected_exclusions + expected_redactions > 0 && capture.write_policy_audit_count == 0 { + failures + .push("write-policy audit count was zero despite expected policy effects".to_string()); + } + if failures.is_empty() { + None + } else { + Some(format!("Capture runtime validation failed: {}", failures.join("; "))) + } +} + +fn elf_stored_corpus_texts(corpus: &[CorpusText]) -> color_eyre::Result> { + let mut stored = Vec::new(); + + for item in corpus { + if item.capture.action == LiveCaptureAction::Exclude { + continue; + } + + stored.push(CorpusText { + evidence_id: item.evidence_id.clone(), + text: transformed_capture_text(item)?.trim().to_string(), + capture: item.capture.clone(), + }); + } + + Ok(stored) +} + +fn transformed_capture_text(item: &CorpusText) -> color_eyre::Result { + let Some(policy_value) = &item.capture.write_policy else { + return Ok(item.text.clone()); + }; + let policy = write_policy_from_value(policy_value, item.evidence_id.as_str())?; + let result = + writegate::apply_write_policy(item.text.as_str(), Some(&policy)).map_err(|err| { + eyre::eyre!("Invalid write_policy for evidence {}: {err:?}", item.evidence_id) + })?; + + Ok(result.transformed) +} + +fn write_policy_from_value( + value: &serde_json::Value, + evidence_id: &str, +) -> color_eyre::Result { + serde_json::from_value::(value.clone()).map_err(|err| { + eyre::eyre!("Failed to parse write_policy for evidence {evidence_id}: {err}") + }) +} + fn failure_jobs( adapter_id: &str, jobs: &[LoadedJob], @@ -1411,6 +1742,8 @@ fn failure_jobs( source_mappings: Vec::new(), operator_debug: None, operator_debug_evidence: None, + capture: None, + capture_failure: None, }, ) }) @@ -1436,11 +1769,16 @@ fn write_materialized_output(output: MaterializedOutput<'_>) -> color_eyre::Resu adapter_response .insert("answer".to_string(), serde_json::to_value(&materialized.response.answer)?); - value["corpus"]["adapter_response"] = Value::Object(adapter_response); + value["corpus"]["adapter_response"] = serde_json::Value::Object(adapter_response); if let Some(operator_debug) = &materialized.operator_debug { value["operator_debug"] = operator_debug.clone(); } + if let Some(capture) = &materialized.evidence.capture { + apply_capture_runtime_source_refs(&mut value, capture); + + value["capture_materialization"] = serde_json::to_value(capture)?; + } if matches!( materialized.evidence.status, @@ -1486,6 +1824,31 @@ fn write_materialized_output(output: MaterializedOutput<'_>) -> color_eyre::Resu Ok(()) } +fn apply_capture_runtime_source_refs( + value: &mut serde_json::Value, + capture: &CaptureMaterializationEvidence, +) { + let Some(items) = value.pointer_mut("/corpus/items").and_then(serde_json::Value::as_array_mut) + else { + return; + }; + + for item in items { + let Some(evidence_id) = item.get("evidence_id").and_then(serde_json::Value::as_str) else { + continue; + }; + let Some(source_ref) = capture + .runtime_source_refs + .iter() + .find(|source_ref| source_ref.evidence_id == evidence_id) + else { + continue; + }; + + item["source_ref"] = source_ref.source_ref.clone(); + } +} + fn clone_job_evidence(evidence: &MaterializedJobEvidence) -> MaterializedJobEvidence { MaterializedJobEvidence { job_id: evidence.job_id.clone(), @@ -1501,6 +1864,7 @@ fn clone_job_evidence(evidence: &MaterializedJobEvidence) -> MaterializedJobEvid failure: evidence.failure.clone(), source_mappings: evidence.source_mappings.clone(), operator_debug: evidence.operator_debug.clone(), + capture: evidence.capture.clone(), } } @@ -1558,7 +1922,7 @@ fn load_jobs(path: &Path) -> color_eyre::Result> { for fixture in paths { let raw = fs::read_to_string(&fixture)?; - let value = serde_json::from_str::(&raw) + let value = serde_json::from_str::(&raw) .map_err(|err| eyre::eyre!("Failed to parse {} as JSON: {err}", fixture.display()))?; let job = serde_json::from_value::(value.clone()).map_err(|err| { eyre::eyre!("Failed to parse {} as real_world_job: {err}", fixture.display()) @@ -1631,7 +1995,11 @@ fn corpus_texts(loaded: &LoadedJob) -> color_eyre::Result> { }, }; - Ok(CorpusText { evidence_id: item.evidence_id.clone(), text: text.trim().to_string() }) + Ok(CorpusText { + evidence_id: item.evidence_id.clone(), + text: text.trim().to_string(), + capture: item.capture.clone(), + }) }) .collect() } @@ -1905,6 +2273,20 @@ fn split_long_token(token: &str) -> Vec { chunks } +fn capture_for_job( + loaded: &LoadedJob, + capture: CaptureMaterializationEvidence, +) -> Option { + if loaded.job.suite == "capture_integration" { Some(capture) } else { None } +} + +fn capture_action_str(action: LiveCaptureAction) -> &'static str { + match action { + LiveCaptureAction::Store => "store", + LiveCaptureAction::Exclude => "exclude", + } +} + async fn run_lightrag_async(args: LightragArgs) -> color_eyre::Result<()> { let jobs = load_jobs(&args.fixtures)?; let run_slug = short_hash(format!("{}:{}", args.adapter_id, Uuid::new_v4()).as_str()); @@ -2025,6 +2407,8 @@ async fn materialize_lightrag_job( source_mappings, operator_debug: None, operator_debug_evidence: None, + capture: None, + capture_failure: None, }, )) } @@ -2034,7 +2418,7 @@ async fn insert_lightrag_texts( client: &reqwest::Client, corpus: &[CorpusText], sources: &[LightragSource], -) -> color_eyre::Result { +) -> color_eyre::Result { let request = serde_json::json!({ "texts": corpus.iter().map(|item| item.text.as_str()).collect::>(), "file_sources": sources.iter().map(|source| source.file_source.as_str()).collect::>(), @@ -2053,14 +2437,14 @@ async fn insert_lightrag_texts( async fn wait_for_lightrag_index( args: &LightragArgs, client: &reqwest::Client, - insert_response: &Value, + insert_response: &serde_json::Value, expected_docs: usize, ) -> color_eyre::Result<()> { let track_id = insert_response .get("track_id") - .and_then(Value::as_str) + .and_then(serde_json::Value::as_str) .ok_or_else(|| eyre::eyre!("LightRAG text insert response did not include track_id."))?; - let mut last_status = Value::Null; + let mut last_status = serde_json::Value::Null; for _attempt in 1..=args.index_attempts { let status = @@ -2093,7 +2477,7 @@ async fn query_lightrag_context( args: &LightragArgs, client: &reqwest::Client, loaded: &LoadedJob, -) -> color_eyre::Result { +) -> color_eyre::Result { let keywords = lightrag_keywords(loaded.job.prompt.content.as_str()); let request = serde_json::json!({ "query": loaded.job.prompt.content, @@ -2116,7 +2500,7 @@ async fn lightrag_get_json( args: &LightragArgs, client: &reqwest::Client, path: impl AsRef, -) -> color_eyre::Result { +) -> color_eyre::Result { let url = format!("{}{}", lightrag_api_base(args), path.as_ref()); let mut request = client.get(url); @@ -2131,8 +2515,8 @@ async fn lightrag_post_json( args: &LightragArgs, client: &reqwest::Client, path: &str, - body: &Value, -) -> color_eyre::Result { + body: &serde_json::Value, +) -> color_eyre::Result { let url = format!("{}{}", lightrag_api_base(args), path); let mut request = client.post(url).json(body); @@ -2143,7 +2527,7 @@ async fn lightrag_post_json( lightrag_send_json(request).await } -async fn lightrag_send_json(request: RequestBuilder) -> color_eyre::Result { +async fn lightrag_send_json(request: RequestBuilder) -> color_eyre::Result { let response = request.send().await?; let status = response.status(); let body = response.text().await?; @@ -2241,9 +2625,11 @@ async fn materialize_elf_job( } let corpus = corpus_texts(loaded)?; + let stored_corpus = elf_stored_corpus_texts(&corpus)?; let project_id = project_id_for_job(&loaded.job.job_id); + let capture = + ingest_elf_corpus(service, loaded, adapter_id, project_id.as_str(), &corpus).await?; - ingest_elf_corpus(service, loaded, adapter_id, project_id.as_str(), &corpus).await?; run_worker(runtime).await?; let started_at = Instant::now(); @@ -2268,12 +2654,26 @@ async fn materialize_elf_job( let mut evidence_ids = Vec::new(); for item in &response.items { - if let Some(evidence_id) = item.source_ref.get("evidence_id").and_then(Value::as_str) { + if let Some(evidence_id) = + item.source_ref.get("evidence_id").and_then(serde_json::Value::as_str) + { push_unique(&mut evidence_ids, evidence_id.to_string()); } } - let selected = selected_required_corpus_texts(loaded, &corpus, &evidence_ids); + let runtime_capture = capture_runtime_evidence_from_search_items(&response.items); + let capture = capture_with_runtime_source_refs(capture, &runtime_capture); + let capture_failure = validate_capture_runtime_evidence( + loaded.job.suite.as_str(), + &corpus, + &capture, + &runtime_capture, + ); + let selected = if let Some(failure) = &capture_failure { + SelectedEvidenceText { content: failure.clone(), evidence_ids: Vec::new() } + } else { + selected_required_corpus_texts(loaded, &stored_corpus, &evidence_ids) + }; let replay_command = elf_replay_command(response.trace_id, project_id.as_str()); let (operator_debug, operator_debug_evidence) = operator_debug_output( AdapterKind::ElfServiceRuntime, @@ -2300,6 +2700,8 @@ async fn materialize_elf_job( source_mappings: Vec::new(), operator_debug, operator_debug_evidence, + capture: capture_for_job(loaded, capture), + capture_failure, }, )) } @@ -2310,8 +2712,40 @@ async fn ingest_elf_corpus( adapter_id: &str, project_id: &str, corpus: &[CorpusText], -) -> color_eyre::Result<()> { +) -> color_eyre::Result { + let mut capture = CaptureMaterializationEvidence::default(); + for item in corpus { + if item.capture.action == LiveCaptureAction::Exclude { + push_unique(&mut capture.excluded_evidence_ids, item.evidence_id.clone()); + + continue; + } + + push_unique(&mut capture.stored_evidence_ids, item.evidence_id.clone()); + + if let Some(source_id) = item.capture.source_id.as_deref() { + push_unique(&mut capture.source_ids, source_id.to_string()); + } + + if item.capture.write_policy.is_some() { + ingest_elf_corpus_item( + service, + loaded, + adapter_id, + project_id, + item, + item.evidence_id.clone(), + item.text.clone(), + 0, + 1, + &mut capture, + ) + .await?; + + continue; + } + let chunks = note_text_chunks(item.text.as_str()); let chunk_count = chunks.len(); @@ -2321,47 +2755,96 @@ async fn ingest_elf_corpus( } else { format!("{}:chunk-{chunk_index:03}", item.evidence_id) }; - let response = service - .add_note(AddNoteRequest { - tenant_id: TENANT_ID.to_string(), - project_id: project_id.to_string(), - agent_id: AGENT_ID.to_string(), - scope: SCOPE.to_string(), - notes: vec![AddNoteInput { - r#type: "fact".to_string(), - key: Some(key), - text, - structured: None, - importance: 0.9, - confidence: 0.95, - ttl_days: None, - source_ref: serde_json::json!({ - "schema": "real_world_live_adapter/v1", - "adapter": adapter_id, - "job_id": loaded.job.job_id, - "evidence_id": item.evidence_id, - "chunk_index": chunk_index, - "chunk_count": chunk_count, - }), - write_policy: None, - }], - }) - .await - .map_err(|err| { - eyre::eyre!("ELF add_note failed for {}: {err}", loaded.job.job_id) - })?; - - if !response.results.iter().any(|result| result.note_id.is_some()) { - return Err(eyre::eyre!( - "ELF add_note did not persist evidence {} chunk {} for {}.", - item.evidence_id, - chunk_index, - loaded.job.job_id - )); - } + + ingest_elf_corpus_item( + service, + loaded, + adapter_id, + project_id, + item, + key, + text, + chunk_index, + chunk_count, + &mut capture, + ) + .await?; + } + } + + Ok(capture) +} + +#[allow(clippy::too_many_arguments)] +async fn ingest_elf_corpus_item( + service: &ElfService, + loaded: &LoadedJob, + adapter_id: &str, + project_id: &str, + item: &CorpusText, + key: String, + text: String, + chunk_index: usize, + chunk_count: usize, + capture: &mut CaptureMaterializationEvidence, +) -> color_eyre::Result<()> { + let write_policy = item + .capture + .write_policy + .as_ref() + .map(|policy| write_policy_from_value(policy, item.evidence_id.as_str())) + .transpose()?; + let response = service + .add_note(AddNoteRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.to_string(), + agent_id: AGENT_ID.to_string(), + scope: SCOPE.to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some(key), + text, + structured: None, + importance: 0.9, + confidence: 0.95, + ttl_days: None, + source_ref: serde_json::json!({ + "schema": "real_world_live_adapter/v1", + "adapter": adapter_id, + "job_id": loaded.job.job_id, + "evidence_id": item.evidence_id, + "source_id": item.capture.source_id.as_deref(), + "capture_action": capture_action_str(item.capture.action), + "evidence_binding": item.capture.evidence_binding.as_deref(), + "write_policy_applied": item.capture.write_policy.is_some(), + "chunk_index": chunk_index, + "chunk_count": chunk_count, + }), + write_policy, + }], + }) + .await + .map_err(|err| eyre::eyre!("ELF add_note failed for {}: {err}", loaded.job.job_id))?; + + for result in &response.results { + if let Some(audit) = &result.write_policy_audit + && (!audit.exclusions.is_empty() || !audit.redactions.is_empty()) + { + capture.write_policy_audit_count += 1; + capture.write_policy_exclusion_count += audit.exclusions.len(); + capture.write_policy_redaction_count += audit.redactions.len(); } } + if !response.results.iter().any(|result| result.note_id.is_some()) { + return Err(eyre::eyre!( + "ELF add_note did not persist evidence {} chunk {} for {}.", + item.evidence_id, + chunk_index, + loaded.job.job_id + )); + } + Ok(()) } @@ -2431,3 +2914,137 @@ async fn run_worker(runtime: &BaselineRuntime) -> color_eyre::Result<()> { Ok(()) } + +#[cfg(test)] +mod tests { + use serde_json::Value; + + fn capture_item( + evidence_id: &str, + action: super::LiveCaptureAction, + source_id: Option<&str>, + evidence_binding: Option<&str>, + write_policy: Option, + ) -> super::CorpusText { + super::CorpusText { + evidence_id: evidence_id.to_string(), + text: "Public capture text.".to_string(), + capture: super::LiveCapturePolicy { + action, + source_id: source_id.map(ToString::to_string), + evidence_binding: evidence_binding.map(ToString::to_string), + write_policy, + }, + } + } + + fn capture_evidence( + stored: &[&str], + excluded: &[&str], + ) -> super::CaptureMaterializationEvidence { + super::CaptureMaterializationEvidence { + stored_evidence_ids: stored.iter().map(|id| (*id).to_string()).collect(), + excluded_evidence_ids: excluded.iter().map(|id| (*id).to_string()).collect(), + source_ids: Vec::new(), + write_policy_audit_count: 0, + write_policy_exclusion_count: 0, + write_policy_redaction_count: 0, + runtime_source_refs: Vec::new(), + } + } + + #[test] + fn capture_runtime_validation_requires_returned_source_id() { + let corpus = vec![capture_item( + "source-a", + super::LiveCaptureAction::Store, + Some("capture:a"), + None, + None, + )]; + let capture = capture_evidence(&["source-a"], &[]); + let runtime = super::capture_runtime_evidence_from_source_refs([&serde_json::json!({ + "evidence_id": "source-a", + "capture_action": "store" + })]); + let failure = super::validate_capture_runtime_evidence( + "capture_integration", + &corpus, + &capture, + &runtime, + ) + .expect("missing runtime source_id should fail capture validation"); + + assert!(failure.contains("did not return expected source_id capture:a")); + } + + #[test] + fn capture_runtime_validation_rejects_returned_excluded_evidence() { + let corpus = vec![capture_item( + "private-trap", + super::LiveCaptureAction::Exclude, + Some("capture:private"), + Some("negative_trap"), + None, + )]; + let capture = capture_evidence(&[], &["private-trap"]); + let runtime = super::capture_runtime_evidence_from_source_refs([&serde_json::json!({ + "evidence_id": "private-trap", + "source_id": "capture:private", + "capture_action": "store" + })]); + let failure = super::validate_capture_runtime_evidence( + "capture_integration", + &corpus, + &capture, + &runtime, + ) + .expect("returned excluded evidence should fail capture validation"); + + assert!(failure.contains("excluded evidence private-trap was returned by live search")); + } + + #[test] + fn capture_runtime_source_refs_are_written_into_generated_fixture() { + let mut value = serde_json::json!({ + "corpus": { + "items": [ + { + "evidence_id": "source-a", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "fixture" + } + } + ] + } + }); + let mut capture = capture_evidence(&["source-a"], &[]); + + capture.runtime_source_refs.push(super::CaptureRuntimeSourceRefEvidence { + evidence_id: "source-a".to_string(), + source_ref: serde_json::json!({ + "schema": "real_world_live_adapter/v1", + "evidence_id": "source-a", + "source_id": "capture:a", + "capture_action": "store", + "evidence_binding": "source_ref" + }), + }); + + super::apply_capture_runtime_source_refs(&mut value, &capture); + + assert_eq!( + value + .pointer("/corpus/items/0/source_ref/source_id") + .and_then(serde_json::Value::as_str), + Some("capture:a") + ); + assert_eq!( + value + .pointer("/corpus/items/0/source_ref/evidence_binding") + .and_then(serde_json::Value::as_str), + Some("source_ref") + ); + } +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index a8c7e927..dee50e09 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -48,6 +48,10 @@ fn retrieval_fixture_dir() -> PathBuf { .join("retrieval") } +fn capture_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("capture_integration") +} + fn consolidation_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("consolidation") } @@ -137,6 +141,21 @@ fn competitor_strength_adoption_report_json_path() -> Result { .join("2026-06-11-competitor-strength-adoption-report.json")) } +fn capture_write_policy_live_report_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-11-capture-write-policy-live-report.json")) +} + +fn capture_write_policy_live_markdown_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("benchmarking") + .join("2026-06-11-capture-write-policy-live-report.md")) +} + fn temporal_history_competitor_gap_json_path() -> Result { Ok(workspace_root()? .join("docs") @@ -317,6 +336,39 @@ fn real_world_report_includes_external_adapter_coverage_manifest() -> Result<()> Ok(()) } +#[test] +fn capture_integration_fixtures_score_redaction_and_source_ids() -> Result<()> { + let report = run_json_report_from(capture_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); + + let suites = array_at(&report, "/suites")?; + let capture = find_by_field(suites, "/suite_id", "capture_integration")?; + + assert_eq!(capture.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(capture.pointer("/encoded_job_count").and_then(Value::as_u64), Some(3)); + + let jobs = array_at(&report, "/jobs")?; + let source_id = find_by_field(jobs, "/job_id", "capture-source-id-binding-001")?; + let redaction = find_by_field(jobs, "/job_id", "capture-write-policy-redaction-001")?; + + assert!(array_contains_str(source_id, "/produced_evidence", "source-id-release-summary")?); + assert!(array_contains_str(source_id, "/produced_evidence", "source-id-command-log")?); + assert_eq!(redaction.pointer("/redaction_leak_count").and_then(Value::as_u64), Some(0)); + assert!( + redaction + .pointer("/produced_answer") + .and_then(Value::as_str) + .is_some_and(|answer| !answer.contains("orchid-envelope")) + ); + + Ok(()) +} + #[test] fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { let manifest_path = Path::new(env!("CARGO_MANIFEST_DIR")) @@ -373,7 +425,7 @@ fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { report .pointer("/external_adapters/summary/scenario_position_counts/untested") .and_then(Value::as_u64), - Some(10) + Some(11) ); assert_eq!( report @@ -531,7 +583,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/blocked") .and_then(Value::as_u64), - Some(2) + Some(3) ); assert_eq!( report @@ -555,7 +607,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/pass") .and_then(Value::as_u64), - Some(16) + Some(17) ); assert_eq!( report @@ -573,7 +625,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_position_counts/ties") .and_then(Value::as_u64), - Some(8) + Some(9) ); assert_eq!( report @@ -585,7 +637,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_position_counts/untested") .and_then(Value::as_u64), - Some(11) + Some(12) ); assert_eq!( report @@ -597,7 +649,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_outcome_counts/tie") .and_then(Value::as_u64), - Some(8) + Some(9) ); assert_eq!( report @@ -615,7 +667,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_outcome_counts/blocked") .and_then(Value::as_u64), - Some(1) + Some(2) ); assert_eq!( report @@ -1272,9 +1324,149 @@ fn operator_debug_live_adapter_task_is_docker_scoped() -> Result<()> { Ok(()) } +#[test] +fn live_adapter_supports_elf_capture_write_policy_without_external_hook_claims() -> Result<()> { + let workspace = workspace_root()?; + let live_adapter = + fs::read_to_string(workspace.join("apps/elf-eval/src/bin/real_world_live_adapter.rs"))?; + let manifest = fs::read_to_string( + workspace + .join("apps/elf-eval/fixtures/real_world_external_adapters") + .join("memory_projects_manifest.json"), + )?; + + assert!(live_adapter.contains("fn is_elf_capture_live_adapter(")); + assert!(live_adapter.contains("suite == \"capture_integration\"")); + assert!(live_adapter.contains("write_policy_audit_count")); + assert!(live_adapter.contains("excluded_evidence_ids")); + assert!(live_adapter.contains("source_id")); + assert!(live_adapter.contains("runtime_source_refs")); + assert!(live_adapter.contains("validate_capture_runtime_evidence")); + assert!(live_adapter.contains("capture_failure")); + assert!(live_adapter.contains("The live adapter sweep has no encoded runtime path")); + assert!(manifest.contains("\"scenario_id\": \"live_capture_write_policy\"")); + assert!(manifest.contains("\"scenario_id\": \"capture_write_policy_hooks\"")); + assert!(manifest.contains("\"comparison_outcome\": \"blocked\"")); + assert!(manifest.contains("Four redaction, exclusion, source-id, evidence-binding")); + assert!(manifest.contains("no durable local session/capture path stores source ids")); + assert!(manifest.contains("hooks, timeline, observations, viewer capture")); + + Ok(()) +} + +#[test] +fn capture_write_policy_live_report_preserves_competitor_boundaries() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + capture_write_policy_live_report_path()?, + )?)?; + let markdown = fs::read_to_string(capture_write_policy_live_markdown_path()?)?; + let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?; + let readme = fs::read_to_string(readme_path()?)?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.capture_write_policy_live_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-933")); + assert_eq!( + report + .pointer("/live_capture_results/elf_live_real_world/suite_status") + .and_then(Value::as_str), + Some("pass") + ); + assert_eq!( + report + .pointer("/live_capture_results/elf_live_real_world/encoded_job_count") + .and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report + .pointer("/live_capture_results/elf_live_real_world/redaction_leak_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/live_capture_results/qmd_live_real_world/suite_status") + .and_then(Value::as_str), + Some("not_encoded") + ); + + let jobs = array_at(&report, "/jobs")?; + let source_binding = find_by_field(jobs, "/job_id", "capture-source-id-binding-001")?; + let source_binding_refs = array_at(source_binding, "/runtime_source_refs")?; + let release_summary_ref = + find_by_field(source_binding_refs, "/evidence_id", "source-id-release-summary")?; + + assert!(array_contains_str(source_binding, "/source_ids", "capture:issue-comment-42")?); + assert_eq!( + release_summary_ref.pointer("/source_id").and_then(Value::as_str), + Some("capture:issue-comment-42") + ); + assert_eq!( + release_summary_ref.pointer("/evidence_binding").and_then(Value::as_str), + Some("source_ref") + ); + + let write_policy = find_by_field(jobs, "/job_id", "capture-write-policy-redaction-001")?; + + assert_eq!( + write_policy.pointer("/write_policy_redaction_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + write_policy + .pointer("/runtime_source_refs/0/write_policy_applied") + .and_then(Value::as_bool), + Some(true) + ); + + let boundary = find_by_field(jobs, "/job_id", "capture-integration-boundaries-001")?; + + assert!(array_contains_str(boundary, "/excluded_evidence_ids", "private-span-trap")?); + assert!(!array_contains_str(boundary, "/stored_evidence_ids", "private-span-trap")?); + assert!( + array_at(boundary, "/runtime_source_refs")? + .iter() + .all(|item| item.pointer("/evidence_id").and_then(Value::as_str) + != Some("private-span-trap")) + ); + + let positions = array_at(&report, "/competitor_positions")?; + let qmd = find_by_field(positions, "/project", "qmd")?; + let agentmemory = find_by_field(positions, "/project", "agentmemory")?; + let claude_mem = find_by_field(positions, "/project", "claude-mem")?; + + assert_eq!(qmd.pointer("/position").and_then(Value::as_str), Some("untested")); + assert!(qmd.pointer("/reason").and_then(Value::as_str).is_some_and(|reason| { + reason.contains("typed not_encoded") && reason.contains("ELF self-check") + })); + assert_eq!(agentmemory.pointer("/position").and_then(Value::as_str), Some("blocked")); + assert!(agentmemory.pointer("/reason").and_then(Value::as_str).is_some_and(|reason| { + reason.contains("process-local StateKV Map") && reason.contains("in-memory index") + })); + assert_eq!(claude_mem.pointer("/position").and_then(Value::as_str), Some("untested")); + assert!( + claude_mem + .pointer("/reason") + .and_then(Value::as_str) + .is_some_and(|reason| reason.contains("hooks, timeline, observations")) + ); + assert!(markdown.contains("ELF now has live capture/write-policy self-check evidence")); + assert!(markdown.contains("not an ELF-over-qmd win")); + assert!(markdown.contains("runtime `source_ref` metadata returned by search")); + assert!(markdown.contains("Do not claim ELF broadly beats agentmemory or claude-mem")); + assert!(benchmarking_index.contains("2026-06-11-capture-write-policy-live-report.md")); + assert!(readme.contains("Capture/Write-Policy Live Report - June 11, 2026")); + + Ok(()) +} + fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Result<()> { let suites = array_at(adapter, "/suites")?; let capabilities = array_at(adapter, "/capabilities")?; + let adapter_id = adapter.pointer("/adapter_id").and_then(Value::as_str).unwrap_or_default(); let targeted = find_by_field(capabilities, "/capability", "targeted_live_pass")?; let full_pass = find_by_field(capabilities, "/capability", "full_suite_live_pass")?; let work_resume = find_by_field(suites, "/suite_id", "work_resume")?; @@ -1296,7 +1488,7 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res adapter .pointer("/result/evidence") .and_then(Value::as_str) - .is_some_and(|evidence| evidence.contains("38 jobs across all 11 encoded suites")) + .is_some_and(|evidence| evidence.contains("40 jobs across all 11 encoded suites")) ); assert_eq!(trust_sot.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!(work_resume.pointer("/status").and_then(Value::as_str), Some("pass")); @@ -1310,7 +1502,19 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res assert_eq!(consolidation.pointer("/status").and_then(Value::as_str), Some("not_encoded")); assert_eq!(knowledge.pointer("/status").and_then(Value::as_str), Some("not_encoded")); assert_eq!(operator_debug.pointer("/status").and_then(Value::as_str), Some("not_encoded")); - assert_eq!(capture.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + + if adapter_id == "elf_live_real_world" { + assert_eq!(capture.pointer("/status").and_then(Value::as_str), Some("pass")); + assert!( + capture + .pointer("/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("4/4 capture_integration jobs")) + ); + } else { + assert_eq!(capture.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + } + assert_eq!(personalization.pointer("/status").and_then(Value::as_str), Some("pass")); Ok(()) @@ -1320,7 +1524,7 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(38)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(40)); Ok(()) } @@ -2421,14 +2625,15 @@ fn assert_operator_facing_strength_profile_boundaries( iteration_direction: &str, ) { assert!(readme.contains("Full-suite live real-world adapter sweep after XY-899")); - assert!(readme.contains("fresh ELF sweep reports 18 pass")); - assert!(readme.contains("5 wrong_result, 2 blocked, and 13 not_encoded jobs")); + assert!(readme.contains("fresh ELF sweep reports 22 pass")); + assert!(readme.contains("5 wrong_result, 2 blocked, and 11 not_encoded jobs")); assert!(readme.contains("fresh qmd sweep reports")); - assert!(readme.contains("17 pass, 6 wrong_result, 2 blocked, and 13 not_encoded jobs")); - assert!(readme.contains("The difference is the")); + assert!(readme.contains("17 pass, 6 wrong_result, 2 blocked, and 15 not_encoded jobs")); + assert!(readme.contains("The differences are")); assert!(readme.contains("delete/TTL tombstone case")); + assert!(readme.contains("ELF-only capture/write-policy live self-checks")); assert!(readme.contains("qmd remains the local retrieval-debug UX reference")); - assert!(readme.contains("no broad ELF-over-qmd claim is allowed")); + assert!(readme.contains("no broad ELF-over-qmd claim")); assert!(readme.contains("qmd and OpenViking Strength-Profile Report - June 11, 2026")); assert!(benchmarking_index.contains("2026-06-11-qmd-openviking-strength-profile-report.md")); assert!( @@ -2497,9 +2702,9 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("xy844-current-worktree")); assert!(markdown.contains("Existing live-baseline reports remain valid")); assert!(markdown.contains("### Adapter Scenario Judgments")); - assert!(markdown.contains("ELF scenario positions: `wins=8, ties=8, loses=1, untested=11`")); + assert!(markdown.contains("ELF scenario positions: `wins=8, ties=9, loses=1, untested=12`")); assert!(markdown.contains( - "Scenario comparison outcomes: `win=8, tie=8, loss=1, not_tested=8, blocked=1, non_goal=2`" + "Scenario comparison outcomes: `win=8, tie=9, loss=1, not_tested=8, blocked=2, non_goal=2`" )); assert!(markdown.contains("| `claude_mem_live_baseline` | `same_corpus_retrieval`")); assert!(markdown.contains("| `memsearch_live_baseline` | `ttl_expiry_lifecycle`")); @@ -2786,8 +2991,8 @@ fn assert_root_knowledge_summary(report: &Value) { } fn assert_root_aggregate_summary(report: &Value) { - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(38)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(36)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(40)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(38)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(2)); @@ -2830,9 +3035,9 @@ fn assert_root_aggregate_summary(report: &Value) { ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(84) + Some(88) ); - assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(84)); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(88)); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0)); diff --git a/docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md b/docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md new file mode 100644 index 00000000..cb6ff281 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md @@ -0,0 +1,75 @@ +# Capture/Write-Policy Live Report - June 11, 2026 + +Goal: Record the XY-933 live capture/write-policy evidence and competitor claim +boundaries. +Read this when: You need to know whether ELF has live evidence for capture redaction, +exclusions, source ids, evidence binding, and no secret leakage. +Inputs: `cargo make real-world-memory`, `cargo make real-world-memory-live-adapters`, +`apps/elf-eval/fixtures/real_world_memory/capture_integration/`, and +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. +Outputs: Scenario-level capture results, live artifacts, and typed blocker reasons for +agentmemory and claude-mem capture breadth. + +## Verdict + +ELF now has live capture/write-policy self-check evidence. The ELF live service adapter +passes all 4 `capture_integration` jobs with zero redaction leaks and full required +evidence/source-ref/quote coverage. + +This is not a broad capture-hook superiority claim. ELF has a live self-check for the +currently encoded capture/write-policy suite, while qmd keeps those jobs typed +`not_encoded`; that makes qmd untested on this surface, not an ELF-over-qmd win. +Against agentmemory and claude-mem capture breadth, the comparison is still blocked +or untested because no durable local adapter evidence exists for their hook/viewer +capture paths. + +## Fresh Runs + +| Command | Result | Artifact | +| --- | --- | --- | +| `cargo make real-world-memory` | pass | `tmp/real-world-memory/real-world-memory-report.json` | +| `cargo make real-world-memory-live-adapters` | pass | `tmp/real-world-memory/live-adapters/summary.json` | + +## ELF Capture Results + +| Job | Live status | Evidence coverage | Source-ref coverage | Redaction leaks | Capture evidence | +| --- | --- | ---: | ---: | ---: | --- | +| `capture-redaction-exclusion-001` | `pass` | `2/2` | `2/2` | `0` | Stores public decision and write-policy audit; excludes private text. | +| `capture-source-id-binding-001` | `pass` | `2/2` | `2/2` | `0` | Preserves `capture:issue-comment-42` and `capture:command-log-7`. | +| `capture-write-policy-redaction-001` | `pass` | `2/2` | `2/2` | `0` | Applies one write-policy redaction and preserves `capture:terminal-log-17`. | +| `capture-integration-boundaries-001` | `pass` | `4/4` | `4/4` | `0` | Preserves the no-live boundary for external hooks and viewer flows. | + +The ELF materialization artifact records: + +- stored evidence ids for captured public items; +- excluded evidence ids for private or trap inputs; +- runtime `source_ref` metadata returned by search, including copied source ids; +- write-policy audit, exclusion, and redaction counts; +- generated answers that contain no redaction trap text. + +## Comparison Boundary + +| Compared target | Position | Reason | +| --- | --- | --- | +| qmd live real-world adapter | `untested` | ELF executes and passes 4/4 live capture jobs; qmd keeps the same jobs typed `not_encoded`, so this remains an ELF self-check rather than a qmd comparison result. | +| agentmemory capture hooks | `blocked` | The current Docker baseline uses a process-local StateKV Map and in-memory index. No durable local session/capture path stores source ids, exclusions, write-policy audit, or evidence-bound output. | +| claude-mem capture/viewer flows | `untested` | The checked evidence exercises repository storage, lifecycle, progressive disclosure, and same-corpus retrieval only. Hooks, timeline, observations, viewer capture, and automatic capture review are not run against real-world jobs. | + +## Claims Allowed + +- ELF live capture/write-policy self-checks pass for redaction, exclusions, source ids, + evidence binding, and no secret leakage. +- qmd remains `not_encoded` for capture/write-policy jobs in the full live sweep. +- agentmemory capture comparison is blocked by mocked/in-memory storage and lack of a + durable local capture artifact. +- claude-mem capture breadth is untested until a Docker-contained hook/viewer capture + runner exists. + +## Claims Not Allowed + +- Do not claim ELF broadly beats agentmemory or claude-mem on capture breadth. +- Do not use host-global hooks as benchmark evidence. +- Do not weaken ELF write-policy, redaction, or evidence-binding constraints for + benchmark convenience. +- Do not convert fixture-backed or live-baseline-only capture references into a live + real-world competitor pass. diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index 120c6b3d..041418f4 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -45,7 +45,10 @@ The remaining caveats are material: ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory - UI/export and claude-mem viewer workflows remain blocked or not encoded. + UI/export and claude-mem viewer workflows remain blocked or not encoded. XY-933 + adds an ELF live capture/write-policy self-check, but agentmemory capture breadth + is blocked by mocked/in-memory storage and claude-mem hook/viewer capture remains + untested. ## Evidence Classes @@ -70,8 +73,9 @@ results, or lifecycle failures into one aggregate leaderboard. | Command or run | Artifact | Supported claim | | --- | --- | --- | -| `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` | ELF fixture aggregate covers 38 jobs across 11 suites with 36 pass and 2 blocked production-ops operator boundaries. | -| `cargo make real-world-memory-live-adapters` | `2026-06-11-measurement-coverage-audit.md` | ELF live service adapter reports 18 pass, 5 wrong_result, 2 blocked, and 13 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 13 not_encoded jobs. | +| `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` | ELF fixture aggregate covers 40 jobs across 11 suites with 38 pass and 2 blocked production-ops operator boundaries. | +| `cargo make real-world-memory-live-adapters` | `2026-06-11-measurement-coverage-audit.md` | ELF live service adapter reports 22 pass, 5 wrong_result, 2 blocked, and 11 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 15 not_encoded jobs. | +| `cargo make real-world-memory-live-adapters` | `2026-06-11-capture-write-policy-live-report.md` | ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage; qmd remains not_encoded, agentmemory is blocked, and claude-mem is untested for capture breadth. | | `cargo make real-world-job-operator-ux-live-adapters` | `tmp/real-world-job/operator-ux-live-adapters/summary.json` | The narrow live operator-debug slice scores ELF as pass and qmd as wrong_result: ELF wins trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence; both systems expose replay commands and repair-action guidance. | | `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker` | `2026-06-11-first-generation-oss-adapter-promotion-report.md` | mem0/OpenMemory and memsearch pass basic local baseline smokes; agentmemory remains lifecycle_fail and claude-mem remains wrong_result. | | `cargo make openmemory-ui-export-readback` | `2026-06-11-mem0-openmemory-history-ui-export-report.md` | mem0 local OSS passes preference correction history, entity-scoped personalization, local `get_all` export-style readback, and deletion audit history; OpenMemory export-helper setup emits a separate blocked artifact with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`, and hosted Platform export remains non-goal. | @@ -93,7 +97,7 @@ results, or lifecycle failures into one aggregate leaderboard. | Consolidation/proposal review | `not_tested` | `fixture_backed`, `not_encoded` | ELF fixture consolidation passes, but live consolidation proposal generation and review-action scoring are not encoded. | XY-926 | | Knowledge page compilation | `not_tested` | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `not_encoded` | ELF fixture knowledge pages pass, but live knowledge compilation is not encoded; graphify reaches a tiny scored smoke and remains wrong_result. | XY-926, XY-929 | | Operator debugging/viewer UX | `win` | `fixture_backed`, `live_real_world`, `blocked`, `not_encoded` | ELF now has a narrow live operator-debug win over qmd on trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence. ELF ties qmd on replay-command availability and repair-action clarity. OpenMemory UI/export remains blocked and claude-mem UI remains not encoded, so this is not a broad viewer-product superiority claim. | XY-926 | -| Capture/write policy and redaction | `not_tested` | `fixture_backed`, `live_baseline_only`, `blocked`, `not_encoded` | ELF fixture capture/write-policy jobs pass, but live capture integration and agentmemory/claude-mem capture hooks are not comparable yet. | XY-925, XY-926 | +| Capture/write policy and redaction | `not_tested` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF live capture/write-policy self-check jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. qmd remains `not_encoded`; agentmemory comparison is `blocked`; claude-mem capture breadth is `not_encoded`, so no broad capture-hook superiority claim is allowed. | XY-933, XY-925 | | Production ops, restore, backfill, and rebuild | `win` | `live_baseline_only`, `blocked` | ELF has the strongest measured local production-operation story: provider synthetic, stress, resumable backfill, backup/restore, and Qdrant rebuild evidence. | XY-930 | | Private corpus and provider boundaries | `blocked` | `blocked` | Private production profile fails closed without an operator-owned manifest; provider-backed production-ops gates require explicit credentials. | XY-930 | | Personalization and scoped preferences | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `not_encoded` | ELF and qmd both pass the single encoded live personalization job. mem0 local OSS now passes entity-scoped personalization, so scoped preference behavior is a measured tie; preference correction history remains a separate ELF loss. | XY-927 | @@ -109,7 +113,8 @@ results, or lifecycle failures into one aggregate leaderboard. | XY-923 | P0 | Backlog | qmd trace-level replay and wrong-result diagnostics. | | XY-924/XY-931 | P0 | Encoded local OSS history; UI/export setup blocker measured | mem0/OpenMemory local OSS history and SDK export-style readback are measured; OpenMemory UI/export has a blocked export-helper setup probe and still needs a dedicated compose/import path before any product-UX comparison. | | XY-925 | P1 | Backlog | First-generation OSS continuity and source-store adapters. | -| XY-926 | P1 | Backlog | Live operator-debugging, capture, consolidation, and knowledge-page suites. | +| XY-926 | P1 | Backlog | Live consolidation and knowledge-page suites; broad operator-debugging remains dependent on OpenMemory and claude-mem UI runners. | +| XY-933 | P1 | Live ELF self-check encoded | Capture/write-policy redaction, exclusion, source-id, evidence-binding, and no-leak scoring for ELF; durable agentmemory/claude-mem capture-hook comparison remains blocked or untested. | | XY-927 | P1 | Backlog | Letta-style core-vs-archival memory comparison. | | XY-928 | P1 | Backlog | OpenViking context-trajectory and hierarchy benchmark. | | XY-929 | P2 | Backlog | Graph/RAG adapters beyond scored smokes. | @@ -126,6 +131,8 @@ results, or lifecycle failures into one aggregate leaderboard. - ELF has a narrow live operator-debug win over qmd for trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence, with replay-command availability and repair-action clarity tied. +- ELF live capture/write-policy self-checks pass for redaction, exclusions, source + ids, evidence binding, and no secret leakage. - ELF has a live temporal reconciliation loss against the benchmark expectation: five memory-evolution jobs remain `wrong_result`. - Most competitor strengths outside qmd retrieval are `not_tested`, `blocked`, @@ -142,6 +149,8 @@ results, or lifecycle failures into one aggregate leaderboard. behavior plus graph memory remain outside measured local OSS evidence. - Do not claim ELF broadly beats OpenMemory or claude-mem viewer UX from the narrow ELF/qmd operator-debug slice. +- Do not claim ELF broadly beats agentmemory or claude-mem on capture breadth; the + current comparison is blocked or untested for their hook/viewer capture paths. - Do not claim ELF beats OpenViking on staged context trajectory. - Do not claim ELF beats Letta on core-vs-archival memory. - Do not claim graph/RAG parity from smoke-only evidence. diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index 1f770b67..d042d0ec 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -26,11 +26,11 @@ is encoded and run at a comparable evidence class. Current boundary: - ELF and qmd have full-suite `live_real_world` sweeps, but neither has a full-suite - live pass. The fresh ELF sweep produced 38 jobs with 18 pass, 5 wrong_result, - 0 incomplete, 2 blocked, and 13 not_encoded; the fresh qmd sweep produced 17 pass, - 6 wrong_result, 0 incomplete, 2 blocked, and 13 not_encoded. -- ELF fixture evidence is strong: `cargo make real-world-memory` reports 38 jobs - across 11 suites with 36 pass and 2 blocked production-ops operator boundaries. + live pass. The fresh ELF sweep produced 40 jobs with 22 pass, 5 wrong_result, + 0 incomplete, 2 blocked, and 11 not_encoded; the fresh qmd sweep produced 17 pass, + 6 wrong_result, 0 incomplete, 2 blocked, and 15 not_encoded. +- ELF fixture evidence is strong: `cargo make real-world-memory` reports 40 jobs + across 11 suites with 38 pass and 2 blocked production-ops operator boundaries. That proves the fixture contract, not live-service parity. - qmd is the strongest measured local retrieval-debug comparison, but the current evidence still separates its same-corpus/live-retrieval strengths from the full-suite @@ -72,13 +72,13 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | Project | Strongest user-facing scenario | Current evidence | Measured status and proof | Unsupported or blocked status | Required benchmark before ELF claim | Borrow if stronger | | --- | --- | --- | --- | --- | --- | --- | -| ELF | Evidence-linked source-of-truth memory service with real-world fixtures and live retrieval sweeps. | `live_real_world`; supporting `fixture_backed`. | `wrong_result` full live sweep: `cargo make real-world-memory-live-adapters`, `tmp/real-world-memory/live-adapters/elf-report.md`. Narrow operator-debug pass: `cargo make real-world-job-operator-ux-live-adapters`, `tmp/real-world-job/operator-ux-live-adapters/elf-report.md`. Fixture contract: `cargo make real-world-memory`, `tmp/real-world-memory/real-world-memory-report.json`. | `blocked`: private manifest and provider credentials; broader live suites remain `wrong_result`, `blocked`, or `not_encoded`; the narrow operator-debug slice now passes. | Full-suite live pass plus separate private-corpus and credentialed production-ops proof. | Keep borrowing qmd debug knobs, OpenViking staged trajectory, mem0 history, Letta core memory, and graph/RAG navigation. | +| ELF | Evidence-linked source-of-truth memory service with real-world fixtures and live retrieval sweeps. | `live_real_world`; supporting `fixture_backed`. | `wrong_result` full live sweep: `cargo make real-world-memory-live-adapters`, `tmp/real-world-memory/live-adapters/elf-report.md`; live capture/write-policy suite passes 4/4 with zero redaction leaks. Narrow operator-debug pass: `cargo make real-world-job-operator-ux-live-adapters`, `tmp/real-world-job/operator-ux-live-adapters/elf-report.md`. Fixture contract: `cargo make real-world-memory`, `tmp/real-world-memory/real-world-memory-report.json`. | `blocked`: private manifest and provider credentials; broader live suites remain `wrong_result`, `blocked`, or `not_encoded`; the narrow operator-debug and live capture/write-policy slices now pass. | Full-suite live pass plus separate private-corpus, credentialed production-ops proof, and durable external capture-hook comparisons. | Keep borrowing qmd debug knobs, OpenViking staged trajectory, mem0 history, Letta core memory, agentmemory/claude-mem capture breadth, and graph/RAG navigation. | | qmd | Local retrieval-debug workflow with transparent CLI indexing, querying, expansion, fusion, and rerank ergonomics. | `live_real_world`; supporting `live_baseline_only` and `research_gate`. | `wrong_result` full live sweep: `cargo make real-world-memory-live-adapters`, `tmp/real-world-memory/live-adapters/qmd-report.md`; targeted retrieval suites pass; the narrow operator-debug slice ties replay commands but is `wrong_result` for trace hydration and candidate-drop visibility. | `not_encoded`: deep profile and non-retrieval live behavior are not encoded; memory_evolution is `wrong_result`. | Keep qmd deep retrieval/debug profiling separate from the narrow operator-debug live slice; no broad ELF-over-qmd or qmd-over-ELF claim is allowed until comparable stage artifacts exist. | Weighted fusion, rerank explanation, local debug knobs, and command-line replay. | -| agentmemory | Coding-agent continuity, MCP/REST packaging, viewer workflow, and durable cross-agent memory lifecycle. | `live_baseline_only`. | `lifecycle_fail`: `ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. | `blocked`: durable cold-start and real-world adapter coverage are missing. | Durable local adapter with update, delete, cold-start reload, work_resume, capture/write-policy, and lifecycle-staleness jobs. | Cross-agent hooks, packaging, continuity scenarios, and viewer affordances. | +| agentmemory | Coding-agent continuity, MCP/REST packaging, viewer workflow, and durable cross-agent memory lifecycle. | `live_baseline_only`. | `lifecycle_fail`: `ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. | `blocked`: durable cold-start, capture-hook persistence, and real-world adapter coverage are missing; current Docker baseline uses a process-local StateKV Map and in-memory index. | Durable local adapter with update, delete, cold-start reload, work_resume, capture/write-policy, and lifecycle-staleness jobs. | Cross-agent hooks, packaging, continuity scenarios, and viewer affordances. | | mem0/OpenMemory | Memory lifecycle, personalization, hosted/OpenMemory UI ergonomics, and optional graph memory. | `live_baseline_only`. | `pass`: fresh scoped run `cargo make openmemory-ui-export-readback`, `tmp/live-baseline/live-baseline-report.json`, with mem0 `8/8` local SDK checks passing; `blocked`: OpenMemory export-helper setup probe emits `tmp/live-baseline/mem0-openmemory-ui-export.json` with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`. | `blocked`: OpenMemory UI/export cannot be compared until a compose/import path loads the same corpus into the product app; `unsupported`: hosted Platform export; `not_encoded`: optional graph memory and real-world prompt adapter coverage. | Add a Docker-contained OpenMemory product app import/export path, then score browser/API readback separately from SDK `get_all`; keep hosted Platform and graph memory opt-in/non-goal unless explicitly enabled. | Entity-scoped history, lifecycle surfaces, async update ergonomics, and OpenMemory inspection UX. | | memsearch | Markdown-first canonical store with rebuildable local index and practical hybrid retrieval. | `live_baseline_only`. | `pass`: fresh scoped run `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`, with memsearch `4/4` local checks passing. | `not_encoded`: real-world source-of-truth, retrieval, and memory-evolution prompt adapters are not encoded; TTL/expiry is unsupported by the current CLI path. | Score source-of-truth and retrieval-debug real-world jobs over the canonical Markdown store; keep TTL/expiry as unsupported unless a comparable path exists. | Canonical markdown store, local reindex clarity, and user-inspectable source files. | | OpenViking | Filesystem-like context trajectory, hierarchical retrieval, and staged context loading. | `live_baseline_only`; supporting `research_gate`. | `wrong_result`: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. | `not_encoded`: hierarchical context trajectory is not encoded; same-corpus output still misses expected evidence. | Make evidence-bearing same-corpus output pass, then score staged trajectory and hierarchy expansion. | `viking://`-style context model, trajectory readback, and staged retrieval planning. | -| claude-mem | Progressive disclosure, automatic capture loop, repository-local lifecycle, and local viewer workflow. | `live_baseline_only`. | `wrong_result`: `ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. | `not_encoded`: progressive-disclosure real-world jobs are not encoded. | Durable repository-backed work_resume, operator_debugging_ux, capture/write-policy, and progressive-disclosure jobs. | Progressive disclosure, automatic capture review loops, and local viewer/operator comfort. | +| claude-mem | Progressive disclosure, automatic capture loop, repository-local lifecycle, and local viewer workflow. | `live_baseline_only`. | `wrong_result`: `ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. | `not_encoded`: progressive-disclosure and hook/viewer capture real-world jobs are not encoded. | Durable repository-backed work_resume, operator_debugging_ux, capture/write-policy, and progressive-disclosure jobs. | Progressive disclosure, automatic capture review loops, and local viewer/operator comfort. | | RAGFlow | Full RAG application workflow with document, chunk, and reference evidence handles. | `research_gate`. | `blocked`: `ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make ragflow-docker-smoke`, `tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json`. | `blocked`: Docker resource envelope and adapter output mapping still need proof. | XY-885 tiny Docker evidence-smoke adapter mapping `reference.chunks` to scored evidence. | Document/chunk references, resource-envelope reporting, and RAG app evidence handles. | | LightRAG | Lightweight graph/RAG context export with source file-path citation shape. | `research_gate`. | `blocked`: `ELF_LIGHTRAG_CONTEXT_START=1 cargo make lightrag-docker-context-smoke`, `tmp/real-world-memory/lightrag-context/summary.json`. | `blocked`: Docker service setup and context export are not proven. | XY-886 Docker context-export adapter with explicit provider config and source citation mapping. | Context-only query modes, graph-aware retrieval layout, and file-path citation readback. | | GraphRAG | GraphRAG indexing, graph summaries, and document/text-unit evidence tables. | `research_gate`. | `blocked`: `ELF_GRAPHRAG_SMOKE_RUN=1 cargo make graphrag-docker-smoke`, `tmp/real-world-memory/graphrag-smoke/summary.json`. | `blocked`: indexing resource envelope and source citation mapping are not proven. | XY-887 cost-bounded Docker adapter over a tiny corpus and scored output tables. | Graph summary artifacts, local/global search separation, and source table evidence mapping. | @@ -102,7 +102,7 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | Consolidation | Fixture consolidation passes; live consolidation is `not_encoded`. | agentmemory, managed-memory references, llm-wiki. | No manifest project has live consolidation scoring. | Run reviewable consolidation proposal generation with source refs, unsupported-claim flags, and audit transitions. | | Knowledge pages | Fixture knowledge_compilation passes; live knowledge_compilation is `not_encoded`. | llm-wiki, gbrain, GraphRAG, graphify. | llm-wiki and gbrain are `research_gate` `not_encoded` or `blocked`; GraphRAG is `blocked`; graphify has a tiny scored smoke `wrong_result`. | Encode live derived-page rebuild/lint scoring and run contained knowledge/RAG adapters only after setup proof. | | Operator debugging | Fixture operator_debugging_ux passes, and the narrow live operator-debug slice passes for trace hydration, candidate-drop visibility, selected-but-not-narrated evidence, replay-command availability, and repair-action clarity. | qmd, claude-mem, OpenMemory. | qmd ties replay-command availability and repair-action clarity but is `wrong_result` for trace hydration, candidate-drop stage visibility, and selected-but-not-narrated evidence; claude-mem and OpenMemory UX remain `not_encoded` or blocked. | Add bounded OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | -| Capture/write policy | Fixture capture_integration passes; live capture_integration is `not_encoded`. | agentmemory, claude-mem. | agentmemory capture is `blocked`; claude-mem capture is `not_encoded`. | Run live capture/write-policy jobs proving redaction, exclusion, evidence binding, and no secret leakage. | +| Capture/write policy | Fixture capture_integration passes; ELF live capture_integration passes 4/4 with zero redaction leaks, source ids, write-policy audit, and evidence binding. | agentmemory, claude-mem. | agentmemory capture is `blocked` by mocked/in-memory storage; claude-mem hook/viewer capture is `not_encoded`. | Run durable agentmemory and claude-mem capture-hook jobs proving redaction, exclusion, evidence binding, source ids, and no secret leakage. | | Production ops | Fixture production_ops has 4 pass and 2 blocked; live production_ops is `blocked`; production adoption has provider/backfill/restore evidence. | ELF production gate, qmd, RAG/RAGFlow resource gates. | qmd live production_ops is `blocked`; RAG/resource gates are `research_gate` `blocked`. | Rerun private-corpus and credentialed gates only when operator-owned manifest and credentials exist. | | Personalization | Fixture and live personalization pass. | mem0/OpenMemory, Letta. | mem0/OpenMemory and Letta personalization are `not_encoded`. | Encode scoped preference readback for mem0/OpenMemory and Letta before personalization superiority claims. | | Context trajectory | ELF has trace direction but no comparable staged trajectory scenario. | OpenViking. | OpenViking setup is pinned, same-corpus retrieval is `wrong_result`, and hierarchy trajectory is `not_encoded`. | Make OpenViking evidence-bearing retrieval pass, then score staged context trajectory outputs. | @@ -118,6 +118,7 @@ now explicit: | --- | --- | --- | --- | --- | | qmd deep retrieval/debug profile | New benchmark issue | yes | None after this matrix lands. | Stress profile plus trace-level retrieval-debug artifacts for qmd and ELF. | | agentmemory durable lifecycle adapter | `[ELF benchmark P0] Make external adapters lifecycle-durable and fail-typed` | yes | Durable local adapter path selection. | Update, delete, cold-start reload, work_resume, and capture/write-policy jobs. | +| agentmemory/claude-mem capture-hook breadth | Follow-up after XY-933 | yes | Docker-contained hook/viewer capture path with durable artifacts. | Source ids, redaction/exclusion audit, evidence-bound output, and typed blocker reporting. | | mem0/OpenMemory history and UI coverage | New adapter repair issue | yes | Comparable local OSS path for history/UI/readback evidence. | Preference/entity history, deletion audit readback, personalization, OpenMemory inspection/export, and optional graph-context jobs. | | memsearch source-of-truth real-world coverage | New adapter repair issue | yes | Real-world prompt adapter over the canonical Markdown store. | Source-of-truth rebuild/reload jobs and retrieval-debug jobs that preserve baseline reindex/update/delete evidence without converting it into suite pass claims. | | OpenViking context trajectory | New benchmark issue after evidence output fix | yes | Evidence-bearing same-corpus retrieval output. | Hierarchical expansion, staged trajectory, and resume/retrieval evidence jobs. | diff --git a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md index 78a00da3..5948ba26 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md +++ b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md @@ -29,8 +29,8 @@ The strongest current statement is: ergonomics, but ELF now has a narrow live operator-debug win over qmd on trace hydration and candidate-drop visibility. - Many competitor strengths are still undermeasured: OpenViking context trajectory, - mem0/OpenMemory entity history and UI, agentmemory and claude-mem continuity - capture, Letta core-vs-archival memory, Graphiti/Zep temporal graph behavior, and + mem0/OpenMemory entity history and UI, agentmemory and claude-mem capture breadth, + Letta core-vs-archival memory, Graphiti/Zep temporal graph behavior, and llm-wiki/gbrain/graphify knowledge workflows. - The right next strategy is not to replace ELF with any one project. It is to keep ELF's evidence-bound core and absorb the best measured or plausible product @@ -44,18 +44,18 @@ The strongest current statement is: | Metric | Value | | --- | ---: | -| Jobs | `38` | +| Jobs | `40` | | Encoded suites | `11` | -| Pass | `36` | +| Pass | `38` | | Blocked | `2` | | Wrong result | `0` | | Lifecycle fail | `0` | | Incomplete | `0` | | Not encoded | `0` | | Unsupported claim | `0` | -| Mean score | `0.947` | -| Evidence coverage | `84/84` | -| Expected evidence recall | `77/77` | +| Mean score | `0.950` | +| Evidence coverage | `88/88` | +| Expected evidence recall | `80/80` | This proves the fixture contract is broad and well controlled. It does not prove that every live adapter or every competitor runtime passes those scenarios. @@ -67,20 +67,21 @@ sweeps for ELF and qmd: | Adapter | Jobs | Pass | Wrong result | Incomplete | Blocked | Not encoded | Mean score | Evidence recall | | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | -| ELF live service adapter | `38` | `18` | `5` | `0` | `2` | `13` | `0.525` | `41/77` | -| qmd live CLI adapter | `38` | `17` | `6` | `0` | `2` | `13` | `0.486` | `38/77` | +| ELF live service adapter | `40` | `22` | `5` | `0` | `2` | `11` | `0.599` | `50/80` | +| qmd live CLI adapter | `40` | `17` | `6` | `0` | `2` | `15` | `0.461` | `38/80` | Interpretation: -- This is a near tie for the currently encoded live real-world sweep, with ELF one - job ahead in this fresh run because qmd misses the delete/TTL tombstone job. +- ELF is five passes ahead in this full live sweep because qmd misses the delete/TTL + tombstone job and keeps the capture/write-policy suite typed `not_encoded`. - Both pass `trust_source_of_truth`, `work_resume`, `project_decisions`, `retrieval`, and `personalization`. - Both fail most `memory_evolution` live conflict evidence with `wrong_result`. -- Both leave consolidation, knowledge compilation, capture integration, and - production-ops operator boundaries as `not_encoded` or `blocked`. Operator - debugging has a separate narrow live slice: ELF passes it, while qmd remains - `wrong_result` for trace hydration and candidate-drop stage visibility. +- ELF now passes live `capture_integration`; qmd keeps that suite `not_encoded`. + Both leave consolidation, knowledge compilation, and production-ops operator + boundaries as `not_encoded` or `blocked`. Operator debugging has a separate narrow + live slice: ELF passes it, while qmd remains `wrong_result` for trace hydration and + candidate-drop stage visibility. ### Production Evidence @@ -134,7 +135,7 @@ one misleading score. | Consolidation | ELF fixture passes, but live proposal generation is not encoded. | Build reviewable derived proposals with source refs, confidence, unsupported-claim flags, and apply/defer/discard audit. | | Knowledge pages | ELF fixture pages pass; live knowledge generation is not encoded. | Borrow llm-wiki lint/query-save loops, gbrain timelines, and graphify reports behind rebuild/lint benchmarks. | | Operator debugging | Fixture UX passes and the narrow live trace/viewer slice is scored: ELF passes, qmd ties replay/repair clarity but is wrong_result for trace hydration and candidate-drop visibility. | Expand coverage to OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | -| Capture/write policy | Fixture capture boundary passes; live capture is not encoded. | Borrow agentmemory/claude-mem capture hooks while preserving redaction and evidence binding. | +| Capture/write policy | ELF live capture/write-policy self-check passes with zero redaction leaks; qmd is `not_encoded`; agentmemory is `blocked`; claude-mem is `not_encoded`. | Borrow agentmemory/claude-mem capture breadth only after durable local hook/viewer evidence exists, while preserving redaction and evidence binding. | | Production ops | ELF has the strongest checked-in evidence, with private/credential gates blocked. | Keep Docker-first production proof and add private corpus only when an operator-owned manifest exists. | | Personalization | ELF live personalization passes; mem0/OpenMemory and Letta are not encoded. | Add entity-scoped preference history and UI readback before claiming stronger personalization. | | Context trajectory | Not comparable yet; OpenViking remains the reference. | Score staged retrieval, hierarchy expansion, and trajectory readback. | @@ -145,13 +146,13 @@ one misleading score. | Project | Current evidence | User-facing strength | ELF direction | | --- | --- | --- | --- | -| ELF | `fixture_backed` plus `live_real_world`; live full sweep is `wrong_result`. | Evidence-linked memory service, strict provenance, rebuildable Qdrant, production backfill/restore proof. | Keep this as the core; do not weaken source-of-truth or typed failure semantics while adding product ergonomics. | +| ELF | `fixture_backed` plus `live_real_world`; live full sweep is `wrong_result`; live capture/write-policy self-check passes. | Evidence-linked memory service, strict provenance, rebuildable Qdrant, production backfill/restore proof. | Keep this as the core; do not weaken source-of-truth, write-policy, or typed failure semantics while adding product ergonomics. | | qmd | `live_real_world` plus `live_baseline_only`; targeted retrieval passes, full sweep is `wrong_result`. | Local retrieval-debug workflow, transparent CLI, weighted fusion, rerank, replayable commands. | Treat qmd as the retrieval-debug bar. ELF should match its introspection and local replay without becoming CLI-only. | -| agentmemory | `live_baseline_only`; current status is `lifecycle_fail`. | Coding-agent continuity, hooks, MCP/REST packaging, viewer/console observability. | Borrow capture breadth and continuity UX, but require durable lifecycle proof before claims. | +| agentmemory | `live_baseline_only`; current status is `lifecycle_fail`; capture breadth comparison is blocked by process-local StateKV Map and in-memory index. | Coding-agent continuity, hooks, MCP/REST packaging, viewer/console observability. | Borrow capture breadth and continuity UX, but require durable lifecycle and capture artifact proof before claims. | | mem0/OpenMemory | `live_baseline_only`; basic local smoke now passes, while entity/preference history, hosted ecosystem, graph memory, and OpenMemory UI remain untested locally. | Entity-scoped memory, lifecycle/history surfaces, hosted ecosystem, OpenMemory UI. | Add entity/preference history and UI readback patterns, while keeping hosted claims out of local OSS benchmarks. | | memsearch | `live_baseline_only`; canonical Markdown reindex/reload smoke now passes, while real-world source-of-truth prompts remain unencoded. | Markdown-first canonical store and local reindex clarity. | Borrow local inspectability and canonical-file ergonomics, not file-as-authority semantics. | | OpenViking | `live_baseline_only` plus `research_gate`; current status is `wrong_result`. | Filesystem-like context model, hierarchy, staged context trajectory. | Add staged retrieval and trajectory scoring after same-corpus evidence output is correct. | -| claude-mem | `live_baseline_only`; current status is `wrong_result`. | Progressive disclosure, automatic capture, local viewer workflow. | Borrow progressive disclosure and viewer comfort; benchmark capture and operator-debugging live paths. | +| claude-mem | `live_baseline_only`; current status is `wrong_result`; hook/viewer capture breadth is not encoded. | Progressive disclosure, automatic capture, local viewer workflow. | Borrow progressive disclosure and viewer comfort; benchmark capture and operator-debugging live paths before claims. | | RAGFlow | `research_gate`; current status is `blocked`. | Full RAG application workflow with document/chunk/reference handles. | Use as a resource-aware RAG adapter benchmark, not as a current ELF competitor win/loss. | | LightRAG | `research_gate`; current status is `blocked`. | Lightweight graph/RAG context export and source-path citation shape. | Borrow context-export ideas for graph/RAG navigation after Docker proof. | | GraphRAG | `research_gate`; current status is `blocked`. | Graph summaries, document/text-unit tables, local/global search separation. | Borrow graph summary artifacts for knowledge pages and graph navigation after cost-bounded output proof. | @@ -167,8 +168,8 @@ one misleading score. ### P0 - Close Measured Quality Gaps -These are the highest leverage because current evidence already shows an ELF gap or a -near tie. +These are the highest leverage because current evidence already shows an ELF gap, a +close competitor surface, or a still-unmeasured product strength. 1. Live memory evolution correctness - Current state: fixture pass, live `wrong_result`. @@ -201,9 +202,12 @@ These improve day-to-day usefulness while preserving ELF's evidence-bound core. 1. Capture and continuity - Borrow from: agentmemory hook breadth and claude-mem automatic capture review. - - ELF shape: live ingestion must preserve redaction, excluded spans, source ids, - and write-policy audit. - - Benchmark gate: capture/write-policy live jobs with no secret leakage. + - Current state: ELF live capture/write-policy self-check passes; agentmemory is + blocked and claude-mem is not encoded for capture breadth. + - ELF shape: live ingestion must continue to preserve redaction, excluded spans, + source ids, and write-policy audit. + - Benchmark gate: durable agentmemory and claude-mem capture-hook runners with + no secret leakage and evidence-bound output. 2. Reviewable consolidation - Borrow from: managed memory dreaming and Always-On Memory Agent scheduling. @@ -250,9 +254,10 @@ These are needed for broad credibility but should not block personal production Do not claim: -- ELF beats qmd overall. ELF is one pass ahead in the fresh aggregate because qmd - misses the delete/TTL tombstone job, but neither adapter has full-suite live pass - evidence and qmd still owns stronger local retrieval-debug ergonomics. +- ELF beats qmd overall. ELF is five passes ahead in the fresh aggregate because qmd + misses the delete/TTL tombstone job and keeps capture/write-policy jobs + `not_encoded`, but neither adapter has full-suite live pass evidence and qmd still + owns stronger local retrieval-debug ergonomics. - ELF has full-suite live real-world pass evidence. It does not. - ELF has private-corpus production quality proof. The private profile currently fails closed without an operator-owned manifest. @@ -285,7 +290,7 @@ The next reporting work should be ordered by decision value: 1. ELF/qmd retrieval-debug deep profile. 2. ELF live memory-evolution repair report. 3. OpenMemory and claude-mem operator-debug UI/export runners. -4. Capture/write-policy live adapter report. +4. agentmemory and claude-mem capture-hook breadth report. 5. OpenViking context-trajectory report after evidence-bearing retrieval works. 6. RAG/graph adapter pack report after Docker-contained outputs map to evidence ids. diff --git a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md index e10ce945..e34534d2 100644 --- a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md +++ b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md @@ -6,8 +6,8 @@ Read this when: You need to answer whether ELF has enough empirical evidence to claim a win, tie, loss, or non-claim against tracked memory, RAG, graph, and agent-continuity projects. Inputs: Fresh local runs of `cargo make real-world-memory` and -`cargo make real-world-memory-live-adapters` in the current XY-898 lane after -adapter-report consistency repairs, plus +`cargo make real-world-memory-live-adapters` in the current XY-933 lane after live +capture/write-policy scoring, plus `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`, `2026-06-11-competitor-strength-evidence-matrix.md`, and `2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md`. @@ -22,18 +22,25 @@ tracked project's strongest scenario. What is proven today: -- ELF has a strong fixture-backed real-world benchmark contract: 38 jobs, 36 pass, +- ELF has a strong fixture-backed real-world benchmark contract: 40 jobs, 38 pass, 2 blocked operator boundaries, and no wrong results in the fixture aggregate. - ELF and qmd have comparable full-suite live real-world sweeps, but neither has a - full-suite live pass. ELF is one pass ahead in the fresh aggregate because qmd - misses the memory-evolution delete/TTL tombstone job. + full-suite live pass. ELF is five passes ahead in the fresh aggregate because qmd + misses the memory-evolution delete/TTL tombstone job and the capture/write-policy + suite is now ELF-only live evidence. +- ELF now has live capture/write-policy self-check evidence for redaction, exclusions, + source ids, evidence binding, and no secret leakage. This is not a broad + capture-hook win over agentmemory or claude-mem: agentmemory comparison is blocked + by mocked/in-memory storage, and claude-mem hook/viewer capture remains untested in + the Docker real-world job runner. - ELF is ahead on production-operation evidence among tracked systems because it has checked-in provider synthetic, stress, backfill, backup/restore, and Qdrant rebuild evidence. - The current comparison still undermeasures most competitor strengths. OpenViking trajectory, mem0/OpenMemory entity history and UI, Letta core-vs-archival memory, Graphiti/Zep temporal graph behavior, graph/RAG navigation, agentmemory and - claude-mem capture/continuity, and knowledge-page workflows remain non-claims. + claude-mem continuity/capture breadth, and knowledge-page workflows remain + non-claims. The separate XY-932 operator-debug live slice now scores ELF against qmd for trace hydration and candidate-drop visibility, but does not cover OpenMemory or claude-mem UI flows. @@ -43,13 +50,13 @@ production," but the competitiveness objective remains open. ## Fresh Runs -These commands were run in the current XY-898 lane after adapter-report consistency -repairs: +These commands were run in the current XY-933 lane after live capture/write-policy +scoring: | Command | Result | Runtime | | --- | --- | ---: | -| `cargo make real-world-memory` | pass | 11.91 seconds | -| `cargo make real-world-memory-live-adapters` | pass | 121.51 seconds | +| `cargo make real-world-memory` | pass | 7.11 seconds | +| `cargo make real-world-memory-live-adapters` | pass | 137.66 seconds | The live adapter run emitted repeated Qdrant client/server compatibility warnings, but the command completed successfully and produced ELF and qmd JSON/Markdown reports. @@ -62,21 +69,21 @@ failure. | Metric | Value | | --- | ---: | -| Jobs | `38` | +| Jobs | `40` | | Encoded suites | `11` | -| Pass | `36` | +| Pass | `38` | | Blocked | `2` | | Wrong result | `0` | | Lifecycle fail | `0` | | Incomplete | `0` | | Not encoded | `0` | | Unsupported claim | `0` | -| Mean score | `0.947` | -| Mean latency | `4.411 ms` | -| Expected evidence recall | `77/77` | -| Evidence coverage | `84/84` | -| Source-ref coverage | `84/84` | -| Quote coverage | `84/84` | +| Mean score | `0.950` | +| Mean latency | `4.244 ms` | +| Expected evidence recall | `80/80` | +| Evidence coverage | `88/88` | +| Source-ref coverage | `88/88` | +| Quote coverage | `88/88` | This proves fixture contract breadth and scoring behavior. It does not prove every live adapter or competitor runtime can complete those jobs. @@ -87,19 +94,22 @@ live adapter or competitor runtime can complete those jobs. | Adapter | Jobs | Pass | Wrong result | Blocked | Not encoded | Mean score | Mean latency | Evidence recall | Evidence coverage | | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | -| ELF live service adapter | `38` | `18` | `5` | `2` | `13` | `0.525` | `5.100 ms` | `41/77` | `48/84` | -| qmd live CLI adapter | `38` | `17` | `6` | `2` | `13` | `0.486` | `691.163 ms` | `38/77` | `45/84` | +| ELF live service adapter | `40` | `22` | `5` | `2` | `11` | `0.599` | `6.980 ms` | `50/80` | `58/88` | +| qmd live CLI adapter | `40` | `17` | `6` | `2` | `15` | `0.461` | `792.543 ms` | `38/80` | `45/88` | -This supports a near tie on the currently encoded live real-world suite shape, with -ELF one job ahead because qmd misses the delete/TTL tombstone case. It does not -support a broad ELF-over-qmd claim because qmd remains the stronger retrieval-debug UX -reference and its deep profile is still not encoded. +This supports an ELF lead in the current full live sweep count, but not a broad +ELF-over-qmd claim. The lead is concentrated in the ELF-only capture/write-policy +self-check plus the delete/TTL tombstone case. qmd remains the stronger retrieval-debug +UX reference, and its deep profile is still not encoded. ### Live Suite Breakdown -ELF and qmd have the same status shape outside `memory_evolution`. The difference is +ELF and qmd have the same status shape outside `memory_evolution` and +`capture_integration`. The memory-evolution difference is `memory-evolution-delete-ttl-001`: ELF passes that job while qmd reports -`wrong_result`, leaving ELF at five memory-evolution wrong results and qmd at six. +`wrong_result`, leaving ELF at five memory-evolution wrong results and qmd at six. The +capture difference is that ELF now executes the capture/write-policy jobs through its +service runtime, while qmd keeps those jobs typed `not_encoded`. | Suite | Jobs | ELF breakdown | qmd breakdown | | --- | ---: | --- | --- | @@ -109,7 +119,7 @@ ELF and qmd have the same status shape outside `memory_evolution`. The differenc | `project_decisions` | `5` | `pass:5` | `pass:5` | | `personalization` | `1` | `pass:1` | `pass:1` | | `memory_evolution` | `6` | `pass:1`, `wrong_result:5` | `wrong_result:6` | -| `capture_integration` | `2` | `not_encoded:2` | `not_encoded:2` | +| `capture_integration` | `4` | `pass:4` | `not_encoded:4` | | `consolidation` | `4` | `not_encoded:4` | `not_encoded:4` | | `knowledge_compilation` | `2` | `not_encoded:2` | `not_encoded:2` | | `operator_debugging_ux` | `1` | `not_encoded:1` | `not_encoded:1` | @@ -147,13 +157,13 @@ records `unique_project_names: 17` for the full project list including ELF. | Project | Best current evidence | Current measured state | Strongest unproven scenario | Next measurement before claim | | --- | --- | --- | --- | --- | -| ELF | `fixture_backed` plus `live_real_world` | Fixture aggregate passes except 2 blocked operator boundaries; live full sweep is `wrong_result`; narrow operator-debug live slice passes. | Full live memory evolution, live consolidation, live knowledge pages, live capture, live production ops, and broader operator UI runners. | Memory-evolution diagnostic report, then live capture/consolidation/knowledge reports and OpenMemory/claude-mem UI runners. | -| qmd | `live_real_world` plus `live_baseline_only` | Fresh full sweep is one pass behind ELF because qmd misses the delete/TTL tombstone job; same-corpus baseline passes; narrow operator-debug live slice ties replay commands but is `wrong_result` for trace hydration and candidate-drop visibility. | Deep retrieval-debug ergonomics and trace replay beyond the narrow operator-debug slice. | qmd/ELF deep retrieval-debug profile with expansion, fusion, rerank, and dropped-candidate traces. | -| agentmemory | `live_baseline_only` | `lifecycle_fail`. | Durable coding-agent continuity and capture hooks. | Durable lifecycle and work-resume/capture adapter report. | +| ELF | `fixture_backed` plus `live_real_world` | Fixture aggregate passes except 2 blocked operator boundaries; live full sweep is `wrong_result`; live capture/write-policy and narrow operator-debug slices pass. | Full live memory evolution, live consolidation, live knowledge pages, live production ops, competitor capture hooks, and broader operator UI runners. | Memory-evolution diagnostic report, then consolidation/knowledge reports plus agentmemory/claude-mem capture and OpenMemory/claude-mem UI runners. | +| qmd | `live_real_world` plus `live_baseline_only` | Fresh full sweep is five passes behind ELF because qmd misses the delete/TTL tombstone job and keeps capture/write-policy jobs typed `not_encoded`; same-corpus baseline passes; narrow operator-debug live slice ties replay commands but is `wrong_result` for trace hydration and candidate-drop visibility. | Deep retrieval-debug ergonomics and trace replay beyond the narrow operator-debug slice. | qmd/ELF deep retrieval-debug profile with expansion, fusion, rerank, and dropped-candidate traces. | +| agentmemory | `live_baseline_only` | `lifecycle_fail`; capture comparison is `blocked` because the Docker baseline uses a process-local StateKV Map and in-memory index, with no durable local session/capture path for source ids, exclusions, write-policy audit, or evidence-bound output. | Durable coding-agent continuity and capture hooks. | Durable lifecycle and work-resume/capture adapter report. | | mem0/OpenMemory | `live_baseline_only` | Basic local smoke now passes; history/UI/hosted/graph behavior remains `not_encoded`. | Entity history, lifecycle UI, OpenMemory inspection. | Entity-history, deletion-audit, and UI/export readback report. | | memsearch | `live_baseline_only` | Basic canonical Markdown reindex/reload smoke now passes; real-world prompt coverage remains `not_encoded`. | Markdown canonical store and local reindex clarity. | Source-of-truth and retrieval-debug real-world adapter report. | | OpenViking | `live_baseline_only` plus `research_gate` | Same-corpus retrieval is `wrong_result`; trajectory is `not_encoded`. | Hierarchical staged context trajectory. | Evidence-bearing retrieval fix, then staged trajectory report. | -| claude-mem | `live_baseline_only` | `wrong_result`. | Progressive disclosure and automatic capture review. | Work-resume, operator-debugging, and capture/write-policy report. | +| claude-mem | `live_baseline_only` | `wrong_result`; capture breadth is `not_encoded` because hooks, timeline, observations, viewer capture, and automatic capture review were not run against real-world jobs. | Progressive disclosure and automatic capture review. | Work-resume, operator-debugging, and capture/write-policy report. | | RAGFlow | `research_gate` | `blocked`. | RAG app workflow with document/chunk references. | Tiny Docker evidence-smoke with `reference.chunks` mapped to evidence ids. | | LightRAG | `research_gate` | `blocked`. | Graph/RAG context export with source-path citations. | Docker context-export report with explicit provider config and source citation mapping. | | GraphRAG | `research_gate` | `blocked`. | Graph summaries and document/text-unit evidence tables. | Cost-bounded Docker adapter report over a tiny corpus. | @@ -177,7 +187,7 @@ records `unique_project_names: 17` for the full project list including ELF. | Consolidation | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | Live proposal generation with lineage, confidence, and review-action audit. | | Knowledge pages | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | Live page rebuild/lint plus llm-wiki, gbrain, GraphRAG, and graphify comparisons. | | Operator debugging | Fixture aggregate passes; narrow ELF/qmd live operator-debug slice is scored with ELF `pass` and qmd `wrong_result`. | Narrow ELF/qmd live claim only: ELF wins trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence; replay-command and repair-action clarity are tied. | OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | -| Capture/write policy | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | agentmemory/claude-mem style capture with redaction and evidence binding. | +| Capture/write policy | Fixture aggregate passes; ELF live service adapter passes 4/4 capture jobs with zero redaction leaks; qmd is `not_encoded`; agentmemory is `blocked`; claude-mem is `not_encoded`. | ELF has live self-check evidence for redaction, exclusions, source ids, evidence binding, and no secret leakage. Against agentmemory/claude-mem capture breadth, the comparison remains blocked or untested. | Durable agentmemory and claude-mem capture-hook runners with evidence-bound output. | | Production ops | ELF has separate production-provider/backfill/restore evidence; live sweep is not a full production-ops pass. | Bounded personal-production adoption claim with caveats. | Private corpus manifest and credentialed provider gates. | | Personalization | ELF and qmd live pass one scoped preference job. | Narrow encoded pass only. | mem0/OpenMemory and Letta entity/preference history comparison. | | Context trajectory | Not comparable. | No claim. | OpenViking staged hierarchy/trajectory scoring. | @@ -200,11 +210,11 @@ Order these by decision value, not implementation convenience: - Output: per-job evidence-link failure analysis for current-vs-historical facts, supersession, and relation temporal validity. -3. Live operator-debugging and capture/write-policy report - - Why: these are daily-use agent-memory qualities, currently fixture-only or - not_encoded in live sweeps. - - Output: trace hydration, raw-SQL avoidance, redaction, exclusion, write-policy, - and repair-action scoring. +3. External capture-hook report for agentmemory and claude-mem + - Why: ELF now has a live capture/write-policy self-check, but the strongest + agentmemory and claude-mem capture-breadth claims are still blocked or untested. + - Output: durable local capture artifacts, source ids, redaction/exclusion audit, + and typed blocker reasons when hooks or viewer capture cannot run in Docker. 4. Continuity and context-trajectory report - Why: agentmemory, claude-mem, and OpenViking represent real user expectations diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index 6030af7b..ed78742a 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -92,6 +92,10 @@ cleanup, use `docs/guide/single_user_production.md`. competitor-strength adoption report with the bounded personal-production decision, scenario-level win/tie/loss/not-tested matrix, claim boundaries, and optimization issue queue. +- `2026-06-11-capture-write-policy-live-report.md`: XY-933 live capture/write-policy + report that scores ELF redaction, exclusions, source ids, evidence binding, and no + secret leakage while preserving typed blocked/untested boundaries for agentmemory + and claude-mem capture breadth. - `2026-06-11-mem0-openmemory-history-ui-export-report.md`: XY-924 plus XY-931 mem0/OpenMemory local OSS history, preference-correction, deletion-audit, personalization, and export-readback comparison with normalized diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index e4745d72..052c5638 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -155,8 +155,9 @@ including the retrieval-quality slice below. The suite currently encodes: issue status, deployment method, benchmark conclusion, and temporal relation cases. - `operator_debugging_ux`: trace-backed stage attribution that identifies where expected evidence was filtered, demoted, or selected against. -- `capture_integration`: write-policy audit behavior for redaction/private exclusion - and fixture-backed capture/integration boundary classification. +- `capture_integration`: write-policy audit behavior for redaction/private exclusion, + source-id preservation, evidence binding, no secret leakage, and fixture-backed + capture/integration boundary classification. - `production_ops`: interrupted generated backfill resume, backup/restore plus cold-start readback, resource-envelope interpretation, pinned OpenViking local embedding runtime/wrong-result classification, missing private manifest `blocked` @@ -222,24 +223,26 @@ research gates. Its `external_adapters` report section distinguishes: Current state: the `elf_live_real_world` and `qmd_live_real_world` adapters run a full encoded-suite sweep through `cargo make real-world-memory-live-adapters`. Each adapter -materializes generated runtime answers for 38 jobs across 11 suites before scoring. +materializes generated runtime answers for 40 jobs across 11 suites before scoring. The original targeted `work_resume`, `retrieval`, and `project_decisions` slice still -passes, but the full sweep is not a full-suite pass: memory_evolution is -`wrong_result`, production_ops remains typed `incomplete`/`blocked`/`not_encoded`, and -consolidation, knowledge_compilation, operator_debugging_ux, and capture_integration -remain `not_encoded` for this live adapter path. qmd still also keeps its separate +passes, and ELF now passes the live `capture_integration` self-checks for redaction, +exclusions, source ids, evidence binding, and no secret leakage. The full sweep is +still not a full-suite pass: memory_evolution is `wrong_result`, production_ops keeps +operator-owned blocked boundaries, and consolidation, knowledge_compilation, and +operator_debugging_ux remain `not_encoded` for this live adapter path. qmd keeps +`capture_integration` typed `not_encoded` and still also keeps its separate `live_baseline_only` same-corpus record for update/delete/cold-start checks; that record is not a real-world suite win. agentmemory is blocked on durable upstream -storage for lifecycle proof. mem0/OpenMemory, memsearch, and claude-mem currently -retain wrong-result or incomplete live-baseline states for the checked-in adapter -evidence. OpenViking now reaches its pinned Docker local embedding setup but remains a -same-corpus `wrong_result` until it returns evidence-bearing retrieval output. The -expanded RAG and graph-memory records for RAGFlow, LightRAG, GraphRAG, -Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify, and deeper -qmd/OpenViking profiles are `research_gate` records until their Docker-isolated -adapter runs are implemented. These typed states describe benchmark coverage; do not -convert setup weight, missing research, or unencoded suites into broad project quality -rankings. +storage for lifecycle proof and capture breadth. mem0/OpenMemory, memsearch, and +claude-mem currently retain wrong-result, not-encoded, or incomplete live-baseline +states for the checked-in adapter evidence. OpenViking now reaches its pinned Docker +local embedding setup but remains a same-corpus `wrong_result` until it returns +evidence-bearing retrieval output. The expanded RAG and graph-memory records for +RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, +gbrain, graphify, and deeper qmd/OpenViking profiles are `research_gate` records until +their Docker-isolated adapter runs are implemented. These typed states describe +benchmark coverage; do not convert setup weight, missing research, or unencoded suites +into broad project quality rankings. To run the full live adapter sweep for ELF and qmd: diff --git a/docs/research/2026-06-11-capture-write-policy-live-report.json b/docs/research/2026-06-11-capture-write-policy-live-report.json new file mode 100644 index 00000000..a00e9a5e --- /dev/null +++ b/docs/research/2026-06-11-capture-write-policy-live-report.json @@ -0,0 +1,220 @@ +{ + "schema": "elf.capture_write_policy_live_report/v1", + "report_id": "xy-933-capture-write-policy-live-report-2026-06-11", + "authority": "XY-933", + "created_at": "2026-06-11T14:31:00Z", + "commands": [ + { + "command": "cargo make real-world-memory", + "status": "pass", + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "status": "pass", + "artifact": "tmp/real-world-memory/live-adapters/summary.json" + } + ], + "fixture_aggregate": { + "job_count": 40, + "pass": 38, + "blocked": 2, + "capture_integration": { + "encoded_job_count": 4, + "status": "pass", + "score_mean": 1.0, + "redaction_leak_count": 0, + "evidence_required_count": 10, + "evidence_covered_count": 10, + "source_ref_required_count": 10, + "source_ref_covered_count": 10 + } + }, + "live_capture_results": { + "elf_live_real_world": { + "suite_status": "pass", + "encoded_job_count": 4, + "redaction_leak_count": 0, + "expected_evidence_recall": 1.0, + "evidence_required_count": 10, + "evidence_covered_count": 10, + "source_ref_required_count": 10, + "source_ref_covered_count": 10, + "artifact": "tmp/real-world-memory/live-adapters/elf-report.json", + "materialization_artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + "qmd_live_real_world": { + "suite_status": "not_encoded", + "encoded_job_count": 4, + "redaction_leak_count": 0, + "expected_evidence_recall": 0.0, + "evidence_required_count": 10, + "evidence_covered_count": 0, + "source_ref_required_count": 10, + "source_ref_covered_count": 0, + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.json" + } + }, + "jobs": [ + { + "job_id": "capture-redaction-exclusion-001", + "status": "pass", + "stored_evidence_ids": [ + "public-captured-decision", + "write-policy-audit" + ], + "excluded_evidence_ids": [ + "private-excluded-text" + ], + "source_ids": [ + "capture:linear-comment-933", + "capture:write-policy-audit-933" + ], + "runtime_source_refs": [ + { + "evidence_id": "public-captured-decision", + "source_id": "capture:linear-comment-933", + "evidence_binding": "source_ref", + "write_policy_applied": false + }, + { + "evidence_id": "write-policy-audit", + "source_id": "capture:write-policy-audit-933", + "evidence_binding": "source_ref", + "write_policy_applied": false + } + ], + "write_policy_audit_count": 0, + "write_policy_redaction_count": 0, + "redaction_leak_count": 0 + }, + { + "job_id": "capture-source-id-binding-001", + "status": "pass", + "stored_evidence_ids": [ + "source-id-release-summary", + "source-id-command-log" + ], + "excluded_evidence_ids": [], + "source_ids": [ + "capture:issue-comment-42", + "capture:command-log-7" + ], + "runtime_source_refs": [ + { + "evidence_id": "source-id-release-summary", + "source_id": "capture:issue-comment-42", + "evidence_binding": "source_ref", + "write_policy_applied": false + }, + { + "evidence_id": "source-id-command-log", + "source_id": "capture:command-log-7", + "evidence_binding": "source_ref", + "write_policy_applied": false + } + ], + "write_policy_audit_count": 0, + "write_policy_redaction_count": 0, + "redaction_leak_count": 0 + }, + { + "job_id": "capture-write-policy-redaction-001", + "status": "pass", + "stored_evidence_ids": [ + "redacted-source-message" + ], + "excluded_evidence_ids": [ + "redacted-private-token-trap" + ], + "source_ids": [ + "capture:terminal-log-17" + ], + "runtime_source_refs": [ + { + "evidence_id": "redacted-source-message", + "source_id": "capture:terminal-log-17", + "evidence_binding": "source_ref", + "write_policy_applied": true + } + ], + "write_policy_audit_count": 1, + "write_policy_redaction_count": 1, + "redaction_leak_count": 0 + }, + { + "job_id": "capture-integration-boundaries-001", + "status": "pass", + "stored_evidence_ids": [ + "xy844-capture-log", + "agentmemory-hook-reference", + "claude-mem-viewer-reference", + "live-adapter-follow-up" + ], + "excluded_evidence_ids": [ + "private-span-trap" + ], + "source_ids": [], + "runtime_source_refs": [ + { + "evidence_id": "live-adapter-follow-up", + "source_id": null, + "evidence_binding": null, + "write_policy_applied": false + }, + { + "evidence_id": "agentmemory-hook-reference", + "source_id": null, + "evidence_binding": null, + "write_policy_applied": false + }, + { + "evidence_id": "xy844-capture-log", + "source_id": null, + "evidence_binding": null, + "write_policy_applied": false + }, + { + "evidence_id": "claude-mem-viewer-reference", + "source_id": null, + "evidence_binding": null, + "write_policy_applied": false + } + ], + "write_policy_audit_count": 0, + "write_policy_redaction_count": 0, + "redaction_leak_count": 0 + } + ], + "competitor_positions": [ + { + "project": "qmd", + "position": "untested", + "reason": "ELF executes and passes 4/4 live capture jobs; qmd keeps capture_integration typed not_encoded in the same live sweep, so this is an ELF self-check rather than a qmd comparison result." + }, + { + "project": "agentmemory", + "position": "blocked", + "reason": "The current Docker baseline uses a process-local StateKV Map and in-memory index; no durable local session/capture path stores source ids, exclusions, write-policy audit, or evidence-bound output." + }, + { + "project": "claude-mem", + "position": "untested", + "reason": "Repository storage, lifecycle, progressive disclosure, and same-corpus retrieval are checked; hooks, timeline, observations, viewer capture, and automatic capture review are not run against real-world jobs." + } + ], + "claim_boundary": { + "allowed": [ + "ELF live capture/write-policy self-checks pass for redaction, exclusions, source ids, evidence binding, and no secret leakage.", + "qmd remains not_encoded for capture/write-policy jobs in the full live sweep.", + "agentmemory capture comparison is blocked by mocked/in-memory storage and lack of a durable local capture artifact.", + "claude-mem capture breadth is untested until a Docker-contained hook/viewer capture runner exists." + ], + "not_allowed": [ + "Do not claim ELF broadly beats agentmemory or claude-mem on capture breadth.", + "Do not use host-global hooks as benchmark evidence.", + "Do not weaken ELF write-policy, redaction, or evidence-binding constraints for benchmark convenience.", + "Do not convert fixture-backed or live-baseline-only capture references into a live real-world competitor pass." + ] + } +} diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index 56ec65a5..670cf16f 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -12,7 +12,7 @@ "Live temporal reconciliation remains wrong_result for five of six memory_evolution jobs.", "Private-corpus production quality is blocked until an operator-owned manifest exists.", "Credentialed provider production-ops gates are blocked until explicit provider setup exists.", - "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation remain unproven. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export and claude-mem viewer workflows remain blocked or not encoded." + "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation remain unproven. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export and claude-mem viewer workflows remain blocked or not encoded. XY-933 adds an ELF live capture/write-policy self-check, but agentmemory capture breadth is blocked by mocked/in-memory storage and claude-mem hook/viewer capture remains untested." ] }, "evidence_class_terms": [ @@ -39,12 +39,17 @@ { "command": "cargo make real-world-memory", "artifact": "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", - "claim": "ELF fixture aggregate covers 38 jobs across 11 suites with 36 pass and 2 blocked production-ops operator boundaries." + "claim": "ELF fixture aggregate covers 40 jobs across 11 suites with 38 pass and 2 blocked production-ops operator boundaries." }, { "command": "cargo make real-world-memory-live-adapters", "artifact": "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", - "claim": "ELF live service adapter reports 18 pass, 5 wrong_result, 2 blocked, and 13 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 13 not_encoded jobs." + "claim": "ELF live service adapter reports 22 pass, 5 wrong_result, 2 blocked, and 11 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 15 not_encoded jobs." + }, + { + "command": "cargo make real-world-memory-live-adapters", + "artifact": "docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md", + "claim": "ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage; qmd remains not_encoded, agentmemory is blocked, and claude-mem is untested for capture breadth." }, { "command": "cargo make real-world-job-operator-ux-live-adapters", @@ -269,20 +274,22 @@ "outcome": "not_tested", "evidence_classes": [ "fixture_backed", + "live_real_world", "live_baseline_only", "blocked", "not_encoded" ], - "measured_claim": "ELF fixture capture/write-policy jobs pass, but live capture integration remains not encoded and agentmemory/claude-mem capture hooks are not comparable yet.", + "measured_claim": "ELF live capture/write-policy self-check jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. qmd remains not_encoded; agentmemory comparison is blocked by mocked/in-memory storage; claude-mem capture breadth is not_encoded because hooks, timeline, observations, viewer capture, and automatic capture review were not run against real-world jobs.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md", "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md" ], "follow_up_issues": [ - "XY-925", - "XY-926" + "XY-933", + "XY-925" ], - "caveat": "Future evidence must prove redaction, exclusions, evidence binding, and no secret leakage." + "caveat": "This is an ELF self-check and qmd not_encoded delta, not a broad capture-breadth win over agentmemory or claude-mem." }, { "scenario_id": "production_ops_restore_backfill", @@ -426,7 +433,13 @@ "issue": "XY-926", "priority": "P1", "state": "Backlog", - "gap": "Live operator-debugging, capture, consolidation, and knowledge-page suites." + "gap": "Live consolidation and knowledge-page suites; broad operator-debugging remains dependent on OpenMemory and claude-mem UI runners." + }, + { + "issue": "XY-933", + "priority": "P1", + "state": "Live ELF self-check encoded", + "gap": "Capture/write-policy redaction, exclusion, source-id, evidence-binding, and no-leak scoring for ELF; durable agentmemory/claude-mem capture-hook comparison remains blocked or untested." }, { "issue": "XY-927", @@ -466,7 +479,8 @@ "ELF ties qmd on encoded live retrieval, work_resume, project_decisions, and personalization slices.", "ELF has a live temporal reconciliation loss against the benchmark expectation: five memory_evolution jobs remain wrong_result.", "Most competitor strengths outside qmd retrieval are not_tested, blocked, smoke_only, or research_gate.", - "ELF has a narrow live operator-debug win over qmd for trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence, with replay-command availability and repair-action clarity tied." + "ELF has a narrow live operator-debug win over qmd for trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence, with replay-command availability and repair-action clarity tied.", + "ELF live capture/write-policy self-checks pass for redaction, exclusions, source ids, evidence binding, and no secret leakage." ], "not_allowed": [ "Do not claim ELF broadly beats qmd.", @@ -476,7 +490,8 @@ "Do not claim ELF beats Letta on core-vs-archival memory.", "Do not claim graph/RAG parity from smoke-only evidence.", "Do not promote fixture-backed, live_baseline_only, smoke_only, research_gate, blocked, wrong_result, lifecycle_fail, unsupported, or not_encoded states into a generic pass/fail score.", - "Do not claim ELF broadly beats OpenMemory or claude-mem viewer UX from the narrow ELF/qmd operator-debug slice." + "Do not claim ELF broadly beats OpenMemory or claude-mem viewer UX from the narrow ELF/qmd operator-debug slice.", + "Do not claim ELF broadly beats agentmemory or claude-mem on capture breadth; the current comparison is blocked or untested for their hook/viewer capture paths." ] } } diff --git a/docs/research/2026-06-11-measurement-coverage-audit.json b/docs/research/2026-06-11-measurement-coverage-audit.json index ab71c30e..e55042c4 100644 --- a/docs/research/2026-06-11-measurement-coverage-audit.json +++ b/docs/research/2026-06-11-measurement-coverage-audit.json @@ -1,73 +1,73 @@ { "schema": "elf.benchmark_measurement_coverage_audit/v2", "run_id": "2026-06-11-measurement-coverage-audit", - "source_revision": "current XY-898 lane after adapter-report consistency repairs", + "source_revision": "current XY-933 lane after live capture/write-policy scoring", "created_at": "2026-06-11", "scope": "ELF memory-system competitiveness measurement coverage, external competitor comparison evidence, and next report directions", "commands": [ { "command": "cargo make real-world-memory", "status": "pass", - "runtime_seconds": 11.91, + "runtime_seconds": 7.11, "artifact": "tmp/real-world-memory/real-world-memory-report.json" }, { "command": "cargo make real-world-memory-live-adapters", "status": "pass", - "runtime_seconds": 121.51, + "runtime_seconds": 137.66, "artifact": "tmp/real-world-memory/live-adapters/" } ], "fixture_aggregate": { - "job_count": 38, + "job_count": 40, "encoded_suite_count": 11, - "pass": 36, + "pass": 38, "wrong_result": 0, "lifecycle_fail": 0, "incomplete": 0, "blocked": 2, "not_encoded": 0, "unsupported_claim": 0, - "mean_score": 0.947, - "mean_latency_ms": 4.411, - "expected_evidence_total": 77, - "expected_evidence_matched": 77, - "evidence_required_count": 84, - "evidence_covered_count": 84 + "mean_score": 0.95, + "mean_latency_ms": 4.244, + "expected_evidence_total": 80, + "expected_evidence_matched": 80, + "evidence_required_count": 88, + "evidence_covered_count": 88 }, "live_real_world_adapters": [ { "adapter": "ELF live service adapter", - "job_count": 38, + "job_count": 40, "encoded_suite_count": 11, - "pass": 18, + "pass": 22, "wrong_result": 5, "blocked": 2, - "not_encoded": 13, - "mean_score": 0.525, - "mean_latency_ms": 6.761, - "expected_evidence_total": 77, - "expected_evidence_matched": 41, - "evidence_required_count": 84, - "evidence_covered_count": 48 + "not_encoded": 11, + "mean_score": 0.599, + "mean_latency_ms": 6.98, + "expected_evidence_total": 80, + "expected_evidence_matched": 50, + "evidence_required_count": 88, + "evidence_covered_count": 58 }, { "adapter": "qmd live CLI adapter", - "job_count": 38, + "job_count": 40, "encoded_suite_count": 11, "pass": 17, "wrong_result": 6, "blocked": 2, - "not_encoded": 13, - "mean_score": 0.486, - "mean_latency_ms": 691.163, - "expected_evidence_total": 77, + "not_encoded": 15, + "mean_score": 0.461, + "mean_latency_ms": 792.543, + "expected_evidence_total": 80, "expected_evidence_matched": 38, - "evidence_required_count": 84, + "evidence_required_count": 88, "evidence_covered_count": 45 } ], - "live_suite_delta": "ELF passes memory-evolution-delete-ttl-001 while qmd reports wrong_result; other suite status shapes match.", + "live_suite_delta": "ELF passes memory-evolution-delete-ttl-001 while qmd reports wrong_result; ELF also passes the live capture/write-policy suite while qmd remains not_encoded for capture_integration.", "live_suite_breakdown": [ { "suite": "trust_source_of_truth", @@ -132,12 +132,12 @@ }, { "suite": "capture_integration", - "jobs": 2, + "jobs": 4, "elf_status_counts": { - "not_encoded": 2 + "pass": 4 }, "qmd_status_counts": { - "not_encoded": 2 + "not_encoded": 4 } }, { @@ -201,7 +201,8 @@ "not_encoded": 7 }, "xy900_update_note": "XY-900 promotes graphify from research_gate/blocked to a tiny scored live_real_world wrong_result smoke; broad graph/RAG quality remains unproven.", - "xy932_update_note": "XY-932 adds narrow ELF/qmd operator-debug live_real_world records: ELF pass and qmd wrong_result for trace hydration/candidate-drop visibility, with OpenMemory and claude-mem UI still unmeasured." + "xy932_update_note": "XY-932 adds narrow ELF/qmd operator-debug live_real_world records: ELF pass and qmd wrong_result for trace hydration/candidate-drop visibility, with OpenMemory and claude-mem UI still unmeasured.", + "xy933_update_note": "XY-933 adds live ELF capture/write-policy scoring: ELF passes 4/4 capture_integration jobs with zero redaction leaks, qmd remains not_encoded, agentmemory comparison is blocked by mocked/in-memory storage, and claude-mem capture hooks remain not_encoded." }, "claim_boundary": { "elf_vs_qmd": "near_tie_with_narrow_delete_ttl_elf_lead_not_overall_win", @@ -211,7 +212,7 @@ "qmd_deep_retrieval_debug", "OpenViking_context_trajectory", "mem0_OpenMemory_entity_history_ui", - "agentmemory_claude_mem_capture_continuity", + "agentmemory_claude_mem_capture_breadth", "Letta_core_vs_archival_memory", "Graphiti_Zep_temporal_graph", "RAG_graph_navigation", @@ -221,7 +222,7 @@ "next_reports": [ "ELF/qmd retrieval-debug deep profile", "ELF/qmd live memory-evolution diagnostic", - "Live operator-debugging and capture/write-policy report", + "External capture-hook report for agentmemory and claude-mem", "Continuity and context-trajectory report", "Personalization and core-memory report", "Knowledge and graph/RAG report pack" diff --git a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json index f67d9d5f..528fc057 100644 --- a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json +++ b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json @@ -95,10 +95,10 @@ "unsupported_or_blocked_status": { "state": "blocked", "typed_reason": "private_manifest_and_provider_credentials", - "details": "Fixture production-ops keeps private corpus and provider credential gates blocked; the full live sweep keeps broader non-retrieval suites typed non-pass, while the narrow operator-debug slice now passes." + "details": "Fixture production-ops keeps private corpus and provider credential gates blocked; the full live sweep keeps broader non-retrieval suites typed non-pass, while the narrow operator-debug and live capture/write-policy slices now pass." }, - "benchmark_before_claim": "A full-suite live_real_world pass plus separate private-corpus and credentialed production-ops evidence is required before broad live parity or production proof claims.", - "borrow_if_stronger": "Keep borrowing qmd debug knobs, OpenViking staged trajectory, mem0 history, Letta core memory, and graph/RAG navigation patterns where they remain stronger." + "benchmark_before_claim": "A full-suite live_real_world pass plus separate private-corpus, credentialed production-ops, and durable external capture-hook evidence is required before broad live parity, production, or capture-breadth claims.", + "borrow_if_stronger": "Keep borrowing qmd debug knobs, OpenViking staged trajectory, mem0 history, Letta core memory, agentmemory/claude-mem capture breadth, and graph/RAG navigation patterns where they remain stronger." }, { "project": "qmd", @@ -136,8 +136,8 @@ }, "unsupported_or_blocked_status": { "state": "blocked", - "typed_reason": "durable_lifecycle_adapter_missing", - "details": "Same-corpus retrieval can run, but durable cold-start and real-world job adapter coverage are blocked by the current adapter path." + "typed_reason": "durable_lifecycle_and_capture_adapter_missing", + "details": "Same-corpus retrieval can run, but durable cold-start, capture-hook persistence, and real-world job adapter coverage are blocked by the current process-local StateKV Map and in-memory index path." }, "benchmark_before_claim": "Add a durable local adapter that covers update, delete, cold-start reload, work resume, capture/write policy, and lifecycle-staleness jobs.", "borrow_if_stronger": "Borrow cross-agent hooks, packaging, continuity scenarios, and operator-visible viewer affordances." @@ -217,8 +217,8 @@ }, "unsupported_or_blocked_status": { "state": "not_encoded", - "typed_reason": "progressive_disclosure_real_world_jobs_not_encoded", - "details": "Current Docker evidence is not a clean retrieval pass and progressive-disclosure jobs are not encoded." + "typed_reason": "progressive_disclosure_and_capture_real_world_jobs_not_encoded", + "details": "Current Docker evidence is not a clean retrieval pass, and progressive-disclosure plus hook/viewer capture jobs are not encoded." }, "benchmark_before_claim": "Add durable repository-backed work_resume, operator_debugging_ux, capture/write-policy, and progressive-disclosure jobs.", "borrow_if_stronger": "Borrow progressive disclosure, automatic capture review loops, and local viewer/operator comfort." @@ -500,11 +500,11 @@ { "scenario_id": "capture_write_policy", "scenario": "capture/write policy", - "current_elf_evidence": "ELF fixture-backed capture_integration passes, but ELF live_real_world capture_integration is not_encoded.", + "current_elf_evidence": "ELF fixture-backed capture_integration passes, and ELF live_real_world capture_integration passes 4/4 with zero redaction leaks, source ids, write-policy audit, and evidence binding.", "strongest_competitor_or_reference": "agentmemory, claude-mem", - "current_competitor_evidence": "agentmemory capture_integration is blocked and claude-mem capture_integration is not_encoded.", - "current_state": "ELF fixture evidence is strongest, but live capture and write-policy behavior still needs runtime scoring.", - "next_measurement": "Run capture/write-policy jobs that prove redaction, exclusion, evidence binding, and no secret leakage through live ingestion paths." + "current_competitor_evidence": "agentmemory capture_integration is blocked by mocked/in-memory storage and claude-mem hook/viewer capture is not_encoded.", + "current_state": "ELF has live capture/write-policy self-check evidence, but agentmemory and claude-mem capture-breadth comparisons remain blocked or untested.", + "next_measurement": "Run durable agentmemory and claude-mem capture-hook jobs that prove redaction, exclusion, evidence binding, source ids, and no secret leakage." }, { "scenario_id": "production_ops", @@ -567,6 +567,13 @@ "blocked_by": "Durable local adapter path selection.", "measurement": "Update, delete, cold-start reload, work_resume, and capture/write-policy jobs." }, + { + "workstream": "agentmemory/claude-mem capture-hook breadth", + "issue_or_candidate": "follow-up after XY-933", + "parallelizable": true, + "blocked_by": "Docker-contained hook/viewer capture path with durable artifacts.", + "measurement": "Source ids, redaction/exclusion audit, evidence-bound output, and typed blocker reporting." + }, { "workstream": "mem0/OpenMemory history and UI coverage", "issue_or_candidate": "new adapter repair issue", diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index 5bb56574..3416f3f7 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -113,6 +113,18 @@ Each `items[]` entry MUST include: - `source_ref`: object; MAY be `{}` only for generated synthetic fixtures. - `created_at`: RFC3339 timestamp or `null` when time is intentionally irrelevant. +Each `items[]` entry MAY include: + +- `capture`: object used by live capture/write-policy materializers. Supported fields: + - `action`: `store` or `exclude`. `exclude` means the item is an expected capture + input but MUST NOT be stored in the evaluated memory system. + - `source_id`: optional stable source identifier that must be preserved in the + resulting source reference when the item is stored. + - `evidence_binding`: optional label for the evidence-binding mode the live adapter + must preserve. + - `write_policy`: optional write-policy object applied before storage. Redactions + and exclusions from this policy must be counted in the materialization artifact. + Optional corpus fields: - `capture_behaviors`: object used by `capture_integration` jobs and fixture-backed