Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions Makefile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -839,6 +839,9 @@ args = [
# | real-world-memory-knowledge | composite | |
# | real-world-memory-knowledge-json | command | |
# | real-world-memory-knowledge-report | command | |
# | real-world-first-generation-oss | composite | |
# | real-world-first-generation-oss-json | command | |
# | real-world-first-generation-oss-report | command | |
# | ragflow-docker-smoke | command | |
# | lightrag-docker-context-smoke | command | |
# | graphrag-docker-smoke | command | |
Expand Down Expand Up @@ -933,6 +936,55 @@ args = [
"tmp/real-world-memory/knowledge-report.md",
]

[tasks.real-world-first-generation-oss]
workspace = false
dependencies = [
"real-world-first-generation-oss-report",
]

[tasks.real-world-first-generation-oss-json]
workspace = false
command = "cargo"
args = [
"run",
"-p",
"elf-eval",
"--bin",
"real_world_job_benchmark",
"--",
"run",
"--fixtures",
"apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss",
"--out",
"tmp/real-world-memory/first-generation-oss/report.json",
"--run-id",
"first-generation-oss-continuity-source-store",
"--adapter-id",
"fixture_first_generation_oss",
"--adapter-name",
"First-generation OSS fixture coverage",
]

[tasks.real-world-first-generation-oss-report]
workspace = false
dependencies = [
"real-world-first-generation-oss-json",
]
command = "cargo"
args = [
"run",
"-p",
"elf-eval",
"--bin",
"real_world_job_benchmark",
"--",
"publish",
"--report",
"tmp/real-world-memory/first-generation-oss/report.json",
"--out",
"tmp/real-world-memory/first-generation-oss/report.md",
]


# External memory pattern radar
# | task | type | cwd |
Expand Down
20 changes: 16 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,16 @@ provider-backed ELF evidence was required.
ELF passes trace hydration, candidate-drop visibility, selected-but-not-narrated
evidence, replay-command availability, and repair-action clarity. qmd ties replay
command and repair-action clarity but is `wrong_result` for trace hydration and
candidate-drop stage visibility. OpenMemory UI/export and claude-mem viewer flows
remain blocked or not encoded, so this is not a broad viewer-product claim.
candidate-drop stage visibility. OpenMemory UI/export remains blocked, and
claude-mem viewer flows remain blocked until Docker-contained hook/viewer evidence
exists, so this is not a broad viewer-product claim.
- First-generation OSS continuity/source-store follow-up after XY-925: `cargo make
real-world-first-generation-oss` emits a fixture-backed external-adapter slice for
agentmemory, memsearch, and claude-mem with 6 jobs, 4 pass, 2 blocked, and full
evidence/source-ref/quote coverage. It selects agentmemory's durable local path,
adds memsearch canonical Markdown source-store and retrieval-debug prompt coverage,
and records claude-mem progressive-disclosure/retrieval-repair coverage while
keeping hook and viewer/operator workflows blocked.
- Expanded adapter-pack coverage after XY-834: the real-world external adapter
manifest now includes `research_gate` records for RAGFlow, LightRAG, GraphRAG,
Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, and deeper
Expand Down Expand Up @@ -201,14 +209,16 @@ provider-backed ELF evidence was required.
source refs, write-policy redaction audit counts, evidence binding, and no secret
leakage. qmd remains `not_encoded` for this suite. agentmemory capture comparison is
blocked by mocked/in-memory storage, and claude-mem hook/viewer capture remains
untested, so no broad capture-breadth superiority claim is allowed.
blocked until Docker-contained hook/viewer capture evidence exists, so no broad
capture-breadth superiority claim is allowed.
- The benchmark runner and report publisher are checked in and Docker-isolated:
`cargo make baseline-live-docker`, `cargo make baseline-backfill-docker`,
`cargo make baseline-production-private-addendum`,
`cargo make baseline-backfill-10k-docker`,
`cargo make baseline-backfill-100k-docker`,
`cargo make baseline-soak-docker`, `cargo make baseline-live-report`,
`cargo make real-world-memory-live-adapters`, and
`cargo make real-world-memory-live-adapters`,
`cargo make real-world-first-generation-oss`, and
`cargo make baseline-live-docker-clean`. Expensive 100k and long-soak profiles
are opt-in and do not run in normal checks.

Expand All @@ -225,6 +235,7 @@ Detailed evidence and interpretation:
- [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md)
- [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md)
- [Capture/Write-Policy Live Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md)
- [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md)
- [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
- [Single-User Production Runbook](docs/guide/single_user_production.md)
- Benchmark contract:
Expand Down Expand Up @@ -303,6 +314,7 @@ Detailed comparison, mechanism-level analysis, and source map:
- [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md)
- [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md)
- [Capture/Write-Policy Live Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md)
- [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md)
- [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
- [Real-World Agent Memory Benchmark](docs/guide/benchmarking/real_world_agent_memory_benchmark.md)
- [External Memory Improvement Plan](docs/guide/research/external_memory_improvement_plan.md)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
{
"schema": "elf.real_world_job/v1",
"job_id": "first-gen-agentmemory-durable-capture-blocked-001",
"suite": "capture_integration",
"title": "Select the durable agentmemory capture path before scoring hooks",
"encoding": {
"status": "blocked",
"reason": "agentmemory's current Docker baseline still uses a process-local SDK/KV mock, so work-resume and write-policy hook capture cannot be scored until a persistent local session, KV, and index path survives a fresh process.",
"follow_up": {
"title": "Wire agentmemory durable local session capture for work-resume jobs",
"reason": "The fair path is a Docker-contained adapter that persists the agentmemory observation log, KV store, and searchable index between capture and replay processes."
}
},
"corpus": {
"corpus_id": "first-generation-oss-agentmemory-2026-06-11",
"profile": "external_adapter",
"items": [
{
"evidence_id": "agentmemory-selected-durable-path",
"kind": "adapter_plan",
"text": "Selected agentmemory path: run capture hooks into a Docker-local session directory, persist the SDK KV store and searchable index, restart a fresh process, then score work_resume and write-policy prompts against that recovered store.",
"source_ref": {
"schema": "source_ref/v1",
"resolver": "real_world_job_fixture/v1",
"ref": {
"fixture": "agentmemory_durable_capture_path_blocked",
"evidence_id": "agentmemory-selected-durable-path"
},
"locator": {
"quote": "persist the SDK KV store and searchable index"
}
},
"created_at": "2026-06-11T10:00:00Z"
},
{
"evidence_id": "agentmemory-mock-boundary",
"kind": "adapter_blocker",
"text": "Current blocker: the live-baseline adapter registers agentmemory functions against a process-local StateKV Map and in-memory index, so it cannot prove cold-start recovery or hook capture durability.",
"source_ref": {
"schema": "source_ref/v1",
"resolver": "real_world_job_fixture/v1",
"ref": {
"fixture": "agentmemory_durable_capture_path_blocked",
"evidence_id": "agentmemory-mock-boundary"
},
"locator": {
"quote": "process-local StateKV Map and in-memory index"
}
},
"created_at": "2026-06-11T10:01:00Z"
},
{
"evidence_id": "agentmemory-pass-decoy",
"kind": "adapter_state",
"text": "Decoy: agentmemory same-corpus retrieval passing through the mock proves durable coding-agent continuity and write-policy capture.",
"source_ref": {
"schema": "source_ref/v1",
"resolver": "real_world_job_fixture/v1",
"ref": {
"fixture": "agentmemory_durable_capture_path_blocked",
"evidence_id": "agentmemory-pass-decoy"
}
},
"created_at": "2026-06-11T09:59:00Z"
}
],
"adapter_response": {
"adapter_id": "fixture_first_generation_oss",
"answer": {
"content": "agentmemory remains blocked for durable work-resume and write-policy hook capture. The selected local path is a Docker-contained session directory that persists the SDK KV store and searchable index across a fresh process; the current StateKV Map and in-memory index cannot prove that.",
"claims": [
{
"claim_id": "selected_durable_path",
"text": "The selected local path persists the SDK KV store and searchable index across a fresh process.",
"evidence_ids": ["agentmemory-selected-durable-path"],
"confidence": "high"
},
{
"claim_id": "current_mock_blocker",
"text": "The current StateKV Map and in-memory index cannot prove durable continuity.",
"evidence_ids": ["agentmemory-mock-boundary"],
"confidence": "high"
}
],
"evidence_ids": ["agentmemory-selected-durable-path", "agentmemory-mock-boundary"],
"latency_ms": 1.0,
"cost": {
"currency": "USD",
"amount": 0.0,
"input_tokens": 0,
"output_tokens": 0
}
}
},
"capture_behaviors": {
"blocked": [
"agentmemory durable hook capture waits for a persistent Docker-local session, KV, and index path."
],
"notes": [
"Same-corpus mock retrieval is not promoted into work-resume or capture integration pass evidence."
]
}
},
"timeline": [
{
"event_id": "agentmemory-durable-path-selected",
"ts": "2026-06-11T10:00:00Z",
"actor": "benchmark",
"action": "selected_durable_adapter_path",
"evidence_ids": ["agentmemory-selected-durable-path"],
"summary": "The next fair agentmemory path must persist capture state across a fresh process."
},
{
"event_id": "agentmemory-mock-blocker-preserved",
"ts": "2026-06-11T10:01:00Z",
"actor": "benchmark",
"action": "kept_blocked_state",
"evidence_ids": ["agentmemory-mock-boundary"],
"summary": "The current in-memory adapter remains blocked for durable continuity."
}
],
"prompt": {
"role": "user",
"content": "What local agentmemory path should be used for work-resume and write-policy capture, and can the current mock be scored?",
"job_mode": "operate",
"constraints": ["cite_evidence", "state_blockers", "do_not_promote_mock_smoke"]
},
"expected_answer": {
"must_include": [
{
"claim_id": "selected_durable_path",
"text": "The selected local path persists the SDK KV store and searchable index across a fresh process."
},
{
"claim_id": "current_mock_blocker",
"text": "The current StateKV Map and in-memory index cannot prove durable continuity."
}
],
"must_not_include": [
"same-corpus retrieval passing through the mock proves durable coding-agent continuity"
],
"evidence_links": {
"selected_durable_path": ["agentmemory-selected-durable-path"],
"current_mock_blocker": ["agentmemory-mock-boundary"]
},
"answer_type": "blocked_plan",
"accepted_alternates": [],
"requires_caveat": true,
"requires_refusal": false
},
"required_evidence": [
{
"evidence_id": "agentmemory-selected-durable-path",
"claim_id": "selected_durable_path",
"requirement": "cite",
"quote": "persist the SDK KV store and searchable index"
},
{
"evidence_id": "agentmemory-mock-boundary",
"claim_id": "current_mock_blocker",
"requirement": "cite",
"quote": "process-local StateKV Map and in-memory index"
}
],
"negative_traps": [
{
"trap_id": "mock-smoke-durable-pass",
"type": "unsupported_prior",
"evidence_ids": ["agentmemory-pass-decoy"],
"failure_if_used": true
}
],
"scoring_rubric": {
"dimensions": {
"uncertainty_handling": {
"weight": 0.35,
"max_points": 1.0,
"criteria": "Keeps the durable path blocked until persistent state is proven."
},
"workflow_helpfulness": {
"weight": 0.25,
"max_points": 1.0,
"criteria": "Names the concrete local path needed for the next adapter."
},
"evidence_grounding": {
"weight": 0.25,
"max_points": 1.0,
"criteria": "Cites the selected path and the current mock boundary."
},
"trap_avoidance": {
"weight": 0.15,
"max_points": 1.0,
"criteria": "Does not promote the mock same-corpus smoke into durable continuity proof."
}
},
"pass_threshold": 0.85,
"hard_fail_rules": [
"unsupported high-confidence claim about a required decision or fact",
"use of a negative trap marked failure_if_used = true"
]
},
"allowed_uncertainty": {
"can_answer_unknown": false,
"acceptable_phrases": [],
"fallback_action": "state_blocker"
},
"tags": ["external_adapter", "agentmemory", "capture_integration", "blocked", "no_live_claim"]
}
Loading