hack-ink · yvette-carlisle · Jun 10, 2026 · Jun 10, 2026
diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json
@@ -580,7 +580,7 @@
       },
       "run": {
         "status": "wrong_result",
-        "evidence": "The current same-corpus retrieval result is typed wrong_result or incomplete in the checked-in benchmark evidence.",
+        "evidence": "The Docker runner exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, and cold-start reload; same-corpus retrieval remains typed wrong_result or incomplete when evidence is missed.",
         "artifact": "tmp/live-baseline/live-baseline-report.json"
       },
       "result": {
@@ -599,11 +599,21 @@
           "status": "wrong_result",
           "evidence": "The checked-in smoke evidence did not prove a correct same-corpus result for mem0."
         },
+        {
+          "capability": "local_lifecycle_update_delete_reload",
+          "status": "real",
+          "evidence": "The Docker runner exercises public Memory.update, Memory.delete, and a new Memory.from_config over the same local Qdrant/history paths; any miss is reported as lifecycle_fail instead of pass."
+        },
         {
           "capability": "openmemory_ui_readback",
           "status": "not_encoded",
           "evidence": "OpenMemory UI readback is not encoded in the Docker baseline or real-world job runner."
         },
+        {
+          "capability": "hosted_managed_memory_claims",
+          "status": "not_encoded",
+          "evidence": "Hosted mem0 Platform behavior is outside the local OSS Docker adapter and is not counted as a local pass."
+        },
         {
           "capability": "real_world_job_adapter",
           "status": "not_encoded",
@@ -613,8 +623,8 @@
       "suites": [
         {
           "suite_id": "memory_evolution",
-          "status": "incomplete",
-          "evidence": "mem0 lifecycle/history is a target dimension, but current Docker evidence has not produced a complete real-world job result."
+          "status": "wrong_result",
+          "evidence": "Local lifecycle checks are encoded in the Docker baseline, but real_world_job memory-evolution prompts are not executed and missed local evidence must remain typed non-pass."
         },
         {
           "suite_id": "personalization",
@@ -654,7 +664,7 @@
       },
       "run": {
         "status": "wrong_result",
-        "evidence": "The current same-corpus retrieval evidence is not a clean pass for memsearch.",
+        "evidence": "The Docker runner indexes a per-adapter corpus copy, rewrites and deletes files, reruns memsearch index, and records wrong_result or lifecycle_fail when expected evidence is missed.",
         "artifact": "tmp/live-baseline/live-baseline-report.json"
       },
       "result": {
@@ -673,6 +683,11 @@
           "status": "wrong_result",
           "evidence": "The checked-in smoke evidence did not prove correct same-corpus retrieval."
         },
+        {
+          "capability": "reindex_update_delete_reload",
+          "status": "real",
+          "evidence": "The runner rewrites auth-memory.md, deletes a second corpus file, reruns memsearch index, and starts fresh memsearch search processes for update/delete/cold-start checks."
+        },
         {
           "capability": "real_world_job_adapter",
           "status": "not_encoded",
@@ -687,13 +702,13 @@
         },
         {
           "suite_id": "retrieval",
-          "status": "incomplete",
-          "evidence": "The live-baseline retrieval path is not a clean pass and no job-level run is encoded."
+          "status": "wrong_result",
+          "evidence": "The Docker same-corpus check reaches memsearch search, but current evidence is not a clean retrieval pass and no job-level run is encoded."
         },
         {
           "suite_id": "memory_evolution",
-          "status": "incomplete",
-          "evidence": "Update/delete reindex semantics need a complete Docker evidence path before suite claims."
+          "status": "wrong_result",
+          "evidence": "Update/delete reindex semantics are exercised in Docker; misses remain typed wrong_result or lifecycle_fail and do not become suite passes."
         }
       ],
       "evidence": [
@@ -823,7 +838,7 @@
       },
       "run": {
         "status": "wrong_result",
-        "evidence": "The current same-corpus SQLite repository search is not a clean pass for claude-mem and lifecycle checks are not encoded.",
+        "evidence": "The Docker runner now uses a durable SQLite file, exercises repository update/delete/reopen checks, and reports missed same-corpus or lifecycle evidence as typed non-pass.",
         "artifact": "tmp/live-baseline/live-baseline-report.json"
       },
       "result": {
@@ -839,20 +854,30 @@
         },
         {
           "capability": "durable_storage",
-          "status": "mocked",
-          "evidence": "The current adapter uses in-memory SQLite and does not reopen a durable store."
+          "status": "real",
+          "evidence": "The runner writes to a Docker-local SQLite file and constructs a new Database plus repository instances for cold-start recovery search."
+        },
+        {
+          "capability": "repository_lifecycle",
+          "status": "real",
+          "evidence": "The runner uses MemoryItemsRepository.update, deletes from the repository-owned memory_items table, and relies on repository FTS triggers for update/delete checks."
+        },
+        {
+          "capability": "repository_progressive_disclosure",
+          "status": "real",
+          "evidence": "The runner verifies search result to getById detail hydration and listSources source evidence on the durable repository path."
         },
         {
           "capability": "progressive_disclosure_real_world_job",
           "status": "not_encoded",
-          "evidence": "search -> timeline -> observation workflows are not encoded against real_world_job prompts."
+          "evidence": "Hook, timeline, viewer, and observation workflows are not encoded against real_world_job prompts."
         }
       ],
       "suites": [
         {
           "suite_id": "work_resume",
-          "status": "incomplete",
-          "evidence": "Hook-driven capture and progressive disclosure need a durable local repository run before work-resume suite claims."
+          "status": "wrong_result",
+          "evidence": "The durable repository run is encoded, but hook-driven capture and real_world_job work-resume prompts are not proven by that local repository check."
         },
         {
           "suite_id": "operator_debugging_ux",
@@ -869,11 +894,11 @@
         {
           "kind": "runner",
           "ref": "scripts/live-baseline-benchmark.sh",
-          "status": "mocked"
+          "status": "real"
         }
       ],
       "notes": [
-        "claude-mem remains a UX reference; current Docker evidence is not a real-world progressive-disclosure pass."
+        "claude-mem remains a UX reference; durable repository checks do not prove hook, viewer, or full real-world progressive-disclosure behavior."
       ]
     },
     {

diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs
@@ -269,7 +269,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) {
 		report
 			.pointer("/external_adapters/summary/capability_status_counts/mocked")
 			.and_then(Value::as_u64),
-		Some(2)
+		Some(1)
 	);
 	assert_eq!(
 		report
@@ -292,7 +292,10 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> {
 	let qmd = find_by_field(adapters, "/adapter_id", "qmd_live_baseline")?;
 	let qmd_live = find_by_field(adapters, "/adapter_id", "qmd_live_real_world")?;
 	let agentmemory = find_by_field(adapters, "/adapter_id", "agentmemory_live_baseline")?;
+	let mem0 = find_by_field(adapters, "/adapter_id", "mem0_openmemory_live_baseline")?;
+	let memsearch = find_by_field(adapters, "/adapter_id", "memsearch_live_baseline")?;
 	let openviking = find_by_field(adapters, "/adapter_id", "openviking_live_baseline")?;
+	let claude_mem = find_by_field(adapters, "/adapter_id", "claude_mem_live_baseline")?;
 	let ragflow = find_by_field(adapters, "/adapter_id", "ragflow_research_gate")?;
 	let lightrag = find_by_field(adapters, "/adapter_id", "lightrag_research_gate")?;
 	let graphrag = find_by_field(adapters, "/adapter_id", "graphrag_research_gate")?;
@@ -324,6 +327,9 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> {
 		agentmemory.pointer("/capabilities/1/status").and_then(Value::as_str),
 		Some("mocked")
 	);
+
+	assert_first_generation_adapter_records(mem0, memsearch, claude_mem);
+
 	assert_eq!(openviking.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result"));
 	assert_eq!(ragflow.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate"));
 	assert_eq!(ragflow.pointer("/overall_status").and_then(Value::as_str), Some("blocked"));
@@ -377,6 +383,29 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> {
 	Ok(())
 }
 
+fn assert_first_generation_adapter_records(mem0: &Value, memsearch: &Value, claude_mem: &Value) {
+	assert_eq!(
+		mem0.pointer("/capabilities/2/capability").and_then(Value::as_str),
+		Some("local_lifecycle_update_delete_reload")
+	);
+	assert_eq!(mem0.pointer("/capabilities/2/status").and_then(Value::as_str), Some("real"));
+	assert_eq!(mem0.pointer("/capabilities/4/status").and_then(Value::as_str), Some("not_encoded"));
+	assert_eq!(
+		memsearch.pointer("/capabilities/2/capability").and_then(Value::as_str),
+		Some("reindex_update_delete_reload")
+	);
+	assert_eq!(memsearch.pointer("/capabilities/2/status").and_then(Value::as_str), Some("real"));
+	assert_eq!(claude_mem.pointer("/capabilities/1/status").and_then(Value::as_str), Some("real"));
+	assert_eq!(
+		claude_mem.pointer("/capabilities/3/capability").and_then(Value::as_str),
+		Some("repository_progressive_disclosure")
+	);
+	assert_eq!(
+		claude_mem.pointer("/capabilities/4/status").and_then(Value::as_str),
+		Some("not_encoded")
+	);
+}
+
 fn assert_graphiti_zep_adapter(adapter: &Value) {
 	assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate"));
 	assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked"));

diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md
@@ -123,16 +123,22 @@ Current external same-corpus adapters:
   cold-start recovery is recorded as `blocked` until a persistent agentmemory KV/index
   path or hosted runtime is wired into the harness.
 - qmd: adds the corpus as a collection, embeds it locally, and runs structured hybrid
-  `query --json` for every query case. It also rewrites and deletes corpus files,
-  then reruns `qmd update`, `qmd embed -f`, and fresh `qmd query` processes.
+  `query --json` for every query case. It also works from a per-adapter corpus copy,
+  rewrites and deletes files in that copy, then reruns `qmd update`, `qmd embed -f`,
+  and fresh `qmd query` processes.
 - memsearch: indexes the corpus with the local ONNX embedder and runs CLI search.
-  It also rewrites and deletes corpus files, then reruns `memsearch index` and
-  fresh `memsearch search` processes.
+  It also works from a per-adapter corpus copy, rewrites and deletes files in that
+  copy, then reruns `memsearch index` and fresh `memsearch search` processes.
 - mem0: writes the corpus with `infer=false` and searches local FastEmbed + Qdrant
   path storage. It also runs public `Memory.update`, `Memory.delete`, and a new
-  `Memory.from_config` over the same local paths. No LLM inference is required.
-- claude-mem: writes every corpus document into the SQLite memory repository and runs
-  repository search for every query case.
+  `Memory.from_config` over the same local paths from a per-adapter corpus copy. No
+  LLM inference is required. OpenMemory UI and hosted Platform behavior are not
+  counted as local OSS passes.
+- claude-mem: writes every corpus document into a Docker-local durable SQLite memory
+  repository, runs repository search for every query case, updates one item, deletes
+  one item, reopens the same SQLite file with fresh repository instances, and checks
+  search-to-detail/source hydration. Hook, viewer, and full timeline progressive
+  disclosure remain separate from this local repository check.
 
 Current deeper checks:
 
@@ -148,9 +154,13 @@ Current deeper checks:
 - agentmemory: same-corpus retrieval and delete suppression are exercised; update
   replacement is probed through superseding `mem::remember`; cold-start recovery is
   `blocked` because the current adapter runs against an in-memory SDK/KV mock.
-- claude-mem and OpenViking: same-corpus retrieval only when their local runtime path
-  can complete. Update, delete, and recovery checks are `not_encoded` for these two
-  adapters.
+- claude-mem: same-corpus retrieval, update replacement, delete suppression,
+  cold-start search recovery, and repository-level progressive detail/source
+  hydration through a durable local SQLite repository. Hook, viewer, and full timeline
+  progressive disclosure remain `not_encoded` until a real adapter executes those
+  surfaces.
+- OpenViking: same-corpus retrieval only when its local runtime path can complete.
+  Update, delete, and recovery checks are `not_encoded` for this adapter.
 - Concurrent write, soak stability, and resource-envelope checks are currently encoded
   for ELF. They are not yet encoded for the external adapters. Multi-hour production
   soak is still operator-controlled through `ELF_BASELINE_SOAK_SECONDS`; the checked-in