diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 6dbe0c0b..152b1f15 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -580,7 +580,7 @@ }, "run": { "status": "wrong_result", - "evidence": "The current same-corpus retrieval result is typed wrong_result or incomplete in the checked-in benchmark evidence.", + "evidence": "The Docker runner exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, and cold-start reload; same-corpus retrieval remains typed wrong_result or incomplete when evidence is missed.", "artifact": "tmp/live-baseline/live-baseline-report.json" }, "result": { @@ -599,11 +599,21 @@ "status": "wrong_result", "evidence": "The checked-in smoke evidence did not prove a correct same-corpus result for mem0." }, + { + "capability": "local_lifecycle_update_delete_reload", + "status": "real", + "evidence": "The Docker runner exercises public Memory.update, Memory.delete, and a new Memory.from_config over the same local Qdrant/history paths; any miss is reported as lifecycle_fail instead of pass." + }, { "capability": "openmemory_ui_readback", "status": "not_encoded", "evidence": "OpenMemory UI readback is not encoded in the Docker baseline or real-world job runner." }, + { + "capability": "hosted_managed_memory_claims", + "status": "not_encoded", + "evidence": "Hosted mem0 Platform behavior is outside the local OSS Docker adapter and is not counted as a local pass." + }, { "capability": "real_world_job_adapter", "status": "not_encoded", @@ -613,8 +623,8 @@ "suites": [ { "suite_id": "memory_evolution", - "status": "incomplete", - "evidence": "mem0 lifecycle/history is a target dimension, but current Docker evidence has not produced a complete real-world job result." + "status": "wrong_result", + "evidence": "Local lifecycle checks are encoded in the Docker baseline, but real_world_job memory-evolution prompts are not executed and missed local evidence must remain typed non-pass." }, { "suite_id": "personalization", @@ -654,7 +664,7 @@ }, "run": { "status": "wrong_result", - "evidence": "The current same-corpus retrieval evidence is not a clean pass for memsearch.", + "evidence": "The Docker runner indexes a per-adapter corpus copy, rewrites and deletes files, reruns memsearch index, and records wrong_result or lifecycle_fail when expected evidence is missed.", "artifact": "tmp/live-baseline/live-baseline-report.json" }, "result": { @@ -673,6 +683,11 @@ "status": "wrong_result", "evidence": "The checked-in smoke evidence did not prove correct same-corpus retrieval." }, + { + "capability": "reindex_update_delete_reload", + "status": "real", + "evidence": "The runner rewrites auth-memory.md, deletes a second corpus file, reruns memsearch index, and starts fresh memsearch search processes for update/delete/cold-start checks." + }, { "capability": "real_world_job_adapter", "status": "not_encoded", @@ -687,13 +702,13 @@ }, { "suite_id": "retrieval", - "status": "incomplete", - "evidence": "The live-baseline retrieval path is not a clean pass and no job-level run is encoded." + "status": "wrong_result", + "evidence": "The Docker same-corpus check reaches memsearch search, but current evidence is not a clean retrieval pass and no job-level run is encoded." }, { "suite_id": "memory_evolution", - "status": "incomplete", - "evidence": "Update/delete reindex semantics need a complete Docker evidence path before suite claims." + "status": "wrong_result", + "evidence": "Update/delete reindex semantics are exercised in Docker; misses remain typed wrong_result or lifecycle_fail and do not become suite passes." } ], "evidence": [ @@ -823,7 +838,7 @@ }, "run": { "status": "wrong_result", - "evidence": "The current same-corpus SQLite repository search is not a clean pass for claude-mem and lifecycle checks are not encoded.", + "evidence": "The Docker runner now uses a durable SQLite file, exercises repository update/delete/reopen checks, and reports missed same-corpus or lifecycle evidence as typed non-pass.", "artifact": "tmp/live-baseline/live-baseline-report.json" }, "result": { @@ -839,20 +854,30 @@ }, { "capability": "durable_storage", - "status": "mocked", - "evidence": "The current adapter uses in-memory SQLite and does not reopen a durable store." + "status": "real", + "evidence": "The runner writes to a Docker-local SQLite file and constructs a new Database plus repository instances for cold-start recovery search." + }, + { + "capability": "repository_lifecycle", + "status": "real", + "evidence": "The runner uses MemoryItemsRepository.update, deletes from the repository-owned memory_items table, and relies on repository FTS triggers for update/delete checks." + }, + { + "capability": "repository_progressive_disclosure", + "status": "real", + "evidence": "The runner verifies search result to getById detail hydration and listSources source evidence on the durable repository path." }, { "capability": "progressive_disclosure_real_world_job", "status": "not_encoded", - "evidence": "search -> timeline -> observation workflows are not encoded against real_world_job prompts." + "evidence": "Hook, timeline, viewer, and observation workflows are not encoded against real_world_job prompts." } ], "suites": [ { "suite_id": "work_resume", - "status": "incomplete", - "evidence": "Hook-driven capture and progressive disclosure need a durable local repository run before work-resume suite claims." + "status": "wrong_result", + "evidence": "The durable repository run is encoded, but hook-driven capture and real_world_job work-resume prompts are not proven by that local repository check." }, { "suite_id": "operator_debugging_ux", @@ -869,11 +894,11 @@ { "kind": "runner", "ref": "scripts/live-baseline-benchmark.sh", - "status": "mocked" + "status": "real" } ], "notes": [ - "claude-mem remains a UX reference; current Docker evidence is not a real-world progressive-disclosure pass." + "claude-mem remains a UX reference; durable repository checks do not prove hook, viewer, or full real-world progressive-disclosure behavior." ] }, { diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 966a4b68..b8f14a81 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -269,7 +269,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) { report .pointer("/external_adapters/summary/capability_status_counts/mocked") .and_then(Value::as_u64), - Some(2) + Some(1) ); assert_eq!( report @@ -292,7 +292,10 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { let qmd = find_by_field(adapters, "/adapter_id", "qmd_live_baseline")?; let qmd_live = find_by_field(adapters, "/adapter_id", "qmd_live_real_world")?; let agentmemory = find_by_field(adapters, "/adapter_id", "agentmemory_live_baseline")?; + let mem0 = find_by_field(adapters, "/adapter_id", "mem0_openmemory_live_baseline")?; + let memsearch = find_by_field(adapters, "/adapter_id", "memsearch_live_baseline")?; let openviking = find_by_field(adapters, "/adapter_id", "openviking_live_baseline")?; + let claude_mem = find_by_field(adapters, "/adapter_id", "claude_mem_live_baseline")?; let ragflow = find_by_field(adapters, "/adapter_id", "ragflow_research_gate")?; let lightrag = find_by_field(adapters, "/adapter_id", "lightrag_research_gate")?; let graphrag = find_by_field(adapters, "/adapter_id", "graphrag_research_gate")?; @@ -324,6 +327,9 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { agentmemory.pointer("/capabilities/1/status").and_then(Value::as_str), Some("mocked") ); + + assert_first_generation_adapter_records(mem0, memsearch, claude_mem); + assert_eq!(openviking.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); assert_eq!(ragflow.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); assert_eq!(ragflow.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); @@ -377,6 +383,29 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { Ok(()) } +fn assert_first_generation_adapter_records(mem0: &Value, memsearch: &Value, claude_mem: &Value) { + assert_eq!( + mem0.pointer("/capabilities/2/capability").and_then(Value::as_str), + Some("local_lifecycle_update_delete_reload") + ); + assert_eq!(mem0.pointer("/capabilities/2/status").and_then(Value::as_str), Some("real")); + assert_eq!(mem0.pointer("/capabilities/4/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!( + memsearch.pointer("/capabilities/2/capability").and_then(Value::as_str), + Some("reindex_update_delete_reload") + ); + assert_eq!(memsearch.pointer("/capabilities/2/status").and_then(Value::as_str), Some("real")); + assert_eq!(claude_mem.pointer("/capabilities/1/status").and_then(Value::as_str), Some("real")); + assert_eq!( + claude_mem.pointer("/capabilities/3/capability").and_then(Value::as_str), + Some("repository_progressive_disclosure") + ); + assert_eq!( + claude_mem.pointer("/capabilities/4/status").and_then(Value::as_str), + Some("not_encoded") + ); +} + fn assert_graphiti_zep_adapter(adapter: &Value) { assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index d1d08e6d..30377951 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -123,16 +123,22 @@ Current external same-corpus adapters: cold-start recovery is recorded as `blocked` until a persistent agentmemory KV/index path or hosted runtime is wired into the harness. - qmd: adds the corpus as a collection, embeds it locally, and runs structured hybrid - `query --json` for every query case. It also rewrites and deletes corpus files, - then reruns `qmd update`, `qmd embed -f`, and fresh `qmd query` processes. + `query --json` for every query case. It also works from a per-adapter corpus copy, + rewrites and deletes files in that copy, then reruns `qmd update`, `qmd embed -f`, + and fresh `qmd query` processes. - memsearch: indexes the corpus with the local ONNX embedder and runs CLI search. - It also rewrites and deletes corpus files, then reruns `memsearch index` and - fresh `memsearch search` processes. + It also works from a per-adapter corpus copy, rewrites and deletes files in that + copy, then reruns `memsearch index` and fresh `memsearch search` processes. - mem0: writes the corpus with `infer=false` and searches local FastEmbed + Qdrant path storage. It also runs public `Memory.update`, `Memory.delete`, and a new - `Memory.from_config` over the same local paths. No LLM inference is required. -- claude-mem: writes every corpus document into the SQLite memory repository and runs - repository search for every query case. + `Memory.from_config` over the same local paths from a per-adapter corpus copy. No + LLM inference is required. OpenMemory UI and hosted Platform behavior are not + counted as local OSS passes. +- claude-mem: writes every corpus document into a Docker-local durable SQLite memory + repository, runs repository search for every query case, updates one item, deletes + one item, reopens the same SQLite file with fresh repository instances, and checks + search-to-detail/source hydration. Hook, viewer, and full timeline progressive + disclosure remain separate from this local repository check. Current deeper checks: @@ -148,9 +154,13 @@ Current deeper checks: - agentmemory: same-corpus retrieval and delete suppression are exercised; update replacement is probed through superseding `mem::remember`; cold-start recovery is `blocked` because the current adapter runs against an in-memory SDK/KV mock. -- claude-mem and OpenViking: same-corpus retrieval only when their local runtime path - can complete. Update, delete, and recovery checks are `not_encoded` for these two - adapters. +- claude-mem: same-corpus retrieval, update replacement, delete suppression, + cold-start search recovery, and repository-level progressive detail/source + hydration through a durable local SQLite repository. Hook, viewer, and full timeline + progressive disclosure remain `not_encoded` until a real adapter executes those + surfaces. +- OpenViking: same-corpus retrieval only when its local runtime path can complete. + Update, delete, and recovery checks are `not_encoded` for this adapter. - Concurrent write, soak stability, and resource-envelope checks are currently encoded for ELF. They are not yet encoded for the external adapters. Multi-hour production soak is still operator-controlled through `ELF_BASELINE_SOAK_SECONDS`; the checked-in diff --git a/scripts/live-baseline-benchmark.sh b/scripts/live-baseline-benchmark.sh index d6f96758..fe607648 100755 --- a/scripts/live-baseline-benchmark.sh +++ b/scripts/live-baseline-benchmark.sh @@ -722,6 +722,16 @@ clone_project() { return 1 } +prepare_project_corpus() { + local project="$1" + local target="${WORK_DIR}/corpus-${project}" + + rm -rf "${target}" + mkdir -p "${target}" + cp -R "${CORPUS_DIR}/." "${target}/" + echo "${target}" +} + finish_report() { jq -s \ --arg schema "elf.live_baseline.report/v1" \ @@ -1393,6 +1403,7 @@ project_qmd() { local status_path="${REPORT_DIR}/${project}-status.txt" local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-qmd.mjs" local home="${HOME_DIR}/${project}" + local corpus_path local head mkdir -p "${home}" cat >"${REPORT_DIR}/${project}-adapter.json" <<'JSON' @@ -1441,6 +1452,7 @@ JSON json_record "${project}" "${repo}" "${head}" "incomplete" "not_run" "install/build failed" "${project}.log" "npm install/build" return fi + corpus_path="$(prepare_project_corpus "${project}")" cat >"${driver_path}" <<'JS' import { execFileSync } from "node:child_process"; @@ -1688,7 +1700,7 @@ writeFileSync( JS if run_cmd "${project}: embedded retrieval" 900 "${log_path}" \ - "export HOME='${home}'; export XDG_CACHE_HOME='/root/.cache'; export QMD_FORCE_CPU=1; cd '${REPOS_DIR}/${project}' && npx tsx src/cli/qmd.ts collection add '${CORPUS_DIR}' --name elfbench && npx tsx src/cli/qmd.ts update && npx tsx src/cli/qmd.ts embed -f -c elfbench && npx tsx src/cli/qmd.ts status > '${status_path}' && node '${driver_path}' '${query_result_path}' '${REPORT_DIR}/queries.json' '${CORPUS_DIR}'"; then + "export HOME='${home}'; export XDG_CACHE_HOME='/root/.cache'; export QMD_FORCE_CPU=1; cd '${REPOS_DIR}/${project}' && npx tsx src/cli/qmd.ts collection add '${corpus_path}' --name elfbench && npx tsx src/cli/qmd.ts update && npx tsx src/cli/qmd.ts embed -f -c elfbench && npx tsx src/cli/qmd.ts status > '${status_path}' && node '${driver_path}' '${query_result_path}' '${REPORT_DIR}/queries.json' '${corpus_path}'"; then if jq -e '.checks and .check_summary' "${query_result_path}" >/dev/null 2>&1; then jq '{check_summary, checks}' "${query_result_path}" >"${REPORT_DIR}/${project}-checks.json" fi @@ -1725,6 +1737,7 @@ project_memsearch() { local home="${HOME_DIR}/${project}" local result_path="${REPORT_DIR}/${project}-search.json" local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-memsearch.py" + local corpus_path local head mkdir -p "${home}" cat >"${REPORT_DIR}/${project}-adapter.json" <<'JSON' @@ -1773,6 +1786,7 @@ JSON json_record "${project}" "${repo}" "${head}" "incomplete" "not_run" "pip install failed" "${project}.log" "pip install -e .[local,onnx]" return fi + corpus_path="$(prepare_project_corpus "${project}")" cat >"${driver_path}" <<'PY' import json @@ -1994,7 +2008,7 @@ out_path.write_text( PY if run_cmd "${project}: cli retrieval attempt" 240 "${log_path}" \ - "export HOME='${home}'; export ELF_MEMSEARCH_RESULT_PATH='${result_path}'; export ELF_BASELINE_QUERIES_PATH='${REPORT_DIR}/queries.json'; export ELF_BASELINE_CORPUS_PATH='${CORPUS_DIR}'; cd '${REPOS_DIR}/${project}' && source .venv/bin/activate && memsearch --help && memsearch config set embedding.provider onnx && memsearch index '${CORPUS_DIR}' && python '${driver_path}'"; then + "export HOME='${home}'; export ELF_MEMSEARCH_RESULT_PATH='${result_path}'; export ELF_BASELINE_QUERIES_PATH='${REPORT_DIR}/queries.json'; export ELF_BASELINE_CORPUS_PATH='${corpus_path}'; cd '${REPOS_DIR}/${project}' && source .venv/bin/activate && memsearch --help && memsearch config set embedding.provider onnx && memsearch index '${corpus_path}' && python '${driver_path}'"; then if jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then jq '{check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" fi @@ -2027,6 +2041,7 @@ project_mem0() { local result_path="${REPORT_DIR}/${project}-search.json" local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-mem0.py" local home="${HOME_DIR}/${project}" + local corpus_path local head mkdir -p "${home}" cat >"${REPORT_DIR}/${project}-adapter.json" <<'JSON' @@ -2078,6 +2093,7 @@ PY"; then json_record "${project}" "${repo}" "${head}" "incomplete" "not_run" "pip install or import failed" "${project}.log" "pip install -e . fastembed ollama; import Memory" return fi + corpus_path="$(prepare_project_corpus "${project}")" cat >"${driver_path}" <<'PY' import gc @@ -2396,7 +2412,7 @@ out_path.write_text( PY if run_cmd "${project}: local fastembed add/search" 900 "${log_path}" \ - "export HOME='${home}'; export ELF_MEM0_HOME='${home}'; export ELF_MEM0_RESULT_PATH='${result_path}'; export ELF_BASELINE_CORPUS_PATH='${CORPUS_DIR}'; export ELF_BASELINE_QUERIES_PATH='${REPORT_DIR}/queries.json'; export MEM0_TELEMETRY=false; cd '${REPOS_DIR}/${project}' && source .venv/bin/activate && python '${driver_path}'"; then + "export HOME='${home}'; export ELF_MEM0_HOME='${home}'; export ELF_MEM0_RESULT_PATH='${result_path}'; export ELF_BASELINE_CORPUS_PATH='${corpus_path}'; export ELF_BASELINE_QUERIES_PATH='${REPORT_DIR}/queries.json'; export MEM0_TELEMETRY=false; cd '${REPOS_DIR}/${project}' && source .venv/bin/activate && python '${driver_path}'"; then if jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then jq '{check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" fi @@ -2731,39 +2747,47 @@ project_claude_mem() { local log_path="${REPORT_DIR}/${project}.log" local result_path="${REPORT_DIR}/${project}-search.json" local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-claude-mem.ts" + local home="${HOME_DIR}/${project}" + local corpus_path + local db_path="${HOME_DIR}/${project}/claude-mem.sqlite" local head + mkdir -p "${home}" cat >"${REPORT_DIR}/${project}-adapter.json" <<'JSON' { "schema": "elf.live_baseline.adapter_metadata/v1", "project": "claude-mem", "storage": { - "status": "mocked", - "detail": "The adapter uses claude-mem repository classes with an in-memory SQLite database for same-corpus search." + "status": "real", + "detail": "The adapter uses claude-mem repository classes with a durable SQLite file inside Docker for same-corpus and lifecycle checks." }, "behaviors": { "same_corpus_retrieval": { - "status": "mocked", - "surface": "MemoryItemsRepository.create/search over in-memory SQLite" + "status": "real", + "surface": "MemoryItemsRepository.create/search over a Docker-local SQLite database" }, "update": { - "status": "not_encoded", - "surface": "no update replacement check is encoded" + "status": "real", + "surface": "MemoryItemsRepository.update against the stored memory item id" }, "delete_or_expire": { - "status": "not_encoded", - "surface": "no delete or expiry check is encoded" + "status": "real", + "surface": "delete from the repository-owned SQLite memory_items table and verify FTS suppression" }, "expire": { "status": "unsupported", "surface": "no TTL/expiry behavior is encoded in the local adapter" }, "cold_start_reload": { - "status": "not_encoded", - "surface": "the current adapter uses :memory: SQLite and does not reopen a durable store" + "status": "real", + "surface": "new Database and repository instances over the same Docker-local SQLite file" + }, + "progressive_disclosure": { + "status": "real", + "surface": "search returns bounded memory items and detail/source hydration uses getById plus listSources" }, "scale_stress_profile": { "status": "incomplete", - "surface": "same-corpus smoke only until durable storage and lifecycle checks are encoded" + "surface": "durable smoke lifecycle path is encoded; scale/stress timing and resource thresholds are not yet calibrated" } } } @@ -2778,6 +2802,7 @@ JSON json_record "${project}" "${repo}" "${head}" "incomplete" "not_run" "npm install/build failed" "${project}.log" "npm install/build" return fi + corpus_path="$(prepare_project_corpus "${project}")" cat >"${driver_path}" <<'TS' import { readFileSync, readdirSync, writeFileSync } from "node:fs"; @@ -2789,8 +2814,9 @@ import { ProjectsRepository } from "./src/storage/sqlite/projects.ts"; const outPath = Bun.argv[2]; const corpusPath = Bun.argv[3]; const queriesPath = Bun.argv[4]; -if (!outPath || !corpusPath || !queriesPath) { - throw new Error("output path, corpus path, and query path are required"); +const dbPath = Bun.argv[5]; +if (!outPath || !corpusPath || !queriesPath || !dbPath) { + throw new Error("output path, corpus path, query path, and database path are required"); } type QueryCase = { @@ -2837,7 +2863,52 @@ function resultMatches(results: unknown[], query: QueryCase): boolean { }); } -const db = new Database(":memory:"); +function resultEntriesForSource(results: unknown[], source: string): unknown[] { + return results.filter((entry) => { + const files = (entry as { filesRead?: string[] }).filesRead ?? []; + return files.includes(source); + }); +} + +function makeCheck( + name: string, + status: + | "pass" + | "wrong_result" + | "lifecycle_fail" + | "incomplete" + | "blocked" + | "not_encoded", + reason: string, + evidence: unknown, +) { + return { name, status, reason, evidence }; +} + +function summarizeChecks(checks: Array<{ status: string }>) { + const wrongResult = checks.filter((check) => check.status === "wrong_result") + .length; + const lifecycleFail = checks.filter( + (check) => check.status === "lifecycle_fail", + ).length; + return { + total: checks.length, + pass: checks.filter((check) => check.status === "pass").length, + fail: wrongResult + lifecycleFail, + wrong_result: wrongResult, + lifecycle_fail: lifecycleFail, + incomplete: checks.filter((check) => check.status === "incomplete").length, + blocked: checks.filter((check) => check.status === "blocked").length, + not_encoded: checks.filter((check) => check.status === "not_encoded") + .length, + }; +} + +function markerQuery(query: QueryCase): string { + return query.expected_terms.join(" "); +} + +const db = new Database(dbPath); db.run("PRAGMA foreign_keys = ON"); try { @@ -2865,8 +2936,10 @@ try { const queries = JSON.parse(readFileSync(queriesPath, "utf8")).queries as QueryCase[]; const topK = Number(process.env.ELF_BASELINE_TOP_K ?? "10"); - const created = docs.map((doc) => - memories.create({ + const created = []; + const createdBySource = new Map>(); + for (const doc of docs) { + const item = memories.create({ projectId: project.id, kind: "manual", type: "fact", @@ -2877,8 +2950,16 @@ try { concepts: doc.concepts, filesRead: [doc.file], metadata: { source: doc.file }, - }), - ); + }); + const source = memories.addSource({ + memoryItemId: item.id, + sourceType: "import", + sourceUri: `file://${doc.file}`, + metadata: { source: doc.file }, + }); + created.push({ item, source }); + createdBySource.set(doc.file, item); + } const queryResults = queries.map((query) => { const results = memories.search(project.id, query.query, topK); @@ -2893,54 +2974,190 @@ try { }); const pass = queryResults.filter((result) => result.matched).length; const checks = [ - { - name: "same_corpus_retrieval", - status: pass === queryResults.length ? "pass" : "wrong_result", - reason: - pass === queryResults.length - ? "claude-mem repository search returned expected evidence for every query." - : "claude-mem repository search missed one or more expected results.", - evidence: { + makeCheck( + "same_corpus_retrieval", + pass === queryResults.length ? "pass" : "wrong_result", + pass === queryResults.length + ? "claude-mem repository search returned expected evidence for every query." + : "claude-mem repository search missed one or more expected results.", + { total: queryResults.length, pass, fail: queryResults.length - pass, }, - }, - { - name: "update_replaces_note_text", - status: "not_encoded", - reason: "claude-mem update replacement is not encoded in this in-memory adapter.", - evidence: {}, - }, - { - name: "delete_suppresses_retrieval", - status: "not_encoded", - reason: "claude-mem delete or expiry behavior is not encoded in this in-memory adapter.", - evidence: {}, - }, - { - name: "cold_start_recovery_search", - status: "not_encoded", - reason: "claude-mem cold-start reload is not encoded because the adapter uses :memory: SQLite.", - evidence: {}, - }, + ), ]; - const wrongResult = checks.filter((check) => check.status === "wrong_result") - .length; - const lifecycleFail = checks.filter( - (check) => check.status === "lifecycle_fail", - ).length; - const checkSummary = { - total: checks.length, - pass: checks.filter((check) => check.status === "pass").length, - fail: wrongResult + lifecycleFail, - wrong_result: wrongResult, - lifecycle_fail: lifecycleFail, - incomplete: checks.filter((check) => check.status === "incomplete").length, - blocked: checks.filter((check) => check.status === "blocked").length, - not_encoded: checks.filter((check) => check.status === "not_encoded") - .length, + + const auth = createdBySource.get("auth-memory.md"); + if (!auth) { + checks.push( + makeCheck( + "update_replaces_note_text", + "incomplete", + "The auth memory item was not created, so update replacement could not be exercised.", + { source: "auth-memory.md" }, + ), + ); + } else { + const updateText = + "Rotated auth middleware validates JWT tokens with key id `kid-v4` under `RotatedJwtKeyPlan`. It still requires tenant scope `project_shared` for deployment operations after the emergency key rotation."; + const update = memories.update(auth.id, { + title: "Auth Memory Updated", + text: updateText, + narrative: updateText, + facts: [updateText], + concepts: conceptsFor("auth-memory.md"), + filesRead: ["auth-memory.md"], + metadata: { source: "auth-memory.md", lifecycle: "updated" }, + }); + const updateQuery: QueryCase = { + id: "lifecycle-update-new-marker", + query: "Which rotated JWT key id does the auth middleware require?", + expected_doc: "auth-memory.md", + expected_terms: ["kid-v4", "RotatedJwtKeyPlan"], + }; + const updateResults = memories.search(project.id, markerQuery(updateQuery), topK); + const updateMatched = resultMatches(updateResults, updateQuery); + const oldMarkerAbsent = resultEntriesForSource(updateResults, "auth-memory.md") + .every((entry) => !JSON.stringify(entry).toLowerCase().includes("kid-v3")); + checks.push( + makeCheck( + "update_replaces_note_text", + updateMatched && oldMarkerAbsent ? "pass" : "lifecycle_fail", + updateMatched && oldMarkerAbsent + ? "claude-mem update returned the new marker and did not return the old marker for the updated memory item." + : "claude-mem update did not cleanly replace the searchable auth memory item text.", + { + memory_item_id: auth.id, + update, + matched_new_marker: updateMatched, + old_marker_absent: oldMarkerAbsent, + results: updateResults, + }, + ), + ); + } + + const deleteQuery = queries.find( + (query) => + query.expected_doc !== "auth-memory.md" && + query.expected_doc !== "database-memory.md" && + createdBySource.has(query.expected_doc), + ); + if (!deleteQuery) { + checks.push( + makeCheck( + "delete_suppresses_retrieval", + "incomplete", + "No non-update, non-recovery memory item was available, so delete suppression could not be exercised.", + { available_sources: Array.from(createdBySource.keys()).sort() }, + ), + ); + } else { + const deleteId = createdBySource.get(deleteQuery.expected_doc)!.id; + const deleteResult = db.prepare("DELETE FROM memory_items WHERE id = ?").run(deleteId); + const deleteResults = memories.search(project.id, markerQuery(deleteQuery), topK); + const deletedStillMatched = resultMatches(deleteResults, deleteQuery); + checks.push( + makeCheck( + "delete_suppresses_retrieval", + deletedStillMatched ? "lifecycle_fail" : "pass", + deletedStillMatched + ? "claude-mem SQLite delete returned success but the deleted memory item was still searchable." + : "claude-mem SQLite delete suppressed the deleted memory item from subsequent FTS search.", + { + memory_item_id: deleteId, + source: deleteQuery.expected_doc, + query: deleteQuery, + changes: deleteResult.changes, + deleted_still_matched: deletedStillMatched, + results: deleteResults, + }, + ), + ); + } + + const progressQuery = + queries.find( + (query) => + query.expected_doc === "database-memory.md" || + (query.expected_doc !== "auth-memory.md" && + query.expected_doc !== deleteQuery?.expected_doc), + ) ?? queries[0]; + const progressResults = memories.search(project.id, markerQuery(progressQuery), topK); + const progressItem = progressResults.find((entry) => + ((entry as { filesRead?: string[] }).filesRead ?? []).includes( + progressQuery.expected_doc, + ), + ); + const detail = progressItem ? memories.getById(progressItem.id) : null; + const sources = detail ? memories.listSources(detail.id) : []; + const detailHasEvidence = + !!detail && + !!detail.text && + detail.facts.length > 0 && + detail.concepts.length > 0 && + detail.filesRead.includes(progressQuery.expected_doc); + const sourceHydrated = sources.some((source) => + source.sourceUri?.includes(progressQuery.expected_doc), + ); + checks.push( + makeCheck( + "progressive_disclosure_detail_hydration", + progressResults.length > 0 && detailHasEvidence && sourceHydrated + ? "pass" + : "lifecycle_fail", + progressResults.length > 0 && detailHasEvidence && sourceHydrated + ? "claude-mem search returned a bounded item that could be hydrated into detail and source evidence." + : "claude-mem search/detail/source hydration did not expose the expected progressive-disclosure evidence.", + { + query: progressQuery, + search_result_count: progressResults.length, + detail_has_evidence: detailHasEvidence, + source_hydrated: sourceHydrated, + detail, + sources, + }, + ), + ); + + db.close(); + + const reopenedDb = new Database(dbPath); + reopenedDb.run("PRAGMA foreign_keys = ON"); + const reopenedProjects = new ProjectsRepository(reopenedDb); + const reopenedMemories = new MemoryItemsRepository(reopenedDb); + const reopenedProject = + reopenedProjects.getByRootPath("/bench/corpus") ?? reopenedProjects.getById(project.id); + const recoveryQuery: QueryCase = { + id: "lifecycle-cold-start-recovery", + query: + "The invoice list N+1 query was fixed by eager loading invoice lines through `InvoiceLineBatcher`. Do not reintroduce per-row SQL calls in invoice rendering.", + expected_doc: "database-memory.md", + expected_terms: ["InvoiceLineBatcher", "N+1"], }; + const recoveryResults = reopenedProject + ? reopenedMemories.search(reopenedProject.id, markerQuery(recoveryQuery), topK) + : []; + const recoveryMatched = resultMatches(recoveryResults, recoveryQuery); + checks.push( + makeCheck( + "cold_start_recovery_search", + recoveryMatched ? "pass" : "lifecycle_fail", + recoveryMatched + ? "A new claude-mem repository instance reopened the durable SQLite file and retrieved persisted evidence." + : "A new claude-mem repository instance did not retrieve expected persisted evidence from the durable SQLite file.", + { + db_path: dbPath, + expected_doc: recoveryQuery.expected_doc, + matched: recoveryMatched, + results: recoveryResults, + }, + ), + ); + reopenedDb.close(); + + const checkSummary = summarizeChecks(checks); writeFileSync( outPath, @@ -2965,13 +3182,18 @@ try { 2, ), ); -} finally { - db.close(); +} catch (err) { + try { + db.close(); + } catch { + // Ignore close errors while surfacing the original benchmark failure. + } + throw err; } TS - if run_cmd "${project}: same-corpus sqlite search" 300 "${log_path}" \ - "cd '${REPOS_DIR}/${project}' && bun '${driver_path}' '${result_path}' '${CORPUS_DIR}' '${REPORT_DIR}/queries.json'"; then + if run_cmd "${project}: same-corpus durable sqlite search" 300 "${log_path}" \ + "cd '${REPOS_DIR}/${project}' && bun '${driver_path}' '${result_path}' '${corpus_path}' '${REPORT_DIR}/queries.json' '${db_path}'"; then if jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then jq '{check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" fi @@ -2988,14 +3210,14 @@ TS else retrieval_status="retrieval_wrong_result" fi - json_record "${project}" "${repo}" "${head}" "${typed_status}" "${retrieval_status}" "$(typed_status_reason "${project}" "${typed_status}")" "${project}.log" "npm install/build; MemoryItemsRepository.create/search" + json_record "${project}" "${repo}" "${head}" "${typed_status}" "${retrieval_status}" "$(typed_status_reason "${project}" "${typed_status}")" "${project}.log" "npm install/build; MemoryItemsRepository.create/update/search; durable SQLite reopen" return fi - json_record "${project}" "${repo}" "${head}" "incomplete" "invalid_json_result" "claude-mem same-corpus search did not produce a valid benchmark result" "${project}.log" "npm install/build; MemoryItemsRepository.create/search" + json_record "${project}" "${repo}" "${head}" "incomplete" "invalid_json_result" "claude-mem same-corpus search did not produce a valid benchmark result" "${project}.log" "npm install/build; MemoryItemsRepository.create/update/search; durable SQLite reopen" return fi - json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "claude-mem built, but same-corpus SQLite search did not pass in Docker" "${project}.log" "npm install/build; MemoryItemsRepository.create/search" + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "claude-mem built, but same-corpus SQLite search did not pass in Docker" "${project}.log" "npm install/build; MemoryItemsRepository.create/update/search; durable SQLite reopen" } run_project "ELF" project_elf