Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Makefile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ args = [
# | baseline-backfill-10k-docker | command | |
# | baseline-backfill-100k-docker | command | |
# | baseline-soak-docker | command | |
# | openmemory-ui-export-readback | command | |

[tasks.baseline-live-docker]
workspace = false
Expand Down Expand Up @@ -342,6 +343,14 @@ args = [
"--remove-orphans",
]

[tasks.openmemory-ui-export-readback]
workspace = false
command = "bash"
args = [
"-lc",
"set -euo pipefail; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=mem0; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner",
]

[tasks.baseline-production-synthetic]
workspace = false
command = "bash"
Expand Down
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,12 @@ provider-backed ELF evidence was required.
typed blocked or incomplete without explicit service, resource, or provider setup.
These reports preserve the smoke-only boundary and do not create an ELF win claim
against graph/RAG strengths.
- mem0/OpenMemory history follow-up after XY-924: the local OSS mem0 adapter now
passes encoded preference correction history, entity-scoped personalization, local
`get_all` export-style readback, and deletion audit history in
`live-baseline-20260611113003`. The comparison records ELF as a loss on preference
- mem0/OpenMemory history follow-up after XY-924 and XY-931: the local OSS mem0
adapter now passes encoded preference correction history, entity-scoped
personalization, local `get_all` export-style readback, and deletion audit history.
The separate OpenMemory export-helper setup probe in `live-baseline-20260611122416`
records `blocked` with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`, so SDK `get_all`
is still not UI/export evidence. The comparison records ELF as a loss on preference
correction history, ties on scoped personalization and delete audit, `not_tested`
for local SDK export-style parity, `blocked` for OpenMemory UI/export, and
`non_goal` for hosted Platform export and optional graph memory in the local OSS
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"schema": "elf.real_world_external_adapter_manifest/v1",
"manifest_id": "real-world-memory-project-adapters-2026-06-11-mem0-history",
"manifest_id": "real-world-memory-project-adapters-2026-06-11-openmemory-ui-export",
"docker_isolation": {
"default": true,
"compose_file": "docker-compose.baseline.yml",
Expand Down Expand Up @@ -608,13 +608,13 @@
},
"run": {
"status": "pass",
"evidence": "Fresh scoped baseline run live-baseline-20260611113003 exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, Memory.history, Memory.get_all, entity filters, and cold-start reload; mem0 passed 8/8 encoded checks.",
"command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker",
"evidence": "Fresh scoped baseline run live-baseline-20260611122416 exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, Memory.history, Memory.get_all, entity filters, and cold-start reload; mem0 passed 8/8 encoded SDK checks. XY-931 adds a separate OpenMemory export-helper setup probe artifact and keeps that blocked UI/export result out of the SDK check summary.",
"command": "cargo make openmemory-ui-export-readback",
"artifact": "tmp/live-baseline/live-baseline-report.json"
},
"result": {
"status": "pass",
"evidence": "The local OSS mem0 baseline now passes same-corpus retrieval, update/delete/reload, preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history. It still does not launch the OpenMemory UI, hosted Platform export flow, optional graph memory, or a real_world_job prompt adapter.",
"evidence": "The local OSS mem0 baseline now passes same-corpus retrieval, update/delete/reload, preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history. The separate OpenMemory export-helper setup probe is blocked because Docker is unavailable inside the baseline-runner container before any product app database readback can run. It still does not claim hosted Platform export, optional graph memory, or a real_world_job prompt adapter.",
"artifact": "tmp/live-baseline/live-baseline-report.json"
},
"capabilities": [
Expand All @@ -626,7 +626,7 @@
{
"capability": "same_corpus_retrieval",
"status": "pass",
"evidence": "Fresh scoped baseline run live-baseline-20260611113003 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks."
"evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks."
},
{
"capability": "local_lifecycle_update_delete_reload",
Expand Down Expand Up @@ -656,7 +656,7 @@
{
"capability": "openmemory_ui_readback",
"status": "blocked",
"evidence": "The Docker live-baseline runner does not launch the OpenMemory web UI, dashboard authentication, or browser export flow. Local SDK get_all readback is measured separately and must not be reused as UI evidence."
"evidence": "XY-931 runs a bounded OpenMemory export-helper setup probe after the mem0 SDK corpus checks. The probe finds the OpenMemory tree, UI package, compose file, and export helper, then records a setup blocker because the export helper requires Docker access to a running OpenMemory container. Local SDK get_all readback is measured separately and must not be reused as UI evidence."
},
{
"capability": "hosted_managed_memory_claims",
Expand Down Expand Up @@ -688,7 +688,7 @@
{
"suite_id": "operator_debugging_ux",
"status": "blocked",
"evidence": "Local SDK get_all inspection is measured, but OpenMemory UI/export readback is blocked because the Docker runner does not launch the web UI or hosted export flow."
"evidence": "Local SDK get_all inspection is measured, but OpenMemory UI/export readback is blocked by the XY-931 export-helper setup probe until a dedicated OpenMemory compose/import path can load the same corpus into the OpenMemory app database."
}
],
"scenarios": [
Expand All @@ -708,7 +708,7 @@
"status": "pass",
"elf_position": "loses",
"comparison_outcome": "loss",
"evidence": "Fresh scoped baseline run live-baseline-20260611113003 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.",
"evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.",
"command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters",
"artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md"
},
Expand All @@ -718,7 +718,7 @@
"status": "pass",
"elf_position": "ties",
"comparison_outcome": "tie",
"evidence": "Fresh scoped baseline run live-baseline-20260611113003 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.",
"evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.",
"command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters",
"artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md"
},
Expand All @@ -728,7 +728,7 @@
"status": "pass",
"elf_position": "ties",
"comparison_outcome": "tie",
"evidence": "Fresh scoped baseline run live-baseline-20260611113003 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.",
"evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.",
"command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters",
"artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md"
},
Expand All @@ -738,7 +738,7 @@
"status": "pass",
"elf_position": "untested",
"comparison_outcome": "not_tested",
"evidence": "Fresh scoped baseline run live-baseline-20260611113003 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.",
"evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.",
"command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker",
"artifact": "tmp/live-baseline/mem0-checks.json"
},
Expand All @@ -748,8 +748,9 @@
"status": "blocked",
"elf_position": "untested",
"comparison_outcome": "blocked",
"evidence": "The local Docker runner does not launch OpenMemory UI/dashboard export, and hosted Platform export remains outside local OSS evidence. Basic lifecycle and local get_all readback are not reused as UI/export proof.",
"artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json"
"evidence": "The XY-931 OpenMemory export-helper setup probe is Docker-contained in the mem0 baseline run. It detects the OpenMemory product tree, UI package, compose file, and export helper, but Docker is unavailable inside the baseline-runner container before the helper can reach a running OpenMemory product container or app database. Basic lifecycle and local SDK get_all readback are not reused as UI/export proof.",
"command": "cargo make openmemory-ui-export-readback",
"artifact": "tmp/live-baseline/mem0-openmemory-ui-export.json"
},
{
"scenario_id": "hosted_platform_export",
Expand Down Expand Up @@ -778,7 +779,8 @@
}
],
"notes": [
"Separate local OSS mem0 evidence from hosted Platform and OpenMemory UI claims."
"Separate local OSS mem0 SDK evidence from OpenMemory product UI/export claims.",
"A blocked OpenMemory export-helper setup probe is not an ELF win or loss until the product app can import and export the same local corpus."
]
},
{
Expand Down
90 changes: 87 additions & 3 deletions apps/elf-eval/tests/real_world_job_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,13 @@ fn competitor_strength_adoption_report_json_path() -> Result<PathBuf> {
.join("2026-06-11-competitor-strength-adoption-report.json"))
}

fn temporal_history_competitor_gap_json_path() -> Result<PathBuf> {
Ok(workspace_root()?
.join("docs")
.join("research")
.join("2026-06-11-temporal-history-competitor-gap-report.json"))
}

fn competitor_strength_matrix_path() -> Result<PathBuf> {
Ok(workspace_root()?
.join("docs")
Expand Down Expand Up @@ -399,7 +406,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) {
);
assert_eq!(
report.pointer("/external_adapters/manifest_id").and_then(Value::as_str),
Some("real-world-memory-project-adapters-2026-06-11-mem0-history")
Some("real-world-memory-project-adapters-2026-06-11-openmemory-ui-export")
);
assert_eq!(
report.pointer("/external_adapters/docker_isolation/default").and_then(Value::as_bool),
Expand Down Expand Up @@ -812,6 +819,20 @@ fn assert_first_generation_adapter_records(
Some("openmemory_ui_export_readback")
);
assert_eq!(mem0.pointer("/scenarios/5/status").and_then(Value::as_str), Some("blocked"));
assert_eq!(
mem0.pointer("/scenarios/5/command").and_then(Value::as_str),
Some("cargo make openmemory-ui-export-readback")
);
assert_eq!(
mem0.pointer("/scenarios/5/artifact").and_then(Value::as_str),
Some("tmp/live-baseline/mem0-openmemory-ui-export.json")
);
assert!(
mem0.pointer("/capabilities/7/evidence")
.and_then(Value::as_str)
.is_some_and(|evidence| evidence.contains("export-helper setup probe")
&& evidence.contains("requires Docker access"))
);
assert_eq!(
mem0.pointer("/scenarios/6/comparison_outcome").and_then(Value::as_str),
Some("non_goal")
Expand Down Expand Up @@ -1067,6 +1088,48 @@ fn live_adapter_aggregate_forwards_graph_rag_smoke_controls() -> Result<()> {
Ok(())
}

#[test]
fn openmemory_ui_export_probe_has_dedicated_docker_task() -> Result<()> {
let workspace_root = workspace_root()?;
let makefile = fs::read_to_string(workspace_root.join("Makefile.toml"))?;
let compose = fs::read_to_string(workspace_root.join("docker-compose.baseline.yml"))?;
let script = fs::read_to_string(workspace_root.join("scripts/live-baseline-benchmark.sh"))?;
let report = serde_json::from_str::<Value>(&fs::read_to_string(
workspace_root.join("docs/research/2026-06-11-xy-931-openmemory-ui-export-readback.json"),
)?)?;

assert!(makefile.contains("[tasks.openmemory-ui-export-readback]"));
assert!(makefile.contains("export ELF_BASELINE_PROJECTS=mem0"));
assert!(compose.contains("ELF_MEM0_OPENMEMORY_EXPORT_USER_ID"));
assert!(compose.contains("ELF_MEM0_OPENMEMORY_EXPORT_CONTAINER"));
assert!(script.contains("probe_mem0_openmemory_ui_export"));
assert!(script.contains("mem0-openmemory-ui-export.json"));
assert!(script.contains("DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER"));
assert!(script.contains("sdk_get_all_is_ui_export_evidence: false"));
assert!(
script.contains("SDK same-corpus retrieval and every encoded SDK behavior check passed")
);
assert_eq!(report.pointer("/classification/status").and_then(Value::as_str), Some("blocked"));
assert_eq!(
report.pointer("/classification/reason_code").and_then(Value::as_str),
Some("DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER")
);
assert_eq!(
report
.pointer("/same_corpus_boundary/sdk_get_all_is_ui_export_evidence")
.and_then(Value::as_bool),
Some(false)
);
assert_eq!(
report
.pointer("/claim_boundary/elf_can_compare_against_openmemory_ui_export_after_this_run")
.and_then(Value::as_bool),
Some(false)
);

Ok(())
}

fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Result<()> {
let suites = array_at(adapter, "/suites")?;
let capabilities = array_at(adapter, "/capabilities")?;
Expand Down Expand Up @@ -1432,6 +1495,9 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> {
let external_manifest = fs::read_to_string(external_adapter_manifest_path())?;
let retrieval_debug_profile =
serde_json::from_str::<Value>(&fs::read_to_string(retrieval_debug_profile_json_path()?)?)?;
let temporal_history = serde_json::from_str::<Value>(&fs::read_to_string(
temporal_history_competitor_gap_json_path()?,
)?)?;

assert!(
measurement_audit.contains(
Expand Down Expand Up @@ -1506,6 +1572,20 @@ fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> {

assert_competitor_strength_matrix_json(&competitor_matrix_json)?;

let openmemory_command = find_by_field(
array_at(&temporal_history, "/commands")?,
"/command",
"cargo make openmemory-ui-export-readback",
)?;

assert!(
openmemory_command
.pointer("/artifact")
.and_then(Value::as_str)
.is_some_and(|artifact| artifact.contains("tmp/live-baseline/mem0-checks.json")
&& artifact.contains("tmp/live-baseline/mem0-openmemory-ui-export.json"))
);

Ok(())
}

Expand Down Expand Up @@ -1680,12 +1760,16 @@ fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> {
assert_eq!(mem0.pointer("/measured_status").and_then(Value::as_str), Some("pass"));
assert_eq!(
mem0.pointer("/unsupported_or_blocked_status/state").and_then(Value::as_str),
Some("not_encoded")
Some("blocked")
);
assert_eq!(
mem0.pointer("/unsupported_or_blocked_status/typed_reason").and_then(Value::as_str),
Some("openmemory_export_helper_setup_blocked")
);
assert!(
mem0.pointer("/benchmark_before_claim")
.and_then(Value::as_str)
.is_some_and(|claim| claim.contains("preference/entity history"))
.is_some_and(|claim| claim.contains("OpenMemory product app import/export"))
);
assert_eq!(
openviking.pointer("/current_evidence_class").and_then(Value::as_str),
Expand Down
2 changes: 2 additions & 0 deletions docker-compose.baseline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ services:
ELF_BASELINE_BACKFILL_RESUME_PROBE: ${ELF_BASELINE_BACKFILL_RESUME_PROBE:-}
ELF_BASELINE_MAX_ELF_RSS_KB: ${ELF_BASELINE_MAX_ELF_RSS_KB:-1500000}
ELF_BASELINE_MAX_ELF_SECONDS: ${ELF_BASELINE_MAX_ELF_SECONDS:-600}
ELF_MEM0_OPENMEMORY_EXPORT_CONTAINER: ${ELF_MEM0_OPENMEMORY_EXPORT_CONTAINER:-}
ELF_MEM0_OPENMEMORY_EXPORT_USER_ID: ${ELF_MEM0_OPENMEMORY_EXPORT_USER_ID:-}
ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX: ${ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX:-}
ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION: ${ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION:-}
ELF_BASELINE_PROFILE: ${ELF_BASELINE_PROFILE:-smoke}
Expand Down
Loading