Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions Makefile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,9 @@ args = [
# | real-world-memory-summary | composite | |
# | real-world-memory-summary-json | command | |
# | real-world-memory-summary-report | command | |
# | real-world-memory-proactive-brief | composite | |
# | real-world-memory-proactive-brief-json | command | |
# | real-world-memory-proactive-brief-report | command | |
# | real-world-memory-live-consolidation | command | |
# | real-world-job-operator-ux | composite | |
# | real-world-job-operator-ux-json | command | |
Expand Down Expand Up @@ -883,6 +886,55 @@ args = [
"tmp/real-world-memory/memory-summary/report.md",
]

[tasks.real-world-memory-proactive-brief]
workspace = false
dependencies = [
"real-world-memory-proactive-brief-report",
]

[tasks.real-world-memory-proactive-brief-json]
workspace = false
command = "cargo"
args = [
"run",
"-p",
"elf-eval",
"--bin",
"real_world_job_benchmark",
"--",
"run",
"--fixtures",
"apps/elf-eval/fixtures/real_world_memory/proactive_brief",
"--out",
"tmp/real-world-memory/proactive-brief/report.json",
"--run-id",
"real-world-memory-proactive-brief",
"--adapter-id",
"fixture_proactive_brief",
"--adapter-name",
"ELF proactive brief fixture",
]

[tasks.real-world-memory-proactive-brief-report]
workspace = false
dependencies = [
"real-world-memory-proactive-brief-json",
]
command = "cargo"
args = [
"run",
"-p",
"elf-eval",
"--bin",
"real_world_job_benchmark",
"--",
"publish",
"--report",
"tmp/real-world-memory/proactive-brief/report.json",
"--out",
"tmp/real-world-memory/proactive-brief/report.md",
]

[tasks.real-world-memory-live-consolidation]
workspace = false
command = "bash"
Expand Down
19 changes: 12 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,17 +152,20 @@ provider-backed ELF evidence was required.
its pinned Docker local embedding path and is reported as `wrong_result` when
same-corpus evidence terms are missed; claude-mem and OpenViking non-retrieval
coverage remain typed non-pass states.
- Real-world agent memory aggregate after XY-952: 50 fixture-backed
jobs across 14 suites, 45 pass, 0 incomplete, 5 blocked, 0 wrong-result,
- Real-world agent memory aggregate after XY-953: 55 fixture-backed
jobs across 15 suites, 49 pass, 0 incomplete, 6 blocked, 0 wrong-result,
0 not-encoded, and 0 unsupported-claim results. The remaining non-pass jobs are
production-ops operator boundaries plus blocked OpenViking staged trajectory,
hierarchy selection, and recursive/context expansion measurement gates, not
hidden benchmark wins. The `core_archival_memory` suite passes 6 fixture jobs for
core block attachment, scope, provenance, stale-core detection, archival fallback,
and project-decision recovery; it does not create an ELF-over-Letta claim. The new
hierarchy selection, recursive/context expansion measurement gates, and the
private-corpus refresh blocker tied to XY-930, not hidden benchmark wins. The
`core_archival_memory` suite passes 6 fixture jobs for core block attachment, scope,
provenance, stale-core detection, archival fallback, and project-decision recovery;
it does not create an ELF-over-Letta claim. The
`memory_summary` fixture passes 1 source-trace job for reviewable top-of-mind,
background, stale, superseded, tombstoned, and derived project-profile entries; it
does not create a managed-memory parity claim.
does not create a managed-memory parity claim. The new `proactive_brief` fixture
scores 5 jobs, with 4 pass and 1 blocked private-corpus case; it does not create
Pulse or hosted managed-memory parity.
- Full-suite live real-world adapter sweep after XY-926: ELF and qmd emit
Docker-isolated `live_real_world` records for all 55 checked-in jobs across 13 suites
through `cargo make real-world-memory-live-adapters`. Both keep the original
Expand Down Expand Up @@ -268,6 +271,7 @@ Detailed evidence and interpretation:
- [Live Consolidation Proposal Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md)
- [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md)
- [Live Temporal Reconciliation Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md)
- [Proactive Brief Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md)
- [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
- [Single-User Production Runbook](docs/guide/single_user_production.md)
- Benchmark contract:
Expand Down Expand Up @@ -349,6 +353,7 @@ Detailed comparison, mechanism-level analysis, and source map:
- [Live Consolidation Proposal Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md)
- [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md)
- [Live Temporal Reconciliation Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md)
- [Proactive Brief Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md)
- [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
- [Real-World Agent Memory Benchmark](docs/guide/benchmarking/real_world_agent_memory_benchmark.md)
- [External Memory Improvement Plan](docs/guide/research/external_memory_improvement_plan.md)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
},
"run": {
"status": "blocked",
"evidence": "The current fixture set reports 50 jobs across 14 suites: 45 pass, 0 incomplete, 5 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; context_trajectory remains blocked behind OpenViking staged-artifact materialization.",
"evidence": "The current fixture set reports 55 jobs across 15 suites: 49 pass, 0 incomplete, 6 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; the proactive_brief suite scores 4 passing evidence-linked suggestions plus one blocked private-corpus refresh case tied to XY-930, not Pulse or hosted managed-memory parity; context_trajectory remains blocked behind OpenViking staged-artifact materialization.",
"command": "cargo make real-world-memory",
"artifact": "tmp/real-world-memory/real-world-memory-report.json"
},
Expand Down Expand Up @@ -86,6 +86,16 @@
"status": "pass",
"evidence": "Proposal-only consolidation fixtures are encoded and passing without source mutation."
},
{
"suite_id": "memory_summary",
"status": "pass",
"evidence": "The source-trace memory summary fixture is encoded and passing with freshness, rationale, tombstone, and unsupported-claim guards."
},
{
"suite_id": "proactive_brief",
"status": "blocked",
"evidence": "The proactive brief suite scores 4 passing source-linked suggestions and 1 typed private-corpus refresh blocker tied to XY-930."
},
{
"suite_id": "knowledge_compilation",
"status": "pass",
Expand Down
Loading