Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 70 additions & 18 deletions Makefile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -392,23 +392,26 @@ args = [


# Real-world job benchmark smoke
# | task | type | cwd |
# | -------------------------------- | --------- | --- |
# | real-world-job-smoke | composite | |
# | real-world-job-smoke-json | command | |
# | real-world-job-smoke-report | command | |
# | real-world-memory | composite | |
# | real-world-memory-json | command | |
# | real-world-memory-report | command | |
# | real-world-memory-evolution | composite | |
# | real-world-memory-evolution-json | command | |
# | real-world-memory-evolution-report | command | |
# | real-world-job-operator-ux | composite | |
# | real-world-job-operator-ux-json | command | |
# | real-world-job-operator-ux-report | command | |
# | real-world-memory-retrieval | composite | |
# | real-world-memory-retrieval-json | command | |
# | real-world-memory-retrieval-report | command | |
# | task | type | cwd |
# | -------------------------------------- | --------- | --- |
# | real-world-job-smoke | composite | |
# | real-world-job-smoke-json | command | |
# | real-world-job-smoke-report | command | |
# | real-world-memory | composite | |
# | real-world-memory-json | command | |
# | real-world-memory-report | command | |
# | real-world-memory-evolution | composite | |
# | real-world-memory-evolution-json | command | |
# | real-world-memory-evolution-report | command | |
# | real-world-memory-consolidation | composite | |
# | real-world-memory-consolidation-json | command | |
# | real-world-memory-consolidation-report | command | |
# | real-world-job-operator-ux | composite | |
# | real-world-job-operator-ux-json | command | |
# | real-world-job-operator-ux-report | command | |
# | real-world-memory-retrieval | composite | |
# | real-world-memory-retrieval-json | command | |
# | real-world-memory-retrieval-report | command | |

[tasks.real-world-job-smoke]
workspace = false
Expand Down Expand Up @@ -475,7 +478,7 @@ args = [
"--out",
"tmp/real-world-memory/real-world-memory-report.json",
"--run-id",
"real-world-memory-trust-resume-personalization",
"real-world-memory",
"--adapter-id",
"elf_real_world_memory_fixture",
"--adapter-name",
Expand Down Expand Up @@ -649,6 +652,55 @@ args = [
"tmp/real-world-memory/retrieval-report.md",
]

[tasks.real-world-memory-consolidation]
workspace = false
dependencies = [
"real-world-memory-consolidation-report",
]

[tasks.real-world-memory-consolidation-json]
workspace = false
command = "cargo"
args = [
"run",
"-p",
"elf-eval",
"--bin",
"real_world_job_benchmark",
"--",
"run",
"--fixtures",
"apps/elf-eval/fixtures/real_world_memory/consolidation",
"--out",
"tmp/real-world-memory/consolidation/report.json",
"--run-id",
"real-world-memory-consolidation",
"--adapter-id",
"fixture_consolidation",
"--adapter-name",
"ELF consolidation fixture",
]

[tasks.real-world-memory-consolidation-report]
workspace = false
dependencies = [
"real-world-memory-consolidation-json",
]
command = "cargo"
args = [
"run",
"-p",
"elf-eval",
"--bin",
"real_world_job_benchmark",
"--",
"publish",
"--report",
"tmp/real-world-memory/consolidation/report.json",
"--out",
"tmp/real-world-memory/consolidation/report.md",
]


# Meta
# | task | type | cwd |
Expand Down
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,10 @@ Detailed evidence and interpretation:
- [Single-User Production Runbook](docs/guide/single_user_production.md)
- Future benchmark contract:
[Real-World Agent Memory Benchmark v1](docs/spec/real_world_agent_memory_benchmark_v1.md).
This contract defines job-level suites for agent work, but no system win is claimed
under it until a runner encodes and reports those suites.
This contract defines job-level suites for agent work. Checked-in fixture runners now
cover a smoke work-resume slice and proposal-only consolidation cases through
`cargo make real-world-job-smoke` and `cargo make real-world-memory-consolidation`,
but those reports are fixture-level evidence and not live external-adapter wins.

Quick comparison snapshot (objective/high-level).
This table compares capability coverage, not overall project quality.
Expand Down
Loading