From 0addbb7106814c322828692437d8b382dbd05eed Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 10 Jun 2026 11:15:58 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add real-world external adapter coverage contract","authority":"XY-864"} --- README.md | 7 +- .../memory_projects_manifest.json | 569 +++++++++++++++++ .../src/bin/real_world_job_benchmark.rs | 581 ++++++++++++++++++ .../tests/real_world_job_benchmark.rs | 110 ++++ .../benchmarking/live_baseline_benchmark.md | 7 + .../real_world_agent_memory_benchmark.md | 45 ++ .../research/comparison_external_projects.md | 8 + .../external_memory_improvement_plan.md | 5 + .../real_world_agent_memory_benchmark_v1.md | 86 +++ 9 files changed, 1417 insertions(+), 1 deletion(-) create mode 100644 apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json diff --git a/README.md b/README.md index c636f041..828d1821 100644 --- a/README.md +++ b/README.md @@ -164,7 +164,12 @@ Detailed evidence and interpretation: This contract defines job-level suites for agent work. Checked-in fixture runners now cover a smoke work-resume slice and proposal-only consolidation cases through `cargo make real-world-job-smoke` and `cargo make real-world-memory-consolidation`, - but those reports are fixture-level evidence and not live external-adapter wins. + and `cargo make real-world-memory` now reports the first external adapter coverage + manifest for ELF, qmd, agentmemory, mem0/OpenMemory, claude-mem, memsearch, and + OpenViking. Those real-world reports still distinguish fixture-backed and + live-baseline-only evidence from true live real-world adapter runs; no external + project has a live real-world suite win until an adapter actually executes + `real_world_job` prompts and scoring. Quick comparison snapshot (objective/high-level). This table compares capability coverage, not overall project quality. diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json new file mode 100644 index 00000000..c66ebd56 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -0,0 +1,569 @@ +{ + "schema": "elf.real_world_external_adapter_manifest/v1", + "manifest_id": "real-world-memory-project-adapters-2026-06-10", + "docker_isolation": { + "default": true, + "compose_file": "docker-compose.baseline.yml", + "runner": "scripts/live-baseline-benchmark.sh", + "artifact_dir": "tmp/live-baseline/", + "host_global_installs_required": false, + "notes": [ + "External project runs default to Docker Compose and Docker-managed caches.", + "Real-world job fixture reports and live baseline reports use separate schemas and claim boundaries." + ] + }, + "adapters": [ + { + "adapter_id": "elf_real_world_memory_fixture", + "project": "ELF", + "adapter_kind": "offline_fixture_response", + "evidence_class": "fixture_backed", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The checked-in real_world_memory fixtures parse and score through the ELF fixture runner.", + "command": "cargo make real-world-memory", + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + "run": { + "status": "wrong_result", + "evidence": "The current fixture set reports 27 jobs, 25 pass, 1 wrong_result, and 1 not_encoded.", + "command": "cargo make real-world-memory", + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "This is fixture-backed ELF scoring, not a live external adapter result.", + "artifact": "tmp/real-world-memory/real-world-memory-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_fixture_scoring", + "status": "real", + "evidence": "The runner scores checked-in real_world_job records with expected evidence, traps, and typed status output." + }, + { + "capability": "live_external_adapter_execution", + "status": "not_encoded", + "evidence": "The ELF fixture response path does not exercise an external memory project runtime." + }, + { + "capability": "docker_isolated_baseline", + "status": "pass", + "evidence": "ELF live baseline runs execute through docker-compose.baseline.yml for retrieval and lifecycle evidence." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "Checked-in source-of-truth rebuild fixture is encoded and passing." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "Checked-in work-resume fixtures are encoded and passing." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "Checked-in retrieval fixtures are encoded; one deliberate operator-debug wrong-result case is reported under operator_debugging_ux." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "The relation temporal-validity case is deliberately not_encoded until temporal graph validity is implemented." + }, + { + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "evidence": "The aggregate fixture set includes one deliberate wrong-result trace attribution case." + }, + { + "suite_id": "capture_integration", + "status": "pass", + "evidence": "The redaction and capture-boundary fixture is encoded and passing." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "The scoped preference fixture is encoded and passing." + }, + { + "suite_id": "consolidation", + "status": "pass", + "evidence": "Proposal-only consolidation fixtures are encoded and passing without source mutation." + }, + { + "suite_id": "knowledge_compilation", + "status": "pass", + "evidence": "Knowledge page fixtures are encoded and passing with citation and rebuild metrics." + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory", + "status": "pass" + } + ], + "notes": [ + "This adapter record exists to keep ELF fixture results separate from live external adapter results." + ], + "follow_up": { + "title": "[ELF benchmark vNext] Replace fixture-only ELF answers with live real-world adapter execution where appropriate", + "reason": "The current report proves fixture scoring, not an end-to-end live real-world memory service run." + } + }, + { + "adapter_id": "qmd_live_baseline", + "project": "qmd", + "adapter_kind": "docker_cli_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner installs qmd inside the baseline container.", + "command": "ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/qmd.log" + }, + "run": { + "status": "pass", + "evidence": "qmd same-corpus retrieval, update, delete, and cold-start checks are encoded in the live baseline runner.", + "command": "ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "The current evidence is same-corpus live-baseline evidence only; no real_world_job qmd adapter is encoded yet.", + "artifact": "docs/guide/benchmarking/live_baseline_benchmark.md" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "qmd has an encoded Docker same-corpus retrieval adapter." + }, + { + "capability": "update_delete_cold_start", + "status": "pass", + "evidence": "qmd lifecycle smoke checks are encoded in the live-baseline runner." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No qmd adapter currently executes real_world_job prompts and answer scoring." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "qmd is a retrieval-debug reference, but no real_world_job retrieval adapter run is encoded." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Live-baseline lifecycle checks exist, but no real_world_job memory_evolution run is encoded." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "qmd debug ergonomics are a reference dimension; no operator_debugging_ux fixture is executed against qmd." + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + }, + { + "kind": "compose", + "ref": "docker-compose.baseline.yml", + "status": "real" + } + ], + "notes": [ + "Do not claim a qmd real-world suite pass until a real_world_job adapter executes qmd and records job-level evidence." + ] + }, + { + "adapter_id": "agentmemory_live_baseline", + "project": "agentmemory", + "adapter_kind": "docker_sdk_mock_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "lifecycle_fail", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner installs and exercises agentmemory package APIs.", + "command": "ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/agentmemory.log" + }, + "run": { + "status": "lifecycle_fail", + "evidence": "Same-corpus retrieval can run, but durable lifecycle behavior is not proven because the adapter uses an in-memory SDK/KV mock.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "lifecycle_fail", + "evidence": "agentmemory remains a reference for capture and continuity UX, but current Docker evidence is not a durable lifecycle pass.", + "artifact": "docs/guide/benchmarking/live_baseline_benchmark.md" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "The current adapter can run mem::remember and mem::search against the shared corpus." + }, + { + "capability": "adapter_storage", + "status": "mocked", + "evidence": "The current adapter uses a process-local StateKV Map and in-memory index." + }, + { + "capability": "durable_cold_start", + "status": "blocked", + "evidence": "A persistent upstream KV/index path or hosted runtime is needed before cold-start recovery can be fairly scored." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No agentmemory adapter currently executes real_world_job prompts and answer scoring." + } + ], + "suites": [ + { + "suite_id": "work_resume", + "status": "blocked", + "evidence": "A durable upstream agentmemory session/capture path is required before work-resume jobs can be compared fairly." + }, + { + "suite_id": "capture_integration", + "status": "blocked", + "evidence": "The current fixture import boundary is offline and does not run live agentmemory hooks." + }, + { + "suite_id": "memory_evolution", + "status": "blocked", + "evidence": "Durable update/supersede/delete history is not proven by the in-memory adapter." + } + ], + "evidence": [ + { + "kind": "guide", + "ref": "docs/guide/research/agentmemory_adapter.md", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "mocked" + } + ], + "notes": [ + "The offline agentmemory fixture adapter is an import/comparison boundary and must not be treated as live benchmark proof." + ], + "follow_up": { + "title": "[ELF benchmark P0] Make agentmemory adapter lifecycle-durable and fail-typed", + "reason": "A durable upstream agentmemory storage path is required before lifecycle and real-world job suites can be fairly scored." + } + }, + { + "adapter_id": "mem0_openmemory_live_baseline", + "project": "mem0/OpenMemory", + "adapter_kind": "docker_sdk_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install mem0 and configure local FastEmbed/Qdrant paths.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0.log" + }, + "run": { + "status": "wrong_result", + "evidence": "The current same-corpus retrieval result is typed wrong_result or incomplete in the checked-in benchmark evidence.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "No real_world_job mem0/OpenMemory adapter is encoded; local same-corpus evidence must not be upgraded to suite coverage.", + "artifact": "docs/guide/research/comparison_external_projects.md" + }, + "capabilities": [ + { + "capability": "local_storage", + "status": "real", + "evidence": "The adapter targets local FastEmbed, Qdrant path storage, and local history DB paths in Docker." + }, + { + "capability": "same_corpus_retrieval", + "status": "wrong_result", + "evidence": "The checked-in smoke evidence did not prove a correct same-corpus result for mem0." + }, + { + "capability": "openmemory_ui_readback", + "status": "not_encoded", + "evidence": "OpenMemory UI readback is not encoded in the Docker baseline or real-world job runner." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No mem0/OpenMemory adapter currently executes real_world_job prompts and answer scoring." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "incomplete", + "evidence": "mem0 lifecycle/history is a target dimension, but current Docker evidence has not produced a complete real-world job result." + }, + { + "suite_id": "personalization", + "status": "not_encoded", + "evidence": "Entity-scoped personalization is not encoded as a real_world_job adapter run." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "OpenMemory inspection is not encoded in this runner." + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "Separate local OSS mem0 evidence from hosted Platform and OpenMemory UI claims." + ] + }, + { + "adapter_id": "memsearch_live_baseline", + "project": "memsearch", + "adapter_kind": "docker_cli_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install memsearch and run its CLI path.", + "command": "ELF_BASELINE_PROJECTS=memsearch cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/memsearch.log" + }, + "run": { + "status": "wrong_result", + "evidence": "The current same-corpus retrieval evidence is not a clean pass for memsearch.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "No real_world_job memsearch adapter is encoded; Markdown-first behavior remains a design reference.", + "artifact": "docs/guide/research/comparison_external_projects.md" + }, + "capabilities": [ + { + "capability": "canonical_markdown_store", + "status": "real", + "evidence": "memsearch is tracked as a Markdown-first source-of-truth reference." + }, + { + "capability": "same_corpus_retrieval", + "status": "wrong_result", + "evidence": "The checked-in smoke evidence did not prove correct same-corpus retrieval." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No memsearch adapter currently executes real_world_job prompts and answer scoring." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "incomplete", + "evidence": "The Markdown-first source model is relevant, but no real_world_job source-of-truth run is encoded." + }, + { + "suite_id": "retrieval", + "status": "incomplete", + "evidence": "The live-baseline retrieval path is not a clean pass and no job-level run is encoded." + }, + { + "suite_id": "memory_evolution", + "status": "incomplete", + "evidence": "Update/delete reindex semantics need a complete Docker evidence path before suite claims." + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "Do not mark memsearch worse solely because setup or local indexing is heavier; preserve the typed incomplete/wrong-result boundary." + ] + }, + { + "adapter_id": "openviking_live_baseline", + "project": "OpenViking", + "adapter_kind": "docker_local_embed_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "incomplete", + "setup": { + "status": "incomplete", + "evidence": "OpenViking local-embed setup can fail in Docker while building or importing local embedding dependencies.", + "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/OpenViking.log" + }, + "run": { + "status": "incomplete", + "evidence": "The adapter cannot reliably reach same-corpus add_resource/find behavior until local embedding setup is pinned for Docker.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "incomplete", + "evidence": "No real_world_job OpenViking adapter is encoded; current blocker is dependency setup, not a quality claim.", + "artifact": "docs/guide/benchmarking/live_baseline_benchmark.md" + }, + "capabilities": [ + { + "capability": "local_embed_setup", + "status": "incomplete", + "evidence": "Docker local embedding dependency setup is not reliable in the current adapter." + }, + { + "capability": "context_trajectory", + "status": "not_encoded", + "evidence": "OpenViking staged/hierarchical retrieval is a reference dimension but is not encoded as a real_world_job run." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No OpenViking adapter currently executes real_world_job prompts and answer scoring." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "incomplete", + "evidence": "The local embedding install blocker prevents a fair retrieval job run." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Hierarchical context resume scenarios are not encoded for OpenViking." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Stage trajectory readback is not encoded in this runner." + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "incomplete" + } + ], + "notes": [ + "Record OpenViking as incomplete until Docker-compatible local embeddings are pinned; do not treat setup weight as a negative quality result." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Pin OpenViking Docker local embedding dependency path", + "reason": "The current adapter must reach add_resource/find before real-world job suites can be scored." + } + }, + { + "adapter_id": "claude_mem_live_baseline", + "project": "claude-mem", + "adapter_kind": "docker_repository_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install and build claude-mem.", + "command": "ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/claude-mem.log" + }, + "run": { + "status": "wrong_result", + "evidence": "The current same-corpus SQLite repository search is not a clean pass for claude-mem and lifecycle checks are not encoded.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "No real_world_job claude-mem adapter is encoded; progressive disclosure remains a design reference.", + "artifact": "docs/guide/research/comparison_external_projects.md" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "wrong_result", + "evidence": "The current Docker adapter did not prove correct same-corpus retrieval." + }, + { + "capability": "durable_storage", + "status": "mocked", + "evidence": "The current adapter uses in-memory SQLite and does not reopen a durable store." + }, + { + "capability": "progressive_disclosure_real_world_job", + "status": "not_encoded", + "evidence": "search -> timeline -> observation workflows are not encoded against real_world_job prompts." + } + ], + "suites": [ + { + "suite_id": "work_resume", + "status": "incomplete", + "evidence": "Hook-driven capture and progressive disclosure need a durable local repository run before work-resume suite claims." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Local viewer/operator workflow is not encoded in the benchmark runner." + }, + { + "suite_id": "capture_integration", + "status": "not_encoded", + "evidence": "claude-mem hooks are not executed by this runner." + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "mocked" + } + ], + "notes": [ + "claude-mem remains a UX reference; current Docker evidence is not a real-world progressive-disclosure pass." + ] + } + ] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index f5a5fee6..9ce9b4e3 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -18,9 +18,13 @@ use elf_cli::VERSION; const JOB_SCHEMA: &str = "elf.real_world_job/v1"; const REPORT_SCHEMA: &str = "elf.real_world_job_report/v1"; +const EXTERNAL_ADAPTER_MANIFEST_SCHEMA: &str = "elf.real_world_external_adapter_manifest/v1"; +const EXTERNAL_ADAPTER_REPORT_SCHEMA: &str = "elf.real_world_external_adapter_report/v1"; const DEFAULT_FIXTURE_PATH: &str = "apps/elf-eval/fixtures/real_world_memory/work_resume"; const DEFAULT_REPORT_PATH: &str = "tmp/real-world-job/real-world-job-smoke-report.json"; const DEFAULT_MARKDOWN_PATH: &str = "tmp/real-world-job/real-world-job-smoke-report.md"; +const DEFAULT_EXTERNAL_ADAPTER_MANIFEST_PATH: &str = + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json"; const DEFAULT_RUN_ID: &str = "real-world-job-smoke"; const DEFAULT_ADAPTER_ID: &str = "fixture_smoke"; const DEFAULT_ADAPTER_NAME: &str = "ELF fixture smoke"; @@ -85,6 +89,12 @@ struct RunArgs { /// Human-readable adapter name recorded in the generated report. #[arg(long, default_value = DEFAULT_ADAPTER_NAME)] adapter_name: String, + /// Real-world external adapter manifest to include in report coverage. + #[arg(long, value_name = "FILE", default_value = DEFAULT_EXTERNAL_ADAPTER_MANIFEST_PATH)] + external_adapter_manifest: PathBuf, + /// Skip loading the real-world external adapter coverage manifest. + #[arg(long)] + skip_external_adapter_manifest: bool, } #[derive(Debug, Parser)] @@ -562,6 +572,8 @@ struct RealWorldReport { runner_version: String, corpus_profile: String, adapter: AdapterReport, + #[serde(default)] + external_adapters: ExternalAdapterSection, capture_integration: CaptureIntegrationReport, summary: ReportSummary, suites: Vec, @@ -585,6 +597,133 @@ struct AdapterReport { notes: String, } +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum AdapterCoverageStatus { + Real, + Mocked, + Unsupported, + Blocked, + Incomplete, + WrongResult, + LifecycleFail, + Pass, + NotEncoded, +} + +#[derive(Debug, Deserialize)] +struct ExternalAdapterManifest { + schema: String, + manifest_id: String, + docker_isolation: ExternalDockerIsolation, + #[serde(default)] + adapters: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ExternalAdapterSection { + schema: String, + manifest_id: String, + docker_isolation: ExternalDockerIsolation, + summary: ExternalAdapterSummary, + #[serde(default)] + adapters: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ExternalDockerIsolation { + default: bool, + compose_file: String, + runner: String, + artifact_dir: String, + host_global_installs_required: bool, + #[serde(default)] + notes: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ExternalAdapterReport { + adapter_id: String, + project: String, + adapter_kind: String, + evidence_class: String, + docker_default: bool, + host_global_installs_required: bool, + overall_status: AdapterCoverageStatus, + setup: AdapterExecutionEvidence, + run: AdapterExecutionEvidence, + result: AdapterExecutionEvidence, + #[serde(default)] + capabilities: Vec, + #[serde(default)] + suites: Vec, + #[serde(default)] + evidence: Vec, + #[serde(default)] + notes: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + follow_up: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct AdapterExecutionEvidence { + status: AdapterCoverageStatus, + evidence: String, + #[serde(skip_serializing_if = "Option::is_none")] + command: Option, + #[serde(skip_serializing_if = "Option::is_none")] + artifact: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct AdapterCapabilityCoverage { + capability: String, + status: AdapterCoverageStatus, + evidence: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct AdapterSuiteCoverage { + suite_id: String, + status: AdapterCoverageStatus, + evidence: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct AdapterEvidencePointer { + kind: String, + #[serde(rename = "ref")] + reference: String, + status: AdapterCoverageStatus, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ExternalAdapterSummary { + adapter_count: usize, + external_project_count: usize, + docker_default_count: usize, + host_global_install_required_count: usize, + fixture_backed_count: usize, + live_baseline_only_count: usize, + live_real_world_count: usize, + overall_status_counts: AdapterStatusCounts, + capability_status_counts: AdapterStatusCounts, + suite_status_counts: AdapterStatusCounts, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct AdapterStatusCounts { + real: usize, + mocked: usize, + unsupported: usize, + blocked: usize, + incomplete: usize, + wrong_result: usize, + lifecycle_fail: usize, + pass: usize, + not_encoded: usize, +} + #[derive(Clone, Debug, Default, Deserialize, Serialize)] struct CaptureIntegrationReport { #[serde(default)] @@ -1826,6 +1965,10 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result Result AdapterReport { } } +fn external_adapter_section( + manifest_path: &Path, + skip_manifest: bool, +) -> Result { + if skip_manifest { + return Ok(empty_external_adapter_section("skipped")); + } + + let manifest_path = resolve_external_adapter_manifest_path(manifest_path); + + if !manifest_path.exists() { + return Ok(empty_external_adapter_section("missing")); + } + + let raw = fs::read_to_string(&manifest_path)?; + let manifest = serde_json::from_str::(&raw).map_err(|err| { + eyre::eyre!("Failed to parse external adapter manifest {}: {err}", manifest_path.display()) + })?; + + validate_external_adapter_manifest(&manifest, &manifest_path)?; + + let summary = external_adapter_summary(&manifest.adapters); + + Ok(ExternalAdapterSection { + schema: EXTERNAL_ADAPTER_REPORT_SCHEMA.to_string(), + manifest_id: manifest.manifest_id, + docker_isolation: manifest.docker_isolation, + summary, + adapters: manifest.adapters, + }) +} + +fn empty_external_adapter_section(reason: &str) -> ExternalAdapterSection { + ExternalAdapterSection { + schema: EXTERNAL_ADAPTER_REPORT_SCHEMA.to_string(), + manifest_id: reason.to_string(), + docker_isolation: ExternalDockerIsolation::default(), + summary: ExternalAdapterSummary::default(), + adapters: Vec::new(), + } +} + +fn resolve_external_adapter_manifest_path(path: &Path) -> PathBuf { + if path.exists() || path.is_absolute() { + return path.to_path_buf(); + } + + let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); + let Some(workspace_root) = manifest_dir.parent().and_then(Path::parent) else { + return path.to_path_buf(); + }; + let workspace_candidate = workspace_root.join(path); + + if workspace_candidate.exists() { workspace_candidate } else { path.to_path_buf() } +} + +fn validate_external_adapter_manifest( + manifest: &ExternalAdapterManifest, + path: &Path, +) -> Result<()> { + if manifest.schema != EXTERNAL_ADAPTER_MANIFEST_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {EXTERNAL_ADAPTER_MANIFEST_SCHEMA}.", + path.display(), + manifest.schema + )); + } + if manifest.manifest_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); + } + + validate_external_docker_isolation(path, &manifest.docker_isolation)?; + + validate_external_adapters(path, &manifest.adapters) +} + +fn validate_external_docker_isolation(path: &Path, docker: &ExternalDockerIsolation) -> Result<()> { + if docker.compose_file.trim().is_empty() + || docker.runner.trim().is_empty() + || docker.artifact_dir.trim().is_empty() + { + return Err(eyre::eyre!("{} has incomplete docker_isolation metadata.", path.display())); + } + if !docker.default { + return Err(eyre::eyre!( + "{} external adapter manifest must default to Docker isolation.", + path.display() + )); + } + if docker.host_global_installs_required { + return Err(eyre::eyre!( + "{} external adapter manifest must not require host-global installs by default.", + path.display() + )); + } + + Ok(()) +} + +fn validate_external_adapters(path: &Path, adapters: &[ExternalAdapterReport]) -> Result<()> { + if adapters.is_empty() { + return Err(eyre::eyre!("{} declares no external adapters.", path.display())); + } + + let mut seen = BTreeSet::new(); + + for adapter in adapters { + validate_external_adapter(path, adapter)?; + + if !seen.insert(adapter.adapter_id.as_str()) { + return Err(eyre::eyre!( + "{} declares duplicate adapter_id {}.", + path.display(), + adapter.adapter_id + )); + } + } + + Ok(()) +} + +fn validate_external_adapter(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { + if adapter.adapter_id.trim().is_empty() + || adapter.project.trim().is_empty() + || adapter.adapter_kind.trim().is_empty() + || adapter.evidence_class.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete external adapter.", path.display())); + } + if !matches!( + adapter.evidence_class.as_str(), + "fixture_backed" | "live_baseline_only" | "live_real_world" + ) { + return Err(eyre::eyre!( + "{} adapter {} has unsupported evidence_class {}.", + path.display(), + adapter.adapter_id, + adapter.evidence_class + )); + } + if adapter.docker_default && adapter.host_global_installs_required { + return Err(eyre::eyre!( + "{} adapter {} is Docker-default but requires host-global installs.", + path.display(), + adapter.adapter_id + )); + } + + validate_adapter_execution(path, adapter)?; + validate_adapter_capabilities(path, adapter)?; + validate_adapter_suites(path, adapter)?; + validate_adapter_evidence(path, adapter)?; + + if let Some(follow_up) = &adapter.follow_up + && (follow_up.title.trim().is_empty() || follow_up.reason.trim().is_empty()) + { + return Err(eyre::eyre!( + "{} adapter {} has an incomplete follow_up.", + path.display(), + adapter.adapter_id + )); + } + + Ok(()) +} + +fn validate_adapter_execution(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { + for evidence in [&adapter.setup, &adapter.run, &adapter.result] { + if evidence.evidence.trim().is_empty() + || evidence.command.as_deref().is_some_and(str::is_empty) + || evidence.artifact.as_deref().is_some_and(str::is_empty) + { + return Err(eyre::eyre!( + "{} adapter {} has incomplete setup/run/result evidence.", + path.display(), + adapter.adapter_id + )); + } + } + + Ok(()) +} + +fn validate_adapter_capabilities(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { + for capability in &adapter.capabilities { + if capability.capability.trim().is_empty() || capability.evidence.trim().is_empty() { + return Err(eyre::eyre!( + "{} adapter {} has incomplete capability coverage.", + path.display(), + adapter.adapter_id + )); + } + } + + Ok(()) +} + +fn validate_adapter_suites(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { + for suite in &adapter.suites { + if !SUITES.contains(&suite.suite_id.as_str()) { + return Err(eyre::eyre!( + "{} adapter {} references unknown suite {}.", + path.display(), + adapter.adapter_id, + suite.suite_id + )); + } + if suite.evidence.trim().is_empty() { + return Err(eyre::eyre!( + "{} adapter {} has suite {} without evidence.", + path.display(), + adapter.adapter_id, + suite.suite_id + )); + } + } + + Ok(()) +} + +fn validate_adapter_evidence(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { + for evidence in &adapter.evidence { + if evidence.kind.trim().is_empty() || evidence.reference.trim().is_empty() { + return Err(eyre::eyre!( + "{} adapter {} has incomplete evidence pointers.", + path.display(), + adapter.adapter_id + )); + } + } + + Ok(()) +} + +fn external_adapter_summary(adapters: &[ExternalAdapterReport]) -> ExternalAdapterSummary { + let mut summary = ExternalAdapterSummary { + adapter_count: adapters.len(), + external_project_count: adapters.iter().filter(|adapter| adapter.project != "ELF").count(), + ..ExternalAdapterSummary::default() + }; + + for adapter in adapters { + accumulate_adapter_summary(&mut summary, adapter); + } + + summary +} + +fn accumulate_adapter_summary( + summary: &mut ExternalAdapterSummary, + adapter: &ExternalAdapterReport, +) { + summary.docker_default_count += usize::from(adapter.docker_default); + summary.host_global_install_required_count += + usize::from(adapter.host_global_installs_required); + summary.fixture_backed_count += usize::from(adapter.evidence_class == "fixture_backed"); + summary.live_baseline_only_count += usize::from(adapter.evidence_class == "live_baseline_only"); + summary.live_real_world_count += usize::from(adapter.evidence_class == "live_real_world"); + + increment_adapter_status_count(&mut summary.overall_status_counts, adapter.overall_status); + + for capability in &adapter.capabilities { + increment_adapter_status_count(&mut summary.capability_status_counts, capability.status); + } + for suite in &adapter.suites { + increment_adapter_status_count(&mut summary.suite_status_counts, suite.status); + } +} + +fn increment_adapter_status_count(counts: &mut AdapterStatusCounts, status: AdapterCoverageStatus) { + match status { + AdapterCoverageStatus::Real => counts.real += 1, + AdapterCoverageStatus::Mocked => counts.mocked += 1, + AdapterCoverageStatus::Unsupported => counts.unsupported += 1, + AdapterCoverageStatus::Blocked => counts.blocked += 1, + AdapterCoverageStatus::Incomplete => counts.incomplete += 1, + AdapterCoverageStatus::WrongResult => counts.wrong_result += 1, + AdapterCoverageStatus::LifecycleFail => counts.lifecycle_fail += 1, + AdapterCoverageStatus::Pass => counts.pass += 1, + AdapterCoverageStatus::NotEncoded => counts.not_encoded += 1, + } +} + fn capture_integration_report(jobs: &[RealWorldJob]) -> CaptureIntegrationReport { let mut report = CaptureIntegrationReport::default(); @@ -3397,6 +3824,7 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { let mut out = String::new(); render_markdown_header(&mut out, report, report_path.as_str()); + render_markdown_external_adapters(&mut out, report); render_markdown_capture_integration(&mut out, report); render_markdown_suites(&mut out, report); render_markdown_jobs(&mut out, report); @@ -3446,6 +3874,91 @@ fn render_markdown_capture_integration(out: &mut String, report: &RealWorldRepor out.push('\n'); } +fn render_markdown_external_adapters(out: &mut String, report: &RealWorldReport) { + out.push_str("## External Adapter Coverage\n\n"); + + if report.external_adapters.adapters.is_empty() { + out.push_str("No external adapter coverage manifest was loaded for this report.\n\n"); + + return; + } + + let summary = &report.external_adapters.summary; + + out.push_str("This section is manifest-backed. It records external adapter coverage and blockers, but it does not convert live-baseline retrieval results into real-world suite wins.\n\n"); + out.push_str(&format!( + "- Manifest: `{}`\n", + md_inline(report.external_adapters.manifest_id.as_str()) + )); + out.push_str(&format!( + "- Docker default: `{}` via `{}`; artifact dir `{}`\n", + report.external_adapters.docker_isolation.default, + md_inline(report.external_adapters.docker_isolation.compose_file.as_str()), + md_inline(report.external_adapters.docker_isolation.artifact_dir.as_str()) + )); + out.push_str(&format!( + "- Adapter records: `{}` total, `{}` external project(s), `{}` Docker-default, `{}` requiring host-global installs\n", + summary.adapter_count, + summary.external_project_count, + summary.docker_default_count, + summary.host_global_install_required_count + )); + out.push_str(&format!( + "- Evidence classes: `{}` fixture-backed, `{}` live-baseline-only, `{}` live real-world\n", + summary.fixture_backed_count, + summary.live_baseline_only_count, + summary.live_real_world_count + )); + out.push_str(&format!( + "- Overall statuses: `{}`\n", + adapter_status_counts_display(&summary.overall_status_counts) + )); + out.push_str(&format!( + "- Capability coverage statuses: `{}`\n", + adapter_status_counts_display(&summary.capability_status_counts) + )); + out.push_str(&format!( + "- Real-world suite statuses: `{}`\n\n", + adapter_status_counts_display(&summary.suite_status_counts) + )); + out.push_str("| Project | Adapter | Evidence Class | Overall | Setup | Run | Result | Docker | Suites | Evidence |\n"); + out.push_str("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n"); + + for adapter in &report.external_adapters.adapters { + out.push_str(&format!( + "| {} | `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | {} | {} |\n", + md_cell(adapter.project.as_str()), + md_inline(adapter.adapter_id.as_str()), + md_inline(adapter.evidence_class.as_str()), + adapter_status_str(adapter.overall_status), + adapter_status_str(adapter.setup.status), + adapter_status_str(adapter.run.status), + adapter_status_str(adapter.result.status), + adapter.docker_default, + adapter_suite_cell(adapter.suites.as_slice()), + adapter_evidence_cell(adapter) + )); + } + + out.push_str("\n### Adapter Capability Details\n\n"); + out.push_str("| Adapter | Capability | Status | Evidence |\n"); + out.push_str("| --- | --- | --- | --- |\n"); + + for adapter in &report.external_adapters.adapters { + for capability in &adapter.capabilities { + out.push_str(&format!( + "| `{}` | {} | `{}` | {} |\n", + md_inline(adapter.adapter_id.as_str()), + md_cell(capability.capability.as_str()), + adapter_status_str(capability.status), + md_cell(capability.evidence.as_str()) + )); + } + } + + out.push('\n'); +} + fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_path: &str) { out.push_str("# Real-World Job Benchmark Report\n\n"); out.push_str( @@ -4024,6 +4537,74 @@ fn status_str(status: TypedStatus) -> &'static str { } } +fn adapter_status_str(status: AdapterCoverageStatus) -> &'static str { + match status { + AdapterCoverageStatus::Real => "real", + AdapterCoverageStatus::Mocked => "mocked", + AdapterCoverageStatus::Unsupported => "unsupported", + AdapterCoverageStatus::Blocked => "blocked", + AdapterCoverageStatus::Incomplete => "incomplete", + AdapterCoverageStatus::WrongResult => "wrong_result", + AdapterCoverageStatus::LifecycleFail => "lifecycle_fail", + AdapterCoverageStatus::Pass => "pass", + AdapterCoverageStatus::NotEncoded => "not_encoded", + } +} + +fn adapter_status_counts_display(counts: &AdapterStatusCounts) -> String { + [ + ("real", counts.real), + ("mocked", counts.mocked), + ("unsupported", counts.unsupported), + ("blocked", counts.blocked), + ("incomplete", counts.incomplete), + ("wrong_result", counts.wrong_result), + ("lifecycle_fail", counts.lifecycle_fail), + ("pass", counts.pass), + ("not_encoded", counts.not_encoded), + ] + .into_iter() + .filter(|(_, count)| *count > 0) + .map(|(status, count)| format!("{status}={count}")) + .collect::>() + .join(", ") +} + +fn adapter_suite_cell(suites: &[AdapterSuiteCoverage]) -> String { + if suites.is_empty() { + return "`none`".to_string(); + } + + suites + .iter() + .map(|suite| { + format!( + "`{}`: `{}`", + md_inline(suite.suite_id.as_str()), + adapter_status_str(suite.status) + ) + }) + .collect::>() + .join("
") +} + +fn adapter_evidence_cell(adapter: &ExternalAdapterReport) -> String { + let setup = adapter + .setup + .command + .as_deref() + .or(adapter.setup.artifact.as_deref()) + .unwrap_or(adapter.setup.evidence.as_str()); + let result = adapter + .result + .artifact + .as_deref() + .or(adapter.result.command.as_deref()) + .unwrap_or(adapter.result.evidence.as_str()); + + format!("setup: `{}`
result: `{}`", md_inline(setup), md_inline(result)) +} + fn trace_failure_stage(trace: Option<&TraceExplainability>) -> Option<&str> { trace.and_then(|trace| trace.failure_stage.as_deref()) } diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index cc665cb4..bb158eb5 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -108,6 +108,14 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/external_adapters/summary/adapter_count").and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report.pointer("/external_adapters/summary/live_real_world_count").and_then(Value::as_u64), + Some(0) + ); let jobs = array_at(&report, "/jobs")?; let job = find_by_field(jobs, "/job_id", "work-resume-stale-worktree-001")?; @@ -150,6 +158,105 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { Ok(()) } +#[test] +fn real_world_report_includes_external_adapter_coverage_manifest() -> Result<()> { + let report = run_json_report_from(real_world_memory_fixture_dir())?; + + assert_eq!( + report.pointer("/external_adapters/schema").and_then(Value::as_str), + Some("elf.real_world_external_adapter_report/v1") + ); + assert_eq!( + report.pointer("/external_adapters/manifest_id").and_then(Value::as_str), + Some("real-world-memory-project-adapters-2026-06-10") + ); + assert_eq!( + report.pointer("/external_adapters/docker_isolation/default").and_then(Value::as_bool), + Some(true) + ); + assert_eq!( + report + .pointer("/external_adapters/docker_isolation/host_global_installs_required") + .and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + report.pointer("/external_adapters/summary/adapter_count").and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report.pointer("/external_adapters/summary/external_project_count").and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report.pointer("/external_adapters/summary/fixture_backed_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/live_baseline_only_count") + .and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report.pointer("/external_adapters/summary/live_real_world_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/pass") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/wrong_result") + .and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/lifecycle_fail") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/incomplete") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/capability_status_counts/mocked") + .and_then(Value::as_u64), + Some(2) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/suite_status_counts/blocked") + .and_then(Value::as_u64), + Some(3) + ); + + let adapters = array_at(&report, "/external_adapters/adapters")?; + let elf = find_by_field(adapters, "/adapter_id", "elf_real_world_memory_fixture")?; + let qmd = find_by_field(adapters, "/adapter_id", "qmd_live_baseline")?; + let agentmemory = find_by_field(adapters, "/adapter_id", "agentmemory_live_baseline")?; + let openviking = find_by_field(adapters, "/adapter_id", "openviking_live_baseline")?; + + assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); + assert_eq!(qmd.pointer("/overall_status").and_then(Value::as_str), Some("pass")); + assert_eq!(qmd.pointer("/suites/0/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!( + agentmemory.pointer("/capabilities/1/status").and_then(Value::as_str), + Some("mocked") + ); + assert_eq!(openviking.pointer("/overall_status").and_then(Value::as_str), Some("incomplete")); + + Ok(()) +} + #[test] fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; @@ -362,6 +469,9 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("# Real-World Job Benchmark Report")); assert!(markdown.contains("work_resume")); assert!(markdown.contains("Capture And Integration Coverage")); + assert!(markdown.contains("External Adapter Coverage")); + assert!(markdown.contains("live-baseline-only")); + assert!(markdown.contains("does not convert live-baseline retrieval results")); assert!(markdown.contains("fixture-backed")); assert!(markdown.contains("agentmemory-style hook capture")); assert!(markdown.contains("xy844-current-worktree")); diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index 5d5f0387..d419af0c 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -290,6 +290,13 @@ the interpretation manually under `docs/guide/benchmarking/`. The live-baseline runner and real-world job runner publish separate report schemas. Live-baseline reports remain evidence for Docker retrieval and lifecycle checks only. They are not real-world suite wins. +The real-world runner loads +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` +by default and records live-baseline-only external adapter evidence under +`external_adapters`; those records preserve the typed setup/run evidence but still +leave real-world suites as `not_encoded`, `blocked`, `incomplete`, `wrong_result`, or +`lifecycle_fail` until an adapter actually executes `real_world_job` prompts and +scoring. To run the checked-in real-world job smoke fixture and render its Markdown report: diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 305ec553..ab8fa512 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -167,6 +167,51 @@ for stale blockers, unsupported prior claims, stale deleted facts, stale histori facts, cross-project preference leakage, private/redacted text leakage, obsolete retrieval context, and distractor context. +The report also loads the checked-in external adapter coverage manifest by default: + +```text +apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +``` + +That manifest records the first memory-project set: ELF, qmd, agentmemory, +mem0/OpenMemory, claude-mem, memsearch, and OpenViking. Its `external_adapters` +report section distinguishes: + +- `fixture_backed`: checked-in real-world fixture scoring, such as the ELF fixture + response path. +- `live_baseline_only`: Docker live-baseline retrieval/lifecycle evidence that is not + a real-world suite win. +- `live_real_world`: future external adapters that actually execute `real_world_job` + prompts and scoring. + +Current state: no external project has a `live_real_world` adapter in this runner yet. +qmd has Docker live-baseline pass evidence for the encoded same-corpus checks, but its +real-world suites remain `not_encoded`. agentmemory is blocked on durable upstream +storage for lifecycle proof. mem0/OpenMemory, memsearch, and claude-mem currently +retain wrong-result or incomplete live-baseline states for the checked-in adapter +evidence. OpenViking is incomplete until its local embedding setup is reliable inside +Docker. These typed states describe benchmark coverage; do not treat them as broad +project quality rankings. + +To run the fixture report without the manifest during local debugging: + +```sh +cargo run -p elf-eval --bin real_world_job_benchmark -- \ + run \ + --fixtures apps/elf-eval/fixtures/real_world_memory \ + --skip-external-adapter-manifest +``` + +To test an adapter-pack manifest before committing it: + +```sh +cargo run -p elf-eval --bin real_world_job_benchmark -- \ + run \ + --fixtures apps/elf-eval/fixtures/real_world_memory \ + --external-adapter-manifest path/to/manifest.json \ + --out tmp/real-world-memory/adapter-contract-report.json +``` + Narrow memory evolution increment: ```sh diff --git a/docs/guide/research/comparison_external_projects.md b/docs/guide/research/comparison_external_projects.md index 54be2ba7..9d8ae4f1 100644 --- a/docs/guide/research/comparison_external_projects.md +++ b/docs/guide/research/comparison_external_projects.md @@ -56,6 +56,14 @@ or could not prove durable lifecycle behavior; memsearch, mem0, OpenViking, and claude-mem retained `incomplete`, wrong-result, or not-encoded states. All broader suite fit below is research guidance, not a benchmark result. +The real-world job runner now carries a separate external adapter coverage manifest: +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. +That manifest is a contract and evidence ledger, not a leaderboard. It records which +projects only have `live_baseline_only` Docker retrieval/lifecycle evidence, which +capabilities are `mocked`, `blocked`, `unsupported`, `incomplete`, `wrong_result`, or +`lifecycle_fail`, and which real-world suites remain `not_encoded`. No external project +in the first manifest has `live_real_world` suite evidence yet. + Benchmark suite labels: | Suite | Real-world job shape | diff --git a/docs/guide/research/external_memory_improvement_plan.md b/docs/guide/research/external_memory_improvement_plan.md index f288685e..bd37e8fc 100644 --- a/docs/guide/research/external_memory_improvement_plan.md +++ b/docs/guide/research/external_memory_improvement_plan.md @@ -231,12 +231,17 @@ Implementation shape: - For every external adapter, mark which behaviors are real, mocked, unsupported, or blocked. - Add lifecycle checks: update, delete/expire, cold-start reload, and same-corpus retrieval. - Keep failures typed with the terms in this document. +- Use `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` + as the real-world adapter coverage contract so fixture-only, live-baseline-only, and + future live-real-world evidence stay separate. Acceptance: - agentmemory adapter either passes durable lifecycle checks or is explicitly marked blocked with evidence. - OpenViking incomplete state records a pinned dependency failure and retry path. - qmd smoke pass remains covered and gains scale/stress profiles. +- Real-world reports include adapter coverage counters before any external adapter is + allowed to claim a real-world suite pass. Linear mapping: diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index d1aefae9..8591590c 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -125,6 +125,88 @@ Optional corpus fields: Private corpus fixtures MUST use sanitized inline text or local refs excluded from git. Reports MAY publish evidence ids and score summaries without publishing private text. +### External Adapter Manifest + +Real-world reports MAY include an external adapter manifest. When present, the manifest +MUST use this schema id: + +```text +elf.real_world_external_adapter_manifest/v1 +``` + +The manifest is the stable adapter-pack contract for comparing external memory projects +against `real_world_job` suites. It records what an adapter actually executed, which +coverage is only fixture-backed or live-baseline-only, and which suites remain blocked, +unsupported, incomplete, or not encoded. It MUST NOT be used to convert retrieval-only +live-baseline evidence into a real-world suite win. + +Required manifest fields: + +- `manifest_id`: stable ASCII id for the checked-in or generated manifest. +- `docker_isolation`: object describing the default execution boundary. +- `adapters`: array of adapter records. + +`docker_isolation` MUST include: + +- `default`: boolean; MUST be `true` for repository-supported external adapter runs + unless a separate issue records why Docker is impossible. +- `compose_file`: Docker Compose file used by the supported runner. +- `runner`: script or command entrypoint used inside the Compose boundary. +- `artifact_dir`: relative artifact directory for logs and reports. +- `host_global_installs_required`: boolean; MUST be `false` for default external + runs. +- `notes`: optional bounded explanatory strings. + +Each `adapters[]` record MUST include: + +- `adapter_id`: stable id unique within the manifest. +- `project`: display name such as `qmd`, `agentmemory`, or `mem0/OpenMemory`. +- `adapter_kind`: local execution shape, for example `docker_cli_same_corpus`, + `docker_sdk_same_corpus`, or `offline_fixture_response`. +- `evidence_class`: one of `fixture_backed`, `live_baseline_only`, or + `live_real_world`. +- `docker_default`: boolean. +- `host_global_installs_required`: boolean. +- `overall_status`: one adapter status from the table below. +- `setup`, `run`, and `result`: evidence objects with `status`, `evidence`, and + optional `command` and `artifact`. +- `capabilities`: array of capability coverage records with `capability`, `status`, + and `evidence`. +- `suites`: array of real-world suite coverage records with `suite_id`, `status`, and + `evidence`. +- `evidence`: array of evidence pointers with `kind`, `ref`, and `status`. +- `notes`: optional bounded explanatory strings. +- `follow_up`: optional `title` and `reason`. + +Adapter coverage status terms: + +| Term | Meaning | +| --- | --- | +| `real` | The adapter capability is exercised through the project's real local API, CLI, storage, or service surface. | +| `mocked` | The adapter uses a mock, in-memory substitute, fixture replay, or other non-durable stand-in for the named capability. | +| `unsupported` | The project or safe Docker profile does not expose the capability. This is not a quality penalty. | +| `blocked` | The check cannot run safely without credentials, manual setup, durable runtime integration, private input, or host integration outside the run scope. | +| `incomplete` | Setup, build, dependency, adapter wiring, parse, or runtime execution did not reach the behavioral check. | +| `wrong_result` | The adapter reached execution but produced the wrong answer, memory, evidence, or action. | +| `lifecycle_fail` | Retrieval may work, but encoded update, delete, expiry, cold-start, persistence, history, or supersession behavior failed. | +| `pass` | The declared adapter check completed and met its encoded expectations. | +| `not_encoded` | The capability, suite, or adapter path is not implemented in the runner, so no pass/fail claim is allowed. | + +Reports that load a manifest MUST emit an `external_adapters` section with schema id +`elf.real_world_external_adapter_report/v1`, the manifest id, Docker isolation +metadata, per-adapter records, and summary counters for: + +- adapter count, external project count, Docker-default count, host-global-install + count; +- `fixture_backed`, `live_baseline_only`, and `live_real_world` evidence classes; +- overall adapter statuses; +- capability coverage statuses; +- real-world suite coverage statuses. + +Adapter-pack issues SHOULD add new projects by appending adapter records to this +manifest shape. They MUST NOT change these status meanings to make a project look +better or worse. + ### `timeline` `timeline` MUST model the user job as prior agent work, not just a bag of documents. @@ -454,6 +536,10 @@ Reports MUST include: - capture/integration coverage classes when any fixture declares `capture_behaviors`, preserving the `real`, `fixture_backed`, `mocked`, `blocked`, and `not_encoded` distinction. +- external adapter coverage when an external adapter manifest is loaded, preserving + `fixture_backed`, `live_baseline_only`, `live_real_world`, `real`, `mocked`, + `unsupported`, `blocked`, `incomplete`, `wrong_result`, `lifecycle_fail`, `pass`, + and `not_encoded` distinctions. Reports that encode `memory_evolution` jobs SHOULD also include stale-answer counts, conflict detection counts, update rationale availability, and temporal-validity