From 29147148e14027b285ecf704db7894f97a785919 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Thu, 11 Jun 2026 20:49:41 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add live operator-debug benchmark scoring","authority":"XY-932"} --- Makefile.toml | 9 + README.md | 8 + .../memory_projects_manifest.json | 268 +++++++++++++++ .../selected_but_not_narrated.json | 160 +++++++++ .../src/bin/real_world_job_benchmark.rs | 26 +- .../src/bin/real_world_live_adapter.rs | 318 +++++++++++++++--- .../tests/real_world_job_benchmark.rs | 317 +++++++++++++++-- ...-11-competitor-strength-adoption-report.md | 12 +- ...-11-competitor-strength-evidence-matrix.md | 14 +- ...on-direction-from-competitor-benchmarks.md | 34 +- ...elf-qmd-trace-replay-diagnostics-report.md | 30 +- .../2026-06-11-measurement-coverage-audit.md | 17 +- ...1-competitor-strength-adoption-report.json | 193 ++++++++--- ...f-qmd-trace-replay-diagnostics-report.json | 86 ++++- ...2026-06-11-measurement-coverage-audit.json | 102 ++++-- ...-11-xy-897-competitor-strength-matrix.json | 26 +- ...real-world-operator-debug-live-adapters.sh | 129 +++++++ 17 files changed, 1552 insertions(+), 197 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/selected_but_not_narrated.json create mode 100755 scripts/real-world-operator-debug-live-adapters.sh diff --git a/Makefile.toml b/Makefile.toml index 86b24c7d..42b2033c 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -421,6 +421,7 @@ args = [ # | real-world-job-operator-ux | composite | | # | real-world-job-operator-ux-json | command | | # | real-world-job-operator-ux-report | command | | +# | real-world-job-operator-ux-live-adapters | command | | # | real-world-memory-retrieval | composite | | # | real-world-memory-retrieval-json | command | | # | real-world-memory-retrieval-report | command | | @@ -668,6 +669,14 @@ args = [ "tmp/real-world-job/real-world-job-operator-ux-report.md", ] +[tasks.real-world-job-operator-ux-live-adapters] +workspace = false +command = "bash" +args = [ + "-lc", + "docker compose -f docker-compose.baseline.yml run --build --rm -e ELF_OPERATOR_DEBUG_LIVE_REPORT_DIR -e ELF_OPERATOR_DEBUG_LIVE_FIXTURES -e ELF_OPERATOR_DEBUG_LIVE_WORK_DIR -e ELF_OPERATOR_DEBUG_QMD_DIR baseline-runner bash scripts/real-world-operator-debug-live-adapters.sh", +] + [tasks.real-world-memory-retrieval] workspace = false dependencies = [ diff --git a/README.md b/README.md index f4e15199..8261bf13 100644 --- a/README.md +++ b/README.md @@ -162,6 +162,14 @@ provider-backed ELF evidence was required. 17 pass, 6 wrong_result, 2 blocked, and 13 not_encoded jobs. The difference is the delete/TTL tombstone case; qmd remains the local retrieval-debug UX reference, and no broad ELF-over-qmd claim is allowed. +- Live operator-debugging slice after XY-932: `cargo make + real-world-job-operator-ux-live-adapters` emits narrow Docker-isolated + `live_real_world` records for ELF and qmd over the operator-debugging fixtures. + ELF passes trace hydration, candidate-drop visibility, selected-but-not-narrated + evidence, replay-command availability, and repair-action clarity. qmd ties replay + command and repair-action clarity but is `wrong_result` for trace hydration and + candidate-drop stage visibility. OpenMemory UI/export and claude-mem viewer flows + remain blocked or not encoded, so this is not a broad viewer-product claim. - Expanded adapter-pack coverage after XY-834: the real-world external adapter manifest now includes `research_gate` records for RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, and deeper diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index f5eabf62..2832b202 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -481,6 +481,274 @@ "This record does not prove broad RAG/graph adapter parity or private-corpus production quality." ] }, + { + "adapter_id": "elf_operator_debug_live", + "project": "ELF", + "adapter_kind": "docker_service_operator_debug_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The narrow operator-debug live task runs inside docker-compose.baseline.yml with Docker-owned Postgres, Qdrant, Cargo, npm, qmd, and cache volumes.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json" + }, + "run": { + "status": "pass", + "evidence": "ELF materializes operator_debugging_ux adapter_response objects through ElfService, worker indexing, search_raw trace ids, and generated operator_debug metadata.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + }, + "result": { + "status": "pass", + "evidence": "The narrow live slice scores operator-debugging jobs with trace availability, replay command availability, candidate-drop visibility, repair-action clarity, and raw-SQL avoidance separated in job-level evidence.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.md" + }, + "capabilities": [ + { + "capability": "operator_debug_real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes the checked-in operator_debugging_ux jobs through the live service materializer and generated scoring fixtures." + }, + { + "capability": "trace_hydration_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include service trace ids, viewer links, admin trace-bundle URLs, and trace_available=true." + }, + { + "capability": "replay_command_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include admin trace-bundle curl replay commands; no raw SQL path is required." + }, + { + "capability": "candidate_drop_visibility", + "status": "pass", + "evidence": "The operator-debug jobs keep dropped-candidate visibility as explicit job-level evidence instead of relying on direct database inspection." + }, + { + "capability": "openmemory_or_claude_mem_ui_runner", + "status": "not_encoded", + "evidence": "This ELF live slice does not launch OpenMemory or claude-mem UI flows." + } + ], + "suites": [ + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "The narrow live operator-debug slice scores trace hydration, stage attribution, candidate-drop visibility, selected-but-not-narrated diagnosis, and repair-action clarity through generated ELF live artifacts." + } + ], + "scenarios": [ + { + "scenario_id": "operator_debug_trace_hydration", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF generated trace_available=true, service trace ids, viewer URLs, and admin trace-bundle replay URLs for the operator-debug jobs; qmd has replay rows but no ELF trace hydration surface.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + }, + { + "scenario_id": "operator_debug_replay_command", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF generated admin trace-bundle replay commands; qmd generated local CLI query replay commands. These are comparable replay-command availability artifacts, not equivalent UI quality claims.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_candidate_drop_visibility", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF generated operator_debug candidate-drop visibility from trace and replay-candidate metadata without direct SQL assumptions; qmd keeps only top-k replay rows and lacks intermediate candidate-drop stages.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json" + }, + { + "scenario_id": "operator_debug_repair_action_clarity", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF and qmd generated clear repair/replay steps for the narrow operator-debug jobs; OpenMemory and claude-mem UI repair paths remain blocked or not encoded.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_selected_but_not_narrated", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "The new selected-but-not-narrated job scores whether selected trace evidence is available for answer-composition repair without direct database inspection.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-job-operator-ux-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json", + "status": "pass" + } + ], + "notes": [ + "This is a narrow operator-debug live slice, not a full-suite live pass.", + "The record does not implement product UI improvements and does not claim broad qmd/OpenMemory/claude-mem superiority." + ] + }, + { + "adapter_id": "qmd_operator_debug_live", + "project": "qmd", + "adapter_kind": "docker_cli_operator_debug_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The narrow operator-debug live task clones and installs qmd inside the baseline Docker container when the checkout is absent.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "qmd materializes operator_debugging_ux adapter_response objects through collection add, update, embed, and query --json, then records local replay-command metadata but no service trace hydration.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The narrow live slice gives qmd explicit replay-command evidence, but operator-debug jobs remain wrong_result where trace availability, trace completeness, or candidate-drop stage visibility is required.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.md" + }, + "capabilities": [ + { + "capability": "operator_debug_real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes the checked-in operator_debugging_ux jobs through qmd local CLI materialization and generated scoring fixtures." + }, + { + "capability": "local_replay_command_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include qmd query replay commands tied to per-job collections." + }, + { + "capability": "trace_hydration_metadata", + "status": "wrong_result", + "evidence": "Generated qmd operator_debug records have trace_available=false and no ELF viewer/admin trace bundle because qmd exposes local replay rows rather than service trace hydration." + }, + { + "capability": "candidate_drop_visibility", + "status": "wrong_result", + "evidence": "qmd top-k replay output is available, but intermediate candidate-drop stages are not exposed in the generated artifact." + }, + { + "capability": "openmemory_or_claude_mem_ui_runner", + "status": "not_encoded", + "evidence": "This qmd live slice does not launch OpenMemory or claude-mem UI flows." + } + ], + "suites": [ + { + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "evidence": "The narrow qmd operator-debug slice scores local replay commands but remains wrong_result for trace hydration and candidate-drop stage visibility." + } + ], + "scenarios": [ + { + "scenario_id": "operator_debug_trace_hydration", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd generated replay-command metadata but trace_available=false, so ELF wins only this trace-hydration dimension; this is not a broad qmd loss.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + { + "scenario_id": "operator_debug_replay_command", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "qmd generated local CLI query replay commands for the same operator-debugging scenarios; ELF generated admin trace-bundle curl commands.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_candidate_drop_visibility", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd generated top-k replay output but not intermediate retrieved-but-dropped stage visibility, so candidate-drop diagnosis remains a qmd wrong_result in this narrow slice.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + }, + { + "scenario_id": "operator_debug_repair_action_clarity", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "qmd generated clear local replay steps for repair investigation, matching ELF on repair-action clarity while differing on trace hydration.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + { + "scenario_id": "operator_debug_selected_but_not_narrated", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd can replay top-k rows, but the generated artifact does not expose service trace narration stages for the selected-but-not-narrated diagnosis.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-job-operator-ux-live-adapters", + "status": "wrong_result" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json", + "status": "wrong_result" + } + ], + "notes": [ + "This is a narrow operator-debug live slice, not a full-suite live pass.", + "qmd's replay-command availability remains useful; the wrong_result status is limited to trace hydration and candidate-drop stage visibility." + ] + }, { "adapter_id": "agentmemory_live_baseline", "project": "agentmemory", diff --git a/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/selected_but_not_narrated.json b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/selected_but_not_narrated.json new file mode 100644 index 00000000..3f670ac7 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/selected_but_not_narrated.json @@ -0,0 +1,160 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "operator-debug-selected-not-narrated-001", + "suite": "operator_debugging_ux", + "title": "Debug evidence selected but not narrated", + "corpus": { + "corpus_id": "operator-debugging-ux-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "trace-selected-not-narrated", + "kind": "trace", + "text": "Trace 66666666-6666-4666-8666-666666666666 shows final selection included supersession evidence for the release owner change, but the generated answer narrated only the current owner and omitted the selected historical handoff evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "operator_debugging_ux", + "evidence_id": "trace-selected-not-narrated" + } + }, + "created_at": "2026-06-11T02:30:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_operator_ux", + "answer": { + "content": "The trace selected the supersession evidence, but the answer did not narrate it.", + "claims": [ + { + "claim_id": "root_cause", + "text": "The trace selected the supersession evidence, but the answer did not narrate it.", + "evidence_ids": ["trace-selected-not-narrated"], + "confidence": "high" + } + ], + "evidence_ids": ["trace-selected-not-narrated"], + "latency_ms": 2.7, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "66666666-6666-4666-8666-666666666666", + "failure_stage": "selection.narration", + "failure_reason": "The selected evidence was present in the final set, but the answer omitted the historical handoff narration.", + "stages": [ + { + "stage_name": "selection.final", + "kept_evidence": ["trace-selected-not-narrated"], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Final selection retained the trace that explains the supersession history." + }, + { + "stage_name": "selection.narration", + "kept_evidence": ["trace-selected-not-narrated"], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "The narration step did not surface the selected historical handoff evidence." + } + ] + } + } + } + }, + "timeline": [ + { + "event_id": "selected-not-narrated-trace", + "ts": "2026-06-11T02:30:00Z", + "actor": "system", + "action": "captured_trace", + "evidence_ids": ["trace-selected-not-narrated"], + "summary": "The trace captured selected evidence that the final answer failed to narrate." + } + ], + "prompt": { + "role": "user", + "content": "Why did the debug answer miss the release owner handoff even though the trace had the evidence?", + "job_mode": "debug", + "constraints": ["cite_evidence", "state_repair_action"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "root_cause", + "text": "The trace selected the supersession evidence, but the answer did not narrate it." + } + ], + "must_not_include": ["The supersession evidence was absent from final selection."], + "evidence_links": { + "root_cause": ["trace-selected-not-narrated"] + }, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "trace-selected-not-narrated", + "claim_id": "root_cause", + "requirement": "explain", + "quote": "final selection included supersession evidence for the release owner change" + } + ], + "negative_traps": [], + "scoring_rubric": { + "dimensions": { + "debuggability": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Identifies that the evidence was selected but not narrated." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites selected trace evidence." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Names a narration or answer-composition repair action." + }, + "answer_correctness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not claim the evidence was absent." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": ["unsupported high-confidence claim about a required decision or fact"] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "selected_but_not_narrated", + "trace_id": "66666666-6666-4666-8666-666666666666", + "viewer_url": "/viewer?trace_id=66666666-6666-4666-8666-666666666666", + "admin_trace_bundle_url": "/v2/admin/traces/66666666-6666-4666-8666-666666666666/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + "root_cause": "The evidence survived final selection, but answer composition failed to narrate the selected supersession context.", + "steps_to_root_cause": 3, + "raw_sql_needed": false, + "dropped_candidate_visibility": "not dropped; selected evidence is visible in final results and narration stage details", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "viewer_panels": ["Selected Final Results", "Stage Details", "Trace"], + "cli_steps": ["open trace bundle", "inspect final selected evidence", "inspect narration stage", "repair answer composition"], + "trace_evidence": ["trace-selected-not-narrated"], + "ux_gaps": [] + }, + "tags": ["synthetic", "operator_debugging_ux", "qmd_reference", "no_live_claim"] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 7f0c74e8..a167d2bd 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -534,6 +534,14 @@ struct OperatorDebugEvidence { dropped_candidate_visibility: String, trace_completeness: String, repair_action_clarity: String, + #[serde(skip_serializing_if = "Option::is_none")] + trace_available: Option, + #[serde(skip_serializing_if = "Option::is_none")] + replay_command_available: Option, + #[serde(skip_serializing_if = "Option::is_none")] + replay_command: Option, + #[serde(skip_serializing_if = "Option::is_none")] + replay_artifact: Option, #[serde(default)] viewer_panels: Vec, #[serde(default)] @@ -1787,6 +1795,8 @@ fn validate_operator_debug(job: &RealWorldJob, path: &Path) -> Result<()> { debug.admin_trace_bundle_url.as_deref(), "admin_trace_bundle_url", )?; + validate_optional_debug_field(path, debug.replay_command.as_deref(), "replay_command")?; + validate_optional_debug_field(path, debug.replay_artifact.as_deref(), "replay_artifact")?; validate_non_empty_debug_list(path, &debug.viewer_panels, "viewer_panels")?; validate_non_empty_debug_list(path, &debug.cli_steps, "cli_steps")?; validate_non_empty_debug_list(path, &debug.trace_evidence, "trace_evidence")?; @@ -4598,16 +4608,18 @@ fn render_markdown_operator_debugging(out: &mut String, report: &RealWorldReport return; } - out.push_str("| Job | Failure Mode | Trace Evidence | Steps | Raw SQL | Dropped Candidate Visibility | Trace Completeness | Repair Clarity | UX Gaps |\n"); - out.push_str("| --- | --- | --- | ---: | --- | --- | --- | --- | --- |\n"); + out.push_str("| Job | Failure Mode | Trace Evidence | Trace Available | Replay Command | Steps | Raw SQL | Dropped Candidate Visibility | Trace Completeness | Repair Clarity | UX Gaps |\n"); + out.push_str("| --- | --- | --- | --- | --- | ---: | --- | --- | --- | --- | --- |\n"); for job in jobs { if let Some(debug) = &job.operator_debug { out.push_str(&format!( - "| {} | {} | {} | {} | `{}` | {} | `{}` | `{}` | {} |\n", + "| {} | {} | {} | `{}` | `{}` | {} | `{}` | {} | `{}` | `{}` | {} |\n", md_cell(job.job_id.as_str()), md_cell(debug.failure_mode.as_str()), debug_trace_cell(debug), + debug.trace_available.unwrap_or(debug.trace_id.is_some()), + debug.replay_command_available.unwrap_or(debug.replay_command.is_some()), debug.steps_to_root_cause, debug.raw_sql_needed, md_cell(debug.dropped_candidate_visibility.as_str()), @@ -4632,6 +4644,14 @@ fn render_markdown_operator_debugging(out: &mut String, report: &RealWorldReport "- CLI steps: `{}`\n", md_inline(debug.cli_steps.join(" -> ").as_str()) )); + + if let Some(command) = &debug.replay_command { + out.push_str(&format!("- Replay command: `{}`\n", md_inline(command.as_str()))); + } + if let Some(artifact) = &debug.replay_artifact { + out.push_str(&format!("- Replay artifact: `{}`\n", md_inline(artifact.as_str()))); + } + out.push_str(&format!( "- Trace evidence: `{}`\n", md_inline(debug.trace_evidence.join(", ").as_str()) diff --git a/apps/elf-eval/src/bin/real_world_live_adapter.rs b/apps/elf-eval/src/bin/real_world_live_adapter.rs index ac30d229..0e6a621f 100644 --- a/apps/elf-eval/src/bin/real_world_live_adapter.rs +++ b/apps/elf-eval/src/bin/real_world_live_adapter.rs @@ -234,6 +234,17 @@ struct MaterializedJobEvidence { failure: Option, #[serde(skip_serializing_if = "Vec::is_empty")] source_mappings: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + operator_debug: Option, +} + +#[derive(Clone, Debug, Serialize)] +struct OperatorDebugMaterializationEvidence { + trace_available: bool, + replay_command_available: bool, + candidate_drop_visibility: String, + repair_action_clarity: String, + raw_sql_needed: bool, } #[derive(Debug, Serialize)] @@ -282,6 +293,7 @@ struct TraceStageOutput { struct MaterializedJob { response: AdapterResponseOutput, evidence: MaterializedJobEvidence, + operator_debug: Option, } #[derive(Debug)] @@ -294,6 +306,8 @@ struct MaterializedJobInput { trace_id: Option, failure: Option, source_mappings: Vec, + operator_debug: Option, + operator_debug_evidence: Option, } struct MaterializedOutput<'a> { @@ -642,6 +656,14 @@ fn materialize_qmd_job( } let selected = selected_required_corpus_texts(loaded, &corpus, &evidence_ids); + let replay_command = qmd_replay_command(&loaded.job.prompt.content, collection.as_str()); + let (operator_debug, operator_debug_evidence) = operator_debug_output( + AdapterKind::QmdCliRuntime, + loaded, + None, + replay_command, + log_path.display().to_string(), + ); Ok(materialized_job( loaded, @@ -655,6 +677,8 @@ fn materialize_qmd_job( trace_id: None, failure: None, source_mappings: Vec::new(), + operator_debug, + operator_debug_evidence, }, )) } @@ -698,6 +722,8 @@ fn lightrag_failure_jobs( trace_id: None, failure: Some(format!("{stage}: {reason}")), source_mappings: Vec::new(), + operator_debug: None, + operator_debug_evidence: None, }, ) }) @@ -978,6 +1004,7 @@ fn materialized_job( }, }, }, + operator_debug: input.operator_debug, evidence: MaterializedJobEvidence { job_id: loaded.job.job_id.clone(), suite: loaded.job.suite.clone(), @@ -991,11 +1018,16 @@ fn materialized_job( trace_id: input.trace_id, failure: input.failure, source_mappings: input.source_mappings, + operator_debug: input.operator_debug_evidence, }, } } fn declared_encoding_job(adapter_id: &str, loaded: &LoadedJob) -> Option { + if is_operator_debug_live_adapter(adapter_id, loaded.job.suite.as_str()) { + return None; + } + let status = loaded.job.encoding.status?; let reason = loaded.job.encoding.reason.clone().unwrap_or_else(|| { format!("Fixture declares {} for this live adapter job.", status.as_str()) @@ -1010,6 +1042,10 @@ fn declared_encoding_job(adapter_id: &str, loaded: &LoadedJob) -> Option Option { + if is_operator_debug_live_adapter(adapter_id, loaded.job.suite.as_str()) { + return None; + } + not_encoded_reason(loaded.job.suite.as_str()).map(|reason| { materialized_declared_status_job( adapter_id, @@ -1020,6 +1056,11 @@ fn not_encoded_job(adapter_id: &str, loaded: &LoadedJob) -> Option bool { + suite == "operator_debugging_ux" + && matches!(adapter_id, "elf_operator_debug_live" | "qmd_operator_debug_live") +} + fn not_encoded_reason(suite: &str) -> Option<&'static str> { match suite { "trust_source_of_truth" @@ -1035,7 +1076,7 @@ fn not_encoded_reason(suite: &str) -> Option<&'static str> { "The live adapter sweep retrieves evidence-linked answers but does not generate derived knowledge pages.", ), "operator_debugging_ux" => Some( - "The live adapter sweep does not yet hydrate full operator trace/viewer diagnostics for this suite.", + "The full live adapter sweep keeps operator trace/viewer diagnostics in a focused operator-debug slice.", ), "capture_integration" => Some( "The live adapter sweep does not exercise capture integrations or write-policy redaction boundaries.", @@ -1102,8 +1143,156 @@ fn materialized_declared_status_job( trace_id: None, failure, source_mappings: Vec::new(), + operator_debug: None, + }, + operator_debug: None, + } +} + +fn operator_debug_output( + adapter_kind: AdapterKind, + loaded: &LoadedJob, + trace_id: Option, + replay_command: String, + replay_artifact: String, +) -> (Option, Option) { + if loaded.job.suite != "operator_debugging_ux" { + return (None, None); + } + + let Some(source) = loaded.value.get("operator_debug") else { + return (None, None); + }; + let mut debug = source.clone(); + let Some(object) = debug.as_object_mut() else { + return (None, None); + }; + let trace_available = trace_id.is_some(); + let replay_command_available = !replay_command.trim().is_empty(); + let raw_sql_needed = false; + let repair_action_clarity = if replay_command_available { "clear" } else { "unclear" }; + let candidate_drop_visibility = + operator_debug_candidate_visibility(adapter_kind, object).to_string(); + + object.insert("trace_available".to_string(), Value::Bool(trace_available)); + object.insert("replay_command_available".to_string(), Value::Bool(replay_command_available)); + object.insert("raw_sql_needed".to_string(), Value::Bool(raw_sql_needed)); + object.insert( + "dropped_candidate_visibility".to_string(), + Value::String(candidate_drop_visibility.clone()), + ); + object.insert( + "trace_completeness".to_string(), + Value::String(operator_debug_trace_completeness(adapter_kind, trace_available).to_string()), + ); + object.insert( + "repair_action_clarity".to_string(), + Value::String(repair_action_clarity.to_string()), + ); + object.insert("replay_command".to_string(), Value::String(replay_command.clone())); + object.insert("replay_artifact".to_string(), Value::String(replay_artifact)); + + match adapter_kind { + AdapterKind::ElfServiceRuntime => + if let Some(trace_id) = trace_id { + let trace_id = trace_id.to_string(); + + object.insert("trace_id".to_string(), Value::String(trace_id.clone())); + object.insert( + "viewer_url".to_string(), + Value::String(format!("/viewer?trace_id={trace_id}")), + ); + object.insert( + "admin_trace_bundle_url".to_string(), + Value::String(format!( + "/v2/admin/traces/{trace_id}/bundle?mode=full&stage_items_limit=128&candidates_limit=200" + )), + ); + }, + AdapterKind::QmdCliRuntime => { + object.remove("trace_id"); + object.remove("viewer_url"); + object.remove("admin_trace_bundle_url"); + object.insert("viewer_panels".to_string(), serde_json::json!(["qmd JSON Replay Rows"])); }, + AdapterKind::LightragApiContextExport => {}, } + + let mut cli_steps = string_array_from_object(object, "cli_steps"); + + push_unique(&mut cli_steps, replay_command); + + object.insert("cli_steps".to_string(), serde_json::json!(cli_steps)); + + ( + Some(debug), + Some(OperatorDebugMaterializationEvidence { + trace_available, + replay_command_available, + candidate_drop_visibility, + repair_action_clarity: repair_action_clarity.to_string(), + raw_sql_needed, + }), + ) +} + +fn operator_debug_trace_completeness( + adapter_kind: AdapterKind, + trace_available: bool, +) -> &'static str { + match adapter_kind { + AdapterKind::ElfServiceRuntime if trace_available => "complete", + AdapterKind::ElfServiceRuntime => "missing", + AdapterKind::QmdCliRuntime | AdapterKind::LightragApiContextExport => "not_available", + } +} + +fn operator_debug_candidate_visibility( + adapter_kind: AdapterKind, + object: &Map, +) -> &str { + match adapter_kind { + AdapterKind::ElfServiceRuntime => object + .get("dropped_candidate_visibility") + .and_then(Value::as_str) + .unwrap_or("visible through trace bundle replay candidates"), + AdapterKind::QmdCliRuntime => + "qmd top-k replay output is available, but intermediate candidate-drop stages are not exposed", + AdapterKind::LightragApiContextExport => "not encoded for this adapter", + } +} + +fn string_array_from_object(object: &Map, key: &str) -> Vec { + object + .get(key) + .and_then(Value::as_array) + .map(|items| items.iter().filter_map(Value::as_str).map(ToString::to_string).collect()) + .unwrap_or_default() +} + +fn elf_replay_command(trace_id: Uuid, project_id: &str) -> String { + format!( + "curl -fsS {} -H {} -H {} -H {}", + shell_quote(format!( + "http://127.0.0.1:51891/v2/admin/traces/{trace_id}/bundle?mode=full&stage_items_limit=128&candidates_limit=200" + ) + .as_str()), + shell_quote("X-ELF-Tenant-Id: elf-live-real-world"), + shell_quote(format!("X-ELF-Project-Id: {project_id}").as_str()), + shell_quote("X-ELF-Agent-Id: elf-live-real-world-agent") + ) +} + +fn qmd_replay_command(query: &str, collection: &str) -> String { + format!( + "npx tsx src/cli/qmd.ts query {} -c {} --json --no-rerank --min-score 0 -n 5", + shell_quote(format!("lex: {query}\nvec: {query}").as_str()), + shell_quote(collection) + ) +} + +fn shell_quote(value: &str) -> String { + format!("'{}'", value.replace('\'', "'\\''")) } fn evidence_linked_claims(loaded: &LoadedJob, evidence_ids: &[String]) -> Vec { @@ -1220,6 +1409,8 @@ fn failure_jobs( trace_id: None, failure: Some(format!("{stage}: {reason}")), source_mappings: Vec::new(), + operator_debug: None, + operator_debug_evidence: None, }, ) }) @@ -1247,6 +1438,10 @@ fn write_materialized_output(output: MaterializedOutput<'_>) -> color_eyre::Resu value["corpus"]["adapter_response"] = Value::Object(adapter_response); + if let Some(operator_debug) = &materialized.operator_debug { + value["operator_debug"] = operator_debug.clone(); + } + if matches!( materialized.evidence.status, MaterializationStatus::Blocked @@ -1305,6 +1500,7 @@ fn clone_job_evidence(evidence: &MaterializedJobEvidence) -> MaterializedJobEvid trace_id: evidence.trace_id, failure: evidence.failure.clone(), source_mappings: evidence.source_mappings.clone(), + operator_debug: evidence.operator_debug.clone(), } } @@ -1827,6 +2023,8 @@ async fn materialize_lightrag_job( trace_id: None, failure: None, source_mappings, + operator_debug: None, + operator_debug_evidence: None, }, )) } @@ -2045,7 +2243,75 @@ async fn materialize_elf_job( let corpus = corpus_texts(loaded)?; let project_id = project_id_for_job(&loaded.job.job_id); - for item in &corpus { + ingest_elf_corpus(service, loaded, adapter_id, project_id.as_str(), &corpus).await?; + run_worker(runtime).await?; + + let started_at = Instant::now(); + let response = service + .search_raw(SearchRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.clone(), + agent_id: AGENT_ID.to_string(), + token_id: None, + payload_level: PayloadLevel::L2, + read_profile: "private_only".to_string(), + query: loaded.job.prompt.content.clone(), + top_k: Some(5), + candidate_k: Some(20), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .map_err(|err| eyre::eyre!("ELF search_raw failed for {}: {err}", loaded.job.job_id))?; + let latency_ms = started_at.elapsed().as_secs_f64() * 1_000.0; + let mut evidence_ids = Vec::new(); + + for item in &response.items { + if let Some(evidence_id) = item.source_ref.get("evidence_id").and_then(Value::as_str) { + push_unique(&mut evidence_ids, evidence_id.to_string()); + } + } + + let selected = selected_required_corpus_texts(loaded, &corpus, &evidence_ids); + let replay_command = elf_replay_command(response.trace_id, project_id.as_str()); + let (operator_debug, operator_debug_evidence) = operator_debug_output( + AdapterKind::ElfServiceRuntime, + loaded, + Some(response.trace_id), + replay_command, + format!( + "/v2/admin/traces/{}/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + response.trace_id + ), + ); + + Ok(materialized_job( + loaded, + adapter_id, + MaterializedJobInput { + content: selected.content, + evidence_ids: selected.evidence_ids, + latency_ms, + indexing_latency_ms: None, + returned_count: response.items.len(), + trace_id: Some(response.trace_id), + failure: None, + source_mappings: Vec::new(), + operator_debug, + operator_debug_evidence, + }, + )) +} + +async fn ingest_elf_corpus( + service: &ElfService, + loaded: &LoadedJob, + adapter_id: &str, + project_id: &str, + corpus: &[CorpusText], +) -> color_eyre::Result<()> { + for item in corpus { let chunks = note_text_chunks(item.text.as_str()); let chunk_count = chunks.len(); @@ -2058,7 +2324,7 @@ async fn materialize_elf_job( let response = service .add_note(AddNoteRequest { tenant_id: TENANT_ID.to_string(), - project_id: project_id.clone(), + project_id: project_id.to_string(), agent_id: AGENT_ID.to_string(), scope: SCOPE.to_string(), notes: vec![AddNoteInput { @@ -2096,51 +2362,7 @@ async fn materialize_elf_job( } } - run_worker(runtime).await?; - - let started_at = Instant::now(); - let response = service - .search_raw(SearchRequest { - tenant_id: TENANT_ID.to_string(), - project_id, - agent_id: AGENT_ID.to_string(), - token_id: None, - payload_level: PayloadLevel::L2, - read_profile: "private_only".to_string(), - query: loaded.job.prompt.content.clone(), - top_k: Some(5), - candidate_k: Some(20), - filter: None, - record_hits: Some(false), - ranking: None, - }) - .await - .map_err(|err| eyre::eyre!("ELF search_raw failed for {}: {err}", loaded.job.job_id))?; - let latency_ms = started_at.elapsed().as_secs_f64() * 1_000.0; - let mut evidence_ids = Vec::new(); - - for item in &response.items { - if let Some(evidence_id) = item.source_ref.get("evidence_id").and_then(Value::as_str) { - push_unique(&mut evidence_ids, evidence_id.to_string()); - } - } - - let selected = selected_required_corpus_texts(loaded, &corpus, &evidence_ids); - - Ok(materialized_job( - loaded, - adapter_id, - MaterializedJobInput { - content: selected.content, - evidence_ids: selected.evidence_ids, - latency_ms, - indexing_latency_ms: None, - returned_count: response.items.len(), - trace_id: Some(response.trace_id), - failure: None, - source_mappings: Vec::new(), - }, - )) + Ok(()) } async fn build_service(runtime: &BaselineRuntime) -> color_eyre::Result { diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index fe6da046..a8c7e927 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -255,11 +255,11 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/external_adapters/summary/adapter_count").and_then(Value::as_u64), - Some(21) + Some(23) ); assert_eq!( report.pointer("/external_adapters/summary/live_real_world_count").and_then(Value::as_u64), - Some(3) + Some(5) ); assert_eq!( report.pointer("/external_adapters/summary/research_gate_count").and_then(Value::as_u64), @@ -420,7 +420,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) { ); assert_eq!( report.pointer("/external_adapters/summary/adapter_count").and_then(Value::as_u64), - Some(21) + Some(23) ); assert_eq!( report.pointer("/external_adapters/summary/external_project_count").and_then(Value::as_u64), @@ -438,7 +438,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) { ); assert_eq!( report.pointer("/external_adapters/summary/live_real_world_count").and_then(Value::as_u64), - Some(3) + Some(5) ); assert_eq!( report.pointer("/external_adapters/summary/research_gate_count").and_then(Value::as_u64), @@ -448,13 +448,13 @@ fn assert_external_adapter_manifest_summary(report: &Value) { report .pointer("/external_adapters/summary/overall_status_counts/pass") .and_then(Value::as_u64), - Some(3) + Some(4) ); assert_eq!( report .pointer("/external_adapters/summary/overall_status_counts/wrong_result") .and_then(Value::as_u64), - Some(5) + Some(6) ); assert_eq!( report @@ -543,7 +543,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/wrong_result") .and_then(Value::as_u64), - Some(1) + Some(4) ); assert_eq!( report @@ -555,7 +555,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/pass") .and_then(Value::as_u64), - Some(9) + Some(16) ); assert_eq!( report @@ -567,13 +567,13 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_position_counts/wins") .and_then(Value::as_u64), - Some(2) + Some(8) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_position_counts/ties") .and_then(Value::as_u64), - Some(4) + Some(8) ); assert_eq!( report @@ -591,13 +591,13 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_outcome_counts/win") .and_then(Value::as_u64), - Some(2) + Some(8) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_outcome_counts/tie") .and_then(Value::as_u64), - Some(4) + Some(8) ); assert_eq!( report @@ -629,8 +629,10 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { let adapters = array_at(report, "/external_adapters/adapters")?; let elf = find_by_field(adapters, "/adapter_id", "elf_real_world_memory_fixture")?; let elf_live = find_by_field(adapters, "/adapter_id", "elf_live_real_world")?; + let elf_operator_debug = find_by_field(adapters, "/adapter_id", "elf_operator_debug_live")?; let qmd = find_by_field(adapters, "/adapter_id", "qmd_live_baseline")?; let qmd_live = find_by_field(adapters, "/adapter_id", "qmd_live_real_world")?; + let qmd_operator_debug = find_by_field(adapters, "/adapter_id", "qmd_operator_debug_live")?; let agentmemory = find_by_field(adapters, "/adapter_id", "agentmemory_live_baseline")?; let mem0 = find_by_field(adapters, "/adapter_id", "mem0_openmemory_live_baseline")?; let memsearch = find_by_field(adapters, "/adapter_id", "memsearch_live_baseline")?; @@ -653,6 +655,7 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { assert_eq!(elf_live.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); assert_live_sweep_record(elf_live, "blocked")?; + assert_operator_debug_live_adapter_records(elf_operator_debug, qmd_operator_debug)?; assert_eq!(qmd.pointer("/overall_status").and_then(Value::as_str), Some("pass")); assert_eq!(qmd.pointer("/suites/0/status").and_then(Value::as_str), Some("not_encoded")); @@ -758,6 +761,111 @@ fn assert_qmd_live_baseline_record(adapter: &Value) { })); } +fn assert_operator_debug_live_adapter_records(elf: &Value, qmd: &Value) -> Result<()> { + assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world")); + assert_eq!(elf.pointer("/overall_status").and_then(Value::as_str), Some("pass")); + assert_eq!( + elf.pointer("/setup/command").and_then(Value::as_str), + Some("cargo make real-world-job-operator-ux-live-adapters") + ); + assert_eq!( + elf.pointer("/suites/0/suite_id").and_then(Value::as_str), + Some("operator_debugging_ux") + ); + assert_eq!(elf.pointer("/suites/0/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + elf.pointer("/capabilities/1/capability").and_then(Value::as_str), + Some("trace_hydration_metadata") + ); + assert_eq!(elf.pointer("/capabilities/1/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + elf.pointer("/capabilities/2/capability").and_then(Value::as_str), + Some("replay_command_metadata") + ); + assert_eq!(elf.pointer("/capabilities/2/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + elf.pointer("/capabilities/3/capability").and_then(Value::as_str), + Some("candidate_drop_visibility") + ); + assert_eq!(elf.pointer("/capabilities/3/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + elf.pointer("/capabilities/4/capability").and_then(Value::as_str), + Some("openmemory_or_claude_mem_ui_runner") + ); + assert_eq!(elf.pointer("/capabilities/4/status").and_then(Value::as_str), Some("not_encoded")); + + let elf_scenarios = array_at(elf, "/scenarios")?; + let elf_trace = find_by_field(elf_scenarios, "/scenario_id", "operator_debug_trace_hydration")?; + let elf_replay = find_by_field(elf_scenarios, "/scenario_id", "operator_debug_replay_command")?; + let elf_candidate = + find_by_field(elf_scenarios, "/scenario_id", "operator_debug_candidate_drop_visibility")?; + let elf_repair = + find_by_field(elf_scenarios, "/scenario_id", "operator_debug_repair_action_clarity")?; + let elf_selected = + find_by_field(elf_scenarios, "/scenario_id", "operator_debug_selected_but_not_narrated")?; + + assert_eq!(elf_scenarios.len(), 5); + assert_eq!(elf_trace.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(elf_trace.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + assert_eq!(elf_replay.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(elf_candidate.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + assert_eq!(elf_repair.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(elf_selected.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + assert_eq!(qmd.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world")); + assert_eq!(qmd.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + qmd.pointer("/suites/0/suite_id").and_then(Value::as_str), + Some("operator_debugging_ux") + ); + assert_eq!(qmd.pointer("/suites/0/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + qmd.pointer("/capabilities/1/capability").and_then(Value::as_str), + Some("local_replay_command_metadata") + ); + assert_eq!(qmd.pointer("/capabilities/1/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + qmd.pointer("/capabilities/2/capability").and_then(Value::as_str), + Some("trace_hydration_metadata") + ); + assert_eq!(qmd.pointer("/capabilities/2/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + qmd.pointer("/capabilities/3/capability").and_then(Value::as_str), + Some("candidate_drop_visibility") + ); + assert_eq!(qmd.pointer("/capabilities/3/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(qmd.pointer("/capabilities/4/status").and_then(Value::as_str), Some("not_encoded")); + + let qmd_scenarios = array_at(qmd, "/scenarios")?; + let qmd_trace = find_by_field(qmd_scenarios, "/scenario_id", "operator_debug_trace_hydration")?; + let qmd_replay = find_by_field(qmd_scenarios, "/scenario_id", "operator_debug_replay_command")?; + let qmd_candidate = + find_by_field(qmd_scenarios, "/scenario_id", "operator_debug_candidate_drop_visibility")?; + let qmd_repair = + find_by_field(qmd_scenarios, "/scenario_id", "operator_debug_repair_action_clarity")?; + let qmd_selected = + find_by_field(qmd_scenarios, "/scenario_id", "operator_debug_selected_but_not_narrated")?; + + assert_eq!(qmd_scenarios.len(), 5); + assert_eq!(qmd_trace.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(qmd_trace.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + assert_eq!(qmd_replay.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(qmd_replay.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(qmd_candidate.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(qmd_candidate.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + assert_eq!(qmd_repair.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(qmd_repair.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(qmd_selected.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(qmd_selected.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + assert!(array_at(elf, "/notes")?.iter().any(|note| { + note.as_str().is_some_and(|text| text.contains("narrow operator-debug live slice")) + })); + assert!(array_at(qmd, "/notes")?.iter().any(|note| { + note.as_str().is_some_and(|text| text.contains("narrow operator-debug live slice")) + })); + + Ok(()) +} + fn assert_openviking_deep_profile_gate(adapter: &Value) { let trajectory_evidence = adapter.pointer("/capabilities/1/evidence").and_then(Value::as_str); @@ -1130,6 +1238,40 @@ fn openmemory_ui_export_probe_has_dedicated_docker_task() -> Result<()> { Ok(()) } +#[test] +fn operator_debug_live_adapter_task_is_docker_scoped() -> Result<()> { + let workspace = workspace_root()?; + let makefile = fs::read_to_string(workspace.join("Makefile.toml"))?; + let script = fs::read_to_string( + workspace.join("scripts").join("real-world-operator-debug-live-adapters.sh"), + )?; + let live_adapter = + fs::read_to_string(workspace.join("apps/elf-eval/src/bin/real_world_live_adapter.rs"))?; + let benchmark = + fs::read_to_string(workspace.join("apps/elf-eval/src/bin/real_world_job_benchmark.rs"))?; + + assert!(makefile.contains("[tasks.real-world-job-operator-ux-live-adapters]")); + assert!(makefile.contains("docker compose -f docker-compose.baseline.yml run --build --rm")); + assert!(makefile.contains("scripts/real-world-operator-debug-live-adapters.sh")); + assert!(script.contains("apps/elf-eval/fixtures/real_world_job/operator_debugging_ux")); + assert!(script.contains("elf_operator_debug_live")); + assert!(script.contains("qmd_operator_debug_live")); + assert!(script.contains("elf.real_world_operator_debug_live_adapter_sweep/v1")); + assert!(script.contains("trace_available")); + assert!(script.contains("replay_command_available")); + assert!(live_adapter.contains("fn operator_debug_output(")); + assert!(live_adapter.contains("fn qmd_replay_command(")); + assert!(live_adapter.contains("fn elf_replay_command(")); + assert!( + !live_adapter + .contains("does not yet hydrate full operator trace/viewer diagnostics for this suite") + ); + assert!(benchmark.contains("Replay command:")); + assert!(benchmark.contains("replay_command_available")); + + Ok(()) +} + fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Result<()> { let suites = array_at(adapter, "/suites")?; let capabilities = array_at(adapter, "/capabilities")?; @@ -1187,24 +1329,25 @@ fn runner_discovers_nested_fixture_layout() -> Result<()> { fn operator_debug_fixture_reports_trace_links_and_failure_details() -> Result<()> { let report = run_json_report_from(operator_debug_fixture_dir())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); assert_eq!( report.pointer("/summary/operator_debug_job_count").and_then(Value::as_u64), - Some(5) + Some(6) ); assert_eq!(report.pointer("/summary/raw_sql_needed_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/trace_incomplete_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/operator_ux_gap_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), - Some(1) + Some(2) ); let jobs = array_at(&report, "/jobs")?; let dropped = find_by_field(jobs, "/job_id", "operator-debug-dropped-evidence-001")?; + let selected = find_by_field(jobs, "/job_id", "operator-debug-selected-not-narrated-001")?; assert_eq!(dropped.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!( @@ -1234,6 +1377,15 @@ fn operator_debug_fixture_reports_trace_links_and_failure_details() -> Result<() "trace-dropped-decoy" )?); assert!(array_contains_str(dropped, "/produced_evidence", "trace-dropped-expected")?); + assert_eq!(selected.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + selected.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("selection.narration") + ); + assert_eq!( + selected.pointer("/operator_debug/failure_mode").and_then(Value::as_str), + Some("selected_but_not_narrated") + ); Ok(()) } @@ -1639,6 +1791,8 @@ fn assert_trace_replay_diagnostics_json(report: &Value) -> Result<()> { report.pointer("/summary/outcome_counts/not_tested").and_then(Value::as_u64), Some(4) ); + assert_eq!(report.pointer("/summary/outcome_counts/win").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/outcome_counts/tie").and_then(Value::as_u64), Some(5)); assert_eq!(report.pointer("/summary/outcome_counts/non_goal").and_then(Value::as_u64), Some(1)); let scenarios = array_at(report, "/scenario_outcomes")?; @@ -1647,6 +1801,16 @@ fn assert_trace_replay_diagnostics_json(report: &Value) -> Result<()> { let replay = find_by_field(scenarios, "/scenario_id", "replay_command_locality")?; let trace_surface = find_by_field(scenarios, "/scenario_id", "trace_admin_replay_surface_availability")?; + let operator_trace = + find_by_field(scenarios, "/scenario_id", "operator_debug_trace_hydration")?; + let operator_replay = + find_by_field(scenarios, "/scenario_id", "operator_debug_replay_command_availability")?; + let operator_candidate = + find_by_field(scenarios, "/scenario_id", "operator_debug_candidate_drop_visibility")?; + let operator_repair = + find_by_field(scenarios, "/scenario_id", "operator_debug_repair_action_clarity")?; + let operator_selected = + find_by_field(scenarios, "/scenario_id", "operator_debug_selected_but_not_narrated")?; let expansion = find_by_field(scenarios, "/scenario_id", "query_expansion_attribution")?; let dense_sparse = find_by_field(scenarios, "/scenario_id", "dense_sparse_channel_attribution")?; @@ -1658,11 +1822,31 @@ fn assert_trace_replay_diagnostics_json(report: &Value) -> Result<()> { let tombstone = find_by_field(scenarios, "/scenario_id", "evidence_absent_tombstone_diagnostics")?; - assert_eq!(scenarios.len(), 11); + assert_eq!(scenarios.len(), 16); assert_eq!(retrieval.pointer("/outcome").and_then(Value::as_str), Some("tie")); assert_eq!(top10.pointer("/outcome").and_then(Value::as_str), Some("loss")); assert_eq!(replay.pointer("/outcome").and_then(Value::as_str), Some("loss")); assert_eq!(trace_surface.pointer("/outcome").and_then(Value::as_str), Some("tie")); + assert_eq!( + operator_trace.pointer("/evidence_class").and_then(Value::as_str), + Some("live_real_world") + ); + assert_eq!(operator_trace.pointer("/result_type").and_then(Value::as_str), Some("pass")); + assert_eq!(operator_trace.pointer("/outcome").and_then(Value::as_str), Some("win")); + assert_eq!(operator_replay.pointer("/outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(operator_candidate.pointer("/outcome").and_then(Value::as_str), Some("win")); + assert!(array_contains_str( + operator_candidate, + "/typed_non_pass_states", + "retrieved_but_dropped" + )?); + assert_eq!(operator_repair.pointer("/outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(operator_selected.pointer("/outcome").and_then(Value::as_str), Some("win")); + assert!(array_contains_str( + operator_selected, + "/typed_non_pass_states", + "selected_but_not_narrated" + )?); assert_eq!(expansion.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); assert_eq!(dense_sparse.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); assert_eq!(fusion.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); @@ -1684,6 +1868,11 @@ fn assert_trace_replay_diagnostics_json(report: &Value) -> Result<()> { "/claim_boundaries", "qmd currently wins the default local-debug artifact surface: top-10 rows plus short CLI replay." )?); + assert!(array_contains_str( + report, + "/claim_boundaries", + "ELF narrowly wins the live operator-debug trace hydration and candidate-drop visibility slice against qmd; qmd still ties replay-command and repair-action clarity." + )?); assert!(array_contains_str( report, "/claim_boundaries", @@ -1697,11 +1886,22 @@ fn assert_trace_replay_diagnostics_markdown(markdown: &str) { assert!(markdown.contains("Retrieval correctness is still tied")); assert!(markdown.contains("| Default top-10 candidate artifact |")); assert!(markdown.contains("| Replay command locality |")); + assert!( + markdown + .contains("| Operator-debug trace hydration | `live_real_world` | `pass` | `win` |") + ); + assert!(markdown.contains( + "| Operator-debug replay command availability | `live_real_world` | `pass` | `tie` |" + )); + assert!(markdown.contains( + "| Operator-debug candidate-drop visibility | `live_real_world` | `pass` | `win` |" + )); assert!(markdown.contains("| Rerank attribution | `live_baseline_only` | `non_goal` |")); assert!(markdown.contains("| Candidate-drop diagnostics | `research_gate` | `not_encoded` |")); - assert!(markdown.contains("`retrieved_but_dropped` | Defined but `not_tested`")); + assert!(markdown.contains("`retrieved_but_dropped` | Defined globally as `not_tested`")); assert!(markdown.contains("npx tsx src/cli/qmd.ts query")); assert!(markdown.contains("cargo run -p elf-eval -- --config-a")); + assert!(markdown.contains("cargo make real-world-job-operator-ux-live-adapters")); assert!(markdown.contains("Do not claim qmd beats ELF as a memory system overall")); assert!(markdown.contains("Do not score rerank superiority from a qmd `--no-rerank` run")); } @@ -1712,6 +1912,11 @@ fn assert_trace_replay_adoption_json(adoption: &Value) -> Result<()> { "/scenario_id", "local_debug_replay_ux", )?; + let operator_debug = find_by_field( + array_at(adoption, "/scenario_outcomes")?, + "/scenario_id", + "operator_debugging_viewer_ux", + )?; assert_eq!(local_debug.pointer("/outcome").and_then(Value::as_str), Some("loss")); assert!( @@ -1730,6 +1935,23 @@ fn assert_trace_replay_adoption_json(adoption: &Value) -> Result<()> { "/claim_boundaries/not_allowed", "Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system or retrieval-quality win." )?); + assert_eq!(operator_debug.pointer("/outcome").and_then(Value::as_str), Some("win")); + assert!( + operator_debug + .pointer("/measured_claim") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("narrow live operator-debug win over qmd")) + ); + assert!(array_contains_str( + operator_debug, + "/command_artifacts", + "tmp/real-world-job/operator-ux-live-adapters/summary.json" + )?); + assert!(array_contains_str( + adoption, + "/claim_boundaries/not_allowed", + "Do not claim ELF broadly beats OpenMemory or claude-mem viewer UX from the narrow ELF/qmd operator-debug slice." + )?); Ok(()) } @@ -1739,6 +1961,12 @@ fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> { let qmd = find_by_field(projects, "/project", "qmd")?; let mem0 = find_by_field(projects, "/project", "mem0/OpenMemory")?; let openviking = find_by_field(projects, "/project", "OpenViking")?; + let scenarios = array_at(matrix, "/scenario_matrix")?; + let retrieval_debug = find_by_field(scenarios, "/scenario_id", "retrieval_debug")?; + let operator_debug = find_by_field(scenarios, "/scenario_id", "operator_debugging")?; + let context_trajectory = find_by_field(scenarios, "/scenario_id", "context_trajectory")?; + + assert_competitor_strength_matrix_manifest_counts(matrix); assert_eq!( qmd.pointer("/current_evidence_class").and_then(Value::as_str), @@ -1750,7 +1978,8 @@ fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> { Some("not_encoded") ); assert!(qmd.pointer("/benchmark_before_claim").and_then(Value::as_str).is_some_and(|claim| { - claim.contains("before claiming ELF wins, ties, or loses on retrieval debugging") + claim.contains("Keep qmd deep retrieval/debug profiling separate") + && claim.contains("narrow operator-debug live slice") })); assert!( qmd.pointer("/borrow_if_stronger") @@ -1795,11 +2024,6 @@ fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> { .and_then(Value::as_str) .is_some_and(|claim| claim.contains("evidence-bearing same-corpus output pass")) ); - - let scenarios = array_at(matrix, "/scenario_matrix")?; - let retrieval_debug = find_by_field(scenarios, "/scenario_id", "retrieval_debug")?; - let context_trajectory = find_by_field(scenarios, "/scenario_id", "context_trajectory")?; - assert!( retrieval_debug .pointer("/current_state") @@ -1809,6 +2033,24 @@ fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> { assert!(retrieval_debug.pointer("/current_state").and_then(Value::as_str).is_some_and( |state| state.contains("qmd remains stronger on local debug ergonomics not fully scored") )); + assert!( + operator_debug + .pointer("/current_elf_evidence") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("narrow live_real_world operator-debug slice")) + ); + assert!( + operator_debug + .pointer("/current_competitor_evidence") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("qmd now has a narrow live_real_world")) + ); + assert!( + operator_debug + .pointer("/next_measurement") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("OpenMemory and claude-mem UI/export")) + ); assert!( context_trajectory .pointer("/current_state") @@ -1825,6 +2067,29 @@ fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> { Ok(()) } +fn assert_competitor_strength_matrix_manifest_counts(matrix: &Value) { + assert_eq!( + matrix.pointer("/manifest_summary/adapter_records").and_then(Value::as_u64), + Some(23) + ); + assert_eq!( + matrix + .pointer("/manifest_summary/evidence_class_counts/live_real_world") + .and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + matrix.pointer("/manifest_summary/overall_status_counts/pass").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + matrix + .pointer("/manifest_summary/overall_status_counts/wrong_result") + .and_then(Value::as_u64), + Some(6) + ); +} + fn assert_strength_profile_summary(report: &Value) { assert_eq!( report.pointer("/schema").and_then(Value::as_str), @@ -2232,9 +2497,9 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("xy844-current-worktree")); assert!(markdown.contains("Existing live-baseline reports remain valid")); assert!(markdown.contains("### Adapter Scenario Judgments")); - assert!(markdown.contains("ELF scenario positions: `wins=2, ties=4, loses=1, untested=11`")); + assert!(markdown.contains("ELF scenario positions: `wins=8, ties=8, loses=1, untested=11`")); assert!(markdown.contains( - "Scenario comparison outcomes: `win=2, tie=4, loss=1, not_tested=8, blocked=1, non_goal=2`" + "Scenario comparison outcomes: `win=8, tie=8, loss=1, not_tested=8, blocked=1, non_goal=2`" )); assert!(markdown.contains("| `claude_mem_live_baseline` | `same_corpus_retrieval`")); assert!(markdown.contains("| `memsearch_live_baseline` | `ttl_expiry_lifecycle`")); diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index ec2ea8f2..120c6b3d 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -43,7 +43,9 @@ The remaining caveats are material: is measured separately and is an ELF loss on the current correction history scenario. The XY-923 follow-up also scores qmd's immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, - rerank, and candidate-drop diagnosis remain untested. + and rerank remain untested. XY-932 adds a narrow live operator-debug slice where + ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory + UI/export and claude-mem viewer workflows remain blocked or not encoded. ## Evidence Classes @@ -70,6 +72,7 @@ results, or lifecycle failures into one aggregate leaderboard. | --- | --- | --- | | `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` | ELF fixture aggregate covers 38 jobs across 11 suites with 36 pass and 2 blocked production-ops operator boundaries. | | `cargo make real-world-memory-live-adapters` | `2026-06-11-measurement-coverage-audit.md` | ELF live service adapter reports 18 pass, 5 wrong_result, 2 blocked, and 13 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 13 not_encoded jobs. | +| `cargo make real-world-job-operator-ux-live-adapters` | `tmp/real-world-job/operator-ux-live-adapters/summary.json` | The narrow live operator-debug slice scores ELF as pass and qmd as wrong_result: ELF wins trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence; both systems expose replay commands and repair-action guidance. | | `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker` | `2026-06-11-first-generation-oss-adapter-promotion-report.md` | mem0/OpenMemory and memsearch pass basic local baseline smokes; agentmemory remains lifecycle_fail and claude-mem remains wrong_result. | | `cargo make openmemory-ui-export-readback` | `2026-06-11-mem0-openmemory-history-ui-export-report.md` | mem0 local OSS passes preference correction history, entity-scoped personalization, local `get_all` export-style readback, and deletion audit history; OpenMemory export-helper setup emits a separate blocked artifact with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`, and hosted Platform export remains non-goal. | | `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke` | `2026-06-11-temporal-history-competitor-gap-report.md` | Graphiti/Zep temporal smoke remains blocked by `provider_api_key_missing`. | @@ -89,7 +92,7 @@ results, or lifecycle failures into one aggregate leaderboard. | Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. The mem0 local OSS preference-correction history scenario is now measured and is also an ELF loss. | XY-905 | | Consolidation/proposal review | `not_tested` | `fixture_backed`, `not_encoded` | ELF fixture consolidation passes, but live consolidation proposal generation and review-action scoring are not encoded. | XY-926 | | Knowledge page compilation | `not_tested` | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `not_encoded` | ELF fixture knowledge pages pass, but live knowledge compilation is not encoded; graphify reaches a tiny scored smoke and remains wrong_result. | XY-926, XY-929 | -| Operator debugging/viewer UX | `not_tested` | `fixture_backed`, `live_baseline_only`, `blocked`, `not_encoded`, `research_gate` | ELF fixture operator-debugging UX passes. mem0 local SDK `get_all` readback is measured, but the XY-931 OpenMemory export-helper setup probe is blocked by missing Docker/OpenMemory product container access and must not be inferred from SDK readback. Live trace/viewer scoring and qmd/OpenMemory/claude-mem UX comparisons remain unscored. | XY-923, XY-926 | +| Operator debugging/viewer UX | `win` | `fixture_backed`, `live_real_world`, `blocked`, `not_encoded` | ELF now has a narrow live operator-debug win over qmd on trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence. ELF ties qmd on replay-command availability and repair-action clarity. OpenMemory UI/export remains blocked and claude-mem UI remains not encoded, so this is not a broad viewer-product superiority claim. | XY-926 | | Capture/write policy and redaction | `not_tested` | `fixture_backed`, `live_baseline_only`, `blocked`, `not_encoded` | ELF fixture capture/write-policy jobs pass, but live capture integration and agentmemory/claude-mem capture hooks are not comparable yet. | XY-925, XY-926 | | Production ops, restore, backfill, and rebuild | `win` | `live_baseline_only`, `blocked` | ELF has the strongest measured local production-operation story: provider synthetic, stress, resumable backfill, backup/restore, and Qdrant rebuild evidence. | XY-930 | | Private corpus and provider boundaries | `blocked` | `blocked` | Private production profile fails closed without an operator-owned manifest; provider-backed production-ops gates require explicit credentials. | XY-930 | @@ -120,6 +123,9 @@ results, or lifecycle failures into one aggregate leaderboard. evidence among the tracked systems. - ELF ties qmd on encoded live retrieval, work-resume, project-decisions, and personalization slices. +- ELF has a narrow live operator-debug win over qmd for trace hydration, + candidate-drop visibility, and selected-but-not-narrated evidence, with + replay-command availability and repair-action clarity tied. - ELF has a live temporal reconciliation loss against the benchmark expectation: five memory-evolution jobs remain `wrong_result`. - Most competitor strengths outside qmd retrieval are `not_tested`, `blocked`, @@ -134,6 +140,8 @@ results, or lifecycle failures into one aggregate leaderboard. behavior, or graph memory. The local OSS correction-history scenario is currently an ELF loss, while OpenMemory UI/export is a measured setup blocker and hosted behavior plus graph memory remain outside measured local OSS evidence. +- Do not claim ELF broadly beats OpenMemory or claude-mem viewer UX from the narrow + ELF/qmd operator-debug slice. - Do not claim ELF beats OpenViking on staged context trajectory. - Do not claim ELF beats Letta on core-vs-archival memory. - Do not claim graph/RAG parity from smoke-only evidence. diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index 2043ed37..1f770b67 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -42,10 +42,10 @@ Current boundary: ## Current Ledger Summary -The current manifest has 21 adapter records across 16 external projects plus ELF. -Evidence-class counts: 1 `fixture_backed`, 6 `live_baseline_only`, 3 -`live_real_world`, and 11 `research_gate`. Overall adapter-status counts: 3 `pass`, -5 `wrong_result`, 1 `lifecycle_fail`, 5 `blocked`, and 7 `not_encoded`. +The current manifest has 23 adapter records across 16 external projects plus ELF. +Evidence-class counts: 1 `fixture_backed`, 6 `live_baseline_only`, 5 +`live_real_world`, and 11 `research_gate`. Overall adapter-status counts: 4 `pass`, +6 `wrong_result`, 1 `lifecycle_fail`, 5 `blocked`, and 7 `not_encoded`. ## State Taxonomy @@ -72,8 +72,8 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | Project | Strongest user-facing scenario | Current evidence | Measured status and proof | Unsupported or blocked status | Required benchmark before ELF claim | Borrow if stronger | | --- | --- | --- | --- | --- | --- | --- | -| ELF | Evidence-linked source-of-truth memory service with real-world fixtures and live retrieval sweeps. | `live_real_world`; supporting `fixture_backed`. | `wrong_result` full live sweep: `cargo make real-world-memory-live-adapters`, `tmp/real-world-memory/live-adapters/elf-report.md`. Fixture contract: `cargo make real-world-memory`, `tmp/real-world-memory/real-world-memory-report.json`. | `blocked`: private manifest and provider credentials; broader live suites remain `wrong_result`, `blocked`, or `not_encoded`. | Full-suite live pass plus separate private-corpus and credentialed production-ops proof. | Keep borrowing qmd debug knobs, OpenViking staged trajectory, mem0 history, Letta core memory, and graph/RAG navigation. | -| qmd | Local retrieval-debug workflow with transparent CLI indexing, querying, expansion, fusion, and rerank ergonomics. | `live_real_world`; supporting `live_baseline_only` and `research_gate`. | `wrong_result` full live sweep: `cargo make real-world-memory-live-adapters`, `tmp/real-world-memory/live-adapters/qmd-report.md`; targeted retrieval suites pass. | `not_encoded`: deep profile and non-retrieval live behavior are not encoded; memory_evolution is `wrong_result`. | qmd deep retrieval/debug profile plus full-suite live replay with trace-level diagnostics. | Weighted fusion, rerank explanation, local debug knobs, and command-line replay. | +| ELF | Evidence-linked source-of-truth memory service with real-world fixtures and live retrieval sweeps. | `live_real_world`; supporting `fixture_backed`. | `wrong_result` full live sweep: `cargo make real-world-memory-live-adapters`, `tmp/real-world-memory/live-adapters/elf-report.md`. Narrow operator-debug pass: `cargo make real-world-job-operator-ux-live-adapters`, `tmp/real-world-job/operator-ux-live-adapters/elf-report.md`. Fixture contract: `cargo make real-world-memory`, `tmp/real-world-memory/real-world-memory-report.json`. | `blocked`: private manifest and provider credentials; broader live suites remain `wrong_result`, `blocked`, or `not_encoded`; the narrow operator-debug slice now passes. | Full-suite live pass plus separate private-corpus and credentialed production-ops proof. | Keep borrowing qmd debug knobs, OpenViking staged trajectory, mem0 history, Letta core memory, and graph/RAG navigation. | +| qmd | Local retrieval-debug workflow with transparent CLI indexing, querying, expansion, fusion, and rerank ergonomics. | `live_real_world`; supporting `live_baseline_only` and `research_gate`. | `wrong_result` full live sweep: `cargo make real-world-memory-live-adapters`, `tmp/real-world-memory/live-adapters/qmd-report.md`; targeted retrieval suites pass; the narrow operator-debug slice ties replay commands but is `wrong_result` for trace hydration and candidate-drop visibility. | `not_encoded`: deep profile and non-retrieval live behavior are not encoded; memory_evolution is `wrong_result`. | Keep qmd deep retrieval/debug profiling separate from the narrow operator-debug live slice; no broad ELF-over-qmd or qmd-over-ELF claim is allowed until comparable stage artifacts exist. | Weighted fusion, rerank explanation, local debug knobs, and command-line replay. | | agentmemory | Coding-agent continuity, MCP/REST packaging, viewer workflow, and durable cross-agent memory lifecycle. | `live_baseline_only`. | `lifecycle_fail`: `ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. | `blocked`: durable cold-start and real-world adapter coverage are missing. | Durable local adapter with update, delete, cold-start reload, work_resume, capture/write-policy, and lifecycle-staleness jobs. | Cross-agent hooks, packaging, continuity scenarios, and viewer affordances. | | mem0/OpenMemory | Memory lifecycle, personalization, hosted/OpenMemory UI ergonomics, and optional graph memory. | `live_baseline_only`. | `pass`: fresh scoped run `cargo make openmemory-ui-export-readback`, `tmp/live-baseline/live-baseline-report.json`, with mem0 `8/8` local SDK checks passing; `blocked`: OpenMemory export-helper setup probe emits `tmp/live-baseline/mem0-openmemory-ui-export.json` with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`. | `blocked`: OpenMemory UI/export cannot be compared until a compose/import path loads the same corpus into the product app; `unsupported`: hosted Platform export; `not_encoded`: optional graph memory and real-world prompt adapter coverage. | Add a Docker-contained OpenMemory product app import/export path, then score browser/API readback separately from SDK `get_all`; keep hosted Platform and graph memory opt-in/non-goal unless explicitly enabled. | Entity-scoped history, lifecycle surfaces, async update ergonomics, and OpenMemory inspection UX. | | memsearch | Markdown-first canonical store with rebuildable local index and practical hybrid retrieval. | `live_baseline_only`. | `pass`: fresh scoped run `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`, with memsearch `4/4` local checks passing. | `not_encoded`: real-world source-of-truth, retrieval, and memory-evolution prompt adapters are not encoded; TTL/expiry is unsupported by the current CLI path. | Score source-of-truth and retrieval-debug real-world jobs over the canonical Markdown store; keep TTL/expiry as unsupported unless a comparable path exists. | Canonical markdown store, local reindex clarity, and user-inspectable source files. | @@ -101,7 +101,7 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | Temporal/current-vs-historical memory | Fixture memory_evolution passes; live memory_evolution is `wrong_result`. | Graphiti/Zep, mem0/OpenMemory. | Graphiti/Zep is `research_gate` `blocked`; mem0/OpenMemory local OSS preference history, entity scope, deletion audit, and SDK `get_all` now pass; OpenMemory UI/export is blocked by the export-helper setup probe; graph-memory scenarios are `not_encoded`. | Fix ELF/qmd live memory_evolution evidence links, add OpenMemory product app import/export readback, and run XY-888. | | Consolidation | Fixture consolidation passes; live consolidation is `not_encoded`. | agentmemory, managed-memory references, llm-wiki. | No manifest project has live consolidation scoring. | Run reviewable consolidation proposal generation with source refs, unsupported-claim flags, and audit transitions. | | Knowledge pages | Fixture knowledge_compilation passes; live knowledge_compilation is `not_encoded`. | llm-wiki, gbrain, GraphRAG, graphify. | llm-wiki and gbrain are `research_gate` `not_encoded` or `blocked`; GraphRAG is `blocked`; graphify has a tiny scored smoke `wrong_result`. | Encode live derived-page rebuild/lint scoring and run contained knowledge/RAG adapters only after setup proof. | -| Operator debugging | Fixture operator_debugging_ux passes; live operator_debugging_ux is `not_encoded`. | qmd, claude-mem, OpenMemory. | qmd has debug strengths but operator_debugging_ux is `not_encoded`; claude-mem and OpenMemory UX are `not_encoded`. | Score trace hydration, stage attribution, raw-SQL avoidance, and repair-action clarity through live artifacts. | +| Operator debugging | Fixture operator_debugging_ux passes, and the narrow live operator-debug slice passes for trace hydration, candidate-drop visibility, selected-but-not-narrated evidence, replay-command availability, and repair-action clarity. | qmd, claude-mem, OpenMemory. | qmd ties replay-command availability and repair-action clarity but is `wrong_result` for trace hydration, candidate-drop stage visibility, and selected-but-not-narrated evidence; claude-mem and OpenMemory UX remain `not_encoded` or blocked. | Add bounded OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | | Capture/write policy | Fixture capture_integration passes; live capture_integration is `not_encoded`. | agentmemory, claude-mem. | agentmemory capture is `blocked`; claude-mem capture is `not_encoded`. | Run live capture/write-policy jobs proving redaction, exclusion, evidence binding, and no secret leakage. | | Production ops | Fixture production_ops has 4 pass and 2 blocked; live production_ops is `blocked`; production adoption has provider/backfill/restore evidence. | ELF production gate, qmd, RAG/RAGFlow resource gates. | qmd live production_ops is `blocked`; RAG/resource gates are `research_gate` `blocked`. | Rerun private-corpus and credentialed gates only when operator-owned manifest and credentials exist. | | Personalization | Fixture and live personalization pass. | mem0/OpenMemory, Letta. | mem0/OpenMemory and Letta personalization are `not_encoded`. | Encode scoped preference readback for mem0/OpenMemory and Letta before personalization superiority claims. | diff --git a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md index 5a20aacf..78a00da3 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md +++ b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md @@ -26,7 +26,8 @@ The strongest current statement is: evidence. - ELF and qmd are tied on the encoded live retrieval, work-resume, and project-decision slices. ELF does not yet beat qmd's local retrieval-debug - ergonomics. + ergonomics, but ELF now has a narrow live operator-debug win over qmd on trace + hydration and candidate-drop visibility. - Many competitor strengths are still undermeasured: OpenViking context trajectory, mem0/OpenMemory entity history and UI, agentmemory and claude-mem continuity capture, Letta core-vs-archival memory, Graphiti/Zep temporal graph behavior, and @@ -76,8 +77,10 @@ Interpretation: - Both pass `trust_source_of_truth`, `work_resume`, `project_decisions`, `retrieval`, and `personalization`. - Both fail most `memory_evolution` live conflict evidence with `wrong_result`. -- Both leave consolidation, knowledge compilation, operator debugging, capture - integration, and production-ops operator boundaries as `not_encoded` or `blocked`. +- Both leave consolidation, knowledge compilation, capture integration, and + production-ops operator boundaries as `not_encoded` or `blocked`. Operator + debugging has a separate narrow live slice: ELF passes it, while qmd remains + `wrong_result` for trace hydration and candidate-drop stage visibility. ### Production Evidence @@ -96,21 +99,21 @@ private-corpus quality proof. ### External Adapter Ledger -The current adapter manifest records 21 adapter records across 17 projects: +The current adapter manifest records 23 adapter records across 17 projects: | Evidence class | Count | Meaning | | --- | ---: | --- | | `fixture_backed` | `1` | ELF real-world fixture scoring. | | `live_baseline_only` | `6` | Docker same-corpus or lifecycle evidence without real-world job scoring. | -| `live_real_world` | `3` | ELF and qmd full-suite live sweeps plus graphify's tiny scored Docker smoke. | +| `live_real_world` | `5` | ELF and qmd full-suite live sweeps, graphify's tiny scored Docker smoke, and the narrow ELF/qmd operator-debug live slice. | | `research_gate` | `11` | Source/setup/resource/output-contract evidence only. | Overall adapter statuses: | Status | Count | | --- | ---: | -| `pass` | `3` | -| `wrong_result` | `5` | +| `pass` | `4` | +| `wrong_result` | `6` | | `lifecycle_fail` | `1` | | `blocked` | `5` | | `not_encoded` | `7` | @@ -130,7 +133,7 @@ one misleading score. | Temporal memory | ELF fixture passes, but live memory evolution is wrong_result. | Prioritize current-vs-historical evidence links and Graphiti/Zep-style validity windows. | | Consolidation | ELF fixture passes, but live proposal generation is not encoded. | Build reviewable derived proposals with source refs, confidence, unsupported-claim flags, and apply/defer/discard audit. | | Knowledge pages | ELF fixture pages pass; live knowledge generation is not encoded. | Borrow llm-wiki lint/query-save loops, gbrain timelines, and graphify reports behind rebuild/lint benchmarks. | -| Operator debugging | Fixture UX passes; live trace/viewer scoring is not encoded. | Make viewer/CLI debugging a scored live surface, not just an admin convenience. | +| Operator debugging | Fixture UX passes and the narrow live trace/viewer slice is scored: ELF passes, qmd ties replay/repair clarity but is wrong_result for trace hydration and candidate-drop visibility. | Expand coverage to OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | | Capture/write policy | Fixture capture boundary passes; live capture is not encoded. | Borrow agentmemory/claude-mem capture hooks while preserving redaction and evidence binding. | | Production ops | ELF has the strongest checked-in evidence, with private/credential gates blocked. | Keep Docker-first production proof and add private corpus only when an operator-owned manifest exists. | | Personalization | ELF live personalization passes; mem0/OpenMemory and Letta are not encoded. | Add entity-scoped preference history and UI readback before claiming stronger personalization. | @@ -184,11 +187,13 @@ near tie. - Benchmark gate: qmd deep profile plus ELF/qmd trace-level replay report. 3. Live operator debugging UX - - Current state: fixture pass, live `not_encoded`. + - Current state: fixture pass; narrow live ELF/qmd slice scored with ELF `pass` + and qmd `wrong_result`. - Borrow from: claude-mem viewer, OpenMemory inspector, qmd command output. - - Target: no raw SQL needed to explain a bad memory result. - - Benchmark gate: live operator-debugging jobs score trace hydration, stage - attribution, and repair-action clarity. + - Target: no raw SQL needed to explain a bad memory result, across service traces, + CLI replay, and bounded local viewer surfaces. + - Benchmark gate: add OpenMemory and claude-mem UI/export or viewer runners before + claiming broader operator-debug UX superiority. ### P1 - Turn ELF Into A Better Daily Memory Product @@ -253,7 +258,8 @@ Do not claim: fails closed without an operator-owned manifest. - ELF beats OpenViking on context trajectory. That scenario is not encoded. - ELF beats mem0/OpenMemory on hosted memory, entity history, UI, or optional graph - memory. Those scenarios are not encoded. + memory. Those scenarios are not encoded; the operator-debug win is only against + qmd on a narrow trace/replay slice. - ELF beats Letta on core-vs-archival memory. That scenario is not encoded. - ELF beats RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, or graphify on graph/RAG navigation. Current evidence is research-gate or blocked except graphify's tiny @@ -278,7 +284,7 @@ The next reporting work should be ordered by decision value: 1. ELF/qmd retrieval-debug deep profile. 2. ELF live memory-evolution repair report. -3. Operator-debugging live trace/viewer report. +3. OpenMemory and claude-mem operator-debug UI/export runners. 4. Capture/write-policy live adapter report. 5. OpenViking context-trajectory report after evidence-bearing retrieval works. 6. RAG/graph adapter pack report after Docker-contained outputs map to evidence ids. diff --git a/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md b/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md index e3a7a7c7..aa6213ae 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md +++ b/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md @@ -32,8 +32,12 @@ The resulting narrow position: - Replay command locality: ELF `loss` against qmd. - ELF trace/admin replay surface: `tie` as an available but different replay surface, not a default-artifact win. +- Operator-debug trace hydration and candidate-drop visibility: ELF `win` against qmd + in the narrow XY-932 live slice; replay-command availability and repair-action + clarity are `tie`. - Expansion, dense/sparse contribution, fusion, and candidate-drop diagnostics: - `not_tested` until comparable stage artifacts are emitted. + `not_tested` outside the operator-debug slice until comparable stage artifacts are + emitted. - Rerank stage scoring: `non_goal` for the current qmd stress path because it uses `--no-rerank`. - Wrong-result selected-but-not-narrated diagnosis: `tie` on typed non-pass @@ -48,9 +52,11 @@ This is not a broad qmd-over-ELF claim. It is a scored local-debug artifact gap. | ELF | Stress guardrail with trace ids | `ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` | `tmp/live-baseline/live-baseline-report.json`; summarized in `docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json` | | ELF | Admin trace bundle hydration | `curl -fsS 'http://127.0.0.1:51891/v2/admin/traces//bundle?mode=full&stage_items_limit=256&candidates_limit=200' -H 'X-ELF-Tenant-Id: ' -H 'X-ELF-Project-Id: ' -H 'X-ELF-Agent-Id: '` | `elf.trace_bundle/v1` response from the admin service | | ELF | Trace ranking replay | `cargo run -p elf-eval -- --config-a config/local/elf.docker.toml --config-b config/local/elf.docker.toml --trace-id ` | JSON trace compare output over `search_trace_candidates` | +| ELF | Operator-debug live trace slice | `cargo make real-world-job-operator-ux-live-adapters` | `tmp/real-world-job/operator-ux-live-adapters/elf-report.json` and `summary.json` | | qmd | Stress guardrail and top-10 rows | `ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` | `tmp/live-baseline/qmd-query.json`; summarized in `docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json` | | qmd | Per-query CLI replay | `npx tsx src/cli/qmd.ts query 'lex: \nvec: ' -c elfbench --json --no-rerank --min-score 0 -n 10` | JSON top-10 rows with `file`, line/snippet/score fields when qmd returns them | | qmd | Lifecycle replay | `npx tsx src/cli/qmd.ts update && npx tsx src/cli/qmd.ts embed -f -c elfbench && npx tsx src/cli/qmd.ts query ... --json --no-rerank` | `tmp/live-baseline/qmd-query.json` checks for update, delete, and cold-start recovery | +| qmd | Operator-debug live replay slice | `cargo make real-world-job-operator-ux-live-adapters` | `tmp/real-world-job/operator-ux-live-adapters/qmd-report.json` and `summary.json` | ## Scenario Outcomes @@ -60,6 +66,11 @@ This is not a broad qmd-over-ELF claim. It is a scored local-debug artifact gap. | Default top-10 candidate artifact | `live_baseline_only` | `pass` | `loss` | qmd exposes file, score, line/snippet, and distractor rows directly; ELF records trace ids and top evidence but not the full candidate list in the report. | | Replay command locality | `live_baseline_only` | `pass` | `loss` | qmd replay is a short local CLI query/update/embed path; ELF replay requires a live service config, persisted traces, headers, and trace ids. | | Trace/admin replay surface availability | `implementation_reference` | `not_encoded` | `tie` | ELF has admin trace bundles and `elf-eval` trace replay; qmd has direct CLI replay. They are different useful surfaces and are not scored as equivalent quality. | +| Operator-debug trace hydration | `live_real_world` | `pass` | `win` | ELF live operator-debug jobs generate trace ids, viewer URLs, admin trace-bundle URLs, and `trace_available=true`; qmd generates local replay commands but no service trace hydration surface. | +| Operator-debug replay command availability | `live_real_world` | `pass` | `tie` | ELF emits admin trace-bundle curl commands and qmd emits local CLI query replay commands for the same operator-debugging scenarios; this scores command availability, not equivalent UI quality. | +| Operator-debug candidate-drop visibility | `live_real_world` | `pass` | `win` | ELF exposes dropped-candidate visibility through generated operator-debug metadata without direct SQL assumptions; qmd exposes top-k replay rows but no intermediate candidate-drop stages in this slice. | +| Operator-debug repair-action clarity | `live_real_world` | `pass` | `tie` | Both live operator-debug adapters emit concrete next steps for replay or trace-bundle inspection; OpenMemory and claude-mem UI repair paths remain blocked or not encoded. | +| Operator-debug selected-but-not-narrated evidence | `live_real_world` | `pass` | `win` | The operator-debug slice now scores selected-but-not-narrated evidence as a trace/answer-composition repair surface without direct database inspection. | | Query expansion attribution | `research_gate` | `not_encoded` | `not_tested` | No comparable artifact shows expansion variants or dynamic expansion decisions for both systems. | | Dense/sparse channel attribution | `research_gate` | `not_encoded` | `not_tested` | ELF uses dense plus BM25 and qmd uses structured `lex:` plus `vec:`, but the scored artifacts do not expose comparable per-channel contribution. | | Fusion attribution | `research_gate` | `not_encoded` | `not_tested` | No comparable artifact shows fusion inputs, RRF/weighted-fusion contributions, or fusion-stage candidate drops. | @@ -68,7 +79,7 @@ This is not a broad qmd-over-ELF claim. It is a scored local-debug artifact gap. | Selected-but-not-narrated wrong results | `live_real_world` | `wrong_result` | `tie` | Both live paths produce memory-evolution wrong results where evidence is present but current-vs-historical or lifecycle narration is missing. | | Evidence-absent and tombstone diagnosis | `live_real_world` | `wrong_result` | `win` | ELF retrieved all required memory-evolution evidence and passed delete/TTL; qmd missed three required evidence links including the delete tombstone. | -Summary: `1` ELF win, `3` ties, `2` ELF losses, `4` not-tested scenarios, `0` +Summary: `4` ELF wins, `5` ties, `2` ELF losses, `4` not-tested scenarios, `0` blocked scenarios, and `1` non-goal scenario. The losses are local-debug artifact losses only. They do not change the retrieval-correctness tie. @@ -81,8 +92,9 @@ losses only. They do not change the retrieval-correctness tie. | Sparse retrieval | `not_tested` | qmd `lex:` and ELF BM25 are present in command or service design, but contribution and drops are not scored. | | Fusion | `not_tested` | Fusion candidates and final fusion deltas are not materialized comparably. | | Rerank | `non_goal` | qmd uses `--no-rerank` in the current path; rerank superiority is out of scope for this run. | -| Candidate drops | `not_tested` | No current report can prove retrieved-but-dropped evidence for qmd, and ELF candidate bundles are not hydrated into the stress artifact. | +| Candidate drops | `not_tested` globally; `win` in operator-debug slice | No current stress/default report can prove retrieved-but-dropped evidence for qmd, but the XY-932 operator-debug slice scores ELF candidate-drop visibility without direct SQL assumptions. | | Selected-but-not-narrated | `tie` | Both systems have typed memory-evolution wrong-result rows where evidence is selected or available but not narrated as lifecycle history. | +| Operator-debug selected-but-not-narrated | `win` | The XY-932 operator-debug job proves selected-but-not-narrated evidence is visible as a trace/answer-composition repair surface in ELF but not in qmd's generated service-trace metadata. | | Replay commands | `loss` | qmd's local CLI replay is shorter and directly tied to top-10 JSON output. | ## Typed Non-Pass States @@ -92,8 +104,8 @@ The report preserves the wrong-result classes from the June 11 diagnostics: | Class | Current coverage | | --- | --- | | `evidence_absent` | Observed for qmd on verdict caveat, preference rationale, and delete tombstone misses. | -| `retrieved_but_dropped` | Defined but `not_tested`; current artifacts do not expose enough candidate-stage data. | -| `selected_but_not_narrated` | Observed for both ELF and qmd on supersession and temporal-validity jobs. | +| `retrieved_but_dropped` | Defined globally as `not_tested`; observed as an ELF operator-debug visibility win in the narrow XY-932 slice. | +| `selected_but_not_narrated` | Observed for both ELF and qmd on supersession and temporal-validity jobs; additionally scored as an ELF operator-debug visibility win in the narrow XY-932 slice. | | `contradicted_by_lifecycle_evidence` | Observed when current, historical, supersession, or tombstone evidence makes the answer incomplete. | These states are typed evidence, not leaderboard shortcuts. A `wrong_result` with @@ -108,10 +120,14 @@ Allowed: CLI replay. - ELF has useful service trace/admin replay surfaces, but they are not yet hydrated into the default stress report as qmd-like candidate artifacts. +- ELF narrowly wins the live operator-debug trace hydration and candidate-drop + visibility slice against qmd; qmd still ties replay-command and repair-action + clarity. - ELF narrowly wins the memory-evolution evidence-retention slice because qmd misses the delete tombstone and two other required evidence links. - Expansion, dense/sparse contribution, fusion, rerank-on quality, and - retrieved-but-dropped candidate diagnosis remain unproven. + broad retrieved-but-dropped candidate diagnosis outside the operator-debug slice + remain unproven. Not allowed: @@ -122,6 +138,8 @@ Not allowed: benchmark report has qmd-level candidate visibility. - Do not score rerank superiority from a qmd `--no-rerank` run. - Do not collapse `not_tested`, `non_goal`, or `wrong_result` into pass evidence. +- Do not convert the XY-932 operator-debug trace slice into a broad viewer-product win + over OpenMemory or claude-mem; those UI paths remain blocked or not encoded. ## Follow-Up Gate diff --git a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md index 584b3142..e10ce945 100644 --- a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md +++ b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md @@ -34,6 +34,9 @@ What is proven today: trajectory, mem0/OpenMemory entity history and UI, Letta core-vs-archival memory, Graphiti/Zep temporal graph behavior, graph/RAG navigation, agentmemory and claude-mem capture/continuity, and knowledge-page workflows remain non-claims. + The separate XY-932 operator-debug live slice now scores ELF against qmd for trace + hydration and candidate-drop visibility, but does not cover OpenMemory or + claude-mem UI flows. So the current adoption decision can remain "credible for bounded personal production," but the competitiveness objective remains open. @@ -119,19 +122,19 @@ conflict evidence links for current-vs-historical reasoning. ## External Adapter Ledger -The checked-in manifest records 21 adapter records across 17 unique project names. +The checked-in manifest records 23 adapter records across 17 unique project names. | Evidence class | Adapter records | Meaning | | --- | ---: | --- | | `fixture_backed` | `1` | ELF fixture scoring only. | | `live_baseline_only` | `6` | Docker same-corpus or lifecycle evidence without real-world job scoring. | -| `live_real_world` | `3` | ELF and qmd live real-world sweeps plus graphify's tiny scored Docker smoke. | +| `live_real_world` | `5` | ELF and qmd live real-world sweeps, graphify's tiny scored Docker smoke, and the narrow ELF/qmd operator-debug live slice. | | `research_gate` | `11` | Setup, source, resource, or output-contract gate only. | | Overall status | Adapter records | | --- | ---: | -| `pass` | `3` | -| `wrong_result` | `5` | +| `pass` | `4` | +| `wrong_result` | `6` | | `lifecycle_fail` | `1` | | `blocked` | `5` | | `not_encoded` | `7` | @@ -144,8 +147,8 @@ records `unique_project_names: 17` for the full project list including ELF. | Project | Best current evidence | Current measured state | Strongest unproven scenario | Next measurement before claim | | --- | --- | --- | --- | --- | -| ELF | `fixture_backed` plus `live_real_world` | Fixture aggregate passes except 2 blocked operator boundaries; live full sweep is `wrong_result`. | Full live memory evolution, live consolidation, live knowledge pages, live capture, live production ops. | Memory-evolution diagnostic report, then live operator/capture/consolidation reports. | -| qmd | `live_real_world` plus `live_baseline_only` | Fresh full sweep is one pass behind ELF because qmd misses the delete/TTL tombstone job; same-corpus baseline passes. | Deep retrieval-debug ergonomics and trace replay. | qmd/ELF deep retrieval-debug profile with expansion, fusion, rerank, and dropped-candidate traces. | +| ELF | `fixture_backed` plus `live_real_world` | Fixture aggregate passes except 2 blocked operator boundaries; live full sweep is `wrong_result`; narrow operator-debug live slice passes. | Full live memory evolution, live consolidation, live knowledge pages, live capture, live production ops, and broader operator UI runners. | Memory-evolution diagnostic report, then live capture/consolidation/knowledge reports and OpenMemory/claude-mem UI runners. | +| qmd | `live_real_world` plus `live_baseline_only` | Fresh full sweep is one pass behind ELF because qmd misses the delete/TTL tombstone job; same-corpus baseline passes; narrow operator-debug live slice ties replay commands but is `wrong_result` for trace hydration and candidate-drop visibility. | Deep retrieval-debug ergonomics and trace replay beyond the narrow operator-debug slice. | qmd/ELF deep retrieval-debug profile with expansion, fusion, rerank, and dropped-candidate traces. | | agentmemory | `live_baseline_only` | `lifecycle_fail`. | Durable coding-agent continuity and capture hooks. | Durable lifecycle and work-resume/capture adapter report. | | mem0/OpenMemory | `live_baseline_only` | Basic local smoke now passes; history/UI/hosted/graph behavior remains `not_encoded`. | Entity history, lifecycle UI, OpenMemory inspection. | Entity-history, deletion-audit, and UI/export readback report. | | memsearch | `live_baseline_only` | Basic canonical Markdown reindex/reload smoke now passes; real-world prompt coverage remains `not_encoded`. | Markdown canonical store and local reindex clarity. | Source-of-truth and retrieval-debug real-world adapter report. | @@ -173,7 +176,7 @@ records `unique_project_names: 17` for the full project list including ELF. | Memory evolution | ELF live fails 5/6 jobs; qmd live fails 6/6 jobs after missing the delete/TTL tombstone evidence; fixture aggregate passes. | No broad live superiority claim. | Historical conflict evidence links and Graphiti/Zep temporal comparison. | | Consolidation | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | Live proposal generation with lineage, confidence, and review-action audit. | | Knowledge pages | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | Live page rebuild/lint plus llm-wiki, gbrain, GraphRAG, and graphify comparisons. | -| Operator debugging | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | Trace hydration, stage attribution, dropped-candidate, and repair-action scoring. | +| Operator debugging | Fixture aggregate passes; narrow ELF/qmd live operator-debug slice is scored with ELF `pass` and qmd `wrong_result`. | Narrow ELF/qmd live claim only: ELF wins trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence; replay-command and repair-action clarity are tied. | OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | | Capture/write policy | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | agentmemory/claude-mem style capture with redaction and evidence binding. | | Production ops | ELF has separate production-provider/backfill/restore evidence; live sweep is not a full production-ops pass. | Bounded personal-production adoption claim with caveats. | Private corpus manifest and credentialed provider gates. | | Personalization | ELF and qmd live pass one scoped preference job. | Narrow encoded pass only. | mem0/OpenMemory and Letta entity/preference history comparison. | diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index 906c2659..56ec65a5 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -12,7 +12,7 @@ "Live temporal reconciliation remains wrong_result for five of six memory_evolution jobs.", "Private-corpus production quality is blocked until an operator-owned manifest exists.", "Credentialed provider production-ops gates are blocked until explicit provider setup exists.", - "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation remain unproven. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up now scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, rerank, and candidate-drop diagnosis remain untested." + "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival memory, and graph/RAG navigation remain unproven. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export and claude-mem viewer workflows remain blocked or not encoded." ] }, "evidence_class_terms": [ @@ -46,6 +46,11 @@ "artifact": "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", "claim": "ELF live service adapter reports 18 pass, 5 wrong_result, 2 blocked, and 13 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 13 not_encoded jobs." }, + { + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json", + "claim": "The narrow live operator-debug slice scores ELF as pass and qmd as wrong_result: ELF wins trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence; both systems expose replay commands and repair-action guidance." + }, { "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", "artifact": "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md", @@ -82,7 +87,11 @@ "scenario_id": "source_of_truth_rebuild_evidence_writes", "title": "Source-of-truth rebuild and evidence-bound writes", "outcome": "win", - "evidence_classes": ["fixture_backed", "live_real_world", "live_baseline_only"], + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "live_baseline_only" + ], "measured_claim": "ELF has the strongest measured source-of-truth and rebuild story: Postgres is authoritative, Qdrant is rebuildable, trust_source_of_truth passes in fixture and live sweeps, and production restore/rebuild proof exists.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", @@ -95,192 +104,296 @@ "scenario_id": "work_resume_coding_agent_continuity", "title": "Work resume and coding-agent continuity", "outcome": "tie", - "evidence_classes": ["fixture_backed", "live_real_world", "live_baseline_only", "blocked", "not_encoded"], + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "live_baseline_only", + "blocked", + "not_encoded" + ], "measured_claim": "ELF and qmd both pass the encoded live work_resume jobs. agentmemory, claude-mem, and OpenViking continuity strengths remain blocked or not encoded.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md" ], - "follow_up_issues": ["XY-925", "XY-928"], + "follow_up_issues": [ + "XY-925", + "XY-928" + ], "caveat": "The tie is only for encoded live work_resume behavior, not for broad capture hooks or staged context." }, { "scenario_id": "project_decisions_reversals", "title": "Project decisions and reversals", "outcome": "tie", - "evidence_classes": ["fixture_backed", "live_real_world", "research_gate", "not_encoded"], + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "research_gate", + "not_encoded" + ], "measured_claim": "ELF and qmd both pass encoded project_decisions jobs. Letta-style core/archival decision memory is not tested.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md" ], - "follow_up_issues": ["XY-927"], + "follow_up_issues": [ + "XY-927" + ], "caveat": "No Letta comparison exists until a contained export path is selected." }, { "scenario_id": "retrieval_quality", "title": "Retrieval quality", "outcome": "tie", - "evidence_classes": ["fixture_backed", "live_real_world", "live_baseline_only"], + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "live_baseline_only" + ], "measured_claim": "ELF and qmd both pass the encoded live retrieval suite and both pass stress/same-corpus retrieval evidence.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md" ], - "follow_up_issues": ["XY-923"], + "follow_up_issues": [ + "XY-923" + ], "caveat": "Retrieval correctness is separate from debug/replay ergonomics." }, { "scenario_id": "local_debug_replay_ux", "title": "Retrieval quality and local debug UX", "outcome": "loss", - "evidence_classes": ["live_baseline_only", "research_gate", "wrong_result", "not_encoded"], + "evidence_classes": [ + "live_baseline_only", + "research_gate", + "wrong_result", + "not_encoded" + ], "measured_claim": "The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md", "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md" ], - "follow_up_issues": ["XY-923"], + "follow_up_issues": [ + "XY-923" + ], "caveat": "The loss is a local-debug artifact loss only; retrieval correctness remains tied and no broad qmd-over-ELF memory-system claim is allowed." }, { "scenario_id": "memory_evolution_temporal_history", "title": "Memory evolution and temporal history", "outcome": "loss", - "evidence_classes": ["fixture_backed", "live_real_world", "live_baseline_only", "wrong_result", "blocked"], + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "live_baseline_only", + "wrong_result", + "blocked" + ], "measured_claim": "ELF fixture memory_evolution passes, but live ELF passes only the delete/TTL job and reports five wrong_result jobs where evidence is retrieved but current-vs-historical state is not reconciled. The mem0 local OSS preference-correction history scenario is now measured and is also an ELF loss.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", "docs/research/2026-06-11-temporal-history-competitor-gap-report.json" ], - "follow_up_issues": ["XY-905"], + "follow_up_issues": [ + "XY-905" + ], "caveat": "Graphiti/Zep remains a temporal-validity reference, but its local provider-backed smoke is blocked by provider_api_key_missing." }, { "scenario_id": "consolidation_proposal_review", "title": "Consolidation/proposal review", "outcome": "not_tested", - "evidence_classes": ["fixture_backed", "not_encoded"], + "evidence_classes": [ + "fixture_backed", + "not_encoded" + ], "measured_claim": "ELF fixture consolidation passes, but live consolidation proposal generation and review-action scoring are not encoded.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md" ], - "follow_up_issues": ["XY-926"], + "follow_up_issues": [ + "XY-926" + ], "caveat": "Fixture evidence cannot be promoted into live proposal-quality proof." }, { "scenario_id": "knowledge_page_compilation", "title": "Knowledge page compilation", "outcome": "not_tested", - "evidence_classes": ["fixture_backed", "live_real_world", "wrong_result", "research_gate", "not_encoded"], + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "wrong_result", + "research_gate", + "not_encoded" + ], "measured_claim": "ELF fixture knowledge pages pass, but live knowledge compilation is not encoded. graphify reaches a tiny scored smoke and remains wrong_result.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", "docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md" ], - "follow_up_issues": ["XY-926", "XY-929"], + "follow_up_issues": [ + "XY-926", + "XY-929" + ], "caveat": "llm-wiki, gbrain, GraphRAG, and graphify remain references until representative citation/lint jobs are scored." }, { "scenario_id": "operator_debugging_viewer_ux", "title": "Operator debugging/viewer UX", - "outcome": "not_tested", - "evidence_classes": ["fixture_backed", "live_baseline_only", "blocked", "not_encoded", "research_gate"], - "measured_claim": "ELF fixture operator-debugging UX passes. mem0 local SDK get_all readback is measured, but the XY-931 OpenMemory export-helper setup probe is blocked by missing Docker/OpenMemory product container access and must not be inferred from SDK readback. Live trace/viewer scoring and qmd/OpenMemory/claude-mem UX comparisons remain unscored.", + "outcome": "win", + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "blocked", + "not_encoded" + ], + "measured_claim": "ELF now has a narrow live operator-debug win over qmd on trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence. ELF ties qmd on replay-command availability and repair-action clarity. OpenMemory UI/export remains blocked and claude-mem UI remains not encoded, so this is not a broad viewer-product superiority claim.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", - "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" + "tmp/real-world-job/operator-ux-live-adapters/summary.json", + "tmp/real-world-job/operator-ux-live-adapters/elf-report.json", + "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json", + "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md" + ], + "follow_up_issues": [ + "XY-926" ], - "follow_up_issues": ["XY-923", "XY-926"], - "caveat": "No raw-SQL-avoidance or repair-action live benchmark exists yet." + "caveat": "The live slice compares ELF and qmd only; OpenMemory UI/export and claude-mem viewer workflows remain typed blocked or not_encoded until a bounded local runner exists." }, { "scenario_id": "capture_write_policy_redaction", "title": "Capture/write policy and redaction", "outcome": "not_tested", - "evidence_classes": ["fixture_backed", "live_baseline_only", "blocked", "not_encoded"], + "evidence_classes": [ + "fixture_backed", + "live_baseline_only", + "blocked", + "not_encoded" + ], "measured_claim": "ELF fixture capture/write-policy jobs pass, but live capture integration remains not encoded and agentmemory/claude-mem capture hooks are not comparable yet.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md" ], - "follow_up_issues": ["XY-925", "XY-926"], + "follow_up_issues": [ + "XY-925", + "XY-926" + ], "caveat": "Future evidence must prove redaction, exclusions, evidence binding, and no secret leakage." }, { "scenario_id": "production_ops_restore_backfill", "title": "Production ops, restore, backfill, and rebuild", "outcome": "win", - "evidence_classes": ["live_baseline_only", "blocked"], + "evidence_classes": [ + "live_baseline_only", + "blocked" + ], "measured_claim": "ELF has the strongest measured local production-operation story: provider synthetic, stress, resumable backfill, backup/restore, and Qdrant rebuild evidence are checked in.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md", "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md" ], - "follow_up_issues": ["XY-930"], + "follow_up_issues": [ + "XY-930" + ], "caveat": "Private-corpus and credentialed provider gates remain blocked, so this is not private production quality proof." }, { "scenario_id": "private_corpus_provider_boundaries", "title": "Private corpus and provider boundaries", "outcome": "blocked", - "evidence_classes": ["blocked"], + "evidence_classes": [ + "blocked" + ], "measured_claim": "The private production profile fails closed without an operator-owned manifest, and provider-backed production-ops gates require explicit credentials.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md", "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md" ], - "follow_up_issues": ["XY-930"], + "follow_up_issues": [ + "XY-930" + ], "caveat": "The blocker is an input boundary, not a hidden benchmark pass or loss." }, { "scenario_id": "personalization_scoped_preferences", "title": "Personalization and scoped preferences", "outcome": "tie", - "evidence_classes": ["fixture_backed", "live_real_world", "live_baseline_only", "not_encoded"], + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "live_baseline_only", + "not_encoded" + ], "measured_claim": "ELF and qmd both pass the single encoded live personalization job. mem0 local OSS now passes entity-scoped personalization, so scoped preference behavior is a measured tie; preference correction history remains a separate ELF loss.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md" ], - "follow_up_issues": ["XY-927"], + "follow_up_issues": [ + "XY-927" + ], "caveat": "The tie is scoped to encoded personalization and local OSS entity filters; OpenMemory UI readback and long-term preference evolution remain separate surfaces." }, { "scenario_id": "context_trajectory_hierarchical_retrieval", "title": "Context trajectory and hierarchical retrieval", "outcome": "not_tested", - "evidence_classes": ["live_baseline_only", "research_gate", "wrong_result", "not_encoded"], + "evidence_classes": [ + "live_baseline_only", + "research_gate", + "wrong_result", + "not_encoded" + ], "measured_claim": "OpenViking reaches the pinned Docker local embedding path but misses expected same-corpus evidence, and staged trajectory/hierarchy scoring is not encoded.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" ], - "follow_up_issues": ["XY-928"], + "follow_up_issues": [ + "XY-928" + ], "caveat": "ELF only has a narrow precondition win over OpenViking, not a trajectory win." }, { "scenario_id": "core_vs_archival_memory", "title": "Core-vs-archival memory", "outcome": "not_tested", - "evidence_classes": ["research_gate", "not_encoded"], + "evidence_classes": [ + "research_gate", + "not_encoded" + ], "measured_claim": "ELF has core block semantics in the service contract, but comparable core-vs-archival benchmark jobs and a contained Letta export path are not encoded.", "command_artifacts": [ "docs/spec/system_elf_memory_service_v2.md", "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" ], - "follow_up_issues": ["XY-927"], + "follow_up_issues": [ + "XY-927" + ], "caveat": "No ELF-over-Letta claim is allowed." }, { "scenario_id": "graph_rag_navigation_citations", "title": "Graph/RAG navigation and citations", "outcome": "not_tested", - "evidence_classes": ["smoke_only", "research_gate", "blocked", "wrong_result", "not_encoded"], + "evidence_classes": [ + "smoke_only", + "research_gate", + "blocked", + "wrong_result", + "not_encoded" + ], "measured_claim": "Graph/RAG smokes now produce scored or typed non-pass adapter reports where possible, but broad graph/RAG navigation and citation quality are not tested.", "command_artifacts": [ "docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md" ], - "follow_up_issues": ["XY-929"], + "follow_up_issues": [ + "XY-929" + ], "caveat": "RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, llm-wiki, and gbrain remain blocked, research_gate, or not_encoded; graphify only has a tiny wrong_result smoke." } ], @@ -352,7 +465,8 @@ "ELF has the strongest measured source-of-truth, rebuild, restore, and backfill evidence among the tracked systems.", "ELF ties qmd on encoded live retrieval, work_resume, project_decisions, and personalization slices.", "ELF has a live temporal reconciliation loss against the benchmark expectation: five memory_evolution jobs remain wrong_result.", - "Most competitor strengths outside qmd retrieval are not_tested, blocked, smoke_only, or research_gate." + "Most competitor strengths outside qmd retrieval are not_tested, blocked, smoke_only, or research_gate.", + "ELF has a narrow live operator-debug win over qmd for trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence, with replay-command availability and repair-action clarity tied." ], "not_allowed": [ "Do not claim ELF broadly beats qmd.", @@ -361,7 +475,8 @@ "Do not claim ELF beats OpenViking on staged context trajectory.", "Do not claim ELF beats Letta on core-vs-archival memory.", "Do not claim graph/RAG parity from smoke-only evidence.", - "Do not promote fixture-backed, live_baseline_only, smoke_only, research_gate, blocked, wrong_result, lifecycle_fail, unsupported, or not_encoded states into a generic pass/fail score." + "Do not promote fixture-backed, live_baseline_only, smoke_only, research_gate, blocked, wrong_result, lifecycle_fail, unsupported, or not_encoded states into a generic pass/fail score.", + "Do not claim ELF broadly beats OpenMemory or claude-mem viewer UX from the narrow ELF/qmd operator-debug slice." ] } } diff --git a/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json b/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json index ebc095d2..42c22615 100644 --- a/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json +++ b/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json @@ -35,13 +35,14 @@ "debug_ergonomics": "qmd wins the current default top-10 candidate artifact and short replay-command surfaces.", "elf_trace_position": "ELF has service trace, admin bundle, and trace replay surfaces, but they are not hydrated into the default stress report as qmd-like candidate artifacts.", "outcome_counts": { - "win": 1, - "tie": 3, + "win": 4, + "tie": 5, "loss": 2, "not_tested": 4, "blocked": 0, "non_goal": 1 - } + }, + "operator_debug_live_slice": "XY-932 adds a narrow live_real_world operator-debug slice: ELF passes trace hydration, candidate-drop visibility, selected-but-not-narrated evidence, and repair-action clarity; qmd ties replay-command and repair-action clarity but remains wrong_result for trace hydration and candidate-drop stage visibility." }, "commands": [ { @@ -146,6 +147,79 @@ "scripts/live-baseline-benchmark.sh" ] }, + { + "scenario_id": "operator_debug_trace_hydration", + "surface": "operator-debug trace hydration", + "evidence_class": "live_real_world", + "result_type": "pass", + "elf_status": "pass", + "qmd_status": "wrong_result", + "outcome": "win", + "diagnostic_judgment": "ELF live operator-debug jobs generate trace_available=true, service trace ids, viewer URLs, and admin trace-bundle replay URLs; qmd generates local replay commands but no service trace hydration surface.", + "artifacts": [ + "tmp/real-world-job/operator-ux-live-adapters/elf-report.json", + "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + ] + }, + { + "scenario_id": "operator_debug_replay_command_availability", + "surface": "operator-debug replay command availability", + "evidence_class": "live_real_world", + "result_type": "pass", + "elf_status": "pass", + "qmd_status": "pass", + "outcome": "tie", + "diagnostic_judgment": "ELF emits admin trace-bundle curl commands and qmd emits local CLI query replay commands for the same operator-debugging scenarios; this scores command availability, not equivalent UI quality.", + "artifacts": [ + "tmp/real-world-job/operator-ux-live-adapters/summary.json" + ] + }, + { + "scenario_id": "operator_debug_candidate_drop_visibility", + "surface": "operator-debug candidate-drop visibility", + "evidence_class": "live_real_world", + "result_type": "pass", + "elf_status": "pass", + "qmd_status": "wrong_result", + "outcome": "win", + "diagnostic_judgment": "ELF exposes dropped-candidate visibility through generated operator_debug metadata without direct SQL assumptions; qmd exposes top-k replay rows but no intermediate candidate-drop stages in this slice.", + "typed_non_pass_states": [ + "retrieved_but_dropped" + ], + "artifacts": [ + "tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json", + "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + ] + }, + { + "scenario_id": "operator_debug_repair_action_clarity", + "surface": "operator-debug repair-action clarity", + "evidence_class": "live_real_world", + "result_type": "pass", + "elf_status": "pass", + "qmd_status": "pass", + "outcome": "tie", + "diagnostic_judgment": "Both live operator-debug adapters emit concrete next steps for replay or trace-bundle inspection; OpenMemory and claude-mem UI repair paths remain blocked or not encoded.", + "artifacts": [ + "tmp/real-world-job/operator-ux-live-adapters/summary.json" + ] + }, + { + "scenario_id": "operator_debug_selected_but_not_narrated", + "surface": "operator-debug selected-but-not-narrated evidence", + "evidence_class": "live_real_world", + "result_type": "pass", + "elf_status": "pass", + "qmd_status": "wrong_result", + "outcome": "win", + "diagnostic_judgment": "The operator-debug slice now scores selected-but-not-narrated evidence as a trace/answer-composition repair surface without direct database inspection.", + "typed_non_pass_states": [ + "selected_but_not_narrated" + ], + "artifacts": [ + "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/selected_but_not_narrated.json" + ] + }, { "scenario_id": "query_expansion_attribution", "surface": "query expansion attribution", @@ -286,8 +360,10 @@ "qmd currently wins the default local-debug artifact surface: top-10 rows plus short CLI replay.", "ELF trace/admin endpoint availability is not proof that the default benchmark report has qmd-level candidate visibility.", "Rerank superiority is not scored from a qmd --no-rerank run.", - "Expansion, dense/sparse contribution, fusion, and retrieved-but-dropped candidate diagnostics remain not_tested.", "Do not claim qmd beats ELF as a memory system overall.", - "Do not collapse not_tested, non_goal, or wrong_result into pass evidence." + "Do not collapse not_tested, non_goal, or wrong_result into pass evidence.", + "ELF narrowly wins the live operator-debug trace hydration and candidate-drop visibility slice against qmd; qmd still ties replay-command and repair-action clarity.", + "Expansion, dense/sparse contribution, fusion, rerank-on quality, and broad retrieved-but-dropped diagnosis outside the operator-debug slice remain unproven.", + "Do not convert the XY-932 operator-debug trace slice into a broad viewer-product win over OpenMemory or claude-mem; those UI paths remain blocked or not encoded." ] } diff --git a/docs/research/2026-06-11-measurement-coverage-audit.json b/docs/research/2026-06-11-measurement-coverage-audit.json index d11270f4..ab71c30e 100644 --- a/docs/research/2026-06-11-measurement-coverage-audit.json +++ b/docs/research/2026-06-11-measurement-coverage-audit.json @@ -72,88 +72,136 @@ { "suite": "trust_source_of_truth", "jobs": 1, - "elf_status_counts": {"pass": 1}, - "qmd_status_counts": {"pass": 1} + "elf_status_counts": { + "pass": 1 + }, + "qmd_status_counts": { + "pass": 1 + } }, { "suite": "work_resume", "jobs": 5, - "elf_status_counts": {"pass": 5}, - "qmd_status_counts": {"pass": 5} + "elf_status_counts": { + "pass": 5 + }, + "qmd_status_counts": { + "pass": 5 + } }, { "suite": "retrieval", "jobs": 5, - "elf_status_counts": {"pass": 5}, - "qmd_status_counts": {"pass": 5} + "elf_status_counts": { + "pass": 5 + }, + "qmd_status_counts": { + "pass": 5 + } }, { "suite": "project_decisions", "jobs": 5, - "elf_status_counts": {"pass": 5}, - "qmd_status_counts": {"pass": 5} + "elf_status_counts": { + "pass": 5 + }, + "qmd_status_counts": { + "pass": 5 + } }, { "suite": "personalization", "jobs": 1, - "elf_status_counts": {"pass": 1}, - "qmd_status_counts": {"pass": 1} + "elf_status_counts": { + "pass": 1 + }, + "qmd_status_counts": { + "pass": 1 + } }, { "suite": "memory_evolution", "jobs": 6, - "elf_status_counts": {"pass": 1, "wrong_result": 5}, - "qmd_status_counts": {"wrong_result": 6} + "elf_status_counts": { + "pass": 1, + "wrong_result": 5 + }, + "qmd_status_counts": { + "wrong_result": 6 + } }, { "suite": "capture_integration", "jobs": 2, - "elf_status_counts": {"not_encoded": 2}, - "qmd_status_counts": {"not_encoded": 2} + "elf_status_counts": { + "not_encoded": 2 + }, + "qmd_status_counts": { + "not_encoded": 2 + } }, { "suite": "consolidation", "jobs": 4, - "elf_status_counts": {"not_encoded": 4}, - "qmd_status_counts": {"not_encoded": 4} + "elf_status_counts": { + "not_encoded": 4 + }, + "qmd_status_counts": { + "not_encoded": 4 + } }, { "suite": "knowledge_compilation", "jobs": 2, - "elf_status_counts": {"not_encoded": 2}, - "qmd_status_counts": {"not_encoded": 2} + "elf_status_counts": { + "not_encoded": 2 + }, + "qmd_status_counts": { + "not_encoded": 2 + } }, { "suite": "operator_debugging_ux", "jobs": 1, - "elf_status_counts": {"not_encoded": 1}, - "qmd_status_counts": {"not_encoded": 1} + "elf_status_counts": { + "not_encoded": 1 + }, + "qmd_status_counts": { + "not_encoded": 1 + } }, { "suite": "production_ops", "jobs": 6, - "elf_status_counts": {"blocked": 2, "not_encoded": 4}, - "qmd_status_counts": {"blocked": 2, "not_encoded": 4} + "elf_status_counts": { + "blocked": 2, + "not_encoded": 4 + }, + "qmd_status_counts": { + "blocked": 2, + "not_encoded": 4 + } } ], "adapter_ledger": { - "adapter_records": 21, + "adapter_records": 23, "unique_project_names": 17, "external_project_count_note": "The generated report field external_project_count reports unique non-ELF project names after the XY-900 runner repair; the manifest has 16 external projects and 17 total project names including ELF.", "evidence_class_counts": { "fixture_backed": 1, "live_baseline_only": 6, - "live_real_world": 3, + "live_real_world": 5, "research_gate": 11 }, "overall_status_counts": { - "pass": 3, - "wrong_result": 5, + "pass": 4, + "wrong_result": 6, "lifecycle_fail": 1, "blocked": 5, "not_encoded": 7 }, - "xy900_update_note": "XY-900 promotes graphify from research_gate/blocked to a tiny scored live_real_world wrong_result smoke; broad graph/RAG quality remains unproven." + "xy900_update_note": "XY-900 promotes graphify from research_gate/blocked to a tiny scored live_real_world wrong_result smoke; broad graph/RAG quality remains unproven.", + "xy932_update_note": "XY-932 adds narrow ELF/qmd operator-debug live_real_world records: ELF pass and qmd wrong_result for trace hydration/candidate-drop visibility, with OpenMemory and claude-mem UI still unmeasured." }, "claim_boundary": { "elf_vs_qmd": "near_tie_with_narrow_delete_ttl_elf_lead_not_overall_win", diff --git a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json index a741778a..f67d9d5f 100644 --- a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json +++ b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json @@ -20,20 +20,20 @@ "operator_boundary": "Private corpus and credentialed production-ops checks remain blocked until operator-owned inputs are supplied." }, "manifest_summary": { - "adapter_records": 21, + "adapter_records": 23, "project_count": 17, "evidence_class_counts": { "fixture_backed": 1, "live_baseline_only": 6, - "live_real_world": 3, + "live_real_world": 5, "research_gate": 11 }, "overall_status_counts": { "lifecycle_fail": 1, "blocked": 5, "not_encoded": 7, - "pass": 3, - "wrong_result": 5 + "pass": 4, + "wrong_result": 6 } }, "state_taxonomy": [ @@ -90,12 +90,12 @@ "measured_status": "wrong_result", "proof": { "command": "cargo make real-world-memory-live-adapters", - "artifact": "tmp/real-world-memory/live-adapters/elf-report.md" + "artifact": "tmp/real-world-memory/live-adapters/elf-report.md; tmp/real-world-job/operator-ux-live-adapters/elf-report.md" }, "unsupported_or_blocked_status": { "state": "blocked", "typed_reason": "private_manifest_and_provider_credentials", - "details": "Fixture production-ops keeps private corpus and provider credential gates blocked; live sweep keeps broader non-retrieval suites typed non-pass." + "details": "Fixture production-ops keeps private corpus and provider credential gates blocked; the full live sweep keeps broader non-retrieval suites typed non-pass, while the narrow operator-debug slice now passes." }, "benchmark_before_claim": "A full-suite live_real_world pass plus separate private-corpus and credentialed production-ops evidence is required before broad live parity or production proof claims.", "borrow_if_stronger": "Keep borrowing qmd debug knobs, OpenViking staged trajectory, mem0 history, Letta core memory, and graph/RAG navigation patterns where they remain stronger." @@ -112,14 +112,14 @@ "measured_status": "wrong_result", "proof": { "command": "cargo make real-world-memory-live-adapters", - "artifact": "tmp/real-world-memory/live-adapters/qmd-report.md" + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.md; tmp/real-world-job/operator-ux-live-adapters/qmd-report.md" }, "unsupported_or_blocked_status": { "state": "not_encoded", "typed_reason": "deep_profile_and_non_retrieval_suites_not_encoded", - "details": "The full live sweep passes targeted retrieval suites but keeps memory_evolution wrong_result and several broader suites not_encoded or blocked." + "details": "The full live sweep passes targeted retrieval suites but keeps memory_evolution wrong_result and several broader suites not_encoded or blocked; the narrow operator-debug slice ties replay commands but is wrong_result for trace hydration and candidate-drop visibility." }, - "benchmark_before_claim": "Run qmd deep retrieval/debug profile and full-suite live real-world replay with trace-level diagnostics before claiming ELF wins, ties, or loses on retrieval debugging.", + "benchmark_before_claim": "Keep qmd deep retrieval/debug profiling separate from the narrow operator-debug live slice; no broad ELF-over-qmd or qmd-over-ELF claim is allowed until comparable stage artifacts exist.", "borrow_if_stronger": "Borrow transparent local knobs for query rewriting, weighted fusion, rerank explanation, and command-line replay." }, { @@ -491,11 +491,11 @@ { "scenario_id": "operator_debugging", "scenario": "operator debugging", - "current_elf_evidence": "ELF fixture-backed operator_debugging_ux passes, but ELF live_real_world operator_debugging_ux is not_encoded.", + "current_elf_evidence": "ELF fixture-backed operator_debugging_ux passes, and the narrow live_real_world operator-debug slice passes for trace hydration, candidate-drop visibility, selected-but-not-narrated evidence, replay-command availability, and repair-action clarity.", "strongest_competitor_or_reference": "qmd, claude-mem, OpenMemory", - "current_competitor_evidence": "qmd has local debug strengths but operator_debugging_ux is not_encoded in live sweeps; claude-mem and OpenMemory UX are not_encoded.", - "current_state": "Operator debugging remains mostly product/UX evidence, not comparable live benchmark evidence.", - "next_measurement": "Score trace hydration, candidate-stage attribution, raw-SQL avoidance, and repair-action clarity through live viewer or CLI artifacts." + "current_competitor_evidence": "qmd now has a narrow live_real_world operator-debug slice: replay-command availability and repair-action clarity pass, but trace hydration, candidate-drop stage visibility, and selected-but-not-narrated evidence are wrong_result. claude-mem and OpenMemory UX remain not_encoded or blocked.", + "current_state": "ELF has a narrow comparable live win over qmd for trace hydration and candidate-drop visibility, while OpenMemory and claude-mem UI workflows remain unmeasured.", + "next_measurement": "Add bounded OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim." }, { "scenario_id": "capture_write_policy", diff --git a/scripts/real-world-operator-debug-live-adapters.sh b/scripts/real-world-operator-debug-live-adapters.sh new file mode 100755 index 00000000..f027fe4d --- /dev/null +++ b/scripts/real-world-operator-debug-live-adapters.sh @@ -0,0 +1,129 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_OPERATOR_DEBUG_LIVE_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-job/operator-ux-live-adapters}" +FIXTURE_DIR="${ELF_OPERATOR_DEBUG_LIVE_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux}" +WORK_DIR="${ELF_OPERATOR_DEBUG_LIVE_WORK_DIR:-/bench/operator-debug-live-adapters}" +QMD_DIR="${ELF_OPERATOR_DEBUG_QMD_DIR:-/bench/repos/qmd}" + +if [[ ! -f "/.dockerenv" && "${ELF_OPERATOR_DEBUG_LIVE_ALLOW_HOST:-0}" != "1" ]]; then + echo "Refusing to run operator-debug live adapters outside Docker. Use cargo make real-world-job-operator-ux-live-adapters." >&2 + exit 1 +fi + +for cmd in bash cargo git jq npm npx; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd} in operator-debug live adapter runner." >&2 + exit 1 + fi +done + +mkdir -p "${REPORT_DIR}" "${WORK_DIR}" +rm -rf "${REPORT_DIR:?}/elf-fixtures" \ + "${REPORT_DIR:?}/qmd-fixtures" \ + "${REPORT_DIR:?}/elf-materialization.json" \ + "${REPORT_DIR:?}/qmd-materialization.json" \ + "${REPORT_DIR:?}/elf-report.json" \ + "${REPORT_DIR:?}/elf-report.md" \ + "${REPORT_DIR:?}/qmd-report.json" \ + "${REPORT_DIR:?}/qmd-report.md" \ + "${REPORT_DIR:?}/summary.json" + +cd "${ROOT_DIR}" + +cargo run -p elf-eval --bin real_world_live_adapter -- elf \ + --fixtures "${FIXTURE_DIR}" \ + --out-fixtures "${REPORT_DIR}/elf-fixtures" \ + --evidence-out "${REPORT_DIR}/elf-materialization.json" \ + --config config/local/elf.docker.toml \ + --adapter-id elf_operator_debug_live + +cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures "${REPORT_DIR}/elf-fixtures" \ + --out "${REPORT_DIR}/elf-report.json" \ + --run-id real-world-operator-debug-live-elf \ + --adapter-id elf_operator_debug_live \ + --adapter-name "ELF live operator-debug service adapter" \ + --adapter-behavior live_operator_debug_adapter \ + --adapter-storage-status pass \ + --adapter-runtime-status pass \ + --adapter-notes "Materialized by real_world_live_adapter through ElfService, worker indexing, search_raw trace ids, and operator-debug trace metadata." + +cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report "${REPORT_DIR}/elf-report.json" \ + --out "${REPORT_DIR}/elf-report.md" + +cargo run -p elf-eval --bin real_world_live_adapter -- qmd \ + --fixtures "${FIXTURE_DIR}" \ + --out-fixtures "${REPORT_DIR}/qmd-fixtures" \ + --evidence-out "${REPORT_DIR}/qmd-materialization.json" \ + --qmd-dir "${QMD_DIR}" \ + --work-dir "${WORK_DIR}/qmd" \ + --adapter-id qmd_operator_debug_live + +cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures "${REPORT_DIR}/qmd-fixtures" \ + --out "${REPORT_DIR}/qmd-report.json" \ + --run-id real-world-operator-debug-live-qmd \ + --adapter-id qmd_operator_debug_live \ + --adapter-name "qmd live operator-debug CLI adapter" \ + --adapter-behavior live_operator_debug_adapter \ + --adapter-storage-status pass \ + --adapter-runtime-status pass \ + --adapter-notes "Materialized by real_world_live_adapter through qmd collection add, update, embed, query --json, and local replay command metadata; ELF trace/viewer surfaces are not inferred." + +cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report "${REPORT_DIR}/qmd-report.json" \ + --out "${REPORT_DIR}/qmd-report.md" + +jq -n \ + --slurpfile elf_materialization "${REPORT_DIR}/elf-materialization.json" \ + --slurpfile qmd_materialization "${REPORT_DIR}/qmd-materialization.json" \ + --slurpfile elf_report "${REPORT_DIR}/elf-report.json" \ + --slurpfile qmd_report "${REPORT_DIR}/qmd-report.json" \ + '{ + schema: "elf.real_world_operator_debug_live_adapter_sweep/v1", + generated_at: (now | todateiso8601), + artifact_dir: (env.ELF_OPERATOR_DEBUG_LIVE_REPORT_DIR // "tmp/real-world-job/operator-ux-live-adapters"), + fixture_dir: (env.ELF_OPERATOR_DEBUG_LIVE_FIXTURES // "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux"), + adapters: [ + { + adapter_id: "elf_operator_debug_live", + evidence_class: "live_real_world", + materialization: $elf_materialization[0], + report: { + json: "tmp/real-world-job/operator-ux-live-adapters/elf-report.json", + markdown: "tmp/real-world-job/operator-ux-live-adapters/elf-report.md", + summary: $elf_report[0].summary, + suites: $elf_report[0].suites + } + }, + { + adapter_id: "qmd_operator_debug_live", + evidence_class: "live_real_world", + materialization: $qmd_materialization[0], + report: { + json: "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json", + markdown: "tmp/real-world-job/operator-ux-live-adapters/qmd-report.md", + summary: $qmd_report[0].summary, + suites: $qmd_report[0].suites + } + } + ], + scenario_dimensions: [ + "trace_available", + "replay_command_available", + "candidate_drop_visibility", + "repair_action_clarity", + "raw_sql_needed" + ], + boundary: "This narrow sweep scores operator-debugging fixtures only. It does not change core ranking, launch OpenMemory or claude-mem UI flows, or convert fixture-only UX evidence into broad product superiority." + }' >"${REPORT_DIR}/summary.json" + +echo "Operator-debug live adapter reports:" +echo " ${REPORT_DIR}/elf-report.json" +echo " ${REPORT_DIR}/elf-report.md" +echo " ${REPORT_DIR}/qmd-report.json" +echo " ${REPORT_DIR}/qmd-report.md" +echo " ${REPORT_DIR}/summary.json"