diff --git a/Makefile.toml b/Makefile.toml index 838c9a33..21568da1 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -356,14 +356,17 @@ args = [ # Real-world job benchmark smoke -# | task | type | cwd | -# | --------------------------- | --------- | --- | -# | real-world-job-smoke | composite | | -# | real-world-job-smoke-json | command | | -# | real-world-job-smoke-report | command | | -# | real-world-memory | composite | | -# | real-world-memory-json | command | | -# | real-world-memory-report | command | | +# | task | type | cwd | +# | -------------------------------- | --------- | --- | +# | real-world-job-smoke | composite | | +# | real-world-job-smoke-json | command | | +# | real-world-job-smoke-report | command | | +# | real-world-memory | composite | | +# | real-world-memory-json | command | | +# | real-world-memory-report | command | | +# | real-world-job-operator-ux | composite | | +# | real-world-job-operator-ux-json | command | | +# | real-world-job-operator-ux-report | command | | [tasks.real-world-job-smoke] workspace = false @@ -457,6 +460,55 @@ args = [ "tmp/real-world-memory/real-world-memory-report.md", ] +[tasks.real-world-job-operator-ux] +workspace = false +dependencies = [ + "real-world-job-operator-ux-report", +] + +[tasks.real-world-job-operator-ux-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux", + "--out", + "tmp/real-world-job/real-world-job-operator-ux-report.json", + "--run-id", + "real-world-job-operator-ux", + "--adapter-id", + "fixture_operator_ux", + "--adapter-name", + "ELF operator UX fixture", +] + +[tasks.real-world-job-operator-ux-report] +workspace = false +dependencies = [ + "real-world-job-operator-ux-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-job/real-world-job-operator-ux-report.json", + "--out", + "tmp/real-world-job/real-world-job-operator-ux-report.md", +] + # Meta # | task | type | cwd | diff --git a/apps/elf-api/src/routes.rs b/apps/elf-api/src/routes.rs index 2f6e6516..3887ba2d 100644 --- a/apps/elf-api/src/routes.rs +++ b/apps/elf-api/src/routes.rs @@ -2969,6 +2969,8 @@ mod tests { assert!(html.contains("Providers And Ranking")); assert!(html.contains("Relation Context")); assert!(html.contains("directTraceId")); + assert!(html.contains("trace_id")); + assert!(html.contains("loadInitialTrace")); assert!(!html.contains("method: \"PATCH\"")); assert!(!html.contains("method: \"PUT\"")); assert!(!html.contains("method: \"DELETE\"")); diff --git a/apps/elf-api/static/viewer.html b/apps/elf-api/static/viewer.html index f25cb956..05de83af 100644 --- a/apps/elf-api/static/viewer.html +++ b/apps/elf-api/static/viewer.html @@ -1506,6 +1506,30 @@

Recent Traces

$$(".nav button").forEach((node) => node.classList.toggle("active", node.dataset.tab === tabId)); } + function initialTraceId() { + const params = new URLSearchParams(window.location.search); + const queryTrace = params.get("trace_id") || params.get("traceId"); + if (queryTrace && queryTrace.trim()) { + return queryTrace.trim(); + } + const hash = window.location.hash.replace(/^#/, ""); + if (!hash) { + return ""; + } + const hashParams = new URLSearchParams(hash.includes("=") ? hash : `trace_id=${hash}`); + const hashTrace = hashParams.get("trace_id") || hashParams.get("traceId"); + return hashTrace ? hashTrace.trim() : ""; + } + + async function loadInitialTrace() { + const traceId = initialTraceId(); + if (!traceId) { + return; + } + showTab("tracesView"); + await loadTraceBundle(traceId, $("#traceBundleDetail")); + } + async function refreshActive() { if (state.activeTab === "searchView") { if (state.session) { @@ -1537,6 +1561,7 @@

Recent Traces

loadContext(); bindEvents(); + loadInitialTrace(); diff --git a/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/dropped_evidence_filter.json b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/dropped_evidence_filter.json new file mode 100644 index 00000000..32daf4f8 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/dropped_evidence_filter.json @@ -0,0 +1,124 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "operator-debug-dropped-evidence-001", + "suite": "operator_debugging_ux", + "title": "Debug expected evidence dropped after recall filtering", + "corpus": { + "corpus_id": "operator-debugging-ux-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "trace-dropped-expected", + "kind": "trace", + "text": "Trace 11111111-1111-4111-8111-111111111111 shows the expected note present in recall.candidates before service-side filtering and absent after the read-profile scope filter.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-dropped-expected"}}, + "created_at": "2026-06-09T02:00:00Z" + }, + { + "evidence_id": "trace-dropped-decoy", + "kind": "note", + "text": "Decoy note: the auth retry policy note ranked first but does not explain the missing expected deployment evidence.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-dropped-decoy"}}, + "created_at": "2026-06-09T02:01:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_operator_ux", + "answer": { + "content": "The auth retry policy note is the root cause; no expected deployment evidence was dropped.", + "claims": [ + { + "claim_id": "wrong_root_cause", + "text": "No expected evidence was dropped.", + "evidence_ids": ["trace-dropped-decoy"], + "confidence": "high" + } + ], + "evidence_ids": ["trace-dropped-decoy"], + "latency_ms": 2.4, + "cost": {"currency": "USD", "amount": 0.0, "input_tokens": 0, "output_tokens": 0} + } + } + }, + "timeline": [ + { + "event_id": "expected-evidence-recalled", + "ts": "2026-06-09T02:00:00Z", + "actor": "system", + "action": "captured_trace", + "evidence_ids": ["trace-dropped-expected"], + "summary": "The trace captured recall-stage visibility for the expected evidence before filtering." + } + ], + "prompt": { + "role": "user", + "content": "Why did the memory result miss the expected deployment evidence?", + "job_mode": "debug", + "constraints": ["cite_evidence", "avoid_repeating_completed_work"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "root_cause", + "text": "The expected evidence was dropped after recall by the read-profile filter." + } + ], + "must_not_include": ["No expected deployment evidence was dropped."], + "evidence_links": {"root_cause": ["trace-dropped-expected"]}, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "trace-dropped-expected", + "claim_id": "root_cause", + "requirement": "explain", + "quote": "present in recall.candidates before service-side filtering and absent after the read-profile scope filter" + } + ], + "negative_traps": [ + { + "trap_id": "decoy-top-auth-note", + "type": "decoy_evidence", + "evidence_ids": ["trace-dropped-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "debuggability": {"weight": 0.35, "max_points": 1.0, "criteria": "Identifies the trace stage that dropped expected evidence."}, + "evidence_grounding": {"weight": 0.3, "max_points": 1.0, "criteria": "Uses trace evidence rather than the decoy top note."}, + "workflow_helpfulness": {"weight": 0.2, "max_points": 1.0, "criteria": "Names a concrete repair action."}, + "answer_correctness": {"weight": 0.15, "max_points": 1.0, "criteria": "Reports the correct root cause."} + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "expected_evidence_dropped", + "trace_id": "11111111-1111-4111-8111-111111111111", + "viewer_url": "/viewer?trace_id=11111111-1111-4111-8111-111111111111", + "admin_trace_bundle_url": "/v2/admin/traces/11111111-1111-4111-8111-111111111111/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + "root_cause": "The expected candidate survived recall but was removed by the read-profile scope filter before final selection.", + "steps_to_root_cause": 4, + "raw_sql_needed": false, + "dropped_candidate_visibility": "visible in Retrieval Funnel and Replay Candidates", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "viewer_panels": ["Trace", "Retrieval Funnel", "Replay Candidates", "Stage Details"], + "cli_steps": ["open viewer trace link", "compare recall before and after filter", "inspect replay candidates", "repair read profile or grant"], + "trace_evidence": ["trace-dropped-expected"], + "ux_gaps": [] + }, + "tags": ["synthetic", "operator_debugging_ux", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/provider_latency_failure.json b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/provider_latency_failure.json new file mode 100644 index 00000000..c1562e83 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/provider_latency_failure.json @@ -0,0 +1,107 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "operator-debug-provider-latency-001", + "suite": "operator_debugging_ux", + "title": "Debug provider latency degrading retrieval quality", + "corpus": { + "corpus_id": "operator-debugging-ux-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "trace-provider-timeout", + "kind": "trace", + "text": "Trace 33333333-3333-4333-8333-333333333333 records provider metadata with embedding provider latency near timeout and expansion fallback to the original query only.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-provider-timeout"}}, + "created_at": "2026-06-09T02:10:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_operator_ux", + "answer": { + "content": "Provider latency caused expansion fallback to the original query only, which reduced candidate recall.", + "claims": [ + { + "claim_id": "root_cause", + "text": "Provider latency caused expansion fallback to the original query only.", + "evidence_ids": ["trace-provider-timeout"], + "confidence": "high" + } + ], + "evidence_ids": ["trace-provider-timeout"], + "latency_ms": 4.8, + "cost": {"currency": "USD", "amount": 0.0, "input_tokens": 0, "output_tokens": 0} + } + } + }, + "timeline": [ + { + "event_id": "provider-timeout-recorded", + "ts": "2026-06-09T02:10:00Z", + "actor": "system", + "action": "captured_trace", + "evidence_ids": ["trace-provider-timeout"], + "summary": "Provider metadata and stage details recorded degraded expansion behavior." + } + ], + "prompt": { + "role": "user", + "content": "Why did recall get worse during the slow provider window?", + "job_mode": "debug", + "constraints": ["cite_evidence", "state_blockers"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "root_cause", + "text": "Provider latency caused expansion fallback to the original query only." + } + ], + "must_not_include": ["The corpus did not contain the expected evidence."], + "evidence_links": {"root_cause": ["trace-provider-timeout"]}, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "trace-provider-timeout", + "claim_id": "root_cause", + "requirement": "explain", + "quote": "embedding provider latency near timeout and expansion fallback to the original query only" + } + ], + "negative_traps": [], + "scoring_rubric": { + "dimensions": { + "debuggability": {"weight": 0.35, "max_points": 1.0, "criteria": "Uses provider and stage metadata."}, + "evidence_grounding": {"weight": 0.3, "max_points": 1.0, "criteria": "Cites trace provider metadata."}, + "workflow_helpfulness": {"weight": 0.2, "max_points": 1.0, "criteria": "Suggests timeout or provider health repair."}, + "latency_resource": {"weight": 0.15, "max_points": 1.0, "criteria": "Reports latency as part of the root cause."} + }, + "pass_threshold": 0.8, + "hard_fail_rules": ["unsupported high-confidence claim about a required decision or fact"] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "provider_latency_or_failure", + "trace_id": "33333333-3333-4333-8333-333333333333", + "viewer_url": "/viewer?trace_id=33333333-3333-4333-8333-333333333333", + "admin_trace_bundle_url": "/v2/admin/traces/33333333-3333-4333-8333-333333333333/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + "root_cause": "Provider latency forced fallback behavior, shrinking expanded-query recall.", + "steps_to_root_cause": 3, + "raw_sql_needed": false, + "dropped_candidate_visibility": "visible as low recall counts rather than a post-recall drop", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "viewer_panels": ["Providers And Ranking", "Stage Summary", "Stage Details"], + "cli_steps": ["open trace bundle", "inspect provider metadata", "compare expanded queries", "raise timeout or repair provider health"], + "trace_evidence": ["trace-provider-timeout"], + "ux_gaps": [] + }, + "tags": ["synthetic", "operator_debugging_ux", "agentmemory_reference", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/rebuild_changed_results.json b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/rebuild_changed_results.json new file mode 100644 index 00000000..abd8c048 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/rebuild_changed_results.json @@ -0,0 +1,135 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "operator-debug-rebuild-changed-results-001", + "suite": "operator_debugging_ux", + "title": "Debug result changes after Qdrant rebuild", + "corpus": { + "corpus_id": "operator-debugging-ux-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "trace-before-rebuild", + "kind": "trace", + "text": "Before rebuild, trace 44444444-4444-4444-8444-444444444440 returned an orphan Qdrant candidate that no longer had an active source-of-truth note.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-before-rebuild"}}, + "created_at": "2026-06-09T02:15:00Z" + }, + { + "evidence_id": "trace-after-rebuild", + "kind": "trace", + "text": "After rebuild, trace 44444444-4444-4444-8444-444444444444 shows the orphan candidate removed and the active Postgres-backed note selected.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-after-rebuild"}}, + "created_at": "2026-06-09T02:20:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_operator_ux", + "answer": { + "content": "Rebuild changed results because a stale derived-index candidate was removed and the active Postgres-backed note became top result.", + "claims": [ + { + "claim_id": "root_cause", + "text": "Qdrant rebuild removed a stale derived-index candidate and selected the active source-of-truth note.", + "evidence_ids": ["trace-before-rebuild", "trace-after-rebuild"], + "confidence": "high" + } + ], + "evidence_ids": ["trace-before-rebuild", "trace-after-rebuild"], + "latency_ms": 3.3, + "cost": {"currency": "USD", "amount": 0.0, "input_tokens": 0, "output_tokens": 0} + } + } + }, + "timeline": [ + { + "event_id": "before-rebuild-trace", + "ts": "2026-06-09T02:15:00Z", + "actor": "system", + "action": "captured_trace", + "evidence_ids": ["trace-before-rebuild"], + "summary": "The pre-rebuild trace included a stale derived-index candidate." + }, + { + "event_id": "after-rebuild-trace", + "ts": "2026-06-09T02:20:00Z", + "actor": "system", + "action": "captured_trace", + "evidence_ids": ["trace-after-rebuild"], + "summary": "The post-rebuild trace selected only source-of-truth-backed evidence." + } + ], + "prompt": { + "role": "user", + "content": "Why did search change after rebuild?", + "job_mode": "debug", + "constraints": ["cite_evidence"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "root_cause", + "text": "Qdrant rebuild removed a stale derived-index candidate and selected the active source-of-truth note." + } + ], + "must_not_include": ["Postgres source-of-truth changed during rebuild."], + "evidence_links": {"root_cause": ["trace-before-rebuild", "trace-after-rebuild"]}, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "trace-before-rebuild", + "claim_id": "root_cause", + "requirement": "explain", + "quote": "orphan Qdrant candidate that no longer had an active source-of-truth note" + }, + { + "evidence_id": "trace-after-rebuild", + "claim_id": "root_cause", + "requirement": "explain", + "quote": "orphan candidate removed and the active Postgres-backed note selected" + } + ], + "negative_traps": [ + { + "trap_id": "treat-qdrant-as-source-of-truth", + "type": "unsupported_prior", + "evidence_ids": ["trace-before-rebuild"], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "debuggability": {"weight": 0.3, "max_points": 1.0, "criteria": "Compares before and after trace evidence."}, + "evidence_grounding": {"weight": 0.3, "max_points": 1.0, "criteria": "Uses both rebuild traces."}, + "workflow_helpfulness": {"weight": 0.25, "max_points": 1.0, "criteria": "Explains source-of-truth versus derived index repair."}, + "answer_correctness": {"weight": 0.15, "max_points": 1.0, "criteria": "Does not claim Postgres changed."} + }, + "pass_threshold": 0.8, + "hard_fail_rules": ["unsupported high-confidence claim about a required decision or fact"] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "rebuild_changed_results", + "trace_id": "44444444-4444-4444-8444-444444444444", + "viewer_url": "/viewer?trace_id=44444444-4444-4444-8444-444444444444", + "admin_trace_bundle_url": "/v2/admin/traces/44444444-4444-4444-8444-444444444444/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + "root_cause": "Rebuild removed stale derived-index state and restored source-of-truth-backed ranking.", + "steps_to_root_cause": 5, + "raw_sql_needed": false, + "dropped_candidate_visibility": "visible by comparing before and after trace candidates", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "viewer_panels": ["Trace", "Replay Candidates", "Selected Final Results"], + "cli_steps": ["open before trace", "open after trace", "compare replay candidates", "confirm active note selected", "keep Qdrant rebuild as repair"], + "trace_evidence": ["trace-before-rebuild", "trace-after-rebuild"], + "ux_gaps": [] + }, + "tags": ["synthetic", "operator_debugging_ux", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/relation_context_mislead.json b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/relation_context_mislead.json new file mode 100644 index 00000000..8bdc01e5 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/relation_context_mislead.json @@ -0,0 +1,121 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "operator-debug-relation-context-mislead-001", + "suite": "operator_debugging_ux", + "title": "Debug relation context that misleads search", + "corpus": { + "corpus_id": "operator-debugging-ux-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "trace-relation-context", + "kind": "trace", + "text": "Trace 55555555-5555-4555-8555-555555555555 includes relation_context with deprecated predicate deployment_owner pointing to a stale owner, while the selected note text says the current owner is release engineering.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-relation-context"}}, + "created_at": "2026-06-09T02:25:00Z" + }, + { + "evidence_id": "stale-relation-fact", + "kind": "adapter_state", + "text": "Stale graph fact: deployment_owner points to the old infra group and should not drive the current answer.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "stale-relation-fact"}}, + "created_at": "2026-06-08T02:25:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_operator_ux", + "answer": { + "content": "Relation context misled the search because a deprecated deployment_owner fact conflicted with the selected note text.", + "claims": [ + { + "claim_id": "root_cause", + "text": "A deprecated relation_context fact conflicted with the selected note text.", + "evidence_ids": ["trace-relation-context"], + "confidence": "high" + } + ], + "evidence_ids": ["trace-relation-context"], + "latency_ms": 2.9, + "cost": {"currency": "USD", "amount": 0.0, "input_tokens": 0, "output_tokens": 0} + } + } + }, + "timeline": [ + { + "event_id": "relation-context-trace", + "ts": "2026-06-09T02:25:00Z", + "actor": "system", + "action": "captured_trace", + "evidence_ids": ["trace-relation-context"], + "summary": "The trace captured relation_context and selected note text for the misleading result." + } + ], + "prompt": { + "role": "user", + "content": "Why did graph context point to the wrong owner?", + "job_mode": "debug", + "constraints": ["cite_evidence"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "root_cause", + "text": "A deprecated relation_context fact conflicted with the selected note text." + } + ], + "must_not_include": ["The old infra group is the current owner."], + "evidence_links": {"root_cause": ["trace-relation-context"]}, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "trace-relation-context", + "claim_id": "root_cause", + "requirement": "explain", + "quote": "relation_context with deprecated predicate deployment_owner pointing to a stale owner" + } + ], + "negative_traps": [ + { + "trap_id": "trust-stale-relation", + "type": "stale_fact", + "evidence_ids": ["stale-relation-fact"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "debuggability": {"weight": 0.35, "max_points": 1.0, "criteria": "Uses relation context panel evidence."}, + "evidence_grounding": {"weight": 0.3, "max_points": 1.0, "criteria": "Cites trace relation_context evidence."}, + "workflow_helpfulness": {"weight": 0.2, "max_points": 1.0, "criteria": "Suggests relation invalidation or predicate repair."}, + "answer_correctness": {"weight": 0.15, "max_points": 1.0, "criteria": "Does not trust the stale owner."} + }, + "pass_threshold": 0.8, + "hard_fail_rules": ["unsupported high-confidence claim about a required decision or fact", "use of a negative trap marked failure_if_used = true"] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "relation_context_misled_search", + "trace_id": "55555555-5555-4555-8555-555555555555", + "viewer_url": "/viewer?trace_id=55555555-5555-4555-8555-555555555555", + "admin_trace_bundle_url": "/v2/admin/traces/55555555-5555-4555-8555-555555555555/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + "root_cause": "A deprecated graph relation remained visible in relation_context and conflicted with the selected note text.", + "steps_to_root_cause": 4, + "raw_sql_needed": false, + "dropped_candidate_visibility": "not dropped; misleading context is visible on selected result", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "viewer_panels": ["Selected Final Results", "Relation Context", "Stage Details"], + "cli_steps": ["open trace link", "inspect selected result relation count", "open Relation Context", "invalidate stale relation fact"], + "trace_evidence": ["trace-relation-context"], + "ux_gaps": [] + }, + "tags": ["synthetic", "operator_debugging_ux", "claude_mem_reference", "openmemory_reference", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/rerank_bad_candidate.json b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/rerank_bad_candidate.json new file mode 100644 index 00000000..5be298b7 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/rerank_bad_candidate.json @@ -0,0 +1,121 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "operator-debug-rerank-bad-candidate-001", + "suite": "operator_debugging_ux", + "title": "Debug rerank promotion of a bad candidate", + "corpus": { + "corpus_id": "operator-debugging-ux-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "trace-rerank-promotion", + "kind": "trace", + "text": "Trace 22222222-2222-4222-8222-222222222222 shows the correct candidate at retrieval rank 2 and the decoy at retrieval rank 5, then rerank.score promotes the decoy above the correct candidate.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-rerank-promotion"}}, + "created_at": "2026-06-09T02:05:00Z" + }, + { + "evidence_id": "rerank-decoy-note", + "kind": "note", + "text": "Decoy note: deployment retry discussion shares query terms but belongs to a different project.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "rerank-decoy-note"}}, + "created_at": "2026-06-09T02:06:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_operator_ux", + "answer": { + "content": "The wrong result came from rerank.score promoting a cross-project decoy over the correct retrieval candidate.", + "claims": [ + { + "claim_id": "root_cause", + "text": "Rerank promoted a cross-project decoy above the correct retrieval candidate.", + "evidence_ids": ["trace-rerank-promotion"], + "confidence": "high" + } + ], + "evidence_ids": ["trace-rerank-promotion"], + "latency_ms": 2.1, + "cost": {"currency": "USD", "amount": 0.0, "input_tokens": 0, "output_tokens": 0} + } + } + }, + "timeline": [ + { + "event_id": "rerank-trace-captured", + "ts": "2026-06-09T02:05:00Z", + "actor": "system", + "action": "captured_trace", + "evidence_ids": ["trace-rerank-promotion"], + "summary": "The trace captured retrieval ranks and rerank scores for the correct and decoy candidates." + } + ], + "prompt": { + "role": "user", + "content": "Explain why the wrong note ranked first.", + "job_mode": "debug", + "constraints": ["cite_evidence"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "root_cause", + "text": "Rerank promoted a cross-project decoy above the correct retrieval candidate." + } + ], + "must_not_include": ["The correct candidate was missing from retrieval."], + "evidence_links": {"root_cause": ["trace-rerank-promotion"]}, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "trace-rerank-promotion", + "claim_id": "root_cause", + "requirement": "explain", + "quote": "rerank.score promotes the decoy above the correct candidate" + } + ], + "negative_traps": [ + { + "trap_id": "accept-decoy-as-answer", + "type": "decoy_evidence", + "evidence_ids": ["rerank-decoy-note"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "debuggability": {"weight": 0.35, "max_points": 1.0, "criteria": "Uses rerank and replay candidate evidence."}, + "evidence_grounding": {"weight": 0.3, "max_points": 1.0, "criteria": "Cites the trace rather than the decoy note."}, + "workflow_helpfulness": {"weight": 0.2, "max_points": 1.0, "criteria": "Suggests rerank or scope repair."}, + "answer_correctness": {"weight": 0.15, "max_points": 1.0, "criteria": "Names rerank promotion as the cause."} + }, + "pass_threshold": 0.8, + "hard_fail_rules": ["unsupported high-confidence claim about a required decision or fact", "use of a negative trap marked failure_if_used = true"] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "rerank_promoted_bad_candidate", + "trace_id": "22222222-2222-4222-8222-222222222222", + "viewer_url": "/viewer?trace_id=22222222-2222-4222-8222-222222222222", + "admin_trace_bundle_url": "/v2/admin/traces/22222222-2222-4222-8222-222222222222/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + "root_cause": "The correct item was in the candidate set, but rerank.score elevated a cross-project decoy.", + "steps_to_root_cause": 3, + "raw_sql_needed": false, + "dropped_candidate_visibility": "not dropped; visible with lower final rank in Replay Candidates", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "viewer_panels": ["Selected Final Results", "Replay Candidates", "Providers And Ranking"], + "cli_steps": ["open trace bundle", "compare retrieval rank with final rank", "inspect rerank score", "tighten scope or rerank inputs"], + "trace_evidence": ["trace-rerank-promotion"], + "ux_gaps": [] + }, + "tags": ["synthetic", "operator_debugging_ux", "qmd_reference", "no_live_claim"] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 2f92dd55..59ee9bd2 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -105,6 +105,7 @@ struct RealWorldJob { negative_traps: Vec, scoring_rubric: ScoringRubric, allowed_uncertainty: AllowedUncertainty, + operator_debug: Option, #[serde(default)] tags: Vec, } @@ -314,6 +315,39 @@ struct CostReport { output_tokens: Option, } +#[derive(Clone, Debug, Deserialize, Serialize)] +struct OperatorDebugEvidence { + failure_mode: String, + #[serde(skip_serializing_if = "Option::is_none")] + trace_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + viewer_url: Option, + #[serde(skip_serializing_if = "Option::is_none")] + admin_trace_bundle_url: Option, + root_cause: String, + steps_to_root_cause: u32, + raw_sql_needed: bool, + dropped_candidate_visibility: String, + trace_completeness: String, + repair_action_clarity: String, + #[serde(default)] + viewer_panels: Vec, + #[serde(default)] + cli_steps: Vec, + #[serde(default)] + trace_evidence: Vec, + #[serde(default)] + ux_gaps: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct OperatorUxGap { + gap_id: String, + severity: String, + description: String, + follow_up_issue: String, +} + #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] #[serde(rename_all = "snake_case")] enum TypedStatus { @@ -402,6 +436,14 @@ struct ReportSummary { qdrant_rebuild_case_count: usize, #[serde(default)] qdrant_rebuild_pass_count: usize, + #[serde(default)] + operator_debug_job_count: usize, + #[serde(default)] + raw_sql_needed_count: usize, + #[serde(default)] + trace_incomplete_count: usize, + #[serde(default)] + operator_ux_gap_count: usize, } #[derive(Debug, Deserialize, Serialize)] @@ -457,6 +499,8 @@ struct JobReport { redaction_leak_count: usize, #[serde(default)] qdrant_rebuild_case: bool, + #[serde(skip_serializing_if = "Option::is_none")] + operator_debug: Option, } #[derive(Debug, Deserialize, Serialize)] @@ -509,6 +553,10 @@ struct FailureCounts { missing_evidence: usize, trap_uses: usize, unsupported_claims: usize, + operator_debug_missing: usize, + operator_debug_raw_sql: usize, + operator_debug_trace_gaps: usize, + operator_debug_repair_unclear: usize, } #[derive(Debug, Default)] @@ -627,6 +675,7 @@ fn validate_job(job: &RealWorldJob, path: &Path) -> Result<()> { validate_required_evidence(job, path)?; validate_scoring_rubric(job, path)?; validate_allowed_uncertainty(job, path)?; + validate_operator_debug(job, path)?; Ok(()) } @@ -854,6 +903,68 @@ fn validate_allowed_uncertainty(job: &RealWorldJob, path: &Path) -> Result<()> { Ok(()) } +fn validate_operator_debug(job: &RealWorldJob, path: &Path) -> Result<()> { + let Some(debug) = &job.operator_debug else { + if job.suite == "operator_debugging_ux" { + return Err(eyre::eyre!( + "{} operator_debugging_ux job must include operator_debug.", + path.display() + )); + } + + return Ok(()); + }; + + if debug.failure_mode.trim().is_empty() + || debug.root_cause.trim().is_empty() + || debug.dropped_candidate_visibility.trim().is_empty() + || debug.trace_completeness.trim().is_empty() + || debug.repair_action_clarity.trim().is_empty() + || debug.steps_to_root_cause == 0 + { + return Err(eyre::eyre!("{} has incomplete operator_debug evidence.", path.display())); + } + + validate_optional_debug_field(path, debug.trace_id.as_deref(), "trace_id")?; + validate_optional_debug_field(path, debug.viewer_url.as_deref(), "viewer_url")?; + validate_optional_debug_field( + path, + debug.admin_trace_bundle_url.as_deref(), + "admin_trace_bundle_url", + )?; + validate_non_empty_debug_list(path, &debug.viewer_panels, "viewer_panels")?; + validate_non_empty_debug_list(path, &debug.cli_steps, "cli_steps")?; + validate_non_empty_debug_list(path, &debug.trace_evidence, "trace_evidence")?; + + for gap in &debug.ux_gaps { + if gap.gap_id.trim().is_empty() + || gap.severity.trim().is_empty() + || gap.description.trim().is_empty() + || gap.follow_up_issue.trim().is_empty() + { + return Err(eyre::eyre!("{} has incomplete operator_debug ux_gaps.", path.display())); + } + } + + Ok(()) +} + +fn validate_optional_debug_field(path: &Path, value: Option<&str>, field: &str) -> Result<()> { + if value.is_some_and(|value| value.trim().is_empty()) { + return Err(eyre::eyre!("{} has empty operator_debug {field}.", path.display())); + } + + Ok(()) +} + +fn validate_non_empty_debug_list(path: &Path, values: &[String], field: &str) -> Result<()> { + if values.iter().any(|value| value.trim().is_empty()) { + return Err(eyre::eyre!("{} has empty operator_debug {field} entry.", path.display())); + } + + Ok(()) +} + fn validate_required_rfc3339(value: &str, path: &Path, id: &str) -> Result<()> { if OffsetDateTime::parse(value, &Rfc3339).is_err() { return Err(eyre::eyre!("{} has invalid RFC3339 timestamp for {}.", path.display(), id)); @@ -933,6 +1044,7 @@ fn score_job(job: &RealWorldJob) -> JobScoring { let missing_evidence = missing_required_evidence(job, &produced_evidence); let trap_ids_used = trap_ids_used(job, &produced_evidence); let mut unsupported_claims = unsupported_claims(job, answer); + let operator_counts = operator_debug_failure_counts(job); let hard_fail_hits = hard_fail_hits(job, &unsupported_claims, &trap_ids_used); let counts = FailureCounts { missing_claims: missing_claims.len(), @@ -940,13 +1052,21 @@ fn score_job(job: &RealWorldJob) -> JobScoring { missing_evidence: missing_evidence.len(), trap_uses: trap_ids_used.len(), unsupported_claims: unsupported_claims.len(), + operator_debug_missing: operator_counts.operator_debug_missing, + operator_debug_raw_sql: operator_counts.operator_debug_raw_sql, + operator_debug_trace_gaps: operator_counts.operator_debug_trace_gaps, + operator_debug_repair_unclear: operator_counts.operator_debug_repair_unclear, }; let dimension_scores = dimension_scores(job, &counts); let normalized_score = normalized_score(&dimension_scores); let wrong_result_count = counts.missing_claims + counts.forbidden_claims + counts.missing_evidence - + counts.trap_uses; + + counts.trap_uses + + counts.operator_debug_missing + + counts.operator_debug_raw_sql + + counts.operator_debug_trace_gaps + + counts.operator_debug_repair_unclear; let status = job_status( normalized_score, job.scoring_rubric.pass_threshold, @@ -972,6 +1092,22 @@ fn score_job(job: &RealWorldJob) -> JobScoring { } } +fn operator_debug_failure_counts(job: &RealWorldJob) -> FailureCounts { + let Some(debug) = &job.operator_debug else { + return FailureCounts { + operator_debug_missing: usize::from(job.suite == "operator_debugging_ux"), + ..FailureCounts::default() + }; + }; + + FailureCounts { + operator_debug_raw_sql: usize::from(debug.raw_sql_needed), + operator_debug_trace_gaps: usize::from(debug.trace_completeness != "complete"), + operator_debug_repair_unclear: usize::from(debug.repair_action_clarity != "clear"), + ..FailureCounts::default() + } +} + fn produced_answer(job: &RealWorldJob) -> &ProducedAnswer { job.corpus .adapter_response @@ -1152,12 +1288,20 @@ fn dimension_scores(job: &RealWorldJob, counts: &FailureCounts) -> Vec f64 { let failed = match dimension_id { "answer_correctness" | "workflow_helpfulness" => - counts.missing_claims > 0 || counts.forbidden_claims > 0, + counts.missing_claims > 0 + || counts.forbidden_claims > 0 + || counts.operator_debug_repair_unclear > 0, "evidence_grounding" => counts.missing_evidence > 0 || counts.unsupported_claims > 0, "trap_avoidance" => counts.trap_uses > 0, "uncertainty_handling" => counts.unsupported_claims > 0, "lifecycle_behavior" => false, - "debuggability" | "latency_resource" | "personalization_fit" => + "debuggability" => + counts.missing_claims > 0 + || counts.unsupported_claims > 0 + || counts.operator_debug_missing > 0 + || counts.operator_debug_raw_sql > 0 + || counts.operator_debug_trace_gaps > 0, + "latency_resource" | "personalization_fit" => counts.missing_claims > 0 || counts.unsupported_claims > 0, _ => counts.missing_claims > 0 || counts.unsupported_claims > 0 || counts.trap_uses > 0, }; @@ -1203,6 +1347,10 @@ fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64 + counts.forbidden_claims + counts.missing_evidence + counts.trap_uses + + counts.operator_debug_missing + + counts.operator_debug_raw_sql + + counts.operator_debug_trace_gaps + + counts.operator_debug_repair_unclear ), TypedStatus::WrongResult => format!( "Job produced {} wrong-result signal(s) and normalized_score {normalized_score:.3}.", @@ -1210,6 +1358,10 @@ fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64 + counts.forbidden_claims + counts.missing_evidence + counts.trap_uses + + counts.operator_debug_missing + + counts.operator_debug_raw_sql + + counts.operator_debug_trace_gaps + + counts.operator_debug_repair_unclear ), _ => "Job did not reach a runnable scoring state.".to_string(), } @@ -1248,6 +1400,7 @@ fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { scope_violation_count: metrics.scope_violation_count, redaction_leak_count: metrics.redaction_leak_count, qdrant_rebuild_case: metrics.qdrant_rebuild_case, + operator_debug: job.operator_debug.clone(), } } @@ -1472,6 +1625,22 @@ fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary { .iter() .filter(|job| job.qdrant_rebuild_case && job.status == TypedStatus::Pass) .count(), + operator_debug_job_count: jobs.iter().filter(|job| job.operator_debug.is_some()).count(), + raw_sql_needed_count: jobs + .iter() + .filter_map(|job| job.operator_debug.as_ref()) + .filter(|debug| debug.raw_sql_needed) + .count(), + trace_incomplete_count: jobs + .iter() + .filter_map(|job| job.operator_debug.as_ref()) + .filter(|debug| debug.trace_completeness != "complete") + .count(), + operator_ux_gap_count: jobs + .iter() + .filter_map(|job| job.operator_debug.as_ref()) + .map(|debug| debug.ux_gaps.len()) + .sum(), ..ReportSummary::default() }; @@ -1586,6 +1755,7 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { render_markdown_header(&mut out, report, report_path.as_str()); render_markdown_suites(&mut out, report); render_markdown_jobs(&mut out, report); + render_markdown_operator_debugging(&mut out, report); render_markdown_unsupported_claims(&mut out, report); render_markdown_semantics(&mut out, report); @@ -1661,6 +1831,16 @@ fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_pat optional_f64(report.summary.mean_latency_ms, " ms") )); out.push_str(&format!("- Cost: `{}`\n", cost_display(report.summary.total_cost.as_ref()))); + out.push_str(&format!( + "- Operator-debug jobs: `{}`\n", + report.summary.operator_debug_job_count + )); + out.push_str(&format!("- Raw SQL needed: `{}`\n", report.summary.raw_sql_needed_count)); + out.push_str(&format!( + "- Trace-incomplete debug jobs: `{}`\n", + report.summary.trace_incomplete_count + )); + out.push_str(&format!("- Operator UX gaps: `{}`\n", report.summary.operator_ux_gap_count)); out.push_str(&format!( "- Private corpus redaction: `{}`\n\n", md_inline(report.private_corpus_redaction.policy.as_str()) @@ -1722,6 +1902,94 @@ fn render_markdown_jobs(out: &mut String, report: &RealWorldReport) { out.push('\n'); } +fn render_markdown_operator_debugging(out: &mut String, report: &RealWorldReport) { + let jobs = report.jobs.iter().filter(|job| job.operator_debug.is_some()).collect::>(); + + out.push_str("## Operator Debugging UX\n\n"); + + if jobs.is_empty() { + out.push_str("No encoded job reported operator debugging evidence.\n\n"); + + return; + } + + out.push_str("| Job | Failure Mode | Trace Evidence | Steps | Raw SQL | Dropped Candidate Visibility | Trace Completeness | Repair Clarity | UX Gaps |\n"); + out.push_str("| --- | --- | --- | ---: | --- | --- | --- | --- | --- |\n"); + + for job in jobs { + if let Some(debug) = &job.operator_debug { + out.push_str(&format!( + "| {} | {} | {} | {} | `{}` | {} | `{}` | `{}` | {} |\n", + md_cell(job.job_id.as_str()), + md_cell(debug.failure_mode.as_str()), + debug_trace_cell(debug), + debug.steps_to_root_cause, + debug.raw_sql_needed, + md_cell(debug.dropped_candidate_visibility.as_str()), + md_inline(debug.trace_completeness.as_str()), + md_inline(debug.repair_action_clarity.as_str()), + ux_gap_cell(debug.ux_gaps.as_slice()) + )); + } + } + + out.push_str("\n### Operator Debug Details\n\n"); + + for job in report.jobs.iter().filter(|job| job.operator_debug.is_some()) { + if let Some(debug) = &job.operator_debug { + out.push_str(&format!("#### `{}`\n\n", md_inline(job.job_id.as_str()))); + out.push_str(&format!("- Root cause: {}\n", md_cell(debug.root_cause.as_str()))); + out.push_str(&format!( + "- Viewer panels: `{}`\n", + md_inline(debug.viewer_panels.join(", ").as_str()) + )); + out.push_str(&format!( + "- CLI steps: `{}`\n", + md_inline(debug.cli_steps.join(" -> ").as_str()) + )); + out.push_str(&format!( + "- Trace evidence: `{}`\n", + md_inline(debug.trace_evidence.join(", ").as_str()) + )); + out.push('\n'); + } + } +} + +fn debug_trace_cell(debug: &OperatorDebugEvidence) -> String { + let trace = debug.trace_id.as_deref().unwrap_or("-"); + let viewer = debug + .viewer_url + .as_deref() + .map(|url| format!("[viewer]({})", md_url(url))) + .unwrap_or_else(|| "viewer: -".to_string()); + let bundle = debug + .admin_trace_bundle_url + .as_deref() + .map(|url| format!("[bundle]({})", md_url(url))) + .unwrap_or_else(|| "bundle: -".to_string()); + + format!("`{}`
{}
{}", md_inline(trace), viewer, bundle) +} + +fn ux_gap_cell(gaps: &[OperatorUxGap]) -> String { + if gaps.is_empty() { + return "`none`".to_string(); + } + + gaps.iter() + .map(|gap| { + format!( + "`{}`: {} ({})", + md_inline(gap.gap_id.as_str()), + md_cell(gap.description.as_str()), + md_inline(gap.follow_up_issue.as_str()) + ) + }) + .collect::>() + .join("
") +} + fn render_markdown_unsupported_claims(out: &mut String, report: &RealWorldReport) { out.push_str("## Unsupported Claims\n\n"); @@ -1838,6 +2106,10 @@ fn md_cell(value: &str) -> String { md_inline(value).replace('|', "\\|") } +fn md_url(value: &str) -> String { + value.replace(')', "%29").replace(' ', "%20") +} + fn round3(value: f64) -> f64 { (value * 1_000.0).round() / 1_000.0 } diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 512da9f1..8c53299c 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -23,6 +23,10 @@ fn real_world_memory_fixture_dir() -> PathBuf { Path::new(env!("CARGO_MANIFEST_DIR")).join("fixtures").join("real_world_memory") } +fn operator_debug_fixture_dir() -> PathBuf { + fixture_root().join("operator_debugging_ux") +} + fn run_json_report_from(fixtures: PathBuf) -> Result { let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) .arg("run") @@ -99,7 +103,47 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); + + let suites = array_at(&report, "/suites")?; + let operator_suite = find_by_field(suites, "/suite_id", "operator_debugging_ux")?; + + assert_eq!(operator_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + + Ok(()) +} + +#[test] +fn operator_debug_fixture_reports_trace_links_and_failure_details() -> Result<()> { + let report = run_json_report_from(operator_debug_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!( + report.pointer("/summary/operator_debug_job_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!(report.pointer("/summary/raw_sql_needed_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/trace_incomplete_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/operator_ux_gap_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(1)); + + let jobs = array_at(&report, "/jobs")?; + let dropped = find_by_field(jobs, "/job_id", "operator-debug-dropped-evidence-001")?; + + assert_eq!(dropped.pointer("/status").and_then(Value::as_str), Some("unsupported_claim")); + assert_eq!( + dropped.pointer("/operator_debug/raw_sql_needed").and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + dropped.pointer("/operator_debug/dropped_candidate_visibility").and_then(Value::as_str), + Some("visible in Retrieval Funnel and Replay Candidates") + ); + assert_eq!( + dropped.pointer("/operator_debug/viewer_url").and_then(Value::as_str), + Some("/viewer?trace_id=11111111-1111-4111-8111-111111111111") + ); Ok(()) } @@ -135,6 +179,7 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("# Real-World Job Benchmark Report")); assert!(markdown.contains("work_resume")); assert!(markdown.contains("issue-xy812-resume")); + assert!(markdown.contains("## Operator Debugging UX")); assert!(markdown.contains("Existing live-baseline reports remain valid")); Ok(()) @@ -188,3 +233,41 @@ fn real_world_memory_fixtures_report_trust_and_personalization_metrics() -> Resu Ok(()) } + +#[test] +fn operator_debug_json_report_renders_markdown_links() -> Result<()> { + let report = run_json_report_from(operator_debug_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-real-world-job-operator-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("operator.json"); + let markdown_path = temp_dir.join("operator.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("operator-debug-dropped-evidence-001")); + assert!(markdown.contains("/viewer?trace_id=11111111-1111-4111-8111-111111111111")); + assert!(markdown.contains("Raw SQL")); + assert!(markdown.contains("Replay Candidates")); + assert!(markdown.contains("Root cause")); + + Ok(()) +} diff --git a/docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md b/docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md new file mode 100644 index 00000000..ac2415fe --- /dev/null +++ b/docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md @@ -0,0 +1,132 @@ +# Real-World Job Benchmark Report + +Goal: Publish a Markdown summary for one generated real_world_job benchmark report. +Read this when: You need a durable smoke report for real-world agent memory job fixtures. +Inputs: `tmp/real-world-job/real-world-job-operator-ux-report.json`. +Depends on: `apps/elf-eval/fixtures/real_world_job/`, `docs/spec/real_world_agent_memory_benchmark_v1.md`, and `Makefile.toml`. +Verification: Compare this Markdown summary with the source JSON before committing. + +## Summary + +- Run ID: `real-world-job-operator-ux` +- Generated at: `2026-06-09T14:52:05.906877Z` +- Runner version: `0.2.0-9b60dee3de54705a71a683d9a36b48d94ce8e752-aarch64-apple-darwin` +- Corpus profile: `synthetic` +- Adapter: `fixture_operator_ux` (offline_fixture_response) +- Jobs: `5` +- Encoded suites: `1` +- Not-encoded suites: `10` +- Status summary: `4` pass, `0` wrong_result, `0` lifecycle_fail, `0` incomplete, `0` blocked, `1` unsupported_claim +- Unsupported claim count: `1` +- Wrong-result count: `3` +- Mean score: `0.800` +- Mean latency: `3.100 ms` +- Cost: `0.000 USD` +- Operator-debug jobs: `5` +- Raw SQL needed: `0` +- Trace-incomplete debug jobs: `0` +- Operator UX gaps: `0` +- Private corpus redaction: `no_private_corpus` + +## Suites + +| Suite | Status | Jobs | Score | Unsupported Claims | Wrong Results | Reason | +| --- | --- | ---: | ---: | ---: | ---: | --- | +| trust_source_of_truth | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| work_resume | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| project_decisions | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| retrieval | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| memory_evolution | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| consolidation | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| knowledge_compilation | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| operator_debugging_ux | `unsupported_claim` | 5 | `0.800` | 1 | 3 | At least one encoded job produced an unsupported claim. | +| capture_integration | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| production_ops | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| personalization | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | + +## Jobs + +| Suite | Job | Status | Score | Expected Evidence | Produced Evidence | Unsupported Claims | Wrong Results | Latency | Cost | +| --- | --- | --- | ---: | --- | --- | ---: | ---: | ---: | --- | +| operator_debugging_ux | operator-debug-dropped-evidence-001 | `unsupported_claim` | `0.000` | `trace-dropped-expected` | `trace-dropped-decoy` | 1 | 3 | `2.400 ms` | `0.000 USD` | +| operator_debugging_ux | operator-debug-provider-latency-001 | `pass` | `1.000` | `trace-provider-timeout` | `trace-provider-timeout` | 0 | 0 | `4.800 ms` | `0.000 USD` | +| operator_debugging_ux | operator-debug-rebuild-changed-results-001 | `pass` | `1.000` | `trace-before-rebuild, trace-after-rebuild` | `trace-after-rebuild, trace-before-rebuild` | 0 | 0 | `3.300 ms` | `0.000 USD` | +| operator_debugging_ux | operator-debug-relation-context-mislead-001 | `pass` | `1.000` | `trace-relation-context` | `trace-relation-context` | 0 | 0 | `2.900 ms` | `0.000 USD` | +| operator_debugging_ux | operator-debug-rerank-bad-candidate-001 | `pass` | `1.000` | `trace-rerank-promotion` | `trace-rerank-promotion` | 0 | 0 | `2.100 ms` | `0.000 USD` | + +## Operator Debugging UX + +| Job | Failure Mode | Trace Evidence | Steps | Raw SQL | Dropped Candidate Visibility | Trace Completeness | Repair Clarity | UX Gaps | +| --- | --- | --- | ---: | --- | --- | --- | --- | --- | +| operator-debug-dropped-evidence-001 | expected_evidence_dropped | `11111111-1111-4111-8111-111111111111`
[viewer](/viewer?trace_id=11111111-1111-4111-8111-111111111111)
[bundle](/v2/admin/traces/11111111-1111-4111-8111-111111111111/bundle?mode=full&stage_items_limit=128&candidates_limit=200) | 4 | `false` | visible in Retrieval Funnel and Replay Candidates | `complete` | `clear` | `none` | +| operator-debug-provider-latency-001 | provider_latency_or_failure | `33333333-3333-4333-8333-333333333333`
[viewer](/viewer?trace_id=33333333-3333-4333-8333-333333333333)
[bundle](/v2/admin/traces/33333333-3333-4333-8333-333333333333/bundle?mode=full&stage_items_limit=128&candidates_limit=200) | 3 | `false` | visible as low recall counts rather than a post-recall drop | `complete` | `clear` | `none` | +| operator-debug-rebuild-changed-results-001 | rebuild_changed_results | `44444444-4444-4444-8444-444444444444`
[viewer](/viewer?trace_id=44444444-4444-4444-8444-444444444444)
[bundle](/v2/admin/traces/44444444-4444-4444-8444-444444444444/bundle?mode=full&stage_items_limit=128&candidates_limit=200) | 5 | `false` | visible by comparing before and after trace candidates | `complete` | `clear` | `none` | +| operator-debug-relation-context-mislead-001 | relation_context_misled_search | `55555555-5555-4555-8555-555555555555`
[viewer](/viewer?trace_id=55555555-5555-4555-8555-555555555555)
[bundle](/v2/admin/traces/55555555-5555-4555-8555-555555555555/bundle?mode=full&stage_items_limit=128&candidates_limit=200) | 4 | `false` | not dropped; misleading context is visible on selected result | `complete` | `clear` | `none` | +| operator-debug-rerank-bad-candidate-001 | rerank_promoted_bad_candidate | `22222222-2222-4222-8222-222222222222`
[viewer](/viewer?trace_id=22222222-2222-4222-8222-222222222222)
[bundle](/v2/admin/traces/22222222-2222-4222-8222-222222222222/bundle?mode=full&stage_items_limit=128&candidates_limit=200) | 3 | `false` | not dropped; visible with lower final rank in Replay Candidates | `complete` | `clear` | `none` | + +### Operator Debug Details + +#### `operator-debug-dropped-evidence-001` + +- Root cause: The expected candidate survived recall but was removed by the read-profile scope filter before final selection. +- Viewer panels: `Trace, Retrieval Funnel, Replay Candidates, Stage Details` +- CLI steps: `open viewer trace link -> compare recall before and after filter -> inspect replay candidates -> repair read profile or grant` +- Trace evidence: `trace-dropped-expected` + +#### `operator-debug-provider-latency-001` + +- Root cause: Provider latency forced fallback behavior, shrinking expanded-query recall. +- Viewer panels: `Providers And Ranking, Stage Summary, Stage Details` +- CLI steps: `open trace bundle -> inspect provider metadata -> compare expanded queries -> raise timeout or repair provider health` +- Trace evidence: `trace-provider-timeout` + +#### `operator-debug-rebuild-changed-results-001` + +- Root cause: Rebuild removed stale derived-index state and restored source-of-truth-backed ranking. +- Viewer panels: `Trace, Replay Candidates, Selected Final Results` +- CLI steps: `open before trace -> open after trace -> compare replay candidates -> confirm active note selected -> keep Qdrant rebuild as repair` +- Trace evidence: `trace-before-rebuild, trace-after-rebuild` + +#### `operator-debug-relation-context-mislead-001` + +- Root cause: A deprecated graph relation remained visible in relation_context and conflicted with the selected note text. +- Viewer panels: `Selected Final Results, Relation Context, Stage Details` +- CLI steps: `open trace link -> inspect selected result relation count -> open Relation Context -> invalidate stale relation fact` +- Trace evidence: `trace-relation-context` + +#### `operator-debug-rerank-bad-candidate-001` + +- Root cause: The correct item was in the candidate set, but rerank.score elevated a cross-project decoy. +- Viewer panels: `Selected Final Results, Replay Candidates, Providers And Ranking` +- CLI steps: `open trace bundle -> compare retrieval rank with final rank -> inspect rerank score -> tighten scope or rerank inputs` +- Trace evidence: `trace-rerank-promotion` + +## Unsupported Claims + +| Suite | Job | Claim | Evidence | Reason | +| --- | --- | --- | --- | --- | +| operator_debugging_ux | operator-debug-dropped-evidence-001 | No expected evidence was dropped. | `trace-dropped-decoy` | claim_id is not present in expected_answer.evidence_links | + +## Result Semantics + +This report uses `docs/spec/real_world_agent_memory_benchmark_v1.md` status terms. +It is a real-world job fixture report, not a Docker live-baseline report. +Existing live-baseline reports remain valid for their encoded retrieval and lifecycle checks and are not reinterpreted as real-world suite wins. + +- `pass`: encoded jobs met their pass threshold with required evidence and no hard-fail rule. +- `wrong_result`: a job completed but missed required answer or evidence expectations. +- `unsupported_claim`: a job produced a substantive claim not supported by the fixture evidence links. +- `not_encoded`: a suite has no checked-in real_world_job fixture, so no pass/fail claim is allowed. + +## Not-Encoded Suites + +- `trust_source_of_truth` +- `work_resume` +- `project_decisions` +- `retrieval` +- `memory_evolution` +- `consolidation` +- `knowledge_compilation` +- `capture_integration` +- `production_ops` +- `personalization` diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index 6f1a606a..06e89da5 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -33,6 +33,9 @@ cleanup, use `docs/guide/single_user_production.md`. - `2026-06-09-production-adoption-gate-report.md`: XY-836 production adoption decision report with fresh provider-backed synthetic, stress, backfill, restore, and external adapter evidence. +- `2026-06-09-operator-debugging-ux-report.md`: checked-in real-world job + operator-debugging UX report with trace/viewer links, raw-SQL avoidance, root-cause + step counts, dropped-candidate visibility, and repair-action clarity. - `real_world_agent_memory_benchmark.md`: operator overview for the v1 real-world agent memory benchmark contract, including suite taxonomy and typed report states. diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 6cc18971..b354af1d 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -150,6 +150,30 @@ count, and Qdrant rebuild case/pass counts. The fixtures include negative traps unsupported prior claims, stale deleted facts, cross-project preference leakage, and private/redacted text leakage. +Operator debugging UX increment: + +```sh +cargo make real-world-job-operator-ux +``` + +Artifacts: + +```text +tmp/real-world-job/real-world-job-operator-ux-report.json +tmp/real-world-job/real-world-job-operator-ux-report.md +``` + +The operator UX fixtures live under +`apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/`. They cover dropped +expected evidence, rerank promotion of a bad candidate, provider latency or failure, +Qdrant rebuild result changes, and misleading relation context. Reports include direct +viewer and admin trace bundle links, steps to root cause, whether raw SQL was needed, +dropped-candidate visibility, trace completeness, repair-action clarity, and any +encoded UX gaps. + +Checked-in evidence snapshot: +`docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md`. + Do not generate large fixtures or update production-adoption verdicts while adding the contract. The current adoption gate remains an existing benchmark decision until new real-world job reports are implemented and published. diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index fa94656f..5b65c0d0 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -66,6 +66,7 @@ runner execution. "negative_traps": [], "scoring_rubric": {}, "allowed_uncertainty": {}, + "operator_debug": {}, "tags": [] } ``` @@ -86,6 +87,7 @@ runner execution. | `negative_traps` | array | Distractors, stale facts, or misleading memories that must not drive the answer. | | `scoring_rubric` | object | Dimensions, weights, thresholds, and hard-fail rules for this job. | | `allowed_uncertainty` | object | Explicit uncertainty language and fallback behavior accepted for the job. | +| `operator_debug` | object or null | Optional for most suites; required for `operator_debugging_ux` jobs. Records trace/viewer evidence and operator workflow scoring inputs. | | `tags` | array | Optional labels such as `private_corpus`, `synthetic`, `adapter_required`, or `no_live_claim`. | ### `corpus` @@ -192,6 +194,38 @@ Trap types: Each trap MUST include `trap_id`, `type`, `evidence_ids`, and `failure_if_used`. +### `operator_debug` + +`operator_debug` is required when `suite = "operator_debugging_ux"` and optional +elsewhere. It records whether a human operator can identify the root cause through +viewer, trace, or CLI readback without raw SQL. + +Required fields: + +- `failure_mode`: stable label such as `expected_evidence_dropped`, + `rerank_promoted_bad_candidate`, `provider_latency_or_failure`, + `rebuild_changed_results`, or `relation_context_misled_search`. +- `trace_id`: trace handle when available. +- `viewer_url`: read-only viewer path that opens the trace evidence when available. +- `admin_trace_bundle_url`: direct admin trace bundle path when available. +- `root_cause`: concise expected diagnosis. +- `steps_to_root_cause`: number of viewer or CLI steps needed to reach the diagnosis. +- `raw_sql_needed`: must be `false` for a pass under this suite. +- `dropped_candidate_visibility`: whether dropped, retained, or misleading candidates + are visible through trace/viewer evidence. +- `trace_completeness`: `complete`, `partial`, or `missing`. +- `repair_action_clarity`: `clear`, `partial`, or `missing`. +- `viewer_panels`: viewer panels used, such as `Replay Candidates`, `Stage Details`, + `Providers And Ranking`, or `Relation Context`. +- `cli_steps`: equivalent CLI or endpoint steps. +- `trace_evidence`: evidence ids used for the diagnosis. +- `ux_gaps`: array of focused follow-up pointers when a needed panel or endpoint is + absent. + +Each `ux_gaps[]` entry MUST include `gap_id`, `severity`, `description`, and +`follow_up_issue`. If a fixture requires a missing panel, the report must encode the +gap instead of hiding it behind a wrong-result score. + ### `scoring_rubric` The rubric MUST be job-specific but use the shared dimensions below.