Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 60 additions & 8 deletions Makefile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -356,14 +356,17 @@ args = [


# Real-world job benchmark smoke
# | task | type | cwd |
# | --------------------------- | --------- | --- |
# | real-world-job-smoke | composite | |
# | real-world-job-smoke-json | command | |
# | real-world-job-smoke-report | command | |
# | real-world-memory | composite | |
# | real-world-memory-json | command | |
# | real-world-memory-report | command | |
# | task | type | cwd |
# | -------------------------------- | --------- | --- |
# | real-world-job-smoke | composite | |
# | real-world-job-smoke-json | command | |
# | real-world-job-smoke-report | command | |
# | real-world-memory | composite | |
# | real-world-memory-json | command | |
# | real-world-memory-report | command | |
# | real-world-job-operator-ux | composite | |
# | real-world-job-operator-ux-json | command | |
# | real-world-job-operator-ux-report | command | |

[tasks.real-world-job-smoke]
workspace = false
Expand Down Expand Up @@ -457,6 +460,55 @@ args = [
"tmp/real-world-memory/real-world-memory-report.md",
]

[tasks.real-world-job-operator-ux]
workspace = false
dependencies = [
"real-world-job-operator-ux-report",
]

[tasks.real-world-job-operator-ux-json]
workspace = false
command = "cargo"
args = [
"run",
"-p",
"elf-eval",
"--bin",
"real_world_job_benchmark",
"--",
"run",
"--fixtures",
"apps/elf-eval/fixtures/real_world_job/operator_debugging_ux",
"--out",
"tmp/real-world-job/real-world-job-operator-ux-report.json",
"--run-id",
"real-world-job-operator-ux",
"--adapter-id",
"fixture_operator_ux",
"--adapter-name",
"ELF operator UX fixture",
]

[tasks.real-world-job-operator-ux-report]
workspace = false
dependencies = [
"real-world-job-operator-ux-json",
]
command = "cargo"
args = [
"run",
"-p",
"elf-eval",
"--bin",
"real_world_job_benchmark",
"--",
"publish",
"--report",
"tmp/real-world-job/real-world-job-operator-ux-report.json",
"--out",
"tmp/real-world-job/real-world-job-operator-ux-report.md",
]


# Meta
# | task | type | cwd |
Expand Down
2 changes: 2 additions & 0 deletions apps/elf-api/src/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2969,6 +2969,8 @@ mod tests {
assert!(html.contains("Providers And Ranking"));
assert!(html.contains("Relation Context"));
assert!(html.contains("directTraceId"));
assert!(html.contains("trace_id"));
assert!(html.contains("loadInitialTrace"));
assert!(!html.contains("method: \"PATCH\""));
assert!(!html.contains("method: \"PUT\""));
assert!(!html.contains("method: \"DELETE\""));
Expand Down
25 changes: 25 additions & 0 deletions apps/elf-api/static/viewer.html
Original file line number Diff line number Diff line change
Expand Up @@ -1506,6 +1506,30 @@ <h2>Recent Traces</h2>
$$(".nav button").forEach((node) => node.classList.toggle("active", node.dataset.tab === tabId));
}

function initialTraceId() {
const params = new URLSearchParams(window.location.search);
const queryTrace = params.get("trace_id") || params.get("traceId");
if (queryTrace && queryTrace.trim()) {
return queryTrace.trim();
}
const hash = window.location.hash.replace(/^#/, "");
if (!hash) {
return "";
}
const hashParams = new URLSearchParams(hash.includes("=") ? hash : `trace_id=${hash}`);
const hashTrace = hashParams.get("trace_id") || hashParams.get("traceId");
return hashTrace ? hashTrace.trim() : "";
}

async function loadInitialTrace() {
const traceId = initialTraceId();
if (!traceId) {
return;
}
showTab("tracesView");
await loadTraceBundle(traceId, $("#traceBundleDetail"));
}

async function refreshActive() {
if (state.activeTab === "searchView") {
if (state.session) {
Expand Down Expand Up @@ -1537,6 +1561,7 @@ <h2>Recent Traces</h2>

loadContext();
bindEvents();
loadInitialTrace();
</script>
</body>
</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
{
"schema": "elf.real_world_job/v1",
"job_id": "operator-debug-dropped-evidence-001",
"suite": "operator_debugging_ux",
"title": "Debug expected evidence dropped after recall filtering",
"corpus": {
"corpus_id": "operator-debugging-ux-2026-06-09",
"profile": "synthetic",
"items": [
{
"evidence_id": "trace-dropped-expected",
"kind": "trace",
"text": "Trace 11111111-1111-4111-8111-111111111111 shows the expected note present in recall.candidates before service-side filtering and absent after the read-profile scope filter.",
"source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-dropped-expected"}},
"created_at": "2026-06-09T02:00:00Z"
},
{
"evidence_id": "trace-dropped-decoy",
"kind": "note",
"text": "Decoy note: the auth retry policy note ranked first but does not explain the missing expected deployment evidence.",
"source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-dropped-decoy"}},
"created_at": "2026-06-09T02:01:00Z"
}
],
"adapter_response": {
"adapter_id": "fixture_operator_ux",
"answer": {
"content": "The auth retry policy note is the root cause; no expected deployment evidence was dropped.",
"claims": [
{
"claim_id": "wrong_root_cause",
"text": "No expected evidence was dropped.",
"evidence_ids": ["trace-dropped-decoy"],
"confidence": "high"
}
],
"evidence_ids": ["trace-dropped-decoy"],
"latency_ms": 2.4,
"cost": {"currency": "USD", "amount": 0.0, "input_tokens": 0, "output_tokens": 0}
}
}
},
"timeline": [
{
"event_id": "expected-evidence-recalled",
"ts": "2026-06-09T02:00:00Z",
"actor": "system",
"action": "captured_trace",
"evidence_ids": ["trace-dropped-expected"],
"summary": "The trace captured recall-stage visibility for the expected evidence before filtering."
}
],
"prompt": {
"role": "user",
"content": "Why did the memory result miss the expected deployment evidence?",
"job_mode": "debug",
"constraints": ["cite_evidence", "avoid_repeating_completed_work"]
},
"expected_answer": {
"must_include": [
{
"claim_id": "root_cause",
"text": "The expected evidence was dropped after recall by the read-profile filter."
}
],
"must_not_include": ["No expected deployment evidence was dropped."],
"evidence_links": {"root_cause": ["trace-dropped-expected"]},
"answer_type": "debug_report",
"accepted_alternates": [],
"requires_caveat": false,
"requires_refusal": false
},
"required_evidence": [
{
"evidence_id": "trace-dropped-expected",
"claim_id": "root_cause",
"requirement": "explain",
"quote": "present in recall.candidates before service-side filtering and absent after the read-profile scope filter"
}
],
"negative_traps": [
{
"trap_id": "decoy-top-auth-note",
"type": "decoy_evidence",
"evidence_ids": ["trace-dropped-decoy"],
"failure_if_used": true
}
],
"scoring_rubric": {
"dimensions": {
"debuggability": {"weight": 0.35, "max_points": 1.0, "criteria": "Identifies the trace stage that dropped expected evidence."},
"evidence_grounding": {"weight": 0.3, "max_points": 1.0, "criteria": "Uses trace evidence rather than the decoy top note."},
"workflow_helpfulness": {"weight": 0.2, "max_points": 1.0, "criteria": "Names a concrete repair action."},
"answer_correctness": {"weight": 0.15, "max_points": 1.0, "criteria": "Reports the correct root cause."}
},
"pass_threshold": 0.8,
"hard_fail_rules": [
"unsupported high-confidence claim about a required decision or fact",
"use of a negative trap marked failure_if_used = true"
]
},
"allowed_uncertainty": {
"can_answer_unknown": false,
"acceptable_phrases": ["The fixture does not provide that evidence."],
"fallback_action": "state_blocker"
},
"operator_debug": {
"failure_mode": "expected_evidence_dropped",
"trace_id": "11111111-1111-4111-8111-111111111111",
"viewer_url": "/viewer?trace_id=11111111-1111-4111-8111-111111111111",
"admin_trace_bundle_url": "/v2/admin/traces/11111111-1111-4111-8111-111111111111/bundle?mode=full&stage_items_limit=128&candidates_limit=200",
"root_cause": "The expected candidate survived recall but was removed by the read-profile scope filter before final selection.",
"steps_to_root_cause": 4,
"raw_sql_needed": false,
"dropped_candidate_visibility": "visible in Retrieval Funnel and Replay Candidates",
"trace_completeness": "complete",
"repair_action_clarity": "clear",
"viewer_panels": ["Trace", "Retrieval Funnel", "Replay Candidates", "Stage Details"],
"cli_steps": ["open viewer trace link", "compare recall before and after filter", "inspect replay candidates", "repair read profile or grant"],
"trace_evidence": ["trace-dropped-expected"],
"ux_gaps": []
},
"tags": ["synthetic", "operator_debugging_ux", "no_live_claim"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
{
"schema": "elf.real_world_job/v1",
"job_id": "operator-debug-provider-latency-001",
"suite": "operator_debugging_ux",
"title": "Debug provider latency degrading retrieval quality",
"corpus": {
"corpus_id": "operator-debugging-ux-2026-06-09",
"profile": "synthetic",
"items": [
{
"evidence_id": "trace-provider-timeout",
"kind": "trace",
"text": "Trace 33333333-3333-4333-8333-333333333333 records provider metadata with embedding provider latency near timeout and expansion fallback to the original query only.",
"source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-provider-timeout"}},
"created_at": "2026-06-09T02:10:00Z"
}
],
"adapter_response": {
"adapter_id": "fixture_operator_ux",
"answer": {
"content": "Provider latency caused expansion fallback to the original query only, which reduced candidate recall.",
"claims": [
{
"claim_id": "root_cause",
"text": "Provider latency caused expansion fallback to the original query only.",
"evidence_ids": ["trace-provider-timeout"],
"confidence": "high"
}
],
"evidence_ids": ["trace-provider-timeout"],
"latency_ms": 4.8,
"cost": {"currency": "USD", "amount": 0.0, "input_tokens": 0, "output_tokens": 0}
}
}
},
"timeline": [
{
"event_id": "provider-timeout-recorded",
"ts": "2026-06-09T02:10:00Z",
"actor": "system",
"action": "captured_trace",
"evidence_ids": ["trace-provider-timeout"],
"summary": "Provider metadata and stage details recorded degraded expansion behavior."
}
],
"prompt": {
"role": "user",
"content": "Why did recall get worse during the slow provider window?",
"job_mode": "debug",
"constraints": ["cite_evidence", "state_blockers"]
},
"expected_answer": {
"must_include": [
{
"claim_id": "root_cause",
"text": "Provider latency caused expansion fallback to the original query only."
}
],
"must_not_include": ["The corpus did not contain the expected evidence."],
"evidence_links": {"root_cause": ["trace-provider-timeout"]},
"answer_type": "debug_report",
"accepted_alternates": [],
"requires_caveat": false,
"requires_refusal": false
},
"required_evidence": [
{
"evidence_id": "trace-provider-timeout",
"claim_id": "root_cause",
"requirement": "explain",
"quote": "embedding provider latency near timeout and expansion fallback to the original query only"
}
],
"negative_traps": [],
"scoring_rubric": {
"dimensions": {
"debuggability": {"weight": 0.35, "max_points": 1.0, "criteria": "Uses provider and stage metadata."},
"evidence_grounding": {"weight": 0.3, "max_points": 1.0, "criteria": "Cites trace provider metadata."},
"workflow_helpfulness": {"weight": 0.2, "max_points": 1.0, "criteria": "Suggests timeout or provider health repair."},
"latency_resource": {"weight": 0.15, "max_points": 1.0, "criteria": "Reports latency as part of the root cause."}
},
"pass_threshold": 0.8,
"hard_fail_rules": ["unsupported high-confidence claim about a required decision or fact"]
},
"allowed_uncertainty": {
"can_answer_unknown": false,
"acceptable_phrases": ["The fixture does not provide that evidence."],
"fallback_action": "state_blocker"
},
"operator_debug": {
"failure_mode": "provider_latency_or_failure",
"trace_id": "33333333-3333-4333-8333-333333333333",
"viewer_url": "/viewer?trace_id=33333333-3333-4333-8333-333333333333",
"admin_trace_bundle_url": "/v2/admin/traces/33333333-3333-4333-8333-333333333333/bundle?mode=full&stage_items_limit=128&candidates_limit=200",
"root_cause": "Provider latency forced fallback behavior, shrinking expanded-query recall.",
"steps_to_root_cause": 3,
"raw_sql_needed": false,
"dropped_candidate_visibility": "visible as low recall counts rather than a post-recall drop",
"trace_completeness": "complete",
"repair_action_clarity": "clear",
"viewer_panels": ["Providers And Ranking", "Stage Summary", "Stage Details"],
"cli_steps": ["open trace bundle", "inspect provider metadata", "compare expanded queries", "raise timeout or repair provider health"],
"trace_evidence": ["trace-provider-timeout"],
"ux_gaps": []
},
"tags": ["synthetic", "operator_debugging_ux", "agentmemory_reference", "no_live_claim"]
}
Loading