From 79b66157aa6621a69d4aef0384880eb548a56767 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 19 Jun 2026 13:31:12 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Materialize OpenViking trajectory benchmark evidence","authority":"XY-983"} --- Makefile.toml | 52 +++++ README.md | 8 + ...ing-trajectory-materialization-report.json | 163 +++++++++++++++ .../tests/real_world_job_benchmark.rs | 192 ++++++++++++++++++ ...iking-trajectory-materialization-report.md | 118 +++++++++++ docs/evidence/benchmarking/index.md | 1 + docs/log.md | 9 + 7 files changed, 543 insertions(+) create mode 100644 apps/elf-eval/fixtures/report_snapshots/2026-06-19-openviking-trajectory-materialization-report.json create mode 100644 docs/evidence/benchmarking/2026-06-19-openviking-trajectory-materialization-report.md diff --git a/Makefile.toml b/Makefile.toml index 0f76e427..b9b93a83 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -28,6 +28,9 @@ # | real-world-memory-core-archival | composite | | # | real-world-memory-core-archival-json | command | | # | real-world-memory-core-archival-report | command | | +# | real-world-memory-context-trajectory | composite | | +# | real-world-memory-context-trajectory-json | command | | +# | real-world-memory-context-trajectory-report | command | | # | real-world-memory-evolution | composite | | # | real-world-memory-evolution-json | command | | # | real-world-memory-evolution-report | command | | @@ -362,6 +365,55 @@ args = [ "tmp/real-world-memory/core-archival/report.md", ] +[tasks.real-world-memory-context-trajectory] +workspace = false +dependencies = [ + "real-world-memory-context-trajectory-report", +] + +[tasks.real-world-memory-context-trajectory-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/context_trajectory", + "--out", + "tmp/real-world-memory/context-trajectory/report.json", + "--run-id", + "real-world-memory-context-trajectory", + "--adapter-id", + "fixture_context_trajectory", + "--adapter-name", + "ELF context trajectory fixture", +] + +[tasks.real-world-memory-context-trajectory-report] +workspace = false +dependencies = [ + "real-world-memory-context-trajectory-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/context-trajectory/report.json", + "--out", + "tmp/real-world-memory/context-trajectory/report.md", +] + [tasks.real-world-memory-evolution] workspace = false dependencies = [ diff --git a/README.md b/README.md index 0cd8316a..55bf95e2 100644 --- a/README.md +++ b/README.md @@ -184,6 +184,13 @@ provider-backed ELF evidence was required. intermediate candidate-drop stages are not exposed. This confirms ELF's narrow trace/stage visibility wins without erasing qmd's default top-k JSON and short CLI replay advantage. +- OpenViking trajectory materialization after XY-983: the June 19 context-trajectory + follow-up now has a dedicated repo task, + `cargo make real-world-memory-context-trajectory`, and a checked-in report + snapshot. The slice materializes 3 OpenViking trajectory/hierarchy/recursive jobs + as 0 pass, 0 wrong_result, and 3 typed blockers with 9/9 evidence coverage. This + improves auditability but does not remove the OpenViking context-trajectory gap or + support any ELF win, tie, or loss claim on those strengths. - Full-suite live real-world adapter sweep after XY-926: ELF and qmd emit Docker-isolated `live_real_world` records for all 55 checked-in jobs across 13 suites through `cargo make real-world-memory-live-adapters`. Both keep the original @@ -293,6 +300,7 @@ Detailed evidence and interpretation: - [Scheduled Memory Task Scoring Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md) - [Dreaming Competitor-Strength Retest Report - June 17, 2026](docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md) - [qmd Debug-Ergonomics Dreaming Retest Report - June 19, 2026](docs/evidence/benchmarking/2026-06-19-qmd-debug-ergonomics-dreaming-retest-report.md) +- [OpenViking Trajectory Materialization Report - June 19, 2026](docs/evidence/benchmarking/2026-06-19-openviking-trajectory-materialization-report.md) - [Live Baseline Benchmark Runbook](docs/runbook/benchmarking/live_baseline_benchmark.md) - [Single-User Production Runbook](docs/runbook/single_user_production.md) - Benchmark contract: diff --git a/apps/elf-eval/fixtures/report_snapshots/2026-06-19-openviking-trajectory-materialization-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-19-openviking-trajectory-materialization-report.json new file mode 100644 index 00000000..99282725 --- /dev/null +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-19-openviking-trajectory-materialization-report.json @@ -0,0 +1,163 @@ +{ + "schema": "elf.openviking_trajectory_materialization_report/v1", + "report_id": "2026-06-19-openviking-trajectory-materialization", + "authority": "XY-983", + "created_at": "2026-06-19T05:23:25Z", + "purpose": "Materialize the OpenViking context-trajectory follow-up into scored benchmark evidence without converting blocked fixture rows into ELF win, tie, or loss claims.", + "source_baseline": { + "strength_profile_report": "apps/elf-eval/fixtures/report_snapshots/2026-06-11-qmd-openviking-strength-profile-report.json", + "dreaming_retest_report": "apps/elf-eval/fixtures/report_snapshots/2026-06-17-dreaming-competitor-strength-retest-report.json", + "context_trajectory_fixtures": "apps/elf-eval/fixtures/real_world_memory/context_trajectory/", + "june_17_follow_up_reference": "XY-928", + "current_follow_up_authority": "XY-983" + }, + "summary": { + "overall_judgment": "materialized_blocked_context_trajectory_evidence", + "broader_superiority": "not_proven", + "blockers_removed_count": 0, + "blocked_scenario_count": 3, + "pass_count": 0, + "wrong_result_count": 0, + "incomplete_count": 0, + "regressed_scenario_count": 0, + "unchanged_scenario_count": 3, + "evidence_coverage": 1.0, + "source_ref_coverage": 1.0, + "quote_coverage": 1.0, + "expected_evidence_recall": 1.0, + "unsupported_claim_count": 0, + "unsupported_claims_rejected": [ + "ELF does not beat OpenViking staged retrieval trajectory from fixture-only blocked rows.", + "ELF does not tie or beat OpenViking hierarchy selection until selected hierarchy nodes and evidence ids are materialized.", + "ELF does not tie or beat OpenViking recursive/context expansion until expansion paths and same-corpus evidence ids are materialized.", + "OpenViking setup reachability and same-corpus wrong_result evidence do not score its context-trajectory strengths." + ] + }, + "commands": [ + { + "command": "cargo make real-world-memory-context-trajectory", + "status": "pass", + "artifact_json": "tmp/real-world-memory/context-trajectory/report.json", + "artifact_markdown": "tmp/real-world-memory/context-trajectory/report.md", + "summary": { + "schema": "elf.real_world_job_report/v1", + "run_id": "real-world-memory-context-trajectory", + "job_count": 3, + "encoded_suite_count": 1, + "pass": 0, + "wrong_result": 0, + "blocked": 3, + "not_encoded": 0, + "evidence_required_count": 9, + "evidence_covered_count": 9, + "source_ref_required_count": 9, + "source_ref_covered_count": 9, + "quote_required_count": 9, + "quote_covered_count": 9 + } + } + ], + "scenario_materialization": [ + { + "scenario_id": "openviking_staged_retrieval_trajectory", + "job_id": "context-trajectory-openviking-staged-retrieval-001", + "surface": "staged retrieval trajectory", + "previous_status": "blocked", + "current_status": "blocked", + "judgment": "unchanged", + "blocker": "same_corpus_output_and_comparable_stage_artifacts_missing", + "materialized_artifact": "tmp/real-world-memory/context-trajectory/report.json", + "produced_evidence": [ + "openviking-evidence-id-output-contract", + "openviking-same-corpus-precondition-blocked", + "elf-comparison-requires-comparable-trajectory" + ], + "required_next_artifacts": [ + "OpenViking expected_evidence_ids, matched_evidence_ids, and missing_evidence_ids per same-corpus query.", + "OpenViking stage-level context trajectory output.", + "Equivalent ELF trace or search-session trajectory output for the same scenario." + ], + "claim_boundary": "No ELF win, tie, or loss is allowed until both systems publish comparable stage artifacts for the same context-trajectory scenario." + }, + { + "scenario_id": "openviking_hierarchy_selection", + "job_id": "context-trajectory-openviking-hierarchy-selection-001", + "surface": "hierarchy selection", + "previous_status": "blocked", + "current_status": "blocked", + "judgment": "unchanged", + "blocker": "selected_hierarchy_nodes_and_evidence_ids_missing", + "materialized_artifact": "tmp/real-world-memory/context-trajectory/report.json", + "produced_evidence": [ + "hierarchy-selection-output-contract", + "same-corpus-before-hierarchy", + "hierarchy-comparison-requires-elf-equivalent" + ], + "required_next_artifacts": [ + "Selected parent context.", + "Selected child context.", + "Final resource evidence ids.", + "Rejected sibling or decoy context." + ], + "claim_boundary": "OpenViking's hierarchy design remains a reference, not a scored win, tie, or loss, until comparable output exists." + }, + { + "scenario_id": "openviking_recursive_context_expansion", + "job_id": "context-trajectory-openviking-recursive-expansion-001", + "surface": "recursive/context expansion", + "previous_status": "blocked", + "current_status": "blocked", + "judgment": "unchanged", + "blocker": "expansion_paths_and_same_corpus_evidence_ids_missing", + "materialized_artifact": "tmp/real-world-memory/context-trajectory/report.json", + "produced_evidence": [ + "recursive-expansion-output-contract", + "recursive-same-corpus-gate", + "recursive-elf-comparison-gate" + ], + "required_next_artifacts": [ + "Seed context.", + "Expanded child contexts.", + "Final evidence ids.", + "Pruned branches." + ], + "claim_boundary": "No ELF tie, win, or loss is allowed until both systems publish comparable expansion-path artifacts for the same scenario." + } + ], + "improvement_regression_readback": { + "improved": 0, + "regressed": 0, + "unchanged": 3, + "blocked": 3, + "conclusion": "The follow-up improved auditability by materializing the context-trajectory benchmark slice, but it did not improve the competitive status because all three OpenViking context-trajectory strengths remain blocked." + }, + "claim_boundaries": { + "allowed": [ + "The context-trajectory slice is now reproducible through cargo make real-world-memory-context-trajectory.", + "The three OpenViking context-trajectory fixtures preserve typed blocked states with 9/9 evidence coverage.", + "The current result is unchanged versus the June 11 and June 17 reports: OpenViking trajectory, hierarchy, and recursive/context expansion strengths are still not scored." + ], + "not_allowed": [ + "Do not claim ELF beats OpenViking on staged retrieval trajectory.", + "Do not claim ELF ties or beats OpenViking hierarchy selection.", + "Do not claim ELF ties or beats OpenViking recursive/context expansion.", + "Do not convert OpenViking same-corpus wrong_result evidence into a context-trajectory comparison win." + ] + }, + "next_optimization_direction": { + "recommended_lane": "OpenViking live adapter materializer", + "required_fields": [ + "expected_evidence_ids", + "matched_evidence_ids", + "missing_evidence_ids", + "stage_outputs", + "selected_parent_context", + "selected_child_context", + "final_resource_evidence_ids", + "rejected_or_pruned_contexts", + "expansion_path" + ], + "success_condition": "At least one OpenViking context-trajectory job moves from blocked to pass, wrong_result, or incomplete based on materialized staged, hierarchy, or expansion artifacts.", + "non_goal": "No ELF product change or superiority claim is authorized by this materialization-only report." + } +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index a6fa7b0d..982aa8a7 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -230,6 +230,18 @@ fn qmd_debug_ergonomics_dreaming_retest_report_markdown_path() -> Result Result { + report_snapshot_path("2026-06-19-openviking-trajectory-materialization-report.json") +} + +fn openviking_trajectory_materialization_report_markdown_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("evidence") + .join("benchmarking") + .join("2026-06-19-openviking-trajectory-materialization-report.md")) +} + fn live_temporal_reconciliation_report_json_path() -> Result { report_snapshot_path("2026-06-16-live-temporal-reconciliation-report.json") } @@ -3062,6 +3074,186 @@ fn assert_qmd_debug_retest_markdown_and_indexes( assert!(readme.contains("keeps the qmd edge unchanged")); } +#[test] +fn openviking_trajectory_materialization_report_preserves_blocked_gates() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + openviking_trajectory_materialization_report_json_path()?, + )?)?; + let markdown = + fs::read_to_string(openviking_trajectory_materialization_report_markdown_path()?)?; + let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?; + let readme = fs::read_to_string(readme_path()?)?; + + assert_openviking_trajectory_materialization_summary(&report)?; + assert_openviking_trajectory_materialization_command(&report)?; + assert_openviking_trajectory_materialization_scenarios(&report)?; + assert_openviking_trajectory_materialization_boundaries(&report)?; + assert_openviking_trajectory_materialization_markdown_and_indexes( + &markdown, + &benchmarking_index, + &readme, + ); + + Ok(()) +} + +fn assert_openviking_trajectory_materialization_summary(report: &Value) -> Result<()> { + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.openviking_trajectory_materialization_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-983")); + assert_eq!( + report.pointer("/summary/overall_judgment").and_then(Value::as_str), + Some("materialized_blocked_context_trajectory_evidence") + ); + assert_eq!( + report.pointer("/summary/broader_superiority").and_then(Value::as_str), + Some("not_proven") + ); + assert_eq!(report.pointer("/summary/blockers_removed_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/blocked_scenario_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/pass_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/regressed_scenario_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert!(array_contains_str( + report, + "/summary/unsupported_claims_rejected", + "ELF does not beat OpenViking staged retrieval trajectory from fixture-only blocked rows." + )?); + + Ok(()) +} + +fn assert_openviking_trajectory_materialization_command(report: &Value) -> Result<()> { + let command = find_by_field( + array_at(report, "/commands")?, + "/command", + "cargo make real-world-memory-context-trajectory", + )?; + let summary = + command.pointer("/summary").ok_or_else(|| eyre::eyre!("missing command summary"))?; + + assert_eq!(command.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + command.pointer("/artifact_json").and_then(Value::as_str), + Some("tmp/real-world-memory/context-trajectory/report.json") + ); + assert_eq!(summary.pointer("/job_count").and_then(Value::as_u64), Some(3)); + assert_eq!(summary.pointer("/pass").and_then(Value::as_u64), Some(0)); + assert_eq!(summary.pointer("/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(summary.pointer("/blocked").and_then(Value::as_u64), Some(3)); + assert_eq!(summary.pointer("/evidence_covered_count").and_then(Value::as_u64), Some(9)); + assert_eq!(summary.pointer("/source_ref_covered_count").and_then(Value::as_u64), Some(9)); + assert_eq!(summary.pointer("/quote_covered_count").and_then(Value::as_u64), Some(9)); + + Ok(()) +} + +fn assert_openviking_trajectory_materialization_scenarios(report: &Value) -> Result<()> { + let scenarios = array_at(report, "/scenario_materialization")?; + let staged = + find_by_field(scenarios, "/scenario_id", "openviking_staged_retrieval_trajectory")?; + let hierarchy = find_by_field(scenarios, "/scenario_id", "openviking_hierarchy_selection")?; + let recursive = + find_by_field(scenarios, "/scenario_id", "openviking_recursive_context_expansion")?; + + assert_eq!(scenarios.len(), 3); + + for scenario in [staged, hierarchy, recursive] { + assert_eq!(scenario.pointer("/previous_status").and_then(Value::as_str), Some("blocked")); + assert_eq!(scenario.pointer("/current_status").and_then(Value::as_str), Some("blocked")); + assert_eq!(scenario.pointer("/judgment").and_then(Value::as_str), Some("unchanged")); + } + + assert!(array_contains_str( + staged, + "/produced_evidence", + "openviking-evidence-id-output-contract" + )?); + assert!(array_contains_str( + hierarchy, + "/produced_evidence", + "hierarchy-selection-output-contract" + )?); + assert!(array_contains_str( + recursive, + "/produced_evidence", + "recursive-expansion-output-contract" + )?); + assert_eq!( + staged.pointer("/claim_boundary").and_then(Value::as_str), + Some( + "No ELF win, tie, or loss is allowed until both systems publish comparable stage artifacts for the same context-trajectory scenario." + ) + ); + assert_eq!( + hierarchy.pointer("/blocker").and_then(Value::as_str), + Some("selected_hierarchy_nodes_and_evidence_ids_missing") + ); + assert_eq!( + recursive.pointer("/blocker").and_then(Value::as_str), + Some("expansion_paths_and_same_corpus_evidence_ids_missing") + ); + + Ok(()) +} + +fn assert_openviking_trajectory_materialization_boundaries(report: &Value) -> Result<()> { + assert_eq!( + report.pointer("/improvement_regression_readback/improved").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/improvement_regression_readback/blocked").and_then(Value::as_u64), + Some(3) + ); + assert!(array_contains_str( + report, + "/claim_boundaries/allowed", + "The context-trajectory slice is now reproducible through cargo make real-world-memory-context-trajectory." + )?); + assert!(array_contains_str( + report, + "/claim_boundaries/not_allowed", + "Do not claim ELF beats OpenViking on staged retrieval trajectory." + )?); + assert!(array_contains_str( + report, + "/next_optimization_direction/required_fields", + "expansion_path" + )?); + assert_eq!( + report.pointer("/next_optimization_direction/non_goal").and_then(Value::as_str), + Some( + "No ELF product change or superiority claim is authorized by this materialization-only report." + ) + ); + + Ok(()) +} + +fn assert_openviking_trajectory_materialization_markdown_and_indexes( + markdown: &str, + benchmarking_index: &str, + readme: &str, +) { + assert!(markdown.contains("The OpenViking trajectory follow-up is now materialized")); + assert!(markdown.contains("3 encoded jobs, 0 pass, 3 blocked, 9/9 evidence coverage")); + assert!(markdown.contains("Do not claim ELF beats OpenViking on staged retrieval trajectory.")); + assert!(markdown.contains("OpenViking context-trajectory job can move from `blocked`")); + assert!( + benchmarking_index.contains("2026-06-19-openviking-trajectory-materialization-report.md") + ); + assert!(readme.contains("OpenViking Trajectory Materialization Report - June 19, 2026")); + assert!(readme.contains("cargo make real-world-memory-context-trajectory")); + assert!(readme.contains("3 typed blockers with 9/9 evidence coverage")); +} + fn assert_xy955_commands(report: &Value) -> Result<()> { let commands = array_at(report, "/commands")?; let aggregate = find_by_field(commands, "/command", "cargo make real-world-memory")?; diff --git a/docs/evidence/benchmarking/2026-06-19-openviking-trajectory-materialization-report.md b/docs/evidence/benchmarking/2026-06-19-openviking-trajectory-materialization-report.md new file mode 100644 index 00000000..7513738c --- /dev/null +++ b/docs/evidence/benchmarking/2026-06-19-openviking-trajectory-materialization-report.md @@ -0,0 +1,118 @@ +--- +type: Evidence +title: "OpenViking Trajectory Materialization Report - June 19, 2026" +description: "Checked-in benchmark evidence record: OpenViking Trajectory Materialization Report - June 19, 2026." +resource: docs/evidence/benchmarking/2026-06-19-openviking-trajectory-materialization-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-19 +tags: + - docs + - evidence + - benchmarking +--- +# OpenViking Trajectory Materialization Report - June 19, 2026 + +Goal: Close XY-983 by materializing the OpenViking context-trajectory follow-up +into reproducible benchmark evidence without turning blocked fixture rows into +ELF win, tie, or loss claims. +Read this when: You need to know whether OpenViking staged retrieval trajectory, +hierarchy selection, or recursive/context expansion blockers were removed after the +Dreaming competitor-strength retest. +Inputs: +`apps/elf-eval/fixtures/report_snapshots/2026-06-19-openviking-trajectory-materialization-report.json`, +`apps/elf-eval/fixtures/real_world_memory/context_trajectory/`, +`docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md`, +and `docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md`. +Outputs: Scenario-level materialization readback for OpenViking staged retrieval, +hierarchy selection, and recursive/context expansion gates. + +## Executive Judgment + +The OpenViking trajectory follow-up is now materialized as a dedicated benchmark +slice, but the competitive status is unchanged. + +`cargo make real-world-memory-context-trajectory` runs the three checked-in +`context_trajectory` fixtures and publishes: + +- 3 jobs. +- 0 pass. +- 0 wrong_result. +- 3 blocked. +- 9/9 expected evidence matched. +- 9/9 source refs covered. +- 9/9 required quotes covered. + +This improves auditability because the OpenViking trajectory blocker now has a +small, named repo task and a checked-in report snapshot. It does not remove the +blocker. ELF still has no scored win, tie, or loss against OpenViking staged +retrieval trajectory, hierarchy selection, or recursive/context expansion. + +## Command Evidence + +| Command | Status | Artifact | Result | +| --- | --- | --- | --- | +| `cargo make real-world-memory-context-trajectory` | `pass` | `tmp/real-world-memory/context-trajectory/report.json` and `tmp/real-world-memory/context-trajectory/report.md` | 3 encoded jobs, 0 pass, 3 blocked, 9/9 evidence coverage. | + +No dedicated live OpenViking trajectory adapter was run in this lane. The command +uses the checked-in fixture contract to preserve the exact blocker and the artifact +shape required before a live comparison can be scored. + +## Scenario Materialization + +| Scenario | Previous status | Current status | Judgment | Required next artifact | +| --- | --- | --- | --- | --- | +| OpenViking staged retrieval trajectory | `blocked` | `blocked` | `unchanged` | Same-corpus expected/matched/missing evidence ids plus stage-level trajectory output for the same prompt. | +| OpenViking hierarchy selection | `blocked` | `blocked` | `unchanged` | Selected parent context, selected child context, final resource evidence ids, and rejected sibling or decoy context. | +| OpenViking recursive/context expansion | `blocked` | `blocked` | `unchanged` | Seed context, expanded child contexts, final evidence ids, and pruned branches. | + +## Improvement and Regression Readback + +| Bucket | Count | Meaning | +| --- | --- | --- | +| `improved` | 0 | No OpenViking context-trajectory strength moved from blocked to pass, wrong_result, or incomplete. | +| `regressed` | 0 | No checked scenario moved backward. | +| `unchanged` | 3 | All three trajectory/hierarchy/recursive scenarios remain typed blocked. | +| `blocked` | 3 | Every encoded OpenViking context-trajectory job still waits on materialized staged output. | + +The useful improvement is operational, not competitive: future agents can now run a +single repo task to reproduce the exact blocked slice instead of rediscovering the +fixture directory and long runner command. + +## Claim Boundaries + +Allowed: + +- The OpenViking context-trajectory slice is reproducible through + `cargo make real-world-memory-context-trajectory`. +- The three OpenViking trajectory fixtures preserve typed blocked states with + full evidence, source-ref, and quote coverage. +- The current result is unchanged versus the June 11 and June 17 reports. + +Not allowed: + +- Do not claim ELF beats OpenViking on staged retrieval trajectory. +- Do not claim ELF ties or beats OpenViking hierarchy selection. +- Do not claim ELF ties or beats OpenViking recursive/context expansion. +- Do not convert OpenViking same-corpus wrong_result evidence into a + context-trajectory comparison win. + +## Next Optimization Direction + +The next useful lane is a real OpenViking live adapter materializer. It needs to emit: + +1. `expected_evidence_ids`. +2. `matched_evidence_ids`. +3. `missing_evidence_ids`. +4. Stage outputs. +5. Selected parent and child contexts. +6. Final resource evidence ids. +7. Rejected or pruned contexts. +8. Expansion paths. + +The success condition is not "ELF wins." The success condition is that at least one +OpenViking context-trajectory job can move from `blocked` to `pass`, +`wrong_result`, or `incomplete` based on comparable staged, hierarchy, or expansion +artifacts. Until then, the correct conclusion remains: OpenViking-style context +trajectory is an unresolved evidence gap for ELF. diff --git a/docs/evidence/benchmarking/index.md b/docs/evidence/benchmarking/index.md index 609f4cf0..836a6f2e 100644 --- a/docs/evidence/benchmarking/index.md +++ b/docs/evidence/benchmarking/index.md @@ -36,4 +36,5 @@ Routes to: Benchmarking evidence concepts under `docs/evidence/benchmarking/`. - `2026-06-16-proactive-brief-scoring-report.md`: Proactive Brief Scoring Report - June 16, 2026. - `2026-06-16-scheduled-memory-task-scoring-report.md`: Real-World Job Benchmark Report. - `2026-06-17-dreaming-competitor-strength-retest-report.md`: Dreaming Competitor-Strength Retest Report - June 17, 2026. +- `2026-06-19-openviking-trajectory-materialization-report.md`: OpenViking Trajectory Materialization Report - June 19, 2026; materializes the context-trajectory fixture slice through a dedicated repo task while preserving staged retrieval, hierarchy selection, and recursive/context expansion as typed blockers. - `2026-06-19-qmd-debug-ergonomics-dreaming-retest-report.md`: qmd Debug-Ergonomics Dreaming Retest Report - June 19, 2026; confirms qmd's default top-k/replay edge is unchanged while ELF keeps the narrow operator-debug trace/stage visibility wins. diff --git a/docs/log.md b/docs/log.md index 8f352cad..a56ef411 100644 --- a/docs/log.md +++ b/docs/log.md @@ -32,3 +32,12 @@ logs. - Moved retained plan artifacts from the legacy plans top-level lane to `docs/reference/plans/` so the top-level docs directories match the Decodex docs lane set. + +## 2026-06-19 + +- Added the OpenViking trajectory materialization evidence report and snapshot for + XY-983, preserving staged retrieval, hierarchy selection, and recursive/context + expansion as typed blockers until comparable staged artifacts exist. +- Added `cargo make real-world-memory-context-trajectory` as the reproducible + context-trajectory benchmark entrypoint and linked the new report from the + benchmarking evidence index and README.